From cc83d41ad97fee3703e3db40d44fc6ee3694154d Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Wed, 23 Apr 2008 00:17:12 +0000 Subject: [PATCH] Merge in tmp/jjh-scratch {{{ svn merge -r 18218:18240 https://svn.open-mpi.org/svn/ompi/tmp/jjh-scratch . }}} Contains: * Primarily a fix for a user reported problem where a cached file descriptor is causing a SIGPIPE on restart. * Cleanup some small memory leaks from using mca_base_param_env_var() - Thanks Jeff * Cleanup ORTE FT tool compilation in non-FT builds - Thanks Tim P. * Cleanup mpi interface with missplaced {{{OPAL_CR_ENTER_LIBRARY}}} - Thanks Terry * Some other sundry cleanup items all dealing with C/R functionality in the trunk. This commit was SVN r18241. --- ompi/mca/pml/ob1/pml_ob1.c | 12 +- ompi/mpi/c/open_port.c | 4 +- ompi/tools/ompi-server/ompi-server.c | 14 +- opal/mca/crs/base/crs_base_fns.c | 8 +- opal/mca/crs/self/crs_self_module.c | 20 ++- opal/runtime/opal_cr.c | 40 ++++++ opal/tools/opal-checkpoint/opal-checkpoint.c | 12 +- opal/tools/opal-restart/opal-restart.c | 17 ++- orte/mca/ess/env/ess_env_module.c | 126 +++++++++++++++--- orte/mca/ess/ess.h | 2 - orte/mca/oob/tcp/oob_tcp.c | 26 +++- orte/mca/oob/tcp/oob_tcp_peer.c | 3 +- orte/mca/oob/tcp/oob_tcp_peer.h | 2 + orte/mca/routed/routed.h | 12 ++ orte/mca/routed/tree/routed_tree.c | 44 +++++- orte/mca/routed/unity/routed_unity.c | 48 ++++++- orte/mca/snapc/base/base.h | 1 + orte/mca/snapc/base/snapc_base_fns.c | 5 + orte/mca/snapc/base/snapc_base_select.c | 3 +- orte/mca/snapc/full/snapc_full_module.c | 3 +- orte/mca/snapc/snapc.h | 13 ++ orte/orted/orted_main.c | 6 +- orte/runtime/orte_cr.c | 8 -- orte/tools/Makefile.am | 2 - .../orte-checkpoint/help-orte-checkpoint.txt | 10 ++ orte/tools/orte-checkpoint/orte-checkpoint.c | 51 ++++--- orte/tools/orte-clean/orte-clean.c | 12 +- orte/tools/orte-ps/orte-ps.c | 14 +- orte/tools/orte-restart/help-orte-restart.txt | 10 ++ orte/tools/orte-restart/orte-restart.c | 38 +++++- orte/tools/orterun/orterun.c | 6 +- orte/util/proc_info.c | 7 +- 32 files changed, 484 insertions(+), 95 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index dbbc944a2d..4afbd25ddf 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -476,15 +476,11 @@ int mca_pml_ob1_ft_event( int state ) /* * Clean out the modex information since it is invalid now. + * orte_grpcomm.purge_proc_attrs(); + * This happens at the ORTE level, so doing it again here will cause + * some issues with socket caching. */ - opal_output_verbose(10, ompi_cr_output, - "pml:ob1: ft_event(Restart): Restart Modex information"); - if (OMPI_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) { - opal_output(0, - "pml:ob1: ft_event(Restart): purge_modex Failed %d", - ret); - return ret; - } + /* * Refresh the proc structure, and publish our proc info in the modex. diff --git a/ompi/mpi/c/open_port.c b/ompi/mpi/c/open_port.c index 3cfe073d87..550d4028fc 100644 --- a/ompi/mpi/c/open_port.c +++ b/ompi/mpi/c/open_port.c @@ -62,9 +62,9 @@ int MPI_Open_port(MPI_Info info, char *port_name) */ } - rc = ompi_dpm.open_port(port_name, OMPI_RML_TAG_INVALID); - OPAL_CR_ENTER_LIBRARY(); + rc = ompi_dpm.open_port(port_name, OMPI_RML_TAG_INVALID); + OMPI_ERRHANDLER_RETURN(rc, MPI_COMM_WORLD, rc, FUNC_NAME); } diff --git a/ompi/tools/ompi-server/ompi-server.c b/ompi/tools/ompi-server/ompi-server.c index a88573f312..52b53faa78 100644 --- a/ompi/tools/ompi-server/ompi-server.c +++ b/ompi/tools/ompi-server/ompi-server.c @@ -103,6 +103,7 @@ int main(int argc, char *argv[]) int ret = 0; opal_cmd_line_t *cmd_line = NULL; char *rml_uri; + char * tmp_env_var = NULL; /* init enough of opal to process cmd lines */ if (OPAL_SUCCESS != opal_init_util()) { @@ -163,15 +164,22 @@ int main(int argc, char *argv[]) opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "none", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; + /* Mark as a tool program */ - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); #endif - + tmp_env_var = NULL; /* Silence compiler warning */ + /* Perform the standard init, but flag that we are a tool * so that we only open up the communications infrastructure. No * session directories will be created. diff --git a/opal/mca/crs/base/crs_base_fns.c b/opal/mca/crs/base/crs_base_fns.c index 8eb3a50224..395988a145 100644 --- a/opal/mca/crs/base/crs_base_fns.c +++ b/opal/mca/crs/base/crs_base_fns.c @@ -162,8 +162,14 @@ int opal_crs_base_none_prelaunch(int32_t rank, char ***argv, char ***env) { - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + char * tmp_env_var = NULL; + + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "0", true, env); + free(tmp_env_var); + tmp_env_var = NULL; + return OPAL_SUCCESS; } diff --git a/opal/mca/crs/self/crs_self_module.c b/opal/mca/crs/self/crs_self_module.c index 0032a7f3bc..bd6a244153 100644 --- a/opal/mca/crs/self/crs_self_module.c +++ b/opal/mca/crs/self/crs_self_module.c @@ -553,19 +553,31 @@ crs_self_find_function(lt_dlhandle handle, char *prefix, char *suffix){ */ static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd) { + char * tmp_env_var = NULL; + opal_output_verbose(10, mca_crs_self_component.super.output_handle, "crs:self: restart_cmd(%s, ---)", snapshot->cmd_line); - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "self", true, &environ); - opal_setenv(mca_base_param_env_var("crs_self_do_restart"), + free(tmp_env_var); + tmp_env_var = NULL; + + tmp_env_var = mca_base_param_env_var("crs_self_do_restart"); + opal_setenv(tmp_env_var, "1", true, &environ); - opal_setenv(mca_base_param_env_var("crs_self_prefix"), + free(tmp_env_var); + tmp_env_var = NULL; + + tmp_env_var = mca_base_param_env_var("crs_self_prefix"); + opal_setenv(tmp_env_var, mca_crs_self_component.prefix, true, &environ); - + free(tmp_env_var); + tmp_env_var = NULL; /* Instead of adding it to the command line, we should use the environment * to pass the values. This allow sthe OPAL application to be braindead diff --git a/opal/runtime/opal_cr.c b/opal/runtime/opal_cr.c index cdea662433..862f5be398 100644 --- a/opal/runtime/opal_cr.c +++ b/opal/runtime/opal_cr.c @@ -85,6 +85,7 @@ static int cr_notify_response(opal_cr_ckpt_cmd_state_t resp); static int extract_env_vars(int prev_pid); static int cr_entry_point_notify_reopen_files(int *prog_read_fd, int *prog_write_fd); static void opal_cr_entry_point_signal_handler (int signo); +static void opal_cr_sigpipe_debug_signal_handler (int signo); static opal_cr_coord_callback_fn_t cur_coord_callback = NULL; static opal_cr_notify_callback_fn_t cur_notify_callback = NULL; @@ -104,6 +105,8 @@ int opal_cr_checkpointing = OPAL_CR_STATUS_NONE; /* Current checkpoint request channel state */ int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE; +static bool opal_cr_debug_sigpipe = false; + #if OPAL_ENABLE_FT_THREAD == 1 /***************** * Threading Functions and Variables @@ -279,6 +282,22 @@ int opal_cr_init(void ) "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); + mca_base_param_reg_int_name("opal_cr", "debug_sigpipe", + "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", + false, false, + 0, &val); + opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val); + + opal_output_verbose(10, opal_cr_output, + "opal_cr: init: Debug SIGPIPE: %d (%s)", + val, (opal_cr_debug_sigpipe ? "True" : "False")); + + if( opal_cr_debug_sigpipe ) { + if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { + ; + } + } + mca_base_param_reg_string_name("opal_cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", false, false, @@ -778,6 +797,27 @@ static void opal_cr_entry_point_signal_handler (int signo) opal_cr_checkpoint_request = OPAL_CR_STATUS_REQUESTED; } +/* + * Used only for debugging SIGPIPE problems + */ +static void opal_cr_sigpipe_debug_signal_handler (int signo) +{ + int sleeper = 1; + + if( !opal_cr_debug_sigpipe ) { + opal_output_verbose(10, opal_cr_output, + "opal_cr: sigpipe_debug: Debug SIGPIPE Not enabled :(\n"); + return; + } + + opal_output_verbose(10, opal_cr_output, + "opal_cr: sigpipe_debug: Debug SIGPIPE [%d]: PID (%d)\n", + signo, getpid()); + while(sleeper == 1 ) { + sleep(1); + } +} + /* * Respond to an asynchronous checkpoint request */ diff --git a/opal/tools/opal-checkpoint/opal-checkpoint.c b/opal/tools/opal-checkpoint/opal-checkpoint.c index 45b0498977..5e525e6202 100644 --- a/opal/tools/opal-checkpoint/opal-checkpoint.c +++ b/opal/tools/opal-checkpoint/opal-checkpoint.c @@ -202,6 +202,7 @@ main(int argc, char *argv[]) static int initialize(int argc, char *argv[]) { int ret, exit_status = OPAL_SUCCESS; + char * tmp_env_var = NULL; /* * Make sure to init util before parse_args @@ -242,9 +243,12 @@ static int initialize(int argc, char *argv[]) { * Select the 'none' CRS component, * since we don't actually use a checkpointer */ - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "none", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /* * Initialize OPAL @@ -272,6 +276,7 @@ static int parse_args(int argc, char *argv[]) { int i, ret, len; opal_cmd_line_t cmd_line; char **app_env = NULL, **global_env = NULL; + char * tmp_env_var = NULL; memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t)); @@ -299,9 +304,12 @@ static int parse_args(int argc, char *argv[]) { putenv(global_env[i]); } - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /** * Now start parsing our specific arguments diff --git a/opal/tools/opal-restart/opal-restart.c b/opal/tools/opal-restart/opal-restart.c index 3eb2a45691..0a9fe4d203 100644 --- a/opal/tools/opal-restart/opal-restart.c +++ b/opal/tools/opal-restart/opal-restart.c @@ -150,6 +150,7 @@ main(int argc, char *argv[]) int child_pid; int prev_pid = 0; opal_crs_base_snapshot_t *snapshot = NULL; + char * tmp_env_var = NULL; /*************** * Initialize @@ -170,9 +171,12 @@ main(int argc, char *argv[]) } /* Re-enable the selection of the CRS component, so we can choose the right one */ - opal_setenv(mca_base_param_env_var("crs_base_do_not_select"), + tmp_env_var = mca_base_param_env_var("crs_base_do_not_select"); + opal_setenv(tmp_env_var, "0", /* turn on the selection */ true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /* * Make sure we are using the correct checkpointer @@ -190,9 +194,12 @@ main(int argc, char *argv[]) "Restart Expects checkpointer: (%s)", expected_crs_comp); - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, expected_crs_comp, true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /* Select this component or don't continue. * If the selection of this component fails, then we can't @@ -299,6 +306,7 @@ main(int argc, char *argv[]) static int initialize(int argc, char *argv[]) { int ret, exit_status = OPAL_SUCCESS; + char * tmp_env_var = NULL; /* * Make sure to init util before parse_args @@ -331,9 +339,12 @@ static int initialize(int argc, char *argv[]) * Turn off the selection of the CRS component, * we need to do that later */ - opal_setenv(mca_base_param_env_var("crs_base_do_not_select"), + tmp_env_var = mca_base_param_env_var("crs_base_do_not_select"); + opal_setenv(tmp_env_var, "1", /* turn off the selection */ true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /* * Initialize the OPAL layer diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index a81a5d9f9d..2317ea1977 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -95,12 +95,11 @@ orte_ess_base_module_t orte_ess_env_module = { #endif }; - static int rte_init(char flags) { int ret; char *error = NULL; - + /* Start by getting a unique name from the enviro */ env_set_name(); @@ -225,6 +224,14 @@ static int rte_ft_event(int state) /******** Checkpoint Prep ********/ if(OPAL_CRS_CHECKPOINT == state) { + /* + * Notify SnapC + */ + if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { + exit_status = ret; + goto cleanup; + } + /* * Notify IOF */ @@ -234,7 +241,15 @@ static int rte_ft_event(int state) } /* - * Notify RML & OOB + * Notify Routed + */ + if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { + exit_status = ret; + goto cleanup; + } + + /* + * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { exit_status = ret; @@ -244,13 +259,21 @@ static int rte_ft_event(int state) /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { /* - * Notify RML & OOB + * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { exit_status = ret; goto cleanup; } + /* + * Notify Routed + */ + if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { + exit_status = ret; + goto cleanup; + } + /* * Notify IOF */ @@ -258,16 +281,20 @@ static int rte_ft_event(int state) exit_status = ret; goto cleanup; } + + /* + * Notify SnapC + */ + if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { + exit_status = ret; + goto cleanup; + } } /******** Restart Recovery ********/ else if (OPAL_CRS_RESTART == state ) { /* - * Notify RML & OOB + * This should follow the ess init() function */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } /* * - Reset Contact information @@ -276,11 +303,67 @@ static int rte_ft_event(int state) exit_status = ret; } - /* Session directory stuff: - * orte_process_info.top_session_dir - * orte_process_info.universe_session_dir - * orte_process_info.job_session_dir - * orte_process_info.proc_session_dir + /* + * Notify RML -> OOB + */ + if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { + exit_status = ret; + goto cleanup; + } + + /* + * Restart the routed framework + * JJH: Lie to the finalize function so it does not try to contact the daemon. + */ + orte_process_info.tool = true; + if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { + exit_status = ret; + goto cleanup; + } + orte_process_info.tool = false; + if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { + exit_status = ret; + goto cleanup; + } + + /* + * Group Comm - Clean out stale data + */ + orte_grpcomm.finalize(); + if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { + exit_status = ret; + goto cleanup; + } + if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) { + exit_status = ret; + goto cleanup; + } + + /* + * Restart the PLM - Does nothing at the moment, but included for completeness + */ + if (ORTE_SUCCESS != (ret = orte_plm.finalize())) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + if (ORTE_SUCCESS != (ret = orte_plm.init())) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * RML - Enable communications + */ + if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { + exit_status = ret; + goto cleanup; + } + + /* + * Session directory re-init */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) { exit_status = ret; @@ -299,10 +382,13 @@ static int rte_ft_event(int state) exit_status = ret; } + opal_output_set_output_file_info(orte_process_info.proc_session_dir, + "output-", NULL, NULL); + /* - * Re-enable communication through the RML + * Notify Routed */ - if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { + if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { exit_status = ret; goto cleanup; } @@ -316,13 +402,9 @@ static int rte_ft_event(int state) } /* - * Re-exchange the routes + * Notify SnapC */ - if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { - exit_status = ret; - goto cleanup; - } - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { + if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { exit_status = ret; goto cleanup; } diff --git a/orte/mca/ess/ess.h b/orte/mca/ess/ess.h index de68e5f73c..a983dc01d0 100644 --- a/orte/mca/ess/ess.h +++ b/orte/mca/ess/ess.h @@ -73,8 +73,6 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void); typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report); /** - * Handle fault tolerance updates - * * Handle fault tolerance updates * * @param[in] state Fault tolerance state update diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index af5ce6813c..302cc03828 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -1640,6 +1640,7 @@ int mca_oob_tcp_ft_event(int state) { #else int mca_oob_tcp_ft_event(int state) { int exit_status = ORTE_SUCCESS; + opal_list_item_t *item; if(OPAL_CRS_CHECKPOINT == state) { /* @@ -1647,7 +1648,6 @@ int mca_oob_tcp_ft_event(int state) { */ OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); opal_event_disable(); - } else if(OPAL_CRS_CONTINUE == state) { /* @@ -1657,6 +1657,30 @@ int mca_oob_tcp_ft_event(int state) { OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } else if(OPAL_CRS_RESTART == state) { + /* + * Clean out cached connection information + * Select pieces of finalize/init + */ + for(item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list); + item != NULL; + item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list)) { + mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)item; + /* JJH: Use the below command for debugging restarts with invalid sockets + * mca_oob_tcp_peer_dump(peer, "RESTART CLEAN") + */ + MCA_OOB_TCP_PEER_RETURN(peer); + } + + OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free); + OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names); + OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers); + OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list); + + OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_list, opal_list_t); + OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peers, opal_hash_table_t); + OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_names, opal_hash_table_t); + OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_free, opal_free_list_t); + /* * Resume event processing */ diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 5cd3830804..3812439d92 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -76,7 +76,6 @@ static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, int sd, voi static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user); static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user); static void mca_oob_tcp_peer_timer_handler(int sd, short flags, void* user); -static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg); OBJ_CLASS_INSTANCE( @@ -970,7 +969,7 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user) /* * Routine for debugging to print the connection state and socket options */ -static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg) +void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg) { char src[64]; char dst[64]; diff --git a/orte/mca/oob/tcp/oob_tcp_peer.h b/orte/mca/oob/tcp/oob_tcp_peer.h index a187de6e0b..d0cebea4af 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.h +++ b/orte/mca/oob/tcp/oob_tcp_peer.h @@ -157,6 +157,8 @@ int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer); */ void mca_oob_tcp_peer_dequeue_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg); +void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg); + END_C_DECLS #endif /* _MCA_OOB_TCP_PEER_H */ diff --git a/orte/mca/routed/routed.h b/orte/mca/routed/routed.h index cd45b526b2..72b654f9ce 100644 --- a/orte/mca/routed/routed.h +++ b/orte/mca/routed/routed.h @@ -182,6 +182,16 @@ typedef int (*orte_routed_module_get_wireup_info_fn_t)(orte_jobid_t job, opal_buffer_t *buf); +/** + * Handle fault tolerance updates + * + * @param[in] state Fault tolerance state update + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_routed_module_ft_event_fn_t)(int state); + /* ******************************************************************** */ @@ -202,6 +212,8 @@ struct orte_routed_module_t { orte_routed_module_init_routes_fn_t init_routes; orte_routed_module_route_lost_fn_t route_lost; orte_routed_module_get_wireup_info_fn_t get_wireup_info; + /* FT Notification */ + orte_routed_module_ft_event_fn_t ft_event; }; /** Convenience typedef */ typedef struct orte_routed_module_t orte_routed_module_t; diff --git a/orte/mca/routed/tree/routed_tree.c b/orte/mca/routed/tree/routed_tree.c index 6cc7ad5a5b..8b873c9f7f 100644 --- a/orte/mca/routed/tree/routed_tree.c +++ b/orte/mca/routed/tree/routed_tree.c @@ -39,6 +39,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf); +#if OPAL_ENABLE_FT == 1 +static int tree_ft_event(int state); +#endif + static orte_process_name_t *lifeline=NULL; orte_routed_module_t orte_routed_tree_module = { @@ -48,7 +52,12 @@ orte_routed_module_t orte_routed_tree_module = { get_route, init_routes, route_lost, - get_wireup_info + get_wireup_info, +#if OPAL_ENABLE_FT == 1 + tree_ft_event +#else + NULL +#endif }; /* local globals */ @@ -571,3 +580,36 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) return ORTE_SUCCESS; } + +#if OPAL_ENABLE_FT == 1 +static int tree_ft_event(int state) +{ + int ret, exit_status = ORTE_SUCCESS; + + /******** Checkpoint Prep ********/ + if(OPAL_CRS_CHECKPOINT == state) { + } + /******** Continue Recovery ********/ + else if (OPAL_CRS_CONTINUE == state ) { + } + /******** Restart Recovery ********/ + else if (OPAL_CRS_RESTART == state ) { + /* + * Re-exchange the routes + */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { + exit_status = ret; + goto cleanup; + } + } + else if (OPAL_CRS_TERM == state ) { + /* Nothing */ + } + else { + /* Error state = Nothing */ + } + + cleanup: + return exit_status; +} +#endif diff --git a/orte/mca/routed/unity/routed_unity.c b/orte/mca/routed/unity/routed_unity.c index 7ce803dea4..d89b3b462f 100644 --- a/orte/mca/routed/unity/routed_unity.c +++ b/orte/mca/routed/unity/routed_unity.c @@ -45,6 +45,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf); +#if OPAL_ENABLE_FT == 1 +static int unity_ft_event(int state); +#endif + static orte_process_name_t *lifeline=NULL; orte_routed_module_t orte_routed_unity_module = { @@ -54,7 +58,12 @@ orte_routed_module_t orte_routed_unity_module = { get_route, init_routes, route_lost, - get_wireup_info + get_wireup_info, +#if OPAL_ENABLE_FT == 1 + unity_ft_event +#else + NULL +#endif }; static int init(void) @@ -66,6 +75,8 @@ static int init(void) OBJ_CONSTRUCT(&peer_list, opal_hash_table_t); opal_hash_table_init(&peer_list, 128); + lifeline = NULL; + return ORTE_SUCCESS; } @@ -107,6 +118,8 @@ static int finalize(void) /* cleanup the global condition */ OBJ_DESTRUCT(&cond); OBJ_DESTRUCT(&lock); + + lifeline = NULL; return ORTE_SUCCESS; } @@ -576,3 +589,36 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) return ORTE_SUCCESS; } + +#if OPAL_ENABLE_FT == 1 +static int unity_ft_event(int state) +{ + int ret, exit_status = ORTE_SUCCESS; + + /******** Checkpoint Prep ********/ + if(OPAL_CRS_CHECKPOINT == state) { + } + /******** Continue Recovery ********/ + else if (OPAL_CRS_CONTINUE == state ) { + } + /******** Restart Recovery ********/ + else if (OPAL_CRS_RESTART == state ) { + /* + * Re-exchange the routes + */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { + exit_status = ret; + goto cleanup; + } + } + else if (OPAL_CRS_TERM == state ) { + /* Nothing */ + } + else { + /* Error state = Nothing */ + } + + cleanup: + return exit_status; +} +#endif diff --git a/orte/mca/snapc/base/base.h b/orte/mca/snapc/base/base.h index 3c532dbefe..48ad56309a 100644 --- a/orte/mca/snapc/base/base.h +++ b/orte/mca/snapc/base/base.h @@ -113,6 +113,7 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type; ORTE_DECLSPEC int orte_snapc_base_module_finalize(void); ORTE_DECLSPEC int orte_snapc_base_none_setup_job(orte_jobid_t jobid); ORTE_DECLSPEC int orte_snapc_base_none_release_job(orte_jobid_t jobid); + ORTE_DECLSPEC int orte_snapc_base_none_ft_event(int state); ORTE_DECLSPEC extern int orte_snapc_base_output; ORTE_DECLSPEC extern opal_list_t orte_snapc_base_components_available; diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index f07620b54f..e4b6c8b20f 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -203,6 +203,11 @@ int orte_snapc_base_none_release_job(orte_jobid_t jobid) return ORTE_SUCCESS; } +int orte_snapc_base_none_ft_event(int state) +{ + return ORTE_SUCCESS; +} + /******************** * Local Functions ********************/ diff --git a/orte/mca/snapc/base/snapc_base_select.c b/orte/mca/snapc/base/snapc_base_select.c index e71f283cfb..db9b07dd60 100644 --- a/orte/mca/snapc/base/snapc_base_select.c +++ b/orte/mca/snapc/base/snapc_base_select.c @@ -66,7 +66,8 @@ static orte_snapc_base_module_t none_module = { /** Finalization Function */ orte_snapc_base_module_finalize, orte_snapc_base_none_setup_job, - orte_snapc_base_none_release_job + orte_snapc_base_none_release_job, + orte_snapc_base_none_ft_event }; int orte_snapc_base_select(bool seed, bool app) diff --git a/orte/mca/snapc/full/snapc_full_module.c b/orte/mca/snapc/full/snapc_full_module.c index 57ee4253ff..8fe7e1a353 100644 --- a/orte/mca/snapc/full/snapc_full_module.c +++ b/orte/mca/snapc/full/snapc_full_module.c @@ -47,7 +47,8 @@ static orte_snapc_base_module_t loc_module = { /** Finalization Function */ orte_snapc_full_module_finalize, orte_snapc_full_setup_job, - orte_snapc_full_release_job + orte_snapc_full_release_job, + orte_snapc_base_none_ft_event }; /* diff --git a/orte/mca/snapc/snapc.h b/orte/mca/snapc/snapc.h index 5cef666f56..fd6ebec38b 100644 --- a/orte/mca/snapc/snapc.h +++ b/orte/mca/snapc/snapc.h @@ -200,6 +200,17 @@ typedef int (*orte_snapc_base_setup_job_fn_t) typedef int (*orte_snapc_base_release_job_fn_t) (orte_jobid_t jobid); + +/** + * Handle fault tolerance updates + * + * @param[in] state Fault tolerance state update + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_snapc_base_ft_event_fn_t)(int state); + /** * Structure for SNAPC v1.0.0 components. */ @@ -234,6 +245,8 @@ struct orte_snapc_base_module_1_0_0_t { orte_snapc_base_setup_job_fn_t setup_job; /** Release job */ orte_snapc_base_release_job_fn_t release_job; + /** Handle any FT Notifications */ + orte_snapc_base_ft_event_fn_t ft_event; }; typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t; typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t; diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 680675ab81..1c824df542 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -187,6 +187,7 @@ int orte_daemon(int argc, char *argv[]) int i; opal_buffer_t *buffer; char hostname[100]; + char *tmp_env_var = NULL; /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals)); @@ -283,10 +284,13 @@ int orte_daemon(int argc, char *argv[]) #if OPAL_ENABLE_FT == 1 /* Mark as a tool program */ - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); #endif + tmp_env_var = NULL; /* Silence compiler warning */ /* detach from controlling terminal * otherwise, remain attached so output can get to us diff --git a/orte/runtime/orte_cr.c b/orte/runtime/orte_cr.c index e3168d7040..f95eef4c9b 100644 --- a/orte/runtime/orte_cr.c +++ b/orte/runtime/orte_cr.c @@ -325,14 +325,6 @@ static int orte_cr_coord_post_restart(void) { /* * Notify the ESS */ - if (ORTE_SUCCESS != (ret = orte_ess_base_open())) { - exit_status = ret; - } - - if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { - exit_status = ret; - } - if( NULL != orte_ess.ft_event ) { if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_RESTART))) { exit_status = ret; diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index 462a3f516d..472ed0adc3 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -25,8 +25,6 @@ SUBDIRS += \ tools/orterun \ tools/orte-clean \ tools/orte-ps \ - tools/orte-checkpoint \ - tools/orte-restart \ tools/wrappers DIST_SUBDIRS += \ diff --git a/orte/tools/orte-checkpoint/help-orte-checkpoint.txt b/orte/tools/orte-checkpoint/help-orte-checkpoint.txt index eda5a9598d..260a60956b 100644 --- a/orte/tools/orte-checkpoint/help-orte-checkpoint.txt +++ b/orte/tools/orte-checkpoint/help-orte-checkpoint.txt @@ -24,6 +24,16 @@ ompi-checkpoint PID_OF_MPIRUN %s +[usage-no-cr] +This build of Open MPI does *not* include Checkpoint/Restart functionality. +If you require this functionality re-configure Open MPI with the proper +Checkpoint/Restart options. + +ompi-checkpoint PID_OF_MPIRUN + Open MPI Checkpoint Tool + +%s + [invalid_pid] Error: The PID (%d) is invalid because either you have not provided a PID or provided an invalid PID. diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.c b/orte/tools/orte-checkpoint/orte-checkpoint.c index 3adf8db774..568050ec7b 100644 --- a/orte/tools/orte-checkpoint/orte-checkpoint.c +++ b/orte/tools/orte-checkpoint/orte-checkpoint.c @@ -107,6 +107,8 @@ static int global_sequence_num = 0; /***************************************** * Global Vars for Command line Arguments *****************************************/ +static bool listener_started = false; + typedef struct { bool help; int pid; @@ -267,6 +269,7 @@ static int parse_args(int argc, char *argv[]) { int i, ret, len, exit_status = ORTE_SUCCESS ; opal_cmd_line_t cmd_line; char **app_env = NULL, **global_env = NULL; + char * tmp_env_var = NULL; /* Init structure */ memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t)); @@ -301,9 +304,12 @@ static int parse_args(int argc, char *argv[]) { putenv(global_env[i]); } - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /** * Now start parsing our specific arguments @@ -311,6 +317,19 @@ static int parse_args(int argc, char *argv[]) { /* get the remaining bits */ opal_cmd_line_get_tail(&cmd_line, &argc, &argv); +#if OPAL_ENABLE_FT == 0 + /* Warn and exit if not configured with Checkpoint/Restart */ + { + char *args = NULL; + args = opal_cmd_line_get_usage_msg(&cmd_line); + opal_show_help("help-orte-checkpoint.txt", "usage-no-cr", + true, args); + free(args); + exit_status = ORTE_ERROR; + goto cleanup; + } +#endif + if (OPAL_SUCCESS != ret || orte_checkpoint_globals.help || (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp)) { @@ -322,7 +341,7 @@ static int parse_args(int argc, char *argv[]) { exit_status = ORTE_ERROR; goto cleanup; } - + /* * If the user did not supply an hnp jobid, then they must * supply the PID of MPIRUN @@ -398,6 +417,9 @@ cleanup: static int ckpt_init(int argc, char *argv[]) { int exit_status = ORTE_SUCCESS, ret; + char * tmp_env_var = NULL; + + listener_started = false; /* * Make sure to init util before parse_args @@ -432,9 +454,12 @@ static int ckpt_init(int argc, char *argv[]) { opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "none", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /*************************** * We need all of OPAL and the TOOLS portion of ORTE - this @@ -488,6 +513,8 @@ static int start_listener(void) goto cleanup; } + listener_started = true; + cleanup: return exit_status; } @@ -496,12 +523,18 @@ static int stop_listener(void) { int ret, exit_status = ORTE_SUCCESS; + if( !listener_started ) { + exit_status = ORTE_ERROR; + goto cleanup; + } + if (ORTE_SUCCESS != (ret = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT))) { exit_status = ret; goto cleanup; } + listener_started = false; cleanup: return exit_status; } @@ -597,18 +630,6 @@ static void process_ckpt_update_cmd(orte_process_name_t* sender, pretty_print_status(); } } - /* - * Otherwise only display it if we are going to be terminated soon - */ - else { - /* Since ORTE kills us before we get the Finished message, - * print out the global snapshot handle when we start running - */ /* JJH */ - if(orte_checkpoint_globals.term && - ORTE_SNAPC_CKPT_STATE_RUNNING == orte_checkpoint_globals.ckpt_status ) { - pretty_print_status(); - } - } cleanup: return; diff --git a/orte/tools/orte-clean/orte-clean.c b/orte/tools/orte-clean/orte-clean.c index 4d42f8661e..b02f5fcace 100644 --- a/orte/tools/orte-clean/orte-clean.c +++ b/orte/tools/orte-clean/orte-clean.c @@ -162,6 +162,7 @@ static int parse_args(int argc, char *argv[]) { int ret; opal_cmd_line_t cmd_line; orte_clean_globals_t tmp = { false, false }; + char * tmp_env_var = NULL; /* Parse the command line options */ @@ -176,8 +177,11 @@ static int parse_args(int argc, char *argv[]) { opal_cmd_line_create(&cmd_line, cmd_line_opts); ret = opal_cmd_line_parse(&cmd_line, true, argc, argv); - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, NULL); + free(tmp_env_var); + tmp_env_var = NULL; /** * Now start parsing our specific arguments @@ -199,6 +203,7 @@ static int parse_args(int argc, char *argv[]) { static int orte_clean_init(void) { int exit_status = ORTE_SUCCESS, ret; + char * tmp_env_var = NULL; #if OPAL_ENABLE_FT == 1 /* Disable the checkpoint notification routine for this @@ -208,10 +213,13 @@ static int orte_clean_init(void) { opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "none", true, &environ); + free(tmp_env_var); #endif + tmp_env_var = NULL; /* Silence compiler warning */ if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) { exit_status = ret; diff --git a/orte/tools/orte-ps/orte-ps.c b/orte/tools/orte-ps/orte-ps.c index 0fcc9e0b2d..8f917a2ae1 100644 --- a/orte/tools/orte-ps/orte-ps.c +++ b/orte/tools/orte-ps/orte-ps.c @@ -326,6 +326,7 @@ static int parse_args(int argc, char *argv[]) { static int orte_ps_init(int argc, char *argv[]) { int ret; + char * tmp_env_var = NULL; /* * Make sure to init util before parse_args @@ -361,14 +362,21 @@ static int orte_ps_init(int argc, char *argv[]) { opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "none", true, &environ); - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + free(tmp_env_var); + tmp_env_var = NULL; + + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); #endif - + tmp_env_var = NULL; /* Silence compiler warning */ + /*************************** * We need all of OPAL and the TOOL portion of ORTE ***************************/ diff --git a/orte/tools/orte-restart/help-orte-restart.txt b/orte/tools/orte-restart/help-orte-restart.txt index 757fb1087f..49beefbf9d 100644 --- a/orte/tools/orte-restart/help-orte-restart.txt +++ b/orte/tools/orte-restart/help-orte-restart.txt @@ -24,6 +24,16 @@ ompi-restart GLOBAL_SNAPSHOT_REF %s +[usage-no-cr] +This build of Open MPI does *not* include Checkpoint/Restart functionality. +If you require this functionality re-configure Open MPI with the proper +Checkpoint/Restart options. + +ompi-restart GLOBAL_SNAPSHOT_REF + Open MPI Parallel Job Restart Tool + +%s + [invalid_filename] Error: The filename (%s) is invalid because either you have not provided a filename or provided an invalid filename. diff --git a/orte/tools/orte-restart/orte-restart.c b/orte/tools/orte-restart/orte-restart.c index de2d752916..d4e266270a 100644 --- a/orte/tools/orte-restart/orte-restart.c +++ b/orte/tools/orte-restart/orte-restart.c @@ -229,6 +229,7 @@ main(int argc, char *argv[]) static int initialize(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; + char * tmp_env_var = NULL; /* * Make sure to init util before parse_args @@ -264,9 +265,12 @@ static int initialize(int argc, char *argv[]) { opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ - opal_setenv(mca_base_param_env_var("crs"), + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, "none", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; /* * Setup any ORTE stuff we might need @@ -277,8 +281,15 @@ static int initialize(int argc, char *argv[]) { } /* Unset these now that we no longer need them */ - opal_unsetenv(mca_base_param_env_var("crs"), &environ); - opal_unsetenv(mca_base_param_env_var("opal_cr_is_tool"), &environ); + tmp_env_var = mca_base_param_env_var("crs"); + opal_unsetenv(tmp_env_var, &environ); + free(tmp_env_var); + tmp_env_var = NULL; + + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_unsetenv(tmp_env_var, &environ); + free(tmp_env_var); + tmp_env_var = NULL; cleanup: return exit_status; @@ -300,6 +311,7 @@ static int parse_args(int argc, char *argv[]) int i, ret, len; opal_cmd_line_t cmd_line; char **app_env = NULL, **global_env = NULL; + char * tmp_env_var = NULL; orte_restart_globals_t tmp = { false, /* help */ NULL, /* filename */ NULL, /* appfile */ @@ -334,13 +346,29 @@ static int parse_args(int argc, char *argv[]) putenv(global_env[i]); } - - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; + /** * Now start parsing our specific arguments */ + +#if OPAL_ENABLE_FT == 0 + /* Warn and exit if not configured with Checkpoint/Restart */ + { + char *args = NULL; + args = opal_cmd_line_get_usage_msg(&cmd_line); + opal_show_help("help-orte-restart.txt", "usage-no-cr", + true, args); + free(args); + return ORTE_ERROR; + } +#endif + if (OPAL_SUCCESS != ret || orte_restart_globals.help || 1 >= argc) { diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 82c66c11b6..b432f72f75 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -314,6 +314,7 @@ int orterun(int argc, char *argv[]) { int rc; opal_cmd_line_t cmd_line; + char * tmp_env_var = NULL; /* find our basename (the name of the executable) so that we can use it in pretty-print error messages */ @@ -378,10 +379,13 @@ int orterun(int argc, char *argv[]) #if OPAL_ENABLE_FT == 1 /* Disable OPAL CR notifications for this tool */ opal_cr_set_enabled(false); - opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, "1", true, &environ); + free(tmp_env_var); #endif + tmp_env_var = NULL; /* Silence compiler warning */ /* Intialize our Open RTE environment * Set the flag telling orte_init that I am NOT a diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index 15073a43ce..c0ddf32286 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -159,7 +159,7 @@ int orte_proc_info_finalize(void) return ORTE_SUCCESS; } - if (NULL != orte_process_info.tmpdir_base) { + if (NULL != orte_process_info.tmpdir_base) { free(orte_process_info.tmpdir_base); orte_process_info.tmpdir_base = NULL; } @@ -177,7 +177,6 @@ int orte_proc_info_finalize(void) if (NULL != orte_process_info.proc_session_dir) { free(orte_process_info.proc_session_dir); orte_process_info.proc_session_dir = NULL; - } if (NULL != orte_process_info.nodename) { @@ -190,12 +189,12 @@ int orte_proc_info_finalize(void) orte_process_info.sock_stdin = NULL; } - if (NULL != orte_process_info.sock_stdout) { + if (NULL != orte_process_info.sock_stdout) { free(orte_process_info.sock_stdout); orte_process_info.sock_stdout = NULL; } - if (NULL != orte_process_info.sock_stderr) { + if (NULL != orte_process_info.sock_stderr) { free(orte_process_info.sock_stderr); orte_process_info.sock_stderr = NULL; }