Merge in tmp/jjh-scratch
{{{ svn merge -r 18218:18240 https://svn.open-mpi.org/svn/ompi/tmp/jjh-scratch . }}} Contains: * Primarily a fix for a user reported problem where a cached file descriptor is causing a SIGPIPE on restart. * Cleanup some small memory leaks from using mca_base_param_env_var() - Thanks Jeff * Cleanup ORTE FT tool compilation in non-FT builds - Thanks Tim P. * Cleanup mpi interface with missplaced {{{OPAL_CR_ENTER_LIBRARY}}} - Thanks Terry * Some other sundry cleanup items all dealing with C/R functionality in the trunk. This commit was SVN r18241.
Этот коммит содержится в:
родитель
0215474cb8
Коммит
cc83d41ad9
@ -476,15 +476,11 @@ int mca_pml_ob1_ft_event( int state )
|
||||
|
||||
/*
|
||||
* Clean out the modex information since it is invalid now.
|
||||
* orte_grpcomm.purge_proc_attrs();
|
||||
* This happens at the ORTE level, so doing it again here will cause
|
||||
* some issues with socket caching.
|
||||
*/
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"pml:ob1: ft_event(Restart): Restart Modex information");
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): purge_modex Failed %d",
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Refresh the proc structure, and publish our proc info in the modex.
|
||||
|
@ -62,9 +62,9 @@ int MPI_Open_port(MPI_Info info, char *port_name)
|
||||
*/
|
||||
}
|
||||
|
||||
rc = ompi_dpm.open_port(port_name, OMPI_RML_TAG_INVALID);
|
||||
|
||||
OPAL_CR_ENTER_LIBRARY();
|
||||
|
||||
rc = ompi_dpm.open_port(port_name, OMPI_RML_TAG_INVALID);
|
||||
|
||||
OMPI_ERRHANDLER_RETURN(rc, MPI_COMM_WORLD, rc, FUNC_NAME);
|
||||
}
|
||||
|
@ -103,6 +103,7 @@ int main(int argc, char *argv[])
|
||||
int ret = 0;
|
||||
opal_cmd_line_t *cmd_line = NULL;
|
||||
char *rml_uri;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/* init enough of opal to process cmd lines */
|
||||
if (OPAL_SUCCESS != opal_init_util()) {
|
||||
@ -163,15 +164,22 @@ int main(int argc, char *argv[])
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/* Mark as a tool program */
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
#endif
|
||||
|
||||
tmp_env_var = NULL; /* Silence compiler warning */
|
||||
|
||||
/* Perform the standard init, but flag that we are a tool
|
||||
* so that we only open up the communications infrastructure. No
|
||||
* session directories will be created.
|
||||
|
@ -162,8 +162,14 @@ int opal_crs_base_none_prelaunch(int32_t rank,
|
||||
char ***argv,
|
||||
char ***env)
|
||||
{
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"0", true, env);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -553,19 +553,31 @@ crs_self_find_function(lt_dlhandle handle, char *prefix, char *suffix){
|
||||
*/
|
||||
static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd)
|
||||
{
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
||||
"crs:self: restart_cmd(%s, ---)", snapshot->cmd_line);
|
||||
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"self",
|
||||
true, &environ);
|
||||
opal_setenv(mca_base_param_env_var("crs_self_do_restart"),
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
tmp_env_var = mca_base_param_env_var("crs_self_do_restart");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
opal_setenv(mca_base_param_env_var("crs_self_prefix"),
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
tmp_env_var = mca_base_param_env_var("crs_self_prefix");
|
||||
opal_setenv(tmp_env_var,
|
||||
mca_crs_self_component.prefix,
|
||||
true, &environ);
|
||||
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/* Instead of adding it to the command line, we should use the environment
|
||||
* to pass the values. This allow sthe OPAL application to be braindead
|
||||
|
@ -85,6 +85,7 @@ static int cr_notify_response(opal_cr_ckpt_cmd_state_t resp);
|
||||
static int extract_env_vars(int prev_pid);
|
||||
static int cr_entry_point_notify_reopen_files(int *prog_read_fd, int *prog_write_fd);
|
||||
static void opal_cr_entry_point_signal_handler (int signo);
|
||||
static void opal_cr_sigpipe_debug_signal_handler (int signo);
|
||||
|
||||
static opal_cr_coord_callback_fn_t cur_coord_callback = NULL;
|
||||
static opal_cr_notify_callback_fn_t cur_notify_callback = NULL;
|
||||
@ -104,6 +105,8 @@ int opal_cr_checkpointing = OPAL_CR_STATUS_NONE;
|
||||
/* Current checkpoint request channel state */
|
||||
int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
||||
|
||||
static bool opal_cr_debug_sigpipe = false;
|
||||
|
||||
#if OPAL_ENABLE_FT_THREAD == 1
|
||||
/*****************
|
||||
* Threading Functions and Variables
|
||||
@ -279,6 +282,22 @@ int opal_cr_init(void )
|
||||
"opal_cr: init: Checkpoint Signal: %d",
|
||||
opal_cr_entry_point_signal);
|
||||
|
||||
mca_base_param_reg_int_name("opal_cr", "debug_sigpipe",
|
||||
"Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
|
||||
false, false,
|
||||
0, &val);
|
||||
opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
opal_output_verbose(10, opal_cr_output,
|
||||
"opal_cr: init: Debug SIGPIPE: %d (%s)",
|
||||
val, (opal_cr_debug_sigpipe ? "True" : "False"));
|
||||
|
||||
if( opal_cr_debug_sigpipe ) {
|
||||
if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
mca_base_param_reg_string_name("opal_cr", "tmp_dir",
|
||||
"Temporary directory to place rendezvous files for a checkpoint",
|
||||
false, false,
|
||||
@ -778,6 +797,27 @@ static void opal_cr_entry_point_signal_handler (int signo)
|
||||
opal_cr_checkpoint_request = OPAL_CR_STATUS_REQUESTED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Used only for debugging SIGPIPE problems
|
||||
*/
|
||||
static void opal_cr_sigpipe_debug_signal_handler (int signo)
|
||||
{
|
||||
int sleeper = 1;
|
||||
|
||||
if( !opal_cr_debug_sigpipe ) {
|
||||
opal_output_verbose(10, opal_cr_output,
|
||||
"opal_cr: sigpipe_debug: Debug SIGPIPE Not enabled :(\n");
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, opal_cr_output,
|
||||
"opal_cr: sigpipe_debug: Debug SIGPIPE [%d]: PID (%d)\n",
|
||||
signo, getpid());
|
||||
while(sleeper == 1 ) {
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Respond to an asynchronous checkpoint request
|
||||
*/
|
||||
|
@ -202,6 +202,7 @@ main(int argc, char *argv[])
|
||||
|
||||
static int initialize(int argc, char *argv[]) {
|
||||
int ret, exit_status = OPAL_SUCCESS;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
@ -242,9 +243,12 @@ static int initialize(int argc, char *argv[]) {
|
||||
* Select the 'none' CRS component,
|
||||
* since we don't actually use a checkpointer
|
||||
*/
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Initialize OPAL
|
||||
@ -272,6 +276,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
int i, ret, len;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t));
|
||||
|
||||
@ -299,9 +304,12 @@ static int parse_args(int argc, char *argv[]) {
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
|
@ -150,6 +150,7 @@ main(int argc, char *argv[])
|
||||
int child_pid;
|
||||
int prev_pid = 0;
|
||||
opal_crs_base_snapshot_t *snapshot = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/***************
|
||||
* Initialize
|
||||
@ -170,9 +171,12 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* Re-enable the selection of the CRS component, so we can choose the right one */
|
||||
opal_setenv(mca_base_param_env_var("crs_base_do_not_select"),
|
||||
tmp_env_var = mca_base_param_env_var("crs_base_do_not_select");
|
||||
opal_setenv(tmp_env_var,
|
||||
"0", /* turn on the selection */
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Make sure we are using the correct checkpointer
|
||||
@ -190,9 +194,12 @@ main(int argc, char *argv[])
|
||||
"Restart Expects checkpointer: (%s)",
|
||||
expected_crs_comp);
|
||||
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
expected_crs_comp,
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/* Select this component or don't continue.
|
||||
* If the selection of this component fails, then we can't
|
||||
@ -299,6 +306,7 @@ main(int argc, char *argv[])
|
||||
static int initialize(int argc, char *argv[])
|
||||
{
|
||||
int ret, exit_status = OPAL_SUCCESS;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
@ -331,9 +339,12 @@ static int initialize(int argc, char *argv[])
|
||||
* Turn off the selection of the CRS component,
|
||||
* we need to do that later
|
||||
*/
|
||||
opal_setenv(mca_base_param_env_var("crs_base_do_not_select"),
|
||||
tmp_env_var = mca_base_param_env_var("crs_base_do_not_select");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1", /* turn off the selection */
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Initialize the OPAL layer
|
||||
|
126
orte/mca/ess/env/ess_env_module.c
поставляемый
126
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -95,12 +95,11 @@ orte_ess_base_module_t orte_ess_env_module = {
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
static int rte_init(char flags)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
|
||||
|
||||
/* Start by getting a unique name from the enviro */
|
||||
env_set_name();
|
||||
|
||||
@ -225,6 +224,14 @@ static int rte_ft_event(int state)
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/*
|
||||
* Notify SnapC
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify IOF
|
||||
*/
|
||||
@ -234,7 +241,15 @@ static int rte_ft_event(int state)
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify RML & OOB
|
||||
* Notify Routed
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
exit_status = ret;
|
||||
@ -244,13 +259,21 @@ static int rte_ft_event(int state)
|
||||
/******** Continue Recovery ********/
|
||||
else if (OPAL_CRS_CONTINUE == state ) {
|
||||
/*
|
||||
* Notify RML & OOB
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify Routed
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify IOF
|
||||
*/
|
||||
@ -258,16 +281,20 @@ static int rte_ft_event(int state)
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify SnapC
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/******** Restart Recovery ********/
|
||||
else if (OPAL_CRS_RESTART == state ) {
|
||||
/*
|
||||
* Notify RML & OOB
|
||||
* This should follow the ess init() function
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* - Reset Contact information
|
||||
@ -276,11 +303,67 @@ static int rte_ft_event(int state)
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
/* Session directory stuff:
|
||||
* orte_process_info.top_session_dir
|
||||
* orte_process_info.universe_session_dir
|
||||
* orte_process_info.job_session_dir
|
||||
* orte_process_info.proc_session_dir
|
||||
/*
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Restart the routed framework
|
||||
* JJH: Lie to the finalize function so it does not try to contact the daemon.
|
||||
*/
|
||||
orte_process_info.tool = true;
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
orte_process_info.tool = false;
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Group Comm - Clean out stale data
|
||||
*/
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Restart the PLM - Does nothing at the moment, but included for completeness
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* RML - Enable communications
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Session directory re-init
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
|
||||
exit_status = ret;
|
||||
@ -299,10 +382,13 @@ static int rte_ft_event(int state)
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
|
||||
"output-", NULL, NULL);
|
||||
|
||||
/*
|
||||
* Re-enable communication through the RML
|
||||
* Notify Routed
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -316,13 +402,9 @@ static int rte_ft_event(int state)
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-exchange the routes
|
||||
* Notify SnapC
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -73,8 +73,6 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void);
|
||||
typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report);
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
* @param[in] state Fault tolerance state update
|
||||
|
@ -1640,6 +1640,7 @@ int mca_oob_tcp_ft_event(int state) {
|
||||
#else
|
||||
int mca_oob_tcp_ft_event(int state) {
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
opal_list_item_t *item;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/*
|
||||
@ -1647,7 +1648,6 @@ int mca_oob_tcp_ft_event(int state) {
|
||||
*/
|
||||
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
opal_event_disable();
|
||||
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
/*
|
||||
@ -1657,6 +1657,30 @@ int mca_oob_tcp_ft_event(int state) {
|
||||
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state) {
|
||||
/*
|
||||
* Clean out cached connection information
|
||||
* Select pieces of finalize/init
|
||||
*/
|
||||
for(item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list);
|
||||
item != NULL;
|
||||
item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list)) {
|
||||
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)item;
|
||||
/* JJH: Use the below command for debugging restarts with invalid sockets
|
||||
* mca_oob_tcp_peer_dump(peer, "RESTART CLEAN")
|
||||
*/
|
||||
MCA_OOB_TCP_PEER_RETURN(peer);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list);
|
||||
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peers, opal_hash_table_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_names, opal_hash_table_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_free, opal_free_list_t);
|
||||
|
||||
/*
|
||||
* Resume event processing
|
||||
*/
|
||||
|
@ -76,7 +76,6 @@ static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, int sd, voi
|
||||
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user);
|
||||
static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user);
|
||||
static void mca_oob_tcp_peer_timer_handler(int sd, short flags, void* user);
|
||||
static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg);
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
@ -970,7 +969,7 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user)
|
||||
/*
|
||||
* Routine for debugging to print the connection state and socket options
|
||||
*/
|
||||
static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
|
||||
void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
|
||||
{
|
||||
char src[64];
|
||||
char dst[64];
|
||||
|
@ -157,6 +157,8 @@ int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer);
|
||||
*/
|
||||
void mca_oob_tcp_peer_dequeue_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg);
|
||||
|
||||
void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* _MCA_OOB_TCP_PEER_H */
|
||||
|
@ -182,6 +182,16 @@ typedef int (*orte_routed_module_get_wireup_info_fn_t)(orte_jobid_t job,
|
||||
opal_buffer_t *buf);
|
||||
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
* @param[in] state Fault tolerance state update
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_routed_module_ft_event_fn_t)(int state);
|
||||
|
||||
/* ******************************************************************** */
|
||||
|
||||
|
||||
@ -202,6 +212,8 @@ struct orte_routed_module_t {
|
||||
orte_routed_module_init_routes_fn_t init_routes;
|
||||
orte_routed_module_route_lost_fn_t route_lost;
|
||||
orte_routed_module_get_wireup_info_fn_t get_wireup_info;
|
||||
/* FT Notification */
|
||||
orte_routed_module_ft_event_fn_t ft_event;
|
||||
};
|
||||
/** Convenience typedef */
|
||||
typedef struct orte_routed_module_t orte_routed_module_t;
|
||||
|
@ -39,6 +39,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf);
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
static int tree_ft_event(int state);
|
||||
#endif
|
||||
|
||||
static orte_process_name_t *lifeline=NULL;
|
||||
|
||||
orte_routed_module_t orte_routed_tree_module = {
|
||||
@ -48,7 +52,12 @@ orte_routed_module_t orte_routed_tree_module = {
|
||||
get_route,
|
||||
init_routes,
|
||||
route_lost,
|
||||
get_wireup_info
|
||||
get_wireup_info,
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
tree_ft_event
|
||||
#else
|
||||
NULL
|
||||
#endif
|
||||
};
|
||||
|
||||
/* local globals */
|
||||
@ -571,3 +580,36 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
static int tree_ft_event(int state)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
}
|
||||
/******** Continue Recovery ********/
|
||||
else if (OPAL_CRS_CONTINUE == state ) {
|
||||
}
|
||||
/******** Restart Recovery ********/
|
||||
else if (OPAL_CRS_RESTART == state ) {
|
||||
/*
|
||||
* Re-exchange the routes
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if (OPAL_CRS_TERM == state ) {
|
||||
/* Nothing */
|
||||
}
|
||||
else {
|
||||
/* Error state = Nothing */
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
#endif
|
||||
|
@ -45,6 +45,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf);
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
static int unity_ft_event(int state);
|
||||
#endif
|
||||
|
||||
static orte_process_name_t *lifeline=NULL;
|
||||
|
||||
orte_routed_module_t orte_routed_unity_module = {
|
||||
@ -54,7 +58,12 @@ orte_routed_module_t orte_routed_unity_module = {
|
||||
get_route,
|
||||
init_routes,
|
||||
route_lost,
|
||||
get_wireup_info
|
||||
get_wireup_info,
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
unity_ft_event
|
||||
#else
|
||||
NULL
|
||||
#endif
|
||||
};
|
||||
|
||||
static int init(void)
|
||||
@ -66,6 +75,8 @@ static int init(void)
|
||||
OBJ_CONSTRUCT(&peer_list, opal_hash_table_t);
|
||||
opal_hash_table_init(&peer_list, 128);
|
||||
|
||||
lifeline = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -107,6 +118,8 @@ static int finalize(void)
|
||||
/* cleanup the global condition */
|
||||
OBJ_DESTRUCT(&cond);
|
||||
OBJ_DESTRUCT(&lock);
|
||||
|
||||
lifeline = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -576,3 +589,36 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
static int unity_ft_event(int state)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
}
|
||||
/******** Continue Recovery ********/
|
||||
else if (OPAL_CRS_CONTINUE == state ) {
|
||||
}
|
||||
/******** Restart Recovery ********/
|
||||
else if (OPAL_CRS_RESTART == state ) {
|
||||
/*
|
||||
* Re-exchange the routes
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if (OPAL_CRS_TERM == state ) {
|
||||
/* Nothing */
|
||||
}
|
||||
else {
|
||||
/* Error state = Nothing */
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
#endif
|
||||
|
@ -113,6 +113,7 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
|
||||
ORTE_DECLSPEC int orte_snapc_base_module_finalize(void);
|
||||
ORTE_DECLSPEC int orte_snapc_base_none_setup_job(orte_jobid_t jobid);
|
||||
ORTE_DECLSPEC int orte_snapc_base_none_release_job(orte_jobid_t jobid);
|
||||
ORTE_DECLSPEC int orte_snapc_base_none_ft_event(int state);
|
||||
|
||||
ORTE_DECLSPEC extern int orte_snapc_base_output;
|
||||
ORTE_DECLSPEC extern opal_list_t orte_snapc_base_components_available;
|
||||
|
@ -203,6 +203,11 @@ int orte_snapc_base_none_release_job(orte_jobid_t jobid)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_snapc_base_none_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Local Functions
|
||||
********************/
|
||||
|
@ -66,7 +66,8 @@ static orte_snapc_base_module_t none_module = {
|
||||
/** Finalization Function */
|
||||
orte_snapc_base_module_finalize,
|
||||
orte_snapc_base_none_setup_job,
|
||||
orte_snapc_base_none_release_job
|
||||
orte_snapc_base_none_release_job,
|
||||
orte_snapc_base_none_ft_event
|
||||
};
|
||||
|
||||
int orte_snapc_base_select(bool seed, bool app)
|
||||
|
@ -47,7 +47,8 @@ static orte_snapc_base_module_t loc_module = {
|
||||
/** Finalization Function */
|
||||
orte_snapc_full_module_finalize,
|
||||
orte_snapc_full_setup_job,
|
||||
orte_snapc_full_release_job
|
||||
orte_snapc_full_release_job,
|
||||
orte_snapc_base_none_ft_event
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -200,6 +200,17 @@ typedef int (*orte_snapc_base_setup_job_fn_t)
|
||||
typedef int (*orte_snapc_base_release_job_fn_t)
|
||||
(orte_jobid_t jobid);
|
||||
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
* @param[in] state Fault tolerance state update
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_snapc_base_ft_event_fn_t)(int state);
|
||||
|
||||
/**
|
||||
* Structure for SNAPC v1.0.0 components.
|
||||
*/
|
||||
@ -234,6 +245,8 @@ struct orte_snapc_base_module_1_0_0_t {
|
||||
orte_snapc_base_setup_job_fn_t setup_job;
|
||||
/** Release job */
|
||||
orte_snapc_base_release_job_fn_t release_job;
|
||||
/** Handle any FT Notifications */
|
||||
orte_snapc_base_ft_event_fn_t ft_event;
|
||||
};
|
||||
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t;
|
||||
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t;
|
||||
|
@ -187,6 +187,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
int i;
|
||||
opal_buffer_t *buffer;
|
||||
char hostname[100];
|
||||
char *tmp_env_var = NULL;
|
||||
|
||||
/* initialize the globals */
|
||||
memset(&orted_globals, 0, sizeof(orted_globals));
|
||||
@ -283,10 +284,13 @@ int orte_daemon(int argc, char *argv[])
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Mark as a tool program */
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
#endif
|
||||
tmp_env_var = NULL; /* Silence compiler warning */
|
||||
|
||||
/* detach from controlling terminal
|
||||
* otherwise, remain attached so output can get to us
|
||||
|
@ -325,14 +325,6 @@ static int orte_cr_coord_post_restart(void) {
|
||||
/*
|
||||
* Notify the ESS
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_open())) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
if( NULL != orte_ess.ft_event ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_RESTART))) {
|
||||
exit_status = ret;
|
||||
|
@ -25,8 +25,6 @@ SUBDIRS += \
|
||||
tools/orterun \
|
||||
tools/orte-clean \
|
||||
tools/orte-ps \
|
||||
tools/orte-checkpoint \
|
||||
tools/orte-restart \
|
||||
tools/wrappers
|
||||
|
||||
DIST_SUBDIRS += \
|
||||
|
@ -24,6 +24,16 @@ ompi-checkpoint PID_OF_MPIRUN
|
||||
|
||||
%s
|
||||
|
||||
[usage-no-cr]
|
||||
This build of Open MPI does *not* include Checkpoint/Restart functionality.
|
||||
If you require this functionality re-configure Open MPI with the proper
|
||||
Checkpoint/Restart options.
|
||||
|
||||
ompi-checkpoint PID_OF_MPIRUN
|
||||
Open MPI Checkpoint Tool
|
||||
|
||||
%s
|
||||
|
||||
[invalid_pid]
|
||||
Error: The PID (%d) is invalid because either you have not provided a PID
|
||||
or provided an invalid PID.
|
||||
|
@ -107,6 +107,8 @@ static int global_sequence_num = 0;
|
||||
/*****************************************
|
||||
* Global Vars for Command line Arguments
|
||||
*****************************************/
|
||||
static bool listener_started = false;
|
||||
|
||||
typedef struct {
|
||||
bool help;
|
||||
int pid;
|
||||
@ -267,6 +269,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
int i, ret, len, exit_status = ORTE_SUCCESS ;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/* Init structure */
|
||||
memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t));
|
||||
@ -301,9 +304,12 @@ static int parse_args(int argc, char *argv[]) {
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
@ -311,6 +317,19 @@ static int parse_args(int argc, char *argv[]) {
|
||||
/* get the remaining bits */
|
||||
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
/* Warn and exit if not configured with Checkpoint/Restart */
|
||||
{
|
||||
char *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
opal_show_help("help-orte-checkpoint.txt", "usage-no-cr",
|
||||
true, args);
|
||||
free(args);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (OPAL_SUCCESS != ret ||
|
||||
orte_checkpoint_globals.help ||
|
||||
(0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp)) {
|
||||
@ -322,7 +341,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If the user did not supply an hnp jobid, then they must
|
||||
* supply the PID of MPIRUN
|
||||
@ -398,6 +417,9 @@ cleanup:
|
||||
|
||||
static int ckpt_init(int argc, char *argv[]) {
|
||||
int exit_status = ORTE_SUCCESS, ret;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
listener_started = false;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
@ -432,9 +454,12 @@ static int ckpt_init(int argc, char *argv[]) {
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/***************************
|
||||
* We need all of OPAL and the TOOLS portion of ORTE - this
|
||||
@ -488,6 +513,8 @@ static int start_listener(void)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
listener_started = true;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
@ -496,12 +523,18 @@ static int stop_listener(void)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
if( !listener_started ) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_CKPT))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
listener_started = false;
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
@ -597,18 +630,6 @@ static void process_ckpt_update_cmd(orte_process_name_t* sender,
|
||||
pretty_print_status();
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Otherwise only display it if we are going to be terminated soon
|
||||
*/
|
||||
else {
|
||||
/* Since ORTE kills us before we get the Finished message,
|
||||
* print out the global snapshot handle when we start running
|
||||
*/ /* JJH */
|
||||
if(orte_checkpoint_globals.term &&
|
||||
ORTE_SNAPC_CKPT_STATE_RUNNING == orte_checkpoint_globals.ckpt_status ) {
|
||||
pretty_print_status();
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return;
|
||||
|
@ -162,6 +162,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
int ret;
|
||||
opal_cmd_line_t cmd_line;
|
||||
orte_clean_globals_t tmp = { false, false };
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/* Parse the command line options */
|
||||
|
||||
@ -176,8 +177,11 @@ static int parse_args(int argc, char *argv[]) {
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
||||
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
||||
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1", true, NULL);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
@ -199,6 +203,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
|
||||
static int orte_clean_init(void) {
|
||||
int exit_status = ORTE_SUCCESS, ret;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Disable the checkpoint notification routine for this
|
||||
@ -208,10 +213,13 @@ static int orte_clean_init(void) {
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
#endif
|
||||
tmp_env_var = NULL; /* Silence compiler warning */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
|
||||
exit_status = ret;
|
||||
|
@ -326,6 +326,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
|
||||
static int orte_ps_init(int argc, char *argv[]) {
|
||||
int ret;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
@ -361,14 +362,21 @@ static int orte_ps_init(int argc, char *argv[]) {
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
#endif
|
||||
|
||||
tmp_env_var = NULL; /* Silence compiler warning */
|
||||
|
||||
/***************************
|
||||
* We need all of OPAL and the TOOL portion of ORTE
|
||||
***************************/
|
||||
|
@ -24,6 +24,16 @@ ompi-restart GLOBAL_SNAPSHOT_REF
|
||||
|
||||
%s
|
||||
|
||||
[usage-no-cr]
|
||||
This build of Open MPI does *not* include Checkpoint/Restart functionality.
|
||||
If you require this functionality re-configure Open MPI with the proper
|
||||
Checkpoint/Restart options.
|
||||
|
||||
ompi-restart GLOBAL_SNAPSHOT_REF
|
||||
Open MPI Parallel Job Restart Tool
|
||||
|
||||
%s
|
||||
|
||||
[invalid_filename]
|
||||
Error: The filename (%s) is invalid because either you have not provided a filename
|
||||
or provided an invalid filename.
|
||||
|
@ -229,6 +229,7 @@ main(int argc, char *argv[])
|
||||
|
||||
static int initialize(int argc, char *argv[]) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
@ -264,9 +265,12 @@ static int initialize(int argc, char *argv[]) {
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
opal_setenv(mca_base_param_env_var("crs"),
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Setup any ORTE stuff we might need
|
||||
@ -277,8 +281,15 @@ static int initialize(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
/* Unset these now that we no longer need them */
|
||||
opal_unsetenv(mca_base_param_env_var("crs"), &environ);
|
||||
opal_unsetenv(mca_base_param_env_var("opal_cr_is_tool"), &environ);
|
||||
tmp_env_var = mca_base_param_env_var("crs");
|
||||
opal_unsetenv(tmp_env_var, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_unsetenv(tmp_env_var, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
@ -300,6 +311,7 @@ static int parse_args(int argc, char *argv[])
|
||||
int i, ret, len;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
orte_restart_globals_t tmp = { false, /* help */
|
||||
NULL, /* filename */
|
||||
NULL, /* appfile */
|
||||
@ -334,13 +346,29 @@ static int parse_args(int argc, char *argv[])
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
*/
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
/* Warn and exit if not configured with Checkpoint/Restart */
|
||||
{
|
||||
char *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
opal_show_help("help-orte-restart.txt", "usage-no-cr",
|
||||
true, args);
|
||||
free(args);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (OPAL_SUCCESS != ret ||
|
||||
orte_restart_globals.help ||
|
||||
1 >= argc) {
|
||||
|
@ -314,6 +314,7 @@ int orterun(int argc, char *argv[])
|
||||
{
|
||||
int rc;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
@ -378,10 +379,13 @@ int orterun(int argc, char *argv[])
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Disable OPAL CR notifications for this tool */
|
||||
opal_cr_set_enabled(false);
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
#endif
|
||||
tmp_env_var = NULL; /* Silence compiler warning */
|
||||
|
||||
/* Intialize our Open RTE environment
|
||||
* Set the flag telling orte_init that I am NOT a
|
||||
|
@ -159,7 +159,7 @@ int orte_proc_info_finalize(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.tmpdir_base) {
|
||||
if (NULL != orte_process_info.tmpdir_base) {
|
||||
free(orte_process_info.tmpdir_base);
|
||||
orte_process_info.tmpdir_base = NULL;
|
||||
}
|
||||
@ -177,7 +177,6 @@ int orte_proc_info_finalize(void)
|
||||
if (NULL != orte_process_info.proc_session_dir) {
|
||||
free(orte_process_info.proc_session_dir);
|
||||
orte_process_info.proc_session_dir = NULL;
|
||||
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.nodename) {
|
||||
@ -190,12 +189,12 @@ int orte_proc_info_finalize(void)
|
||||
orte_process_info.sock_stdin = NULL;
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.sock_stdout) {
|
||||
if (NULL != orte_process_info.sock_stdout) {
|
||||
free(orte_process_info.sock_stdout);
|
||||
orte_process_info.sock_stdout = NULL;
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.sock_stderr) {
|
||||
if (NULL != orte_process_info.sock_stderr) {
|
||||
free(orte_process_info.sock_stderr);
|
||||
orte_process_info.sock_stderr = NULL;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user