1
1
{{{
 svn merge -r 18218:18240 https://svn.open-mpi.org/svn/ompi/tmp/jjh-scratch .
}}}

Contains:
 * Primarily a fix for a user reported problem where a cached file descriptor is causing a SIGPIPE on restart.
 * Cleanup some small memory leaks from using mca_base_param_env_var() - Thanks Jeff
 * Cleanup ORTE FT tool compilation in non-FT builds - Thanks Tim P.
 * Cleanup mpi interface with missplaced {{{OPAL_CR_ENTER_LIBRARY}}} - Thanks Terry
 * Some other sundry cleanup items all dealing with C/R functionality in the trunk.

This commit was SVN r18241.
Этот коммит содержится в:
Josh Hursey 2008-04-23 00:17:12 +00:00
родитель 0215474cb8
Коммит cc83d41ad9
32 изменённых файлов: 484 добавлений и 95 удалений

Просмотреть файл

@ -476,15 +476,11 @@ int mca_pml_ob1_ft_event( int state )
/*
* Clean out the modex information since it is invalid now.
* orte_grpcomm.purge_proc_attrs();
* This happens at the ORTE level, so doing it again here will cause
* some issues with socket caching.
*/
opal_output_verbose(10, ompi_cr_output,
"pml:ob1: ft_event(Restart): Restart Modex information");
if (OMPI_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) {
opal_output(0,
"pml:ob1: ft_event(Restart): purge_modex Failed %d",
ret);
return ret;
}
/*
* Refresh the proc structure, and publish our proc info in the modex.

Просмотреть файл

@ -62,9 +62,9 @@ int MPI_Open_port(MPI_Info info, char *port_name)
*/
}
rc = ompi_dpm.open_port(port_name, OMPI_RML_TAG_INVALID);
OPAL_CR_ENTER_LIBRARY();
rc = ompi_dpm.open_port(port_name, OMPI_RML_TAG_INVALID);
OMPI_ERRHANDLER_RETURN(rc, MPI_COMM_WORLD, rc, FUNC_NAME);
}

Просмотреть файл

@ -103,6 +103,7 @@ int main(int argc, char *argv[])
int ret = 0;
opal_cmd_line_t *cmd_line = NULL;
char *rml_uri;
char * tmp_env_var = NULL;
/* init enough of opal to process cmd lines */
if (OPAL_SUCCESS != opal_init_util()) {
@ -163,15 +164,22 @@ int main(int argc, char *argv[])
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* Mark as a tool program */
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
/* Perform the standard init, but flag that we are a tool
* so that we only open up the communications infrastructure. No
* session directories will be created.

Просмотреть файл

@ -162,8 +162,14 @@ int opal_crs_base_none_prelaunch(int32_t rank,
char ***argv,
char ***env)
{
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
char * tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}

Просмотреть файл

@ -553,19 +553,31 @@ crs_self_find_function(lt_dlhandle handle, char *prefix, char *suffix){
*/
static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd)
{
char * tmp_env_var = NULL;
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: restart_cmd(%s, ---)", snapshot->cmd_line);
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"self",
true, &environ);
opal_setenv(mca_base_param_env_var("crs_self_do_restart"),
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("crs_self_do_restart");
opal_setenv(tmp_env_var,
"1",
true, &environ);
opal_setenv(mca_base_param_env_var("crs_self_prefix"),
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("crs_self_prefix");
opal_setenv(tmp_env_var,
mca_crs_self_component.prefix,
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* Instead of adding it to the command line, we should use the environment
* to pass the values. This allow sthe OPAL application to be braindead

Просмотреть файл

@ -85,6 +85,7 @@ static int cr_notify_response(opal_cr_ckpt_cmd_state_t resp);
static int extract_env_vars(int prev_pid);
static int cr_entry_point_notify_reopen_files(int *prog_read_fd, int *prog_write_fd);
static void opal_cr_entry_point_signal_handler (int signo);
static void opal_cr_sigpipe_debug_signal_handler (int signo);
static opal_cr_coord_callback_fn_t cur_coord_callback = NULL;
static opal_cr_notify_callback_fn_t cur_notify_callback = NULL;
@ -104,6 +105,8 @@ int opal_cr_checkpointing = OPAL_CR_STATUS_NONE;
/* Current checkpoint request channel state */
int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
static bool opal_cr_debug_sigpipe = false;
#if OPAL_ENABLE_FT_THREAD == 1
/*****************
* Threading Functions and Variables
@ -279,6 +282,22 @@ int opal_cr_init(void )
"opal_cr: init: Checkpoint Signal: %d",
opal_cr_entry_point_signal);
mca_base_param_reg_int_name("opal_cr", "debug_sigpipe",
"Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
false, false,
0, &val);
opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val);
opal_output_verbose(10, opal_cr_output,
"opal_cr: init: Debug SIGPIPE: %d (%s)",
val, (opal_cr_debug_sigpipe ? "True" : "False"));
if( opal_cr_debug_sigpipe ) {
if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
;
}
}
mca_base_param_reg_string_name("opal_cr", "tmp_dir",
"Temporary directory to place rendezvous files for a checkpoint",
false, false,
@ -778,6 +797,27 @@ static void opal_cr_entry_point_signal_handler (int signo)
opal_cr_checkpoint_request = OPAL_CR_STATUS_REQUESTED;
}
/*
* Used only for debugging SIGPIPE problems
*/
static void opal_cr_sigpipe_debug_signal_handler (int signo)
{
int sleeper = 1;
if( !opal_cr_debug_sigpipe ) {
opal_output_verbose(10, opal_cr_output,
"opal_cr: sigpipe_debug: Debug SIGPIPE Not enabled :(\n");
return;
}
opal_output_verbose(10, opal_cr_output,
"opal_cr: sigpipe_debug: Debug SIGPIPE [%d]: PID (%d)\n",
signo, getpid());
while(sleeper == 1 ) {
sleep(1);
}
}
/*
* Respond to an asynchronous checkpoint request
*/

Просмотреть файл

@ -202,6 +202,7 @@ main(int argc, char *argv[])
static int initialize(int argc, char *argv[]) {
int ret, exit_status = OPAL_SUCCESS;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
@ -242,9 +243,12 @@ static int initialize(int argc, char *argv[]) {
* Select the 'none' CRS component,
* since we don't actually use a checkpointer
*/
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Initialize OPAL
@ -272,6 +276,7 @@ static int parse_args(int argc, char *argv[]) {
int i, ret, len;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t));
@ -299,9 +304,12 @@ static int parse_args(int argc, char *argv[]) {
putenv(global_env[i]);
}
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments

Просмотреть файл

@ -150,6 +150,7 @@ main(int argc, char *argv[])
int child_pid;
int prev_pid = 0;
opal_crs_base_snapshot_t *snapshot = NULL;
char * tmp_env_var = NULL;
/***************
* Initialize
@ -170,9 +171,12 @@ main(int argc, char *argv[])
}
/* Re-enable the selection of the CRS component, so we can choose the right one */
opal_setenv(mca_base_param_env_var("crs_base_do_not_select"),
tmp_env_var = mca_base_param_env_var("crs_base_do_not_select");
opal_setenv(tmp_env_var,
"0", /* turn on the selection */
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Make sure we are using the correct checkpointer
@ -190,9 +194,12 @@ main(int argc, char *argv[])
"Restart Expects checkpointer: (%s)",
expected_crs_comp);
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
expected_crs_comp,
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* Select this component or don't continue.
* If the selection of this component fails, then we can't
@ -299,6 +306,7 @@ main(int argc, char *argv[])
static int initialize(int argc, char *argv[])
{
int ret, exit_status = OPAL_SUCCESS;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
@ -331,9 +339,12 @@ static int initialize(int argc, char *argv[])
* Turn off the selection of the CRS component,
* we need to do that later
*/
opal_setenv(mca_base_param_env_var("crs_base_do_not_select"),
tmp_env_var = mca_base_param_env_var("crs_base_do_not_select");
opal_setenv(tmp_env_var,
"1", /* turn off the selection */
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Initialize the OPAL layer

126
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -95,12 +95,11 @@ orte_ess_base_module_t orte_ess_env_module = {
#endif
};
static int rte_init(char flags)
{
int ret;
char *error = NULL;
/* Start by getting a unique name from the enviro */
env_set_name();
@ -225,6 +224,14 @@ static int rte_ft_event(int state)
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify IOF
*/
@ -234,7 +241,15 @@ static int rte_ft_event(int state)
}
/*
* Notify RML & OOB
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
@ -244,13 +259,21 @@ static int rte_ft_event(int state)
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
/*
* Notify RML & OOB
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify IOF
*/
@ -258,16 +281,20 @@ static int rte_ft_event(int state)
exit_status = ret;
goto cleanup;
}
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
/*
* Notify RML & OOB
* This should follow the ess init() function
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* - Reset Contact information
@ -276,11 +303,67 @@ static int rte_ft_event(int state)
exit_status = ret;
}
/* Session directory stuff:
* orte_process_info.top_session_dir
* orte_process_info.universe_session_dir
* orte_process_info.job_session_dir
* orte_process_info.proc_session_dir
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Restart the routed framework
* JJH: Lie to the finalize function so it does not try to contact the daemon.
*/
orte_process_info.tool = true;
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
exit_status = ret;
goto cleanup;
}
orte_process_info.tool = false;
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
exit_status = ret;
goto cleanup;
}
/*
* Group Comm - Clean out stale data
*/
orte_grpcomm.finalize();
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) {
exit_status = ret;
goto cleanup;
}
/*
* Restart the PLM - Does nothing at the moment, but included for completeness
*/
if (ORTE_SUCCESS != (ret = orte_plm.finalize())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* RML - Enable communications
*/
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
exit_status = ret;
goto cleanup;
}
/*
* Session directory re-init
*/
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
exit_status = ret;
@ -299,10 +382,13 @@ static int rte_ft_event(int state)
exit_status = ret;
}
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
"output-", NULL, NULL);
/*
* Re-enable communication through the RML
* Notify Routed
*/
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
@ -316,13 +402,9 @@ static int rte_ft_event(int state)
}
/*
* Re-exchange the routes
* Notify SnapC
*/
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}

Просмотреть файл

@ -73,8 +73,6 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void);
typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report);
/**
* Handle fault tolerance updates
*
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update

Просмотреть файл

@ -1640,6 +1640,7 @@ int mca_oob_tcp_ft_event(int state) {
#else
int mca_oob_tcp_ft_event(int state) {
int exit_status = ORTE_SUCCESS;
opal_list_item_t *item;
if(OPAL_CRS_CHECKPOINT == state) {
/*
@ -1647,7 +1648,6 @@ int mca_oob_tcp_ft_event(int state) {
*/
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
opal_event_disable();
}
else if(OPAL_CRS_CONTINUE == state) {
/*
@ -1657,6 +1657,30 @@ int mca_oob_tcp_ft_event(int state) {
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
}
else if(OPAL_CRS_RESTART == state) {
/*
* Clean out cached connection information
* Select pieces of finalize/init
*/
for(item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list);
item != NULL;
item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list)) {
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)item;
/* JJH: Use the below command for debugging restarts with invalid sockets
* mca_oob_tcp_peer_dump(peer, "RESTART CLEAN")
*/
MCA_OOB_TCP_PEER_RETURN(peer);
}
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_list, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peers, opal_hash_table_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_names, opal_hash_table_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_free, opal_free_list_t);
/*
* Resume event processing
*/

Просмотреть файл

@ -76,7 +76,6 @@ static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, int sd, voi
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user);
static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user);
static void mca_oob_tcp_peer_timer_handler(int sd, short flags, void* user);
static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg);
OBJ_CLASS_INSTANCE(
@ -970,7 +969,7 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user)
/*
* Routine for debugging to print the connection state and socket options
*/
static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
{
char src[64];
char dst[64];

Просмотреть файл

@ -157,6 +157,8 @@ int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer);
*/
void mca_oob_tcp_peer_dequeue_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg);
void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg);
END_C_DECLS
#endif /* _MCA_OOB_TCP_PEER_H */

Просмотреть файл

@ -182,6 +182,16 @@ typedef int (*orte_routed_module_get_wireup_info_fn_t)(orte_jobid_t job,
opal_buffer_t *buf);
/**
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_routed_module_ft_event_fn_t)(int state);
/* ******************************************************************** */
@ -202,6 +212,8 @@ struct orte_routed_module_t {
orte_routed_module_init_routes_fn_t init_routes;
orte_routed_module_route_lost_fn_t route_lost;
orte_routed_module_get_wireup_info_fn_t get_wireup_info;
/* FT Notification */
orte_routed_module_ft_event_fn_t ft_event;
};
/** Convenience typedef */
typedef struct orte_routed_module_t orte_routed_module_t;

Просмотреть файл

@ -39,6 +39,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf);
#if OPAL_ENABLE_FT == 1
static int tree_ft_event(int state);
#endif
static orte_process_name_t *lifeline=NULL;
orte_routed_module_t orte_routed_tree_module = {
@ -48,7 +52,12 @@ orte_routed_module_t orte_routed_tree_module = {
get_route,
init_routes,
route_lost,
get_wireup_info
get_wireup_info,
#if OPAL_ENABLE_FT == 1
tree_ft_event
#else
NULL
#endif
};
/* local globals */
@ -571,3 +580,36 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf)
return ORTE_SUCCESS;
}
#if OPAL_ENABLE_FT == 1
static int tree_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
/*
* Re-exchange the routes
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */
}
else {
/* Error state = Nothing */
}
cleanup:
return exit_status;
}
#endif

Просмотреть файл

@ -45,6 +45,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf);
#if OPAL_ENABLE_FT == 1
static int unity_ft_event(int state);
#endif
static orte_process_name_t *lifeline=NULL;
orte_routed_module_t orte_routed_unity_module = {
@ -54,7 +58,12 @@ orte_routed_module_t orte_routed_unity_module = {
get_route,
init_routes,
route_lost,
get_wireup_info
get_wireup_info,
#if OPAL_ENABLE_FT == 1
unity_ft_event
#else
NULL
#endif
};
static int init(void)
@ -66,6 +75,8 @@ static int init(void)
OBJ_CONSTRUCT(&peer_list, opal_hash_table_t);
opal_hash_table_init(&peer_list, 128);
lifeline = NULL;
return ORTE_SUCCESS;
}
@ -107,6 +118,8 @@ static int finalize(void)
/* cleanup the global condition */
OBJ_DESTRUCT(&cond);
OBJ_DESTRUCT(&lock);
lifeline = NULL;
return ORTE_SUCCESS;
}
@ -576,3 +589,36 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf)
return ORTE_SUCCESS;
}
#if OPAL_ENABLE_FT == 1
static int unity_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
/*
* Re-exchange the routes
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */
}
else {
/* Error state = Nothing */
}
cleanup:
return exit_status;
}
#endif

Просмотреть файл

@ -113,6 +113,7 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
ORTE_DECLSPEC int orte_snapc_base_module_finalize(void);
ORTE_DECLSPEC int orte_snapc_base_none_setup_job(orte_jobid_t jobid);
ORTE_DECLSPEC int orte_snapc_base_none_release_job(orte_jobid_t jobid);
ORTE_DECLSPEC int orte_snapc_base_none_ft_event(int state);
ORTE_DECLSPEC extern int orte_snapc_base_output;
ORTE_DECLSPEC extern opal_list_t orte_snapc_base_components_available;

Просмотреть файл

@ -203,6 +203,11 @@ int orte_snapc_base_none_release_job(orte_jobid_t jobid)
return ORTE_SUCCESS;
}
int orte_snapc_base_none_ft_event(int state)
{
return ORTE_SUCCESS;
}
/********************
* Local Functions
********************/

Просмотреть файл

@ -66,7 +66,8 @@ static orte_snapc_base_module_t none_module = {
/** Finalization Function */
orte_snapc_base_module_finalize,
orte_snapc_base_none_setup_job,
orte_snapc_base_none_release_job
orte_snapc_base_none_release_job,
orte_snapc_base_none_ft_event
};
int orte_snapc_base_select(bool seed, bool app)

Просмотреть файл

@ -47,7 +47,8 @@ static orte_snapc_base_module_t loc_module = {
/** Finalization Function */
orte_snapc_full_module_finalize,
orte_snapc_full_setup_job,
orte_snapc_full_release_job
orte_snapc_full_release_job,
orte_snapc_base_none_ft_event
};
/*

Просмотреть файл

@ -200,6 +200,17 @@ typedef int (*orte_snapc_base_setup_job_fn_t)
typedef int (*orte_snapc_base_release_job_fn_t)
(orte_jobid_t jobid);
/**
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_snapc_base_ft_event_fn_t)(int state);
/**
* Structure for SNAPC v1.0.0 components.
*/
@ -234,6 +245,8 @@ struct orte_snapc_base_module_1_0_0_t {
orte_snapc_base_setup_job_fn_t setup_job;
/** Release job */
orte_snapc_base_release_job_fn_t release_job;
/** Handle any FT Notifications */
orte_snapc_base_ft_event_fn_t ft_event;
};
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t;
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t;

Просмотреть файл

@ -187,6 +187,7 @@ int orte_daemon(int argc, char *argv[])
int i;
opal_buffer_t *buffer;
char hostname[100];
char *tmp_env_var = NULL;
/* initialize the globals */
memset(&orted_globals, 0, sizeof(orted_globals));
@ -283,10 +284,13 @@ int orte_daemon(int argc, char *argv[])
#if OPAL_ENABLE_FT == 1
/* Mark as a tool program */
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
/* detach from controlling terminal
* otherwise, remain attached so output can get to us

Просмотреть файл

@ -325,14 +325,6 @@ static int orte_cr_coord_post_restart(void) {
/*
* Notify the ESS
*/
if (ORTE_SUCCESS != (ret = orte_ess_base_open())) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
exit_status = ret;
}
if( NULL != orte_ess.ft_event ) {
if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;

Просмотреть файл

@ -25,8 +25,6 @@ SUBDIRS += \
tools/orterun \
tools/orte-clean \
tools/orte-ps \
tools/orte-checkpoint \
tools/orte-restart \
tools/wrappers
DIST_SUBDIRS += \

Просмотреть файл

@ -24,6 +24,16 @@ ompi-checkpoint PID_OF_MPIRUN
%s
[usage-no-cr]
This build of Open MPI does *not* include Checkpoint/Restart functionality.
If you require this functionality re-configure Open MPI with the proper
Checkpoint/Restart options.
ompi-checkpoint PID_OF_MPIRUN
Open MPI Checkpoint Tool
%s
[invalid_pid]
Error: The PID (%d) is invalid because either you have not provided a PID
or provided an invalid PID.

Просмотреть файл

@ -107,6 +107,8 @@ static int global_sequence_num = 0;
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static bool listener_started = false;
typedef struct {
bool help;
int pid;
@ -267,6 +269,7 @@ static int parse_args(int argc, char *argv[]) {
int i, ret, len, exit_status = ORTE_SUCCESS ;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
/* Init structure */
memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t));
@ -301,9 +304,12 @@ static int parse_args(int argc, char *argv[]) {
putenv(global_env[i]);
}
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
@ -311,6 +317,19 @@ static int parse_args(int argc, char *argv[]) {
/* get the remaining bits */
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
#if OPAL_ENABLE_FT == 0
/* Warn and exit if not configured with Checkpoint/Restart */
{
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-checkpoint.txt", "usage-no-cr",
true, args);
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
#endif
if (OPAL_SUCCESS != ret ||
orte_checkpoint_globals.help ||
(0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp)) {
@ -322,7 +341,7 @@ static int parse_args(int argc, char *argv[]) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
@ -398,6 +417,9 @@ cleanup:
static int ckpt_init(int argc, char *argv[]) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
listener_started = false;
/*
* Make sure to init util before parse_args
@ -432,9 +454,12 @@ static int ckpt_init(int argc, char *argv[]) {
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/***************************
* We need all of OPAL and the TOOLS portion of ORTE - this
@ -488,6 +513,8 @@ static int start_listener(void)
goto cleanup;
}
listener_started = true;
cleanup:
return exit_status;
}
@ -496,12 +523,18 @@ static int stop_listener(void)
{
int ret, exit_status = ORTE_SUCCESS;
if( !listener_started ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_CKPT))) {
exit_status = ret;
goto cleanup;
}
listener_started = false;
cleanup:
return exit_status;
}
@ -597,18 +630,6 @@ static void process_ckpt_update_cmd(orte_process_name_t* sender,
pretty_print_status();
}
}
/*
* Otherwise only display it if we are going to be terminated soon
*/
else {
/* Since ORTE kills us before we get the Finished message,
* print out the global snapshot handle when we start running
*/ /* JJH */
if(orte_checkpoint_globals.term &&
ORTE_SNAPC_CKPT_STATE_RUNNING == orte_checkpoint_globals.ckpt_status ) {
pretty_print_status();
}
}
cleanup:
return;

Просмотреть файл

@ -162,6 +162,7 @@ static int parse_args(int argc, char *argv[]) {
int ret;
opal_cmd_line_t cmd_line;
orte_clean_globals_t tmp = { false, false };
char * tmp_env_var = NULL;
/* Parse the command line options */
@ -176,8 +177,11 @@ static int parse_args(int argc, char *argv[]) {
opal_cmd_line_create(&cmd_line, cmd_line_opts);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1", true, NULL);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
@ -199,6 +203,7 @@ static int parse_args(int argc, char *argv[]) {
static int orte_clean_init(void) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
#if OPAL_ENABLE_FT == 1
/* Disable the checkpoint notification routine for this
@ -208,10 +213,13 @@ static int orte_clean_init(void) {
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
exit_status = ret;

Просмотреть файл

@ -326,6 +326,7 @@ static int parse_args(int argc, char *argv[]) {
static int orte_ps_init(int argc, char *argv[]) {
int ret;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
@ -361,14 +362,21 @@ static int orte_ps_init(int argc, char *argv[]) {
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
/***************************
* We need all of OPAL and the TOOL portion of ORTE
***************************/

Просмотреть файл

@ -24,6 +24,16 @@ ompi-restart GLOBAL_SNAPSHOT_REF
%s
[usage-no-cr]
This build of Open MPI does *not* include Checkpoint/Restart functionality.
If you require this functionality re-configure Open MPI with the proper
Checkpoint/Restart options.
ompi-restart GLOBAL_SNAPSHOT_REF
Open MPI Parallel Job Restart Tool
%s
[invalid_filename]
Error: The filename (%s) is invalid because either you have not provided a filename
or provided an invalid filename.

Просмотреть файл

@ -229,6 +229,7 @@ main(int argc, char *argv[])
static int initialize(int argc, char *argv[]) {
int ret, exit_status = ORTE_SUCCESS;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
@ -264,9 +265,12 @@ static int initialize(int argc, char *argv[]) {
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
opal_setenv(mca_base_param_env_var("crs"),
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Setup any ORTE stuff we might need
@ -277,8 +281,15 @@ static int initialize(int argc, char *argv[]) {
}
/* Unset these now that we no longer need them */
opal_unsetenv(mca_base_param_env_var("crs"), &environ);
opal_unsetenv(mca_base_param_env_var("opal_cr_is_tool"), &environ);
tmp_env_var = mca_base_param_env_var("crs");
opal_unsetenv(tmp_env_var, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_unsetenv(tmp_env_var, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
cleanup:
return exit_status;
@ -300,6 +311,7 @@ static int parse_args(int argc, char *argv[])
int i, ret, len;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
orte_restart_globals_t tmp = { false, /* help */
NULL, /* filename */
NULL, /* appfile */
@ -334,13 +346,29 @@ static int parse_args(int argc, char *argv[])
putenv(global_env[i]);
}
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
#if OPAL_ENABLE_FT == 0
/* Warn and exit if not configured with Checkpoint/Restart */
{
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-restart.txt", "usage-no-cr",
true, args);
free(args);
return ORTE_ERROR;
}
#endif
if (OPAL_SUCCESS != ret ||
orte_restart_globals.help ||
1 >= argc) {

Просмотреть файл

@ -314,6 +314,7 @@ int orterun(int argc, char *argv[])
{
int rc;
opal_cmd_line_t cmd_line;
char * tmp_env_var = NULL;
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
@ -378,10 +379,13 @@ int orterun(int argc, char *argv[])
#if OPAL_ENABLE_FT == 1
/* Disable OPAL CR notifications for this tool */
opal_cr_set_enabled(false);
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
/* Intialize our Open RTE environment
* Set the flag telling orte_init that I am NOT a

Просмотреть файл

@ -159,7 +159,7 @@ int orte_proc_info_finalize(void)
return ORTE_SUCCESS;
}
if (NULL != orte_process_info.tmpdir_base) {
if (NULL != orte_process_info.tmpdir_base) {
free(orte_process_info.tmpdir_base);
orte_process_info.tmpdir_base = NULL;
}
@ -177,7 +177,6 @@ int orte_proc_info_finalize(void)
if (NULL != orte_process_info.proc_session_dir) {
free(orte_process_info.proc_session_dir);
orte_process_info.proc_session_dir = NULL;
}
if (NULL != orte_process_info.nodename) {
@ -190,12 +189,12 @@ int orte_proc_info_finalize(void)
orte_process_info.sock_stdin = NULL;
}
if (NULL != orte_process_info.sock_stdout) {
if (NULL != orte_process_info.sock_stdout) {
free(orte_process_info.sock_stdout);
orte_process_info.sock_stdout = NULL;
}
if (NULL != orte_process_info.sock_stderr) {
if (NULL != orte_process_info.sock_stderr) {
free(orte_process_info.sock_stderr);
orte_process_info.sock_stderr = NULL;
}