diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index 9b8009914c..b04f9027fa 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -277,7 +277,7 @@ static int rte_ft_event(int state) orte_grpcomm_collective_t coll; OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); - coll.id = orte_process_info.peer_init_barrier; + coll.id = orte_process_info.snapc_init_barrier; /******** Checkpoint Prep ********/ if(OPAL_CRS_CHECKPOINT == state) { diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index cdc2874e94..a9db7e3c7a 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -596,7 +596,21 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, nm->name.jobid = jdata->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&coll->participants, &nm->super); - + +#if OPAL_ENABLE_FT_CR == 1 + coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + + coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); +#endif + /* progress any pending collectives */ orte_grpcomm_base_progress_collectives(); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 702f7c475d..3d493a98a9 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -238,6 +238,10 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) char *modx_par, *modx_val; char *bar1_par, *bar1_val; char *bar2_par, *bar2_val; +#if OPAL_ENABLE_FT_CR == 1 + char *barcr1_par, *barcr1_val; + char *barcr2_par, *barcr2_val; +#endif OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setup_job", @@ -283,6 +287,15 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) (void) mca_base_var_env_name ("orte_peer_fini_barrier_id", &bar2_par); asprintf(&bar2_val, "%d", caddy->jdata->peer_fini_barrier); +#if OPAL_ENABLE_FT_CR == 1 + caddy->jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id(); + (void) mca_base_var_env_name("orte_snapc_init_barrier_id", &barcr1_par); + asprintf(&barcr1_val, "%d", caddy->jdata->snapc_init_barrier); + caddy->jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id(); + (void) mca_base_var_env_name("orte_snapc_fini_barrier_id", &barcr2_par); + asprintf(&barcr2_val, "%d", caddy->jdata->snapc_fini_barrier); +#endif + /* if app recovery is not defined, set apps to defaults */ for (i=0; i < caddy->jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) { @@ -295,6 +308,10 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) opal_setenv(modx_par, modx_val, true, &app->env); opal_setenv(bar1_par, bar1_val, true, &app->env); opal_setenv(bar2_par, bar2_val, true, &app->env); +#if OPAL_ENABLE_FT_CR == 1 + opal_setenv(barcr1_par, barcr1_val, true, &app->env); + opal_setenv(barcr2_par, barcr2_val, true, &app->env); +#endif } free(modx_par); free(modx_val); @@ -302,6 +319,12 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) free(bar1_val); free(bar2_par); free(bar2_val); +#if OPAL_ENABLE_FT_CR == 1 + free(barcr1_par); + free(barcr1_val); + free(barcr2_par); + free(barcr2_val); +#endif /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE); diff --git a/orte/mca/snapc/full/snapc_full_app.c b/orte/mca/snapc/full/snapc_full_app.c index 68bde032fc..dc83987834 100644 --- a/orte/mca/snapc/full/snapc_full_app.c +++ b/orte/mca/snapc/full/snapc_full_app.c @@ -155,7 +155,7 @@ int app_coord_init() } coll = OBJ_NEW(orte_grpcomm_collective_t); - coll->id = orte_process_info.peer_init_barrier; + coll->id = orte_process_info.snapc_init_barrier; if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; @@ -231,7 +231,7 @@ int app_coord_finalize() } coll = OBJ_NEW(orte_grpcomm_collective_t); - coll->id = orte_process_info.peer_init_barrier; + coll->id = orte_process_info.snapc_init_barrier; if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; @@ -309,7 +309,7 @@ int app_coord_finalize() "app) Shutdown Barrier: Waiting on barrier...!")); } - coll->id = orte_process_info.peer_fini_barrier; + coll->id = orte_process_info.snapc_fini_barrier; if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 285d70fb29..9ad0f8dbd7 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -606,6 +606,22 @@ int orte_daemon(int argc, char *argv[]) nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&coll->participants, &nm->super); +#if OPAL_ENABLE_FT_CR == 1 + jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id(); + coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + + jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id(); + coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); +#endif + /* need to setup a pidmap for it */ if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(&orte_pidmap, false))) { ORTE_ERROR_LOG(ret); diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 1f2cd18a55..679c3f8df8 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -462,6 +462,9 @@ typedef struct { char *ckpt_snapshot_ref; /* snapshot location */ char *ckpt_snapshot_loc; + /* collective ids */ + orte_grpcomm_coll_id_t snapc_init_barrier; + orte_grpcomm_coll_id_t snapc_fini_barrier; #endif } orte_job_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t); diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index 54fe4d62e1..dc83d21381 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -83,6 +83,10 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .peer_init_barrier = */ -1, /* .peer_fini_barrier = */ -1, /* .my_hostid = */ ORTE_VPID_INVALID +#if OPAL_ENABLE_FT_CR == 1 + /* .snapc_init_barrier = */ -1, + /* .snapc_fini_barrier = */ -1, +#endif }; static bool init=false; @@ -90,6 +94,10 @@ static int orte_ess_node_rank; static int orte_peer_modex_id; static int orte_peer_init_barrier_id; static int orte_peer_fini_barrier_id; +#if OPAL_ENABLE_FT_CR == 1 +static int orte_snapc_init_barrier_id; +static int orte_snapc_fini_barrier_id; +#endif static char *orte_strip_prefix; int orte_proc_info(void) @@ -286,6 +294,26 @@ int orte_proc_info(void) &orte_peer_fini_barrier_id); orte_process_info.peer_fini_barrier = (orte_grpcomm_coll_id_t) orte_peer_fini_barrier_id; +#if OPAL_ENABLE_FT_CR == 1 + orte_snapc_init_barrier_id = -1; + (void) mca_base_var_register ("orte", "orte", NULL, "snapc_init_barrier_id", "SNAPC init barrier collective id", + MCA_BASE_VAR_TYPE_INT, NULL, 0, + MCA_BASE_VAR_FLAG_INTERNAL, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_CONSTANT, + &orte_snapc_init_barrier_id); + orte_process_info.snapc_init_barrier = (orte_grpcomm_coll_id_t) orte_snapc_init_barrier_id; + + orte_snapc_fini_barrier_id = -1; + (void) mca_base_var_register ("orte", "orte", NULL, "snapc_fini_barrier_id", "SNAPC finalize barrier collective id", + MCA_BASE_VAR_TYPE_INT, NULL, 0, + MCA_BASE_VAR_FLAG_INTERNAL, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_CONSTANT, + &orte_snapc_fini_barrier_id); + orte_process_info.snapc_fini_barrier = (orte_grpcomm_coll_id_t) orte_snapc_fini_barrier_id; +#endif + return ORTE_SUCCESS; } diff --git a/orte/util/proc_info.h b/orte/util/proc_info.h index 765ced8dc4..8795a1f113 100644 --- a/orte/util/proc_info.h +++ b/orte/util/proc_info.h @@ -130,6 +130,10 @@ struct orte_proc_info_t { orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */ orte_vpid_t my_hostid; /** identifies the local host for a coprocessor */ +#if OPAL_ENABLE_FT_CR == 1 + orte_grpcomm_coll_id_t snapc_init_barrier; /**< barrier id during init */ + orte_grpcomm_coll_id_t snapc_fini_barrier; /**< barrier id during finalize */ +#endif }; typedef struct orte_proc_info_t orte_proc_info_t;