1
1

Use unique collective ids for the checkpoint/restart code

This commit was SVN r30552.
Этот коммит содержится в:
Adrian Reber 2014-02-04 14:03:05 +00:00
родитель 5980b7e042
Коммит fde1040d2f
8 изменённых файлов: 93 добавлений и 5 удалений

2
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -277,7 +277,7 @@ static int rte_ft_event(int state)
orte_grpcomm_collective_t coll; orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_init_barrier; coll.id = orte_process_info.snapc_init_barrier;
/******** Checkpoint Prep ********/ /******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) { if(OPAL_CRS_CHECKPOINT == state) {

Просмотреть файл

@ -597,6 +597,20 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
nm->name.vpid = ORTE_VPID_WILDCARD; nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super); opal_list_append(&coll->participants, &nm->super);
#if OPAL_ENABLE_FT_CR == 1
coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
#endif
/* progress any pending collectives */ /* progress any pending collectives */
orte_grpcomm_base_progress_collectives(); orte_grpcomm_base_progress_collectives();

Просмотреть файл

@ -238,6 +238,10 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
char *modx_par, *modx_val; char *modx_par, *modx_val;
char *bar1_par, *bar1_val; char *bar1_par, *bar1_val;
char *bar2_par, *bar2_val; char *bar2_par, *bar2_val;
#if OPAL_ENABLE_FT_CR == 1
char *barcr1_par, *barcr1_val;
char *barcr2_par, *barcr2_val;
#endif
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setup_job", "%s plm:base:setup_job",
@ -283,6 +287,15 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
(void) mca_base_var_env_name ("orte_peer_fini_barrier_id", &bar2_par); (void) mca_base_var_env_name ("orte_peer_fini_barrier_id", &bar2_par);
asprintf(&bar2_val, "%d", caddy->jdata->peer_fini_barrier); asprintf(&bar2_val, "%d", caddy->jdata->peer_fini_barrier);
#if OPAL_ENABLE_FT_CR == 1
caddy->jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id();
(void) mca_base_var_env_name("orte_snapc_init_barrier_id", &barcr1_par);
asprintf(&barcr1_val, "%d", caddy->jdata->snapc_init_barrier);
caddy->jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id();
(void) mca_base_var_env_name("orte_snapc_fini_barrier_id", &barcr2_par);
asprintf(&barcr2_val, "%d", caddy->jdata->snapc_fini_barrier);
#endif
/* if app recovery is not defined, set apps to defaults */ /* if app recovery is not defined, set apps to defaults */
for (i=0; i < caddy->jdata->apps->size; i++) { for (i=0; i < caddy->jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
@ -295,6 +308,10 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
opal_setenv(modx_par, modx_val, true, &app->env); opal_setenv(modx_par, modx_val, true, &app->env);
opal_setenv(bar1_par, bar1_val, true, &app->env); opal_setenv(bar1_par, bar1_val, true, &app->env);
opal_setenv(bar2_par, bar2_val, true, &app->env); opal_setenv(bar2_par, bar2_val, true, &app->env);
#if OPAL_ENABLE_FT_CR == 1
opal_setenv(barcr1_par, barcr1_val, true, &app->env);
opal_setenv(barcr2_par, barcr2_val, true, &app->env);
#endif
} }
free(modx_par); free(modx_par);
free(modx_val); free(modx_val);
@ -302,6 +319,12 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
free(bar1_val); free(bar1_val);
free(bar2_par); free(bar2_par);
free(bar2_val); free(bar2_val);
#if OPAL_ENABLE_FT_CR == 1
free(barcr1_par);
free(barcr1_val);
free(barcr2_par);
free(barcr2_val);
#endif
/* set the job state to the next position */ /* set the job state to the next position */
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE); ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE);

Просмотреть файл

@ -155,7 +155,7 @@ int app_coord_init()
} }
coll = OBJ_NEW(orte_grpcomm_collective_t); coll = OBJ_NEW(orte_grpcomm_collective_t);
coll->id = orte_process_info.peer_init_barrier; coll->id = orte_process_info.snapc_init_barrier;
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) { if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
@ -231,7 +231,7 @@ int app_coord_finalize()
} }
coll = OBJ_NEW(orte_grpcomm_collective_t); coll = OBJ_NEW(orte_grpcomm_collective_t);
coll->id = orte_process_info.peer_init_barrier; coll->id = orte_process_info.snapc_init_barrier;
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) { if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
@ -309,7 +309,7 @@ int app_coord_finalize()
"app) Shutdown Barrier: Waiting on barrier...!")); "app) Shutdown Barrier: Waiting on barrier...!"));
} }
coll->id = orte_process_info.peer_fini_barrier; coll->id = orte_process_info.snapc_fini_barrier;
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) { if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;

Просмотреть файл

@ -606,6 +606,22 @@ int orte_daemon(int argc, char *argv[])
nm->name.vpid = ORTE_VPID_WILDCARD; nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super); opal_list_append(&coll->participants, &nm->super);
#if OPAL_ENABLE_FT_CR == 1
jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
#endif
/* need to setup a pidmap for it */ /* need to setup a pidmap for it */
if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(&orte_pidmap, false))) { if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(&orte_pidmap, false))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -462,6 +462,9 @@ typedef struct {
char *ckpt_snapshot_ref; char *ckpt_snapshot_ref;
/* snapshot location */ /* snapshot location */
char *ckpt_snapshot_loc; char *ckpt_snapshot_loc;
/* collective ids */
orte_grpcomm_coll_id_t snapc_init_barrier;
orte_grpcomm_coll_id_t snapc_fini_barrier;
#endif #endif
} orte_job_t; } orte_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t); ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t);

Просмотреть файл

@ -83,6 +83,10 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .peer_init_barrier = */ -1, /* .peer_init_barrier = */ -1,
/* .peer_fini_barrier = */ -1, /* .peer_fini_barrier = */ -1,
/* .my_hostid = */ ORTE_VPID_INVALID /* .my_hostid = */ ORTE_VPID_INVALID
#if OPAL_ENABLE_FT_CR == 1
/* .snapc_init_barrier = */ -1,
/* .snapc_fini_barrier = */ -1,
#endif
}; };
static bool init=false; static bool init=false;
@ -90,6 +94,10 @@ static int orte_ess_node_rank;
static int orte_peer_modex_id; static int orte_peer_modex_id;
static int orte_peer_init_barrier_id; static int orte_peer_init_barrier_id;
static int orte_peer_fini_barrier_id; static int orte_peer_fini_barrier_id;
#if OPAL_ENABLE_FT_CR == 1
static int orte_snapc_init_barrier_id;
static int orte_snapc_fini_barrier_id;
#endif
static char *orte_strip_prefix; static char *orte_strip_prefix;
int orte_proc_info(void) int orte_proc_info(void)
@ -286,6 +294,26 @@ int orte_proc_info(void)
&orte_peer_fini_barrier_id); &orte_peer_fini_barrier_id);
orte_process_info.peer_fini_barrier = (orte_grpcomm_coll_id_t) orte_peer_fini_barrier_id; orte_process_info.peer_fini_barrier = (orte_grpcomm_coll_id_t) orte_peer_fini_barrier_id;
#if OPAL_ENABLE_FT_CR == 1
orte_snapc_init_barrier_id = -1;
(void) mca_base_var_register ("orte", "orte", NULL, "snapc_init_barrier_id", "SNAPC init barrier collective id",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_INTERNAL,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_CONSTANT,
&orte_snapc_init_barrier_id);
orte_process_info.snapc_init_barrier = (orte_grpcomm_coll_id_t) orte_snapc_init_barrier_id;
orte_snapc_fini_barrier_id = -1;
(void) mca_base_var_register ("orte", "orte", NULL, "snapc_fini_barrier_id", "SNAPC finalize barrier collective id",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_INTERNAL,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_CONSTANT,
&orte_snapc_fini_barrier_id);
orte_process_info.snapc_fini_barrier = (orte_grpcomm_coll_id_t) orte_snapc_fini_barrier_id;
#endif
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -130,6 +130,10 @@ struct orte_proc_info_t {
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */ orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
orte_vpid_t my_hostid; /** identifies the local host for a coprocessor */ orte_vpid_t my_hostid; /** identifies the local host for a coprocessor */
#if OPAL_ENABLE_FT_CR == 1
orte_grpcomm_coll_id_t snapc_init_barrier; /**< barrier id during init */
orte_grpcomm_coll_id_t snapc_fini_barrier; /**< barrier id during finalize */
#endif
}; };
typedef struct orte_proc_info_t orte_proc_info_t; typedef struct orte_proc_info_t orte_proc_info_t;