diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 66f12c372e..31e444f91e 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -431,18 +431,26 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, opal_list_append(&coll->participants, &nm->super); #if OPAL_ENABLE_FT_CR == 1 - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, (void**)gidptr, )) { - coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier); - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = jdata->jobid; - nm->name.vpid = ORTE_VPID_WILDCARD; - opal_list_append(&coll->participants, &nm->super); + { + orte_grpcomm_coll_id_t gid, *gidptr; + gidptr = &gid; + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, + (void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) { + coll = orte_grpcomm_base_setup_collective(*gidptr); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + } - coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier); - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = jdata->jobid; - nm->name.vpid = ORTE_VPID_WILDCARD; - opal_list_append(&coll->participants, &nm->super); + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR, + (void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) { + coll = orte_grpcomm_base_setup_collective(*gidptr); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + } } #endif diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index a22f6733fd..55b52ce31f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -290,7 +290,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) #if OPAL_ENABLE_FT_CR == 1 { - orte_grpcomm_collective_id_t id; + orte_grpcomm_coll_id_t id; id = orte_grpcomm_base_get_coll_id(); orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T); (void) mca_base_var_env_name("orte_snapc_init_barrier_id", &barcr1_par); diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c index 73d1981292..9da8859a16 100644 --- a/orte/mca/snapc/full/snapc_full_global.c +++ b/orte/mca/snapc/full/snapc_full_global.c @@ -257,7 +257,7 @@ int global_coord_setup_job(orte_jobid_t jobid) { return ORTE_ERR_NOT_FOUND; } - if( ORTE_JOB_CONTROL_RESTART == jdata->controls ) { + if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Restarting Job %s...", ORTE_JOBID_PRINT(jobid))); diff --git a/orte/mca/sstore/stage/sstore_stage_local.c b/orte/mca/sstore/stage/sstore_stage_local.c index 6e2af2600b..1608f9ffef 100644 --- a/orte/mca/sstore/stage/sstore_stage_local.c +++ b/orte/mca/sstore/stage/sstore_stage_local.c @@ -696,26 +696,27 @@ int orte_sstore_stage_local_fetch_app_deps(orte_app_context_t *app) orte_proc_t *child = NULL; int loc_argc = 0; bool skip_xfer = false; + char *sload = NULL; - if( !app->used_on_node || NULL == app->sstore_load ) { + orte_get_attribute(&app->attributes, ORTE_APP_SSTORE_LOAD, (void **)&sload, OPAL_STRING); + + if(!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE) || NULL == sload) { OPAL_OUTPUT_VERBOSE((30, mca_sstore_stage_component.super.output_handle, "sstore:stage:(local): fetch_app_deps(%3d): Not for this daemon (%s, %d, %s)", - app->idx, - (app->used_on_node ? "T" : "F"), - (int)app->num_procs, - app->sstore_load)); + app->idx, (ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE) ? "T" : "F"), + (int)app->num_procs, sload)); /* Nothing to do */ goto cleanup; } OPAL_OUTPUT_VERBOSE((10, mca_sstore_stage_component.super.output_handle, "sstore:stage:(local): fetch_app_deps(%3d): %s", - app->idx, app->sstore_load)); + app->idx, sload)); /* * Extract the 'ref:seq' parameter */ - sstore_args = opal_argv_split(app->sstore_load, ':'); + sstore_args = opal_argv_split(sload, ':'); req_snap_loc = strdup(sstore_args[0]); req_snap_global_ref = strdup(sstore_args[1]); req_snap_ref = strdup(sstore_args[2]); diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 5224f60272..83378eb844 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -607,19 +607,26 @@ int orte_daemon(int argc, char *argv[]) opal_list_append(&coll->participants, &nm->super); #if OPAL_ENABLE_FT_CR == 1 - jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id(); - coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier); - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = jdata->jobid; - nm->name.vpid = ORTE_VPID_WILDCARD; - opal_list_append(&coll->participants, &nm->super); + { + orte_grpcomm_coll_id_t id; + id = orte_grpcomm_base_get_coll_id(); + coll = orte_grpcomm_base_setup_collective(id); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + orte_set_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, + ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T); - jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id(); - coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier); - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = jdata->jobid; - nm->name.vpid = ORTE_VPID_WILDCARD; - opal_list_append(&coll->participants, &nm->super); + id = orte_grpcomm_base_get_coll_id(); + coll = orte_grpcomm_base_setup_collective(id); + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = jdata->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + orte_set_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR, + ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T); + } #endif /* create a string that contains our uri + sysinfo */ diff --git a/orte/tools/orte-ps/orte-ps.c b/orte/tools/orte-ps/orte-ps.c index 095638fd2f..a9a6959e08 100644 --- a/orte/tools/orte-ps/orte-ps.c +++ b/orte/tools/orte-ps/orte-ps.c @@ -538,6 +538,9 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) { orte_jobid_t mask=0x0000ffff; #if OPAL_ENABLE_FT_CR == 1 char * state_str = NULL; + size_t ckpt_state; + char *snap_ref = NULL; + char *snap_loc = NULL; #endif for(i=0; i < num_jobs; i++) { @@ -561,18 +564,17 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) { len_slots = 6; len_vpid_r = (int) strlen("Num Procs"); #if OPAL_ENABLE_FT_CR == 1 - orte_snapc_ckpt_state_str(&state_str, job->ckpt_state); + orte_get_attribute(&job->attributes, ORTE_JOB_CKPT_STATE, (void**)&ckpt_state, OPAL_INT32); + orte_get_attribute(&job->attributes, ORTE_JOB_SNAPSHOT_REF, (void**)&snap_ref, OPAL_STRING); + orte_get_attribute(&job->attributes, ORTE_JOB_SNAPSHOT_LOC, (void**)&snap_loc, OPAL_STRING); + orte_snapc_ckpt_state_str(&state_str, ckpt_state); len_ckpt_s = (int) (strlen(state_str) < strlen("Ckpt State") ? strlen("Ckpt State") : strlen(state_str) ); - len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") : - (strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ? - strlen("Ckpt Ref") : - strlen(job->ckpt_snapshot_ref) ) ); - len_ckpt_l = (int) (NULL == job->ckpt_snapshot_loc ? strlen("Ckpt Loc") : - (strlen(job->ckpt_snapshot_loc) < strlen("Ckpt Loc") ? - strlen("Ckpt Loc") : - strlen(job->ckpt_snapshot_loc) ) ); + len_ckpt_r = (int) (NULL == snap_ref ? strlen("Ckpt Ref") : (strlen(snap_ref) < strlen("Ckpt Ref") ? + strlen("Ckpt Ref") : strlen(snap_ref))); + len_ckpt_l = (int) (NULL == snap_loc ? strlen("Ckpt Loc") : (strlen(snap_loc) < strlen("Ckpt Loc") ? + strlen("Ckpt Loc") : strlen(snap_loc))); #else len_ckpt_s = -3; len_ckpt_r = -3; @@ -614,12 +616,8 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) { printf("%*d | ", len_vpid_r, job->num_procs); #if OPAL_ENABLE_FT_CR == 1 printf("%*s | ", len_ckpt_s, state_str); - printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ? - "" : - job->ckpt_snapshot_ref) ); - printf("%*s |", len_ckpt_l, (NULL == job->ckpt_snapshot_loc ? - "" : - job->ckpt_snapshot_loc) ); + printf("%*s | ", len_ckpt_r, (NULL == snap_ref ? "" : snap_ref)); + printf("%*s |", len_ckpt_l, (NULL == snap_loc ? "" : snap_loc)); #endif printf("\n"); @@ -648,6 +646,9 @@ static int pretty_print_vpids(orte_job_t *job) { char *o_proc_name; #if OPAL_ENABLE_FT_CR == 1 char *state_str = NULL; + size_t ckpt_state; + char *snap_ref = NULL; + char *snap_loc = NULL; #endif char *nodename; @@ -715,19 +716,20 @@ static int pretty_print_vpids(orte_job_t *job) { if( (int)strlen(orte_proc_state_to_str(vpid->state)) > len_state) len_state = strlen(orte_proc_state_to_str(vpid->state)); - + #if OPAL_ENABLE_FT_CR == 1 - orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state); + orte_get_attribute(&vpid->attributes, ORTE_PROC_CKPT_STATE, (void**)&ckpt_state, OPAL_INT32); + orte_get_attribute(&vpid->attributes, ORTE_PROC_SNAPSHOT_REF, (void**)&snap_ref, OPAL_STRING); + orte_get_attribute(&vpid->attributes, ORTE_PROC_SNAPSHOT_LOC, (void**)&snap_loc, OPAL_STRING); + orte_snapc_ckpt_state_str(&state_str, ckpt_state); if( (int)strlen(state_str) > len_ckpt_s) len_ckpt_s = strlen(state_str); - - if( NULL != vpid->ckpt_snapshot_ref && - (int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r) - len_ckpt_r = strlen(vpid->ckpt_snapshot_ref); - - if( NULL != vpid->ckpt_snapshot_loc && - (int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l) - len_ckpt_l = strlen(vpid->ckpt_snapshot_loc); + + if(NULL != snap_ref && (int)strlen(snap_ref) > len_ckpt_r) + len_ckpt_r = strlen(snap_ref); + + if(NULL != snap_loc && (int)strlen(snap_loc) > len_ckpt_l) + len_ckpt_l = strlen(snap_loc); #endif } @@ -798,12 +800,8 @@ static int pretty_print_vpids(orte_job_t *job) { #if OPAL_ENABLE_FT_CR == 1 printf("%*s | ", len_ckpt_s, state_str); - printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ? - "" : - vpid->ckpt_snapshot_ref)); - printf("%*s |", len_ckpt_l, (NULL == vpid->ckpt_snapshot_loc ? - "" : - vpid->ckpt_snapshot_loc)); + printf("%*s | ", len_ckpt_r, (NULL == snap_ref ? "" : snap_ref)); + printf("%*s |", len_ckpt_l, (NULL == snap_loc ? "" : snap_loc)); #endif printf("\n"); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 30a35ab9f4..97821373fb 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -1965,10 +1965,9 @@ static int create_app(int argc, char* argv[], } #if OPAL_ENABLE_FT_CR == 1 - if( NULL != orterun_globals.sstore_load ) { - app->sstore_load = strdup(orterun_globals.sstore_load); - } else { - app->sstore_load = NULL; + if(NULL != orterun_globals.sstore_load) { + orte_set_attribute(&app->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, + orterun_globals.sstore_load, OPAL_STRING); } #endif