1
1

Move all collective id's into the attributes and let the job pack/unpack take care of them instead of singling them out. Add the envars just prior to forking the children instead of into the launch message itself. Remove a few #if CR as the attributes functionality can handle this condition now.

This commit was SVN r32133.
Этот коммит содержится в:
Ralph Castain 2014-07-03 15:58:13 +00:00
родитель 0a4639308e
Коммит 356e7ea904
6 изменённых файлов: 134 добавлений и 149 удалений

Просмотреть файл

@ -172,20 +172,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* pack the collective ids */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_init_barrier, 1, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_fini_barrier, 1, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
@ -210,6 +196,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_namelist_t *nm;
opal_buffer_t *bptr;
orte_app_context_t *app;
orte_grpcomm_coll_id_t gid, *gidptr;
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:constructing child list",
@ -345,23 +332,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
}
}
/* unpack the collective ids */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_init_barrier, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_fini_barrier, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* check the procs */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
for (n=0; n < jdata->procs->size; n++) {
@ -412,47 +382,48 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
COMPLETE:
/* create the collectives so the job doesn't stall */
coll = orte_grpcomm_base_setup_collective(jdata->peer_modex);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
coll = orte_grpcomm_base_setup_collective(jdata->peer_init_barrier);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
coll = orte_grpcomm_base_setup_collective(jdata->peer_fini_barrier);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
#if OPAL_ENABLE_FT_CR == 1
{
orte_grpcomm_coll_id_t gid, *gidptr;
gidptr = &gid;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
gidptr = &gid;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PEER_MODX_ID,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INIT_BAR_ID,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FINI_BAR_ID,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
coll = orte_grpcomm_base_setup_collective(*gidptr);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
}
#endif
/* progress any pending collectives */
orte_grpcomm_base_progress_collectives();
@ -471,7 +442,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
return rc;
}
static int odls_base_default_setup_fork(orte_app_context_t *context,
static int odls_base_default_setup_fork(orte_job_t *jdata,
orte_app_context_t *context,
int32_t num_local_procs,
orte_vpid_t vpid_range,
orte_std_cntr_t total_slots_alloc,
@ -480,6 +452,7 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
{
int i;
char *param, *param2;
orte_grpcomm_coll_id_t gid, *gidptr;
/* setup base environment: copy the current environ and merge
in the app context environ */
@ -489,6 +462,49 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
*environ_copy = opal_argv_copy(orte_launch_environ);
}
/* add any collective id info to the app's environ */
gidptr = &gid;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PEER_MODX_ID,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
(void) mca_base_var_env_name ("orte_peer_modex_id", &param);
asprintf(&param2, "%d", *gidptr);
opal_setenv(param, param2, true, environ_copy);
free(param);
free(param2);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INIT_BAR_ID,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
(void) mca_base_var_env_name ("orte_peer_init_barrier_id", &param);
asprintf(&param2, "%d", *gidptr);
opal_setenv(param, param2, true, environ_copy);
free(param);
free(param2);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FINI_BAR_ID,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
(void) mca_base_var_env_name ("orte_peer_fini_barrier_id", &param);
asprintf(&param2, "%d", *gidptr);
opal_setenv(param, param2, true, environ_copy);
free(param);
free(param2);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
(void) mca_base_var_env_name ("orte_snapc_init_barrier_id", &param);
asprintf(&param2, "%d", *gidptr);
opal_setenv(param, param2, true, environ_copy);
free(param);
free(param2);
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR,
(void**)&gidptr, ORTE_GRPCOMM_COLL_ID_T)) {
(void) mca_base_var_env_name ("orte_snapc_fini_barrier_id", &param);
asprintf(&param2, "%d", *gidptr);
opal_setenv(param, param2, true, environ_copy);
free(param);
free(param2);
}
/* special case handling for --prefix: this is somewhat icky,
but at least some users do this. :-\ It is possible that
when using --prefix, the user will also "-x PATH" and/or
@ -1110,7 +1126,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
}
/* setup the environment for this app */
if (ORTE_SUCCESS != (rc = odls_base_default_setup_fork(app,
if (ORTE_SUCCESS != (rc = odls_base_default_setup_fork(jobdat, app,
jobdat->num_local_procs,
jobdat->num_procs,
jobdat->total_slots_alloc,

Просмотреть файл

@ -236,13 +236,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
int i;
orte_app_context_t *app;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
char *modx_par, *modx_val;
char *bar1_par, *bar1_val;
char *bar2_par, *bar2_val;
#if OPAL_ENABLE_FT_CR == 1
char *barcr1_par, *barcr1_val;
char *barcr2_par, *barcr2_val;
#endif
orte_grpcomm_coll_id_t id;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setup_job",
@ -278,28 +272,19 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
}
/* get collective ids for the std MPI operations */
caddy->jdata->peer_modex = orte_grpcomm_base_get_coll_id();
(void) mca_base_var_env_name ("orte_peer_modex_id", &modx_par);
asprintf(&modx_val, "%d", caddy->jdata->peer_modex);
caddy->jdata->peer_init_barrier = orte_grpcomm_base_get_coll_id();
(void) mca_base_var_env_name ("orte_peer_init_barrier_id", &bar1_par);
asprintf(&bar1_val, "%d", caddy->jdata->peer_init_barrier);
caddy->jdata->peer_fini_barrier = orte_grpcomm_base_get_coll_id();
(void) mca_base_var_env_name ("orte_peer_fini_barrier_id", &bar2_par);
asprintf(&bar2_val, "%d", caddy->jdata->peer_fini_barrier);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_PEER_MODX_ID, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_INIT_BAR_ID, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_FINI_BAR_ID, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
#if OPAL_ENABLE_FT_CR == 1
{
orte_grpcomm_coll_id_t id;
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
(void) mca_base_var_env_name("orte_snapc_init_barrier_id", &barcr1_par);
asprintf(&barcr1_val, "%d", id);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
(void) mca_base_var_env_name("orte_snapc_fini_barrier_id", &barcr2_par);
asprintf(&barcr2_val, "%d", id);
}
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
#endif
/* if app recovery is not defined, set apps to defaults */
@ -310,27 +295,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
if (!orte_get_attribute(&app->attributes, ORTE_APP_RECOV_DEF, NULL, OPAL_BOOL)) {
orte_set_attribute(&app->attributes, ORTE_APP_MAX_RESTARTS, ORTE_ATTR_LOCAL, &orte_max_restarts, OPAL_INT32);
}
/* set the envars for the collective ids */
opal_setenv(modx_par, modx_val, true, &app->env);
opal_setenv(bar1_par, bar1_val, true, &app->env);
opal_setenv(bar2_par, bar2_val, true, &app->env);
#if OPAL_ENABLE_FT_CR == 1
opal_setenv(barcr1_par, barcr1_val, true, &app->env);
opal_setenv(barcr2_par, barcr2_val, true, &app->env);
#endif
}
free(modx_par);
free(modx_val);
free(bar1_par);
free(bar1_val);
free(bar2_par);
free(bar2_val);
#if OPAL_ENABLE_FT_CR == 1
free(barcr1_par);
free(barcr1_val);
free(barcr2_par);
free(barcr2_val);
#endif
/* set the job state to the next position */
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE);

Просмотреть файл

@ -531,6 +531,7 @@ int orte_daemon(int argc, char *argv[])
int32_t ljob;
orte_grpcomm_collective_t *coll;
orte_namelist_t *nm;
orte_grpcomm_coll_id_t id;
/* setup the singleton's job */
jdata = OBJ_NEW(orte_job_t);
@ -585,48 +586,46 @@ int orte_daemon(int argc, char *argv[])
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);
/* account for the collectives in its modex/barriers */
jdata->peer_modex = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(jdata->peer_modex);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&jdata->attributes, ORTE_JOB_PEER_MODX_ID, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
jdata->peer_init_barrier = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(jdata->peer_init_barrier);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&jdata->attributes, ORTE_JOB_INIT_BAR_ID, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
jdata->peer_fini_barrier = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(jdata->peer_fini_barrier);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&jdata->attributes, ORTE_JOB_FINI_BAR_ID, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
#if OPAL_ENABLE_FT_CR == 1
{
orte_grpcomm_coll_id_t id;
id = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
orte_set_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR,
ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
id = orte_grpcomm_base_get_coll_id();
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
orte_set_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR,
ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
}
id = orte_grpcomm_base_get_coll_id();
orte_set_attribute(&jdata->attributes, ORTE_JOB_SNAPC_FINI_BAR, ORTE_ATTR_GLOBAL, &id, ORTE_GRPCOMM_COLL_ID_T);
coll = orte_grpcomm_base_setup_collective(id);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = jdata->jobid;
nm->name.vpid = ORTE_VPID_WILDCARD;
opal_list_append(&coll->participants, &nm->super);
#endif
/* create a string that contains our uri + sysinfo */

Просмотреть файл

@ -328,10 +328,6 @@ typedef struct {
* (wildcard), or none (invalid)
*/
orte_vpid_t stdin_target;
/* collective ids */
orte_grpcomm_coll_id_t peer_modex;
orte_grpcomm_coll_id_t peer_init_barrier;
orte_grpcomm_coll_id_t peer_fini_barrier;
/* total slots allocated to this job */
orte_std_cntr_t total_slots_alloc;
/* number of procs in this job */

Просмотреть файл

@ -195,6 +195,12 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key)
return "JOB-FAIL-NOTIFIED";
case ORTE_JOB_TERM_NOTIFIED:
return "JOB-TERM-NOTIFIED";
case ORTE_JOB_PEER_MODX_ID:
return "JOB-PEER-MODX-ID";
case ORTE_JOB_INIT_BAR_ID:
return "JOB-INIT-BAR-ID";
case ORTE_JOB_FINI_BAR_ID:
return "JOB-FINI-BAR-ID";
case ORTE_PROC_NOBARRIER:
return "PROC-NOBARRIER";

Просмотреть файл

@ -115,6 +115,9 @@ typedef uint16_t orte_job_flags_t;
#define ORTE_JOB_GOVERNOR (ORTE_JOB_START_KEY + 27) // string - governor used for nodes in job
#define ORTE_JOB_FAIL_NOTIFIED (ORTE_JOB_START_KEY + 28) // bool - abnormal term of proc within job has been reported
#define ORTE_JOB_TERM_NOTIFIED (ORTE_JOB_START_KEY + 29) // bool - normal term of job has been reported
#define ORTE_JOB_PEER_MODX_ID (ORTE_JOB_START_KEY + 30) // orte_grpcomm_coll_id_t - collective id
#define ORTE_JOB_INIT_BAR_ID (ORTE_JOB_START_KEY + 31) // orte_grpcomm_coll_id_t - collective id
#define ORTE_JOB_FINI_BAR_ID (ORTE_JOB_START_KEY + 32) // orte_grpcomm_coll_id_t - collective id
#define ORTE_JOB_MAX_KEY 300