1
1

Given the amount of pain singletons cause, one can't help but wonder if it REALLY was that much trouble for people to type "mpirun -n 1 foo"....sigh.

Get the ordering right so that a singleton can start.

Protect the rmgr copy app_context function from NULL fields

Tell the mapper it is okay for there not to be a pre-existing mapping plan for a parent when dynamically spawning processes

This commit was SVN r12257.
Этот коммит содержится в:
Ralph Castain 2006-10-23 15:15:45 +00:00
родитель c5b59829aa
Коммит 7a77ef0ae3
4 изменённых файлов: 320 добавлений и 288 удалений

Просмотреть файл

@ -523,10 +523,17 @@ int orte_rmaps_base_get_mapping_plan(orte_jobid_t job, opal_list_t *attr_list)
return rc;
}
/* should only be one value returned here since there is only one
* container/job on the segment - error otherwise
/* It is okay for there to be 0 values returned as this just means a mapping plan
* was not previously stored on the registry
*/
if (1 != num_vals) {
if (0 == num_vals) {
return ORTE_SUCCESS;
}
/* should only be one value returned here since there is only one
* container/job on the segment - error otherwise.
*/
if (1 < num_vals) {
ORTE_ERROR_LOG(ORTE_ERR_GPR_DATA_CORRUPT);
return ORTE_ERR_GPR_DATA_CORRUPT;
}

Просмотреть файл

@ -48,11 +48,15 @@ int orte_rmgr_base_copy_app_context(orte_app_context_t **dest, orte_app_context_
/* copy data into it */
(*dest)->idx = src->idx;
if (NULL != src->app) {
(*dest)->app = strdup(src->app);
}
(*dest)->num_procs = src->num_procs;
(*dest)->argv = opal_argv_copy(src->argv);
(*dest)->env = opal_argv_copy(src->env);
if (NULL != src->cwd) {
(*dest)->cwd = strdup(src->cwd);
}
(*dest)->user_specified_cwd = src->user_specified_cwd;
(*dest)->num_map = src->num_map;
@ -91,7 +95,9 @@ int orte_rmgr_base_copy_app_context_map(orte_app_context_map_t **dest, orte_app_
/* copy data into it */
(*dest)->map_type = src->map_type;
if (NULL != src->map_data) {
(*dest)->map_data = strdup(src->map_data);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -262,106 +262,6 @@ int orte_init_stage1(bool infrastructure)
/* all done with sds - clean up and call it a day */
orte_sds_base_close();
/* initialize the rml module so it can open its interfaces - this
* is needed so that we can get a uri for ourselves if we are an
* HNP
*/
if (ORTE_SUCCESS != (ret = orte_rml.init())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml.init";
goto error;
}
/* if I'm the seed, set the seed uri to be me! */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {
free(orte_universe_info.seed_uri);
}
orte_universe_info.seed_uri = orte_rml.get_uri();
/* and make sure that the daemon flag is NOT set so that
* components unique to non-HNP orteds can be selected
*/
orte_process_info.daemon = false;
}
/* setup my session directory */
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_jobid_string";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_vpid_string";
goto error;
}
if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] setting up session dir with",
ORTE_NAME_ARGS(orte_process_info.my_name));
if (NULL != orte_process_info.tmpdir_base) {
opal_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base);
}
opal_output(0, "\tuniverse %s", orte_universe_info.name);
opal_output(0, "\tuser %s", orte_system_info.user);
opal_output(0, "\thost %s", orte_system_info.nodename);
opal_output(0, "\tjobid %s", jobid_str);
opal_output(0, "\tprocid %s", procid_str);
}
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename, NULL,
orte_universe_info.name,
jobid_str, procid_str))) {
if (jobid_str != NULL) free(jobid_str);
if (procid_str != NULL) free(procid_str);
ORTE_ERROR_LOG(ret);
error = "orte_session_dir";
goto error;
}
if (NULL != jobid_str) {
free(jobid_str);
}
if (NULL != procid_str) {
free(procid_str);
}
/* Once the session directory location has been established, set
the opal_output default file location to be in the
proc-specific session directory. */
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
"output-", NULL, NULL);
/* if i'm the seed, get my contact info and write my setup file for others to find */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {
free(orte_universe_info.seed_uri);
orte_universe_info.seed_uri = NULL;
}
if (NULL == (orte_universe_info.seed_uri = orte_rml.get_uri())) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
error = "orte_rml_get_uri";
ret = ORTE_ERR_NOT_FOUND;
goto error;
}
contact_path = opal_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL);
if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] contact_file %s",
ORTE_NAME_ARGS(orte_process_info.my_name), contact_path);
}
if (ORTE_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
} else if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
free(contact_path);
}
/*
* Now that we know for certain if we are an HNP and/or a daemon,
* setup the resource management frameworks. This includes opening
@ -469,10 +369,11 @@ int orte_init_stage1(bool infrastructure)
if(orte_process_info.singleton || orte_process_info.seed) {
char *site, *resource;
orte_app_context_t *app;
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_jobid";
error = "orte_ns.get_jobid for singleton/seed";
goto error;
}
@ -484,7 +385,7 @@ int orte_init_stage1(bool infrastructure)
ret = orte_ns.create_cellid(&my_cellid, "unknown", orte_system_info.nodename);
if (ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.create_cellid";
error = "orte_ns.create_cellid for singleton/seed";
goto error;
}
@ -494,13 +395,30 @@ int orte_init_stage1(bool infrastructure)
}
else if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_cell_info";
error = "orte_ns.get_cell_inf for singleton/seedo";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_ns.get_cellid(&my_cellid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_cellid";
error = "orte_ns.get_cellid for singleton/seed";
goto error;
}
/* set the rest of the infrastructure */
app = OBJ_NEW(orte_app_context_t);
app->app = strdup("unknown");
app->num_procs = 1;
if (ORTE_SUCCESS != (ret = orte_rmgr_base_put_app_context(my_jobid, &app, 1))) {
ORTE_ERROR_LOG(ret);
error = "orte_rmgr_base_put_app_context for singleton/seed";
goto error;
}
OBJ_RELEASE(app);
if (ORTE_SUCCESS != (ret = orte_rmgr.set_vpid_range(my_jobid,0,1))) {
ORTE_ERROR_LOG(ret);
error = "orte_rmgr.set_vpid_range for singleton/seed";
goto error;
}
@ -549,7 +467,7 @@ int orte_init_stage1(bool infrastructure)
new_attr = OBJ_NEW(orte_rds_cell_attr_t);
if (NULL == new_attr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
error = "OBJ_NEW(orte_rds_cell_attr_t) for ORTE_RDS_NAME";
error = "singleton OBJ_NEW(orte_rds_cell_attr_t) for ORTE_RDS_NAME";
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
@ -557,7 +475,7 @@ int orte_init_stage1(bool infrastructure)
new_attr->keyval.value = OBJ_NEW(orte_data_value_t);
if (NULL == new_attr->keyval.value) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
error = "OBJ_NEW(orte_data_value_t) for ORTE_RDS_NAME";
error = "singleton OBJ_NEW(orte_data_value_t) for ORTE_RDS_NAME";
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
@ -568,7 +486,7 @@ int orte_init_stage1(bool infrastructure)
new_attr = OBJ_NEW(orte_rds_cell_attr_t);
if (NULL == new_attr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
error = "OBJ_NEW(orte_rds_cell_attr_t) for ORTE_CELLID_KEY";
error = "singleton OBJ_NEW(orte_rds_cell_attr_t) for ORTE_CELLID_KEY";
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
@ -576,14 +494,14 @@ int orte_init_stage1(bool infrastructure)
new_attr->keyval.value = OBJ_NEW(orte_data_value_t);
if (NULL == new_attr->keyval.value) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
error = "OBJ_NEW(orte_data_value_t) for ORTE_CELLID";
error = "singleton OBJ_NEW(orte_data_value_t) for ORTE_CELLID";
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
new_attr->keyval.value->type = ORTE_CELLID;
if (ORTE_SUCCESS != (ret = orte_dss.copy(&(new_attr->keyval.value->data), &(rds_item->cellid), ORTE_CELLID))) {
ORTE_ERROR_LOG(ret);
error = "orte_dss.copy for ORTE_CELLID";
error = "singleton orte_dss.copy for ORTE_CELLID";
goto error;
}
opal_list_append(&(rds_item->attributes), &new_attr->super);
@ -594,7 +512,7 @@ int orte_init_stage1(bool infrastructure)
ret = orte_rds.store_resource(&rds_single_host);
if (ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
error = "orte_rds.store_resource";
error = "singleton orte_rds.store_resource";
goto error;
}
@ -605,44 +523,145 @@ int orte_init_stage1(bool infrastructure)
ret = orte_ras_base_node_insert(&single_host);
if (ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
error = "orte_ras.node_insert";
error = "singleton orte_ras.node_insert";
goto error;;
}
/* JMS: Same as above -- fix this after 1.0: force a
selection so that orte_ras has initialized pointers in
case anywhere else tries to use it. This may end up
putting a bunch more nodes on the node segment (e.g.,
putting a bunch more nodes on the node segment - e.g.,
if you're in a SLURM allocation and you "./a.out",
you'll end up with the localhost *and* all the other
nodes in your allocation on the node segment -- which
is probably fine) */
is probably fine */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_ras.allocate_job(my_jobid, &attrs);
if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, &attrs))) {
ORTE_ERROR_LOG(ret);
error = "allocate for a singleton";
goto error;
}
OBJ_DESTRUCT(&attrs);
OBJ_DESTRUCT(&single_host);
OBJ_DESTRUCT(&rds_single_host);
}
/* set the rest of the infrastructure */
if (ORTE_SUCCESS != (ret = orte_rmgr_base_set_job_slots(my_jobid,1))) {
ORTE_ERROR_LOG(ret);
error = "orte_rmgr_base_set_job_slots";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rmgr.set_vpid_range(my_jobid,0,1))) {
ORTE_ERROR_LOG(ret);
error = "orte_rmgr.set_vpid_range";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rmgr_base_proc_stage_gate_init(my_jobid))) {
ORTE_ERROR_LOG(ret);
error = "orte_rmgr_base_proc_stage_gate_init";
error = "singleton orte_rmgr_base_proc_stage_gate_init";
goto error;
}
/* set our state to LAUNCHED */
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(ret);
error = "singleton could not set launched state";
goto error;
}
}
/* initialize the rml module so it can open its interfaces - this
* is needed so that we can get a uri for ourselves if we are an
* HNP
*/
if (ORTE_SUCCESS != (ret = orte_rml.init())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml.init";
goto error;
}
/* if I'm the seed, set the seed uri to be me! */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {
free(orte_universe_info.seed_uri);
}
orte_universe_info.seed_uri = orte_rml.get_uri();
/* and make sure that the daemon flag is NOT set so that
* components unique to non-HNP orteds can be selected
*/
orte_process_info.daemon = false;
}
/* setup my session directory */
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_jobid_string";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
error = "orte_ns.get_vpid_string";
goto error;
}
if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] setting up session dir with",
ORTE_NAME_ARGS(orte_process_info.my_name));
if (NULL != orte_process_info.tmpdir_base) {
opal_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base);
}
opal_output(0, "\tuniverse %s", orte_universe_info.name);
opal_output(0, "\tuser %s", orte_system_info.user);
opal_output(0, "\thost %s", orte_system_info.nodename);
opal_output(0, "\tjobid %s", jobid_str);
opal_output(0, "\tprocid %s", procid_str);
}
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename, NULL,
orte_universe_info.name,
jobid_str, procid_str))) {
if (jobid_str != NULL) free(jobid_str);
if (procid_str != NULL) free(procid_str);
ORTE_ERROR_LOG(ret);
error = "orte_session_dir";
goto error;
}
if (NULL != jobid_str) {
free(jobid_str);
}
if (NULL != procid_str) {
free(procid_str);
}
/* Once the session directory location has been established, set
the opal_output default file location to be in the
proc-specific session directory. */
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
"output-", NULL, NULL);
/* if i'm the seed, get my contact info and write my setup file for others to find */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {
free(orte_universe_info.seed_uri);
orte_universe_info.seed_uri = NULL;
}
if (NULL == (orte_universe_info.seed_uri = orte_rml.get_uri())) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
error = "orte_rml_get_uri";
ret = ORTE_ERR_NOT_FOUND;
goto error;
}
contact_path = opal_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL);
if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] contact_file %s",
ORTE_NAME_ARGS(orte_process_info.my_name), contact_path);
}
if (ORTE_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
} else if (orte_debug_flag) {
opal_output(0, "[%lu,%lu,%lu] wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
free(contact_path);
}
error:
if (ret != ORTE_SUCCESS) {
opal_show_help("help-orte-runtime",