1
1

moved code to job setup first before enabling comm

Этот коммит содержится в:
annu13 2015-12-21 18:05:07 -08:00
родитель 607d7c7545
Коммит 43f44f31c1

Просмотреть файл

@ -300,8 +300,79 @@ int orte_ess_base_orted_setup(char **hosts)
}
}
}
/* setup the global job and node arrays */
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
ORTE_ERROR_LOG(ret);
error = "setup job array";
goto error;
}
orte_node_pool = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node array";
goto error;
}
orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node topologies array";
goto error;
}
/* Setup the job data object for the daemons */
/* create and store the job data object */
jdata = OBJ_NEW(orte_job_t);
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_set_item(orte_job_data, 0, jdata);
/* every job requires at least one app */
app = OBJ_NEW(orte_app_context_t);
opal_pointer_array_set_item(jdata->apps, 0, app);
jdata->num_apps++;
/* create and store a node object where we are */
node = OBJ_NEW(orte_node_t);
node->name = strdup(orte_process_info.nodename);
node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
/* point our topology to the one detected locally */
node->topology = opal_hwloc_topology;
/* create and store a proc object for us */
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
proc->pid = orte_process_info.pid;
proc->rml_uri = orte_rml.get_contact_info();
proc->state = ORTE_PROC_STATE_RUNNING;
opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
/* record that the daemon (i.e., us) is on this node
* NOTE: we do not add the proc object to the node's
* proc array because we are not an application proc.
* Instead, we record it in the daemon field of the
* node object
*/
OBJ_RETAIN(proc); /* keep accounting straight */
node->daemon = proc;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
node->state = ORTE_NODE_STATE_UP;
/* now point our proc node field to the node */
OBJ_RETAIN(node); /* keep accounting straight */
proc->node = node;
/* record that the daemon job is running */
jdata->num_procs = 1;
jdata->state = ORTE_JOB_STATE_RUNNING;
/* obviously, we have "reported" */
jdata->num_reported = 1;
/* Setup the communication infrastructure */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_open";
goto error;
@ -439,76 +510,6 @@ int orte_ess_base_orted_setup(char **hosts)
goto error;
}
}
/* setup the global job and node arrays */
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
ORTE_ERROR_LOG(ret);
error = "setup job array";
goto error;
}
orte_node_pool = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node array";
goto error;
}
orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node topologies array";
goto error;
}
/* Setup the job data object for the daemons */
/* create and store the job data object */
jdata = OBJ_NEW(orte_job_t);
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_set_item(orte_job_data, 0, jdata);
/* every job requires at least one app */
app = OBJ_NEW(orte_app_context_t);
opal_pointer_array_set_item(jdata->apps, 0, app);
jdata->num_apps++;
/* create and store a node object where we are */
node = OBJ_NEW(orte_node_t);
node->name = strdup(orte_process_info.nodename);
node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
/* point our topology to the one detected locally */
node->topology = opal_hwloc_topology;
/* create and store a proc object for us */
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
proc->pid = orte_process_info.pid;
proc->rml_uri = orte_rml.get_contact_info();
proc->state = ORTE_PROC_STATE_RUNNING;
opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
/* record that the daemon (i.e., us) is on this node
* NOTE: we do not add the proc object to the node's
* proc array because we are not an application proc.
* Instead, we record it in the daemon field of the
* node object
*/
OBJ_RETAIN(proc); /* keep accounting straight */
node->daemon = proc;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
node->state = ORTE_NODE_STATE_UP;
/* now point our proc node field to the node */
OBJ_RETAIN(node); /* keep accounting straight */
proc->node = node;
/* record that the daemon job is running */
jdata->num_procs = 1;
jdata->state = ORTE_JOB_STATE_RUNNING;
/* obviously, we have "reported" */
jdata->num_reported = 1;
/* setup the PMIx framework - ensure it skips all non-PMIx components,
* but do not override anything we were given */