Fix rank-file mapper launch by correctly setting up the remote map from the provided data
Put a simple protection for the case where procs fail while we are trying to deregister handlers Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
7d07659367
Коммит
8f09929469
@ -219,12 +219,14 @@ static void _event_hdlr(int sd, short args, void *cbdata)
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s _EVENT_HDLR CALLING EVHDLR",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
|
||||
if (NULL != event->handler) {
|
||||
event->handler(cd->status, &cd->pname,
|
||||
cd->info, &cd->results,
|
||||
return_local_event_hdlr, (void*)cd);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if we didn't find a match, we still have to call their final callback */
|
||||
if (NULL != cd->pmixcbfunc) {
|
||||
cd->pmixcbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cd->cbdata);
|
||||
|
@ -279,6 +279,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
int rc;
|
||||
orte_std_cntr_t cnt;
|
||||
orte_job_t *jdata=NULL, *daemons;
|
||||
orte_node_t *node;
|
||||
int32_t n, k;
|
||||
opal_buffer_t *bptr;
|
||||
orte_proc_t *pptr, *dmn;
|
||||
@ -436,7 +437,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
/* not ready for use yet */
|
||||
continue;
|
||||
}
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
|
||||
if (!ORTE_PROC_IS_HNP &&
|
||||
orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
|
||||
/* the parser will have already made the connection, but the fully described
|
||||
* case won't have done it, so connect the proc to its node here */
|
||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||
@ -457,6 +459,17 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
}
|
||||
OBJ_RETAIN(dmn->node);
|
||||
pptr->node = dmn->node;
|
||||
/* add the node to the job map, if needed */
|
||||
if (!ORTE_FLAG_TEST(pptr->node, ORTE_NODE_FLAG_MAPPED)) {
|
||||
OBJ_RETAIN(pptr->node);
|
||||
opal_pointer_array_add(jdata->map->nodes, pptr->node);
|
||||
jdata->map->num_nodes++;
|
||||
ORTE_FLAG_SET(pptr->node, ORTE_NODE_FLAG_MAPPED);
|
||||
}
|
||||
/* add this proc to that node */
|
||||
OBJ_RETAIN(pptr);
|
||||
opal_pointer_array_add(pptr->node->procs, pptr);
|
||||
pptr->node->num_procs++;
|
||||
}
|
||||
/* see if it belongs to us */
|
||||
if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
|
||||
@ -485,6 +498,14 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
|
||||
}
|
||||
}
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
|
||||
/* reset the mapped flags */
|
||||
for (n=0; n < jdata->map->nodes->size; n++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
|
||||
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
|
||||
/* compute and save bindings of local children */
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user