1
1

Fix rank-file mapper launch by correctly setting up the remote map from the provided data

Put a simple protection for the case where procs fail while we are trying to deregister handlers

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-06-15 08:29:45 -07:00
родитель 7d07659367
Коммит 8f09929469
2 изменённых файлов: 28 добавлений и 5 удалений

Просмотреть файл

@ -219,12 +219,14 @@ static void _event_hdlr(int sd, short args, void *cbdata)
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s _EVENT_HDLR CALLING EVHDLR",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
if (NULL != event->handler) {
event->handler(cd->status, &cd->pname,
cd->info, &cd->results,
return_local_event_hdlr, (void*)cd);
return;
}
}
}
/* if we didn't find a match, we still have to call their final callback */
if (NULL != cd->pmixcbfunc) {
cd->pmixcbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cd->cbdata);

Просмотреть файл

@ -279,6 +279,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
int rc;
orte_std_cntr_t cnt;
orte_job_t *jdata=NULL, *daemons;
orte_node_t *node;
int32_t n, k;
opal_buffer_t *bptr;
orte_proc_t *pptr, *dmn;
@ -436,7 +437,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
/* not ready for use yet */
continue;
}
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
if (!ORTE_PROC_IS_HNP &&
orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
/* the parser will have already made the connection, but the fully described
* case won't have done it, so connect the proc to its node here */
opal_output_verbose(5, orte_odls_base_framework.framework_output,
@ -457,6 +459,17 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
}
OBJ_RETAIN(dmn->node);
pptr->node = dmn->node;
/* add the node to the job map, if needed */
if (!ORTE_FLAG_TEST(pptr->node, ORTE_NODE_FLAG_MAPPED)) {
OBJ_RETAIN(pptr->node);
opal_pointer_array_add(jdata->map->nodes, pptr->node);
jdata->map->num_nodes++;
ORTE_FLAG_SET(pptr->node, ORTE_NODE_FLAG_MAPPED);
}
/* add this proc to that node */
OBJ_RETAIN(pptr);
opal_pointer_array_add(pptr->node->procs, pptr);
pptr->node->num_procs++;
}
/* see if it belongs to us */
if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
@ -485,6 +498,14 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
}
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
/* reset the mapped flags */
for (n=0; n < jdata->map->nodes->size; n++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
}
}
}
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
/* compute and save bindings of local children */