1
1

Correctly cleanup the local children and node map info on remote orteds upon job completion. Ensure that register_nspace only includes procs from that job in the proc map

Thanks to Ashley Pittman for the report

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2016-12-07 13:53:00 -08:00
родитель 309c967946
Коммит e1aa7939ef
2 изменённых файлов: 30 добавлений и 1 удалений

Просмотреть файл

@ -252,7 +252,7 @@ static void track_procs(int fd, short argc, void *cbdata)
orte_job_t *jdata;
orte_proc_t *pdata, *pptr;
opal_buffer_t *alert;
int rc, i;
int rc, i, j;
orte_plm_cmd_flag_t cmd;
char *rtmod;
@ -416,6 +416,31 @@ static void track_procs(int fd, short argc, void *cbdata)
}
/* mark that we sent it so we ensure we don't do it again */
orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
/* cleanup the procs as these are gone */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
/* if this child is part of the job... */
if (pptr->name.jobid == jdata->jobid) {
/* clear the entry in the local children */
opal_pointer_array_set_item(orte_local_children, i, NULL);
/* find it in the node->procs array */
for (j=0; j < pptr->node->procs->size; j++) {
if (NULL == (pdata = (orte_proc_t*)opal_pointer_array_get_item(pptr->node->procs, j))) {
continue;
}
if (pdata == pptr) {
/* remove it */
opal_pointer_array_set_item(pptr->node->procs, j, NULL);
OBJ_RELEASE(pdata); // maintain accounting
break;
}
}
OBJ_RELEASE(pptr); // maintain accounting
}
}
}
}

Просмотреть файл

@ -332,6 +332,10 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
/* only consider procs from this job */
if (pptr->name.jobid != jdata->jobid) {
continue;
}
/* setup the proc map object */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_PROC_DATA);