From e1aa7939efe635757dbaef18115e8ab6808bd1f5 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 7 Dec 2016 13:53:00 -0800 Subject: [PATCH] Correctly cleanup the local children and node map info on remote orteds upon job completion. Ensure that register_nspace only includes procs from that job in the proc map Thanks to Ashley Pittman for the report Signed-off-by: Ralph Castain --- orte/mca/state/orted/state_orted.c | 27 +++++++++++++++++++++- orte/orted/pmix/pmix_server_register_fns.c | 4 ++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index d77d6ec178..214e223cf2 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -252,7 +252,7 @@ static void track_procs(int fd, short argc, void *cbdata) orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; - int rc, i; + int rc, i, j; orte_plm_cmd_flag_t cmd; char *rtmod; @@ -416,6 +416,31 @@ static void track_procs(int fd, short argc, void *cbdata) } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); + /* cleanup the procs as these are gone */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + /* if this child is part of the job... */ + if (pptr->name.jobid == jdata->jobid) { + /* clear the entry in the local children */ + opal_pointer_array_set_item(orte_local_children, i, NULL); + /* find it in the node->procs array */ + for (j=0; j < pptr->node->procs->size; j++) { + if (NULL == (pdata = (orte_proc_t*)opal_pointer_array_get_item(pptr->node->procs, j))) { + continue; + } + if (pdata == pptr) { + /* remove it */ + opal_pointer_array_set_item(pptr->node->procs, j, NULL); + OBJ_RELEASE(pdata); // maintain accounting + break; + } + } + OBJ_RELEASE(pptr); // maintain accounting + } + } + } } diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index f53f8feb5c..3d0cf381e0 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -332,6 +332,10 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } + /* only consider procs from this job */ + if (pptr->name.jobid != jdata->jobid) { + continue; + } /* setup the proc map object */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_PROC_DATA);