From b87b69e977cd52a1d91184b545921f20f2e28e93 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 27 Aug 2014 16:16:46 +0000 Subject: [PATCH] Ensure the nodes get added to the job map on the remote nodes, add some debug to grpcomm daemon array construction This commit was SVN r32617. --- orte/mca/grpcomm/base/grpcomm_base_stubs.c | 18 +++++++++++++++- orte/mca/odls/base/odls_base_default_fns.c | 25 ++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/orte/mca/grpcomm/base/grpcomm_base_stubs.c b/orte/mca/grpcomm/base/grpcomm_base_stubs.c index 66e61e2631..6db4ba5b2f 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_stubs.c +++ b/orte/mca/grpcomm/base/grpcomm_base_stubs.c @@ -205,7 +205,6 @@ orte_grpcomm_coll_t* orte_grpcomm_base_get_tracker(orte_grpcomm_signature_t *sig OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:returning existing collective", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return coll; } } @@ -250,6 +249,11 @@ static int create_dmns(orte_grpcomm_signature_t *sig, size_t nds; orte_vpid_t *dns; + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, + "%s grpcomm:base:create_dmns called with %s signature", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == sig->signature) ? "NULL" : "NON-NULL")); + /* if NULL == procs, then all daemons are participating */ if (NULL == sig->signature) { *ndmns = orte_process_info.num_procs; @@ -258,6 +262,10 @@ static int create_dmns(orte_grpcomm_signature_t *sig, } if (ORTE_VPID_WILDCARD == sig->signature[0].vpid) { + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, + "%s grpcomm:base:create_dmns called for all procs in job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(sig->signature[0].jobid))); /* all daemons hosting this jobid are participating */ if (NULL == (jdata = orte_get_job_data_object(sig->signature[0].jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -280,6 +288,10 @@ static int create_dmns(orte_grpcomm_signature_t *sig, free(dns); return ORTE_ERROR; } + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, + "%s grpcomm:base:create_dmns adding daemon %s to array", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&node->daemon->name))); dns[nds++] = node->daemon->name.vpid; } } else { @@ -324,6 +336,10 @@ static int create_dmns(orte_grpcomm_signature_t *sig, dns = (orte_vpid_t*)malloc(opal_list_get_size(&ds) * sizeof(orte_vpid_t)); nds = 0; while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&ds))) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, + "%s grpcomm:base:create_dmns adding daemon %s to array", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); dns[nds++] = nm->name.vpid; OBJ_RELEASE(nm); } diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 66fc7519fd..6656316c41 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -189,10 +189,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, int rc; orte_std_cntr_t cnt; orte_job_t *jdata=NULL, *daemons; - int32_t n; + int32_t n, k; orte_proc_t *pptr, *dmn; opal_buffer_t *bptr; orte_app_context_t *app; + bool found; + orte_node_t *node; OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, "%s odls:constructing child list", @@ -312,9 +314,28 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, } OBJ_RETAIN(dmn->node); pptr->node = dmn->node; + /* add proc to node - note that num_procs for the + * node was already correctly unpacked, so don't + * increment it here */ OBJ_RETAIN(pptr); opal_pointer_array_add(dmn->node->procs, pptr); - dmn->node->num_procs++; + + /* add the node to the map, if not already there */ + found = false; + for (k=0; k < jdata->map->nodes->size; k++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, k))) { + continue; + } + if (node->daemon == dmn) { + found = true; + break; + } + } + if (!found) { + OBJ_RETAIN(dmn->node); + opal_pointer_array_add(jdata->map->nodes, dmn->node); + jdata->map->num_nodes++; + } /* see if it belongs to us */ if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {