From a3b08f5800b15077bd8a2ea58d8f33b4a71ecc08 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 29 Aug 2012 03:11:37 +0000 Subject: [PATCH] Fix a few things relating to comm_spawn that causes new daemons to be launched. Ensure that all new daemons receive a full pidmap. Properly mark the daemon job as "updated" when daemons are added This commit was SVN r27177. --- orte/mca/grpcomm/base/grpcomm_base_receive.c | 10 ++++++---- orte/mca/plm/base/plm_base_launch_support.c | 4 ++++ orte/util/nidmap.c | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c index 6bda5c3e9d..1504d0ea3c 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_receive.c +++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c @@ -155,9 +155,7 @@ static void coll_id_req(int status, orte_process_name_t* sender, orte_grpcomm_coll_id_t id; opal_buffer_t *relay; int rc; - /* collective - only the HNP ever gets this message, but check - * in case a developer makes a mistake! - */ + id = orte_grpcomm_base_get_coll_id(); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:receive proc %s requested coll id - returned id %d", @@ -316,7 +314,11 @@ static void daemon_local_recv(int status, orte_process_name_t* sender, * our own local procs, but this could involve a proc * running remotely that we don't know about yet */ - do_progress = false; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s cant find job %s - not progressing collective", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(proc.jobid))); + do_progress = false; } } diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index b2ab6696d4..213eb52a0a 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1269,6 +1269,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) OBJ_DESTRUCT(&nodes); /* mark that the daemons have reported so we can proceed */ daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED; + daemons->updated = false; return ORTE_SUCCESS; } @@ -1430,5 +1431,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) orte_routed.update_routing_plan(); } + /* mark that the daemon job changed */ + daemons->updated = true; + return ORTE_SUCCESS; } diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index f42bf8d49b..01c8f41e7f 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -509,10 +509,25 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update) opal_buffer_t buf; int i, j, rc = ORTE_SUCCESS; orte_job_t *jdata; + bool include_all; /* setup the working buffer */ OBJ_CONSTRUCT(&buf, opal_buffer_t); + /* check the daemon job to see if it has changed - perhaps + * new daemons were added as the result of a comm_spawn + */ + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + /* if it did change, then the pidmap will be going + * to new daemons - so we need to include everything. + * also include everything if we were asked to do so + */ + if (jdata->updated || !update) { + include_all = true; + } else { + include_all = false; + } + for (j=1; j < orte_job_data->size; j++) { /* the job array is no longer left-justified and may * have holes in it as we recover resources at job @@ -532,7 +547,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update) continue; } /* if we want an update version and there is nothing to update, ignore it */ - if (update && !jdata->updated) { + if (!include_all && !jdata->updated) { continue; } /* flag that we included it so we don't do so again */