From a3b08f5800b15077bd8a2ea58d8f33b4a71ecc08 Mon Sep 17 00:00:00 2001
From: Ralph Castain <rhc@open-mpi.org>
Date: Wed, 29 Aug 2012 03:11:37 +0000
Subject: [PATCH] Fix a few things relating to comm_spawn that causes new
 daemons to be launched. Ensure that all new daemons receive a full pidmap.
 Properly mark the daemon job as "updated" when daemons are added

This commit was SVN r27177.
---
 orte/mca/grpcomm/base/grpcomm_base_receive.c | 10 ++++++----
 orte/mca/plm/base/plm_base_launch_support.c  |  4 ++++
 orte/util/nidmap.c                           | 17 ++++++++++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c
index 6bda5c3e9d..1504d0ea3c 100644
--- a/orte/mca/grpcomm/base/grpcomm_base_receive.c
+++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c
@@ -155,9 +155,7 @@ static void coll_id_req(int status, orte_process_name_t* sender,
     orte_grpcomm_coll_id_t id;
     opal_buffer_t *relay;
     int rc;
-    /* collective - only the HNP ever gets this message, but check
-     * in case a developer makes a mistake!
-     */
+
     id = orte_grpcomm_base_get_coll_id();
     OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                          "%s grpcomm:base:receive proc %s requested coll id - returned id %d",
@@ -316,7 +314,11 @@ static void daemon_local_recv(int status, orte_process_name_t* sender,
              * our own local procs, but this could involve a proc
              * running remotely that we don't know about yet
              */
-            do_progress = false;
+            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
+                                 "%s cant find job %s - not progressing collective",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_JOBID_PRINT(proc.jobid)));
+             do_progress = false;
         }
     }
 
diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c
index b2ab6696d4..213eb52a0a 100644
--- a/orte/mca/plm/base/plm_base_launch_support.c
+++ b/orte/mca/plm/base/plm_base_launch_support.c
@@ -1269,6 +1269,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
         OBJ_DESTRUCT(&nodes);
         /* mark that the daemons have reported so we can proceed */
         daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
+	daemons->updated = false;
         return ORTE_SUCCESS;
     }
 
@@ -1430,5 +1431,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
         orte_routed.update_routing_plan();
     }
 
+    /* mark that the daemon job changed */
+    daemons->updated = true;
+
     return ORTE_SUCCESS;
 }
diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c
index f42bf8d49b..01c8f41e7f 100644
--- a/orte/util/nidmap.c
+++ b/orte/util/nidmap.c
@@ -509,10 +509,25 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
     opal_buffer_t buf;
     int i, j, rc = ORTE_SUCCESS;
     orte_job_t *jdata;
+    bool include_all;
 
     /* setup the working buffer */
     OBJ_CONSTRUCT(&buf, opal_buffer_t);
     
+    /* check the daemon job to see if it has changed - perhaps
+     * new daemons were added as the result of a comm_spawn
+     */
+    jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+    /* if it did change, then the pidmap will be going
+     * to new daemons - so we need to include everything.
+     * also include everything if we were asked to do so
+     */
+    if (jdata->updated || !update) {
+        include_all = true;
+    } else {
+        include_all = false;
+    }
+
     for (j=1; j < orte_job_data->size; j++) {
         /* the job array is no longer left-justified and may
          * have holes in it as we recover resources at job
@@ -532,7 +547,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
             continue;
         }
         /* if we want an update version and there is nothing to update, ignore it */
-        if (update && !jdata->updated) {
+        if (!include_all && !jdata->updated) {
             continue;
         }
         /* flag that we included it so we don't do so again */