Fix a few things relating to comm_spawn that causes new daemons to be launched. Ensure that all new daemons receive a full pidmap. Properly mark the daemon job as "updated" when daemons are added

This commit was SVN r27177.
2012-08-29 03:11:37 +00:00 · 2012-08-29 03:11:37 +00:00 · a3b08f5800
--- a/orte/mca/grpcomm/base/grpcomm_base_receive.c
+++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c
@ -155,9 +155,7 @@ static void coll_id_req(int status, orte_process_name_t* sender,
    orte_grpcomm_coll_id_t id;
    opal_buffer_t *relay;
    int rc;
-    /* collective - only the HNP ever gets this message, but check
-     * in case a developer makes a mistake!
-     */
+
    id = orte_grpcomm_base_get_coll_id();
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:base:receive proc %s requested coll id - returned id %d",
@ -316,7 +314,11 @@ static void daemon_local_recv(int status, orte_process_name_t* sender,
             * our own local procs, but this could involve a proc
             * running remotely that we don't know about yet
             */
-            do_progress = false;
+            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
+                                 "%s cant find job %s - not progressing collective",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_JOBID_PRINT(proc.jobid)));
+             do_progress = false;
        }
    }

--- a/orte/mca/plm/base/plm_base_launch_support.c
+++ b/orte/mca/plm/base/plm_base_launch_support.c
@ -1269,6 +1269,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
        OBJ_DESTRUCT(&nodes);
        /* mark that the daemons have reported so we can proceed */
        daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
+	daemons->updated = false;
        return ORTE_SUCCESS;
    }

@ -1430,5 +1431,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
        orte_routed.update_routing_plan();
    }

+    /* mark that the daemon job changed */
+    daemons->updated = true;
+
    return ORTE_SUCCESS;
 }
--- a/orte/util/nidmap.c
+++ b/orte/util/nidmap.c
@ -509,10 +509,25 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
    opal_buffer_t buf;
    int i, j, rc = ORTE_SUCCESS;
    orte_job_t *jdata;
+    bool include_all;

    /* setup the working buffer */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    
+    /* check the daemon job to see if it has changed - perhaps
+     * new daemons were added as the result of a comm_spawn
+     */
+    jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+    /* if it did change, then the pidmap will be going
+     * to new daemons - so we need to include everything.
+     * also include everything if we were asked to do so
+     */
+    if (jdata->updated || !update) {
+        include_all = true;
+    } else {
+        include_all = false;
+    }
+
    for (j=1; j < orte_job_data->size; j++) {
        /* the job array is no longer left-justified and may
         * have holes in it as we recover resources at job
@ -532,7 +547,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
            continue;
        }
        /* if we want an update version and there is nothing to update, ignore it */
-        if (update && !jdata->updated) {
+        if (!include_all && !jdata->updated) {
            continue;
        }
        /* flag that we included it so we don't do so again */