Modify the accounting system to recycle jobids. Properly recover resources from nodes and jobs upon completion. Adjustments in several places were required to deal with sparsely populated job, node, and proc arrays as a result of this change.

Correct an error wrt how jobids were being computed. Needed to ensure that the job family field was not overrun as we increment jobids for comm_spawn. Update the slurm plm module so it uses the new slurm termination procedure (brings trunk back into alignment with 1.3 branch). Update the slurmd ess component so it doesn't get selected if we are running a singleton inside of a slurm allocation. Cleanup HNP init by moving some code that had been in orte_globals.c for historical reasons into the ess hnp module, and removing the call to that code from the ess_base_std_prolog NOTE: this change allows orte to support an infinite aggregate number of comm_spawn's, with up to 64k being alive at any one instant. HOWEVER, the MPI layer currently does -not- support re-use of jobids. I did some prototype coding to revise the ompi_proc_t structures, but the BTLs are caching their own data, and there was no readily apparent way to update it. Thus, attempts to spawn more than the 64k limit will abort to avoid causing the MPI layer to hang. This commit was SVN r20700.
2009-03-03 16:39:13 +00:00 · 2009-03-03 16:39:13 +00:00 · f11931306a
--- a/orte/mca/errmgr/default/errmgr_default.c
+++ b/orte/mca/errmgr/default/errmgr_default.c
@ -77,11 +77,11 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
     */
    jobs = (orte_job_t**)orte_job_data->addr;
    for (i=1; i < orte_job_data->size; i++) {
-        /* the array is left justfied, so we can quit once
-         * we see a NULL
+        /* the array may have holes in it as we are recovering
+         * jobids as they complete, so check everything
         */
        if (NULL == jobs[i]) {
-            break;
+            continue;
        }
        if (ORTE_JOB_STATE_ABORTED != jobs[i]->state &&
            ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state &&
--- a/orte/mca/ess/base/ess_base_std_prolog.c
+++ b/orte/mca/ess/base/ess_base_std_prolog.c
@ -46,17 +46,6 @@ int orte_ess_base_std_prolog(void)
        goto error;
    }
    
-    /* if I'm the HNP, make sure that the daemon flag is NOT set so that
-     * components unique to non-HNP orteds can be selected and init
-     * my basic storage elements
-     */
-    if (orte_process_info.hnp) {
-        if (ORTE_SUCCESS != (ret = orte_hnp_globals_init())) {
-            error = "orte_hnp_globals_init";
-            goto error;
-        }
-    }
-    
    /*
     * Internal startup
     */
--- a/orte/mca/ess/hnp/ess_hnp_module.c
+++ b/orte/mca/ess/hnp/ess_hnp_module.c
@ -327,11 +327,32 @@ static int rte_init(char flags)
    }
    free(contact_path);

+    /* setup the global job and node arrays */
+    orte_job_data = OBJ_NEW(opal_pointer_array_t);
+    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
+                                                      1,
+                                                      ORTE_GLOBAL_ARRAY_MAX_SIZE,
+                                                      1))) {
+        ORTE_ERROR_LOG(ret);
+        error = "setup job array";
+        goto error;
+    }
+    
+    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
+    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
+                                                      ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
+                                                      ORTE_GLOBAL_ARRAY_MAX_SIZE,
+                                                      ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
+        ORTE_ERROR_LOG(ret);
+        error = "setup node array";
+        goto error;
+    }
+    
    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
-    opal_pointer_array_add(orte_job_data, jdata);
+    opal_pointer_array_set_item(orte_job_data, 0, jdata);
   
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
@ -465,6 +486,7 @@ static int rte_finalize(void)
 {
    char *contact_path;
    opal_list_item_t *item;
+    int i;

    /* remove my contact info file */
    contact_path = opal_os_path(false, orte_process_info.top_session_dir,
@ -517,6 +539,24 @@ static int rte_finalize(void)
    }
    OBJ_DESTRUCT(&orte_local_jobdata);
    
+    /* cleanup the job and node info arrays */
+    if (NULL != orte_node_pool) {
+        for (i=0; i < orte_node_pool->size; i++) {
+            if (NULL != orte_node_pool->addr[i]) {
+                OBJ_RELEASE(orte_node_pool->addr[i]);
+            }
+        }
+        OBJ_RELEASE(orte_node_pool);
+    }
+    if (NULL != orte_job_data) {
+        for (i=0; i < orte_job_data->size; i++) {
+            if (NULL != orte_job_data->addr[i]) {
+                OBJ_RELEASE(orte_job_data->addr[i]);
+            }
+        }
+        OBJ_RELEASE(orte_job_data);
+    }
+
    /* finalize the session directory tree */
    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
    
--- a/orte/mca/ess/slurmd/ess_slurmd_component.c
+++ b/orte/mca/ess/slurmd/ess_slurmd_component.c
@ -76,6 +76,7 @@ int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority)
    
    if (orte_process_info.mpi_proc &&
        NULL != getenv("SLURM_JOBID") &&
+        NULL != getenv("SLURM_STEPID") &&
        NULL == orte_process_info.my_hnp_uri) {
        *priority = 30;
        *module = (mca_base_module_t *)&orte_ess_slurmd_module;
--- a/orte/mca/ess/slurmd/ess_slurmd_module.c
+++ b/orte/mca/ess/slurmd/ess_slurmd_module.c
@ -88,6 +88,9 @@ orte_ess_base_module_t orte_ess_slurmd_module = {
    NULL /* ft_event */
 };

+/* Local globals */
+static bool app_init_complete;
+
 /****    MODULE FUNCTIONS    ****/

 static int rte_init(char flags)
@ -110,6 +113,9 @@ static int rte_init(char flags)
    int *ppn;
    bool block=false, cyclic=false;
    
+    /* init flag */
+    app_init_complete = false;
+    
    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
@ -331,6 +337,9 @@ static int rte_init(char flags)
        goto error;
    }
    
+    /* flag that we completed init */
+    app_init_complete = true;
+    
    return ORTE_SUCCESS;
    
 error:
@ -345,12 +354,17 @@ static int rte_finalize(void)
 {
    int ret;
   
-    /* use the default procedure to finish */
-    if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
-        ORTE_ERROR_LOG(ret);
+    if (app_init_complete) {
+        /* use the default procedure to finish */
+        if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
+            ORTE_ERROR_LOG(ret);
+        }
    }
    
-    /* deconstruct my nidmap and jobmap arrays */
+    /* deconstruct my nidmap and jobmap arrays - this
+     * function protects itself from being called
+     * before things were initialized
+     */
    orte_util_nidmap_finalize();
    
    return ret;    
--- a/orte/mca/plm/base/plm_base_jobid.c
+++ b/orte/mca/plm/base/plm_base_jobid.c
@ -63,7 +63,7 @@ int orte_plm_base_set_hnp_name(void)
    ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
    ORTE_PROC_MY_NAME->vpid = 0;
    
-    orte_plm_globals.next_jobid = ORTE_PROC_MY_NAME->jobid + 1;
+    orte_plm_globals.next_jobid = 1;
    
    /* copy it to the HNP field */
    ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
@ -78,12 +78,38 @@ int orte_plm_base_set_hnp_name(void)
 */
 int orte_plm_base_create_jobid(orte_jobid_t *jobid)
 {
-    if (ORTE_JOBID_MAX-1 <  orte_plm_globals.next_jobid) {
+#if 0
+    orte_job_t **jobs;
+    int32_t j;
+    
+    /* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S,
+     * THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO
+     * UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S
+     */
+    
+    /* see if there is a prior
+     * jobid that has completed and can be re-used. It can
+     * never be 0 as that belongs to the HNP and its daemons
+     */
+    jobs = (orte_job_t**)orte_job_data->addr;
+    for (j=1; j < orte_job_data->size; j++) {
+        if (NULL == jobs[j]) {
+            /* this local jobid is available - reuse it */
+            *jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j);
+            return ORTE_SUCCESS;
+        }
+    }
+#endif
+
+    if (UINT16_MAX == orte_plm_globals.next_jobid) {
+        /* if we get here, then no local jobids are available */
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        *jobid = ORTE_JOBID_INVALID;
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    
-    *jobid =  orte_plm_globals.next_jobid++;
+    /* take the next jobid */
+    *jobid =  ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, orte_plm_globals.next_jobid);
+    orte_plm_globals.next_jobid++;
    return ORTE_SUCCESS;
 }
--- a/orte/mca/plm/base/plm_base_launch_support.c
+++ b/orte/mca/plm/base/plm_base_launch_support.c
@ -66,6 +66,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
 {
    orte_job_t *jdatorted;
    int rc;
+    int32_t ljob;
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:setup_job for job %s",
@ -73,7 +74,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
                         ORTE_JOBID_PRINT(jdata->jobid)));

    /* insert the job object into the global pool */
-    opal_pointer_array_add(orte_job_data, jdata);
+    ljob = ORTE_LOCAL_JOBID(jdata->jobid);
+    opal_pointer_array_set_item(orte_job_data, ljob, jdata);
    
    if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
        ORTE_ERROR_LOG(rc);
@ -1135,9 +1137,13 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
 void orte_plm_base_check_job_completed(orte_job_t *jdata)
 {
    orte_proc_t **procs;
-    orte_vpid_t i;
+    int i;
    orte_std_cntr_t j;
    orte_job_t **jobs;
+    orte_node_t **nodes;
+    orte_job_map_t *map;
+    orte_std_cntr_t index;
+    bool one_still_alive;
    
    /* if the incoming job data pointer is NULL, then all we can do
     * is check all jobs for complete
@ -1167,7 +1173,13 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
     * as abnormally terminated, then do not update its state
     */
    if (jdata->state < ORTE_JOB_STATE_TERMINATED) {
-        for (i=0; i < jdata->num_procs; i++) {
+        for (i=0; i < jdata->procs->size; i++) {
+            /* the proc array may no longer be left justified, so
+             * we need to check everything
+             */
+            if (NULL == procs[i]) {
+                continue;
+            }
            if (ORTE_PROC_STATE_FAILED_TO_START == procs[i]->state) {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
                if (!jdata->abort) {
@ -1217,25 +1229,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
        }
    }

-    /* Release the resources used by this job. */
-    if( NULL != jdata->map ) {
-        int i, index;
-        orte_node_t* daemon;
-        orte_proc_t* proc;
-        orte_job_map_t* map;
-
-        map = jdata->map;
-        for( index = 0; index < map->num_nodes; index++ ) {
-            daemon = (orte_node_t *) opal_pointer_array_get_item( map->nodes, index );
-            for( i = 0; i < (int)daemon->num_procs; i++ ) {
-                proc = (orte_proc_t *) opal_pointer_array_get_item(daemon->procs, i);
-                if( (NULL != proc) && (proc->name.jobid == jdata->jobid) ) {
-                    daemon->slots_inuse--;
-                }
-            }
-        }
-    }
-
    /* check the resulting job state and notify the appropriate places */
    
    if (ORTE_JOB_STATE_FAILED_TO_START == jdata->state) {
@ -1299,14 +1292,65 @@ CHECK_ALL_JOBS:
                return;
            }
        }
+        /* Release the resources used by this job. Since some errmgr's may want
+         * to continue using resources allocated to the job as part of their
+         * fault recovery procedure, we only do this once the job is "complete".
+         * Note that an aborted/killed job -is- flagged as complete and will
+         * therefore have its resources released.
+         */
+        if( NULL != jdata->map ) {
+            map = jdata->map;
+            nodes = (orte_node_t**)map->nodes->addr;
+            for( index = 0; index < map->nodes->size; index++ ) {
+                if (NULL == nodes[index]) {
+                    /* the nodes in a map are left-justfied and
+                     * there are no holes in the array
+                     */
+                    break;
+                }
+                procs = (orte_proc_t**)nodes[index]->procs->addr;
+                for( i = 0; i < nodes[index]->procs->size; i++ ) {
+                    if (NULL == procs[i]) {
+                        /* there can be holes in the proc array since
+                         * we are cleaning up as we go
+                         */
+                        continue;
+                    }
+                    if(procs[i]->name.jobid == jdata->jobid) {
+                        nodes[index]->slots_inuse--;
+                        nodes[index]->num_procs--;
+                        /* release this object, ensuring that the
+                         * pointer array internal accounting
+                         * is maintained!
+                         */
+                        OBJ_RELEASE(procs[i]);
+                        opal_pointer_array_set_item(nodes[index]->procs, i, NULL);
+                    }
+                }
+            }
+        }
+        
+        /* now check to see if all jobs are done - release this jdata
+         * object when we find it
+         */
        jobs = (orte_job_t**)orte_job_data->addr;
-        for (j=0; j < orte_job_data->size; j++) {
+        one_still_alive = false;
+        for (j=1; j < orte_job_data->size; j++) {
            if (NULL == jobs[j]) {
-                /* the jobs are left-justified in the array, so
-                 * if we find a NULL, that means we are past all
-                 * the jobs so we can just quit the loop
+                /* since we are releasing jdata objects as we
+                 * go, we can no longer assume that the job_data
+                 * array is left justified
                 */
-                break;
+                continue;
+            }
+            if (NULL != jdata && jobs[j]->jobid == jdata->jobid) {
+                /* release this object, ensuring that the
+                 * pointer array internal accounting
+                 * is maintained!
+                 */
+                OBJ_RELEASE(jdata);
+                opal_pointer_array_set_item(orte_job_data, j, NULL);
+                continue;
            }
            /* if the job is flagged to not be monitored, skip it */
            if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jobs[j]->controls) {
@ -1315,20 +1359,33 @@ CHECK_ALL_JOBS:
            /* when checking for job termination, we must be sure to NOT check
             * our own job as it - rather obviously - has NOT terminated!
             */
-            if (ORTE_PROC_MY_NAME->jobid != jobs[j]->jobid &&
-                jobs[j]->num_terminated < jobs[j]->num_procs) {
-                /* we have at least one job that is not done yet */
+            if (jobs[j]->num_terminated < jobs[j]->num_procs) {
+                /* we have at least one job that is not done yet - we cannot
+                 * just return, though, as we need to ensure we cleanout the
+                 * job data for the job that just completed
+                 */
                OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                     "%s plm:base:check_job_completed job %s is not terminated",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOBID_PRINT(jobs[j]->jobid)));
-                return;
+                one_still_alive = true;
            }
        }
+        /* if a job is still alive, we just return */
+        if (one_still_alive) {
+            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
+                                 "%s plm:base:check_job_completed at least one job is not terminated",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+            return;
+        }
        /* if we get here, then all jobs are done, so wakeup */
        OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                             "%s plm:base:check_job_completed all jobs terminated - waking up",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+        /* set the exit status to 0 - this will only happen if it
+         * wasn't already set by an error condition
+         */
+        ORTE_UPDATE_EXIT_STATUS(0);
        orte_trigger_event(&orte_exit);
    }
    
--- a/orte/mca/plm/base/plm_private.h
+++ b/orte/mca/plm/base/plm_private.h
@ -52,7 +52,7 @@ typedef struct {
    /* orted cmd cond */
    opal_condition_t orted_cmd_cond;
    /* next jobid */
-    orte_jobid_t next_jobid;
+    uint16_t next_jobid;
    /* time when daemons started launch */
    struct timeval daemonlaunchstart;
    /* rsh launch agent path */
--- a/orte/mca/plm/slurm/plm_slurm_module.c
+++ b/orte/mca/plm/slurm/plm_slurm_module.c
@ -102,9 +102,10 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
 /*
 * Local variables
 */
-static pid_t srun_pid = 0;
+static pid_t primary_srun_pid = 0;
+static bool primary_pid_set = false;
 static orte_jobid_t active_job = ORTE_JOBID_INVALID;
-static bool failed_launch;
+static bool launching_daemons;
 static bool local_launch_available = false;

 /**
@ -153,7 +154,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
    struct timeval launchstart, launchstop;
    int proc_vpid_index;
    orte_jobid_t failed_job;
-    
+    bool failed_launch=true;
+
    if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
        /* if this is a request to launch a local slave,
         * then we will not be launching an orted - we will
@ -187,7 +189,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
    }
    
    /* indicate the state of the launch */
-    failed_launch = true;
+    launching_daemons = true;
    
    /* create a jobid for this job */
    if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
@ -398,6 +400,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
    
 launch_apps:
    /* get here if daemons launch okay - any failures now by apps */
+    launching_daemons = false;
    failed_job = active_job;
    if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
@ -465,19 +468,31 @@ static int plm_slurm_terminate_job(orte_jobid_t jobid)
 static int plm_slurm_terminate_orteds(void)
 {
    int rc;
+    orte_job_t *jdata;
    
-    /* deregister the waitpid callback to ensure we don't make it look like
-     * srun failed when it didn't. Since the srun may have already completed,
-     * do NOT ERROR_LOG any return code to avoid confusing, duplicate error
-     * messages
+    /* tell them to die without sending a reply - we will rely on the
+     * waitpid to tell us when they have exited!
     */
-    orte_wait_cb_cancel(srun_pid);
-    
-    /* tell them to die! */
-    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) {
+    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
        ORTE_ERROR_LOG(rc);
    }
    
+    /* check to see if the primary pid is set. If not, this indicates
+     * that we never launched any additional daemons, so we cannot
+     * not wait for a waitpid to fire and tell us it's okay to
+     * exit. Instead, we simply trigger an exit for ourselves
+     */
+    if (!primary_pid_set) {
+        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                             "%s plm:slurm: primary daemons complete!",
+                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+        jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+        jdata->state = ORTE_JOB_STATE_TERMINATED;
+        /* need to set the #terminated value to avoid an incorrect error msg */
+        jdata->num_terminated = jdata->num_procs;
+        orte_trigger_event(&orteds_exit);
+    }
+    
    return rc;
 }

@ -512,40 +527,63 @@ static int plm_slurm_finalize(void)


 static void srun_wait_cb(pid_t pid, int status, void* cbdata){
+    orte_job_t *jdata;
+    
    /* According to the SLURM folks, srun always returns the highest exit
-       code of our remote processes. Thus, a non-zero exit status doesn't
-       necessarily mean that srun failed - it could be that an orted returned
-       a non-zero exit status. Of course, that means the orted failed(!), so
-       the end result is the same - the job didn't start.
-    
-       As a result, we really can't do much with the exit status itself - it
-       could be something in errno (if srun itself failed), or it could be
-       something returned by an orted, or it could be something returned by
-       the OS (e.g., couldn't find the orted binary). Somebody is welcome
-       to sort out all the options and pretty-print a better error message. For
-       now, though, the only thing that really matters is that
-       srun failed. Report the error and make sure that orterun
-       wakes up - otherwise, do nothing!
+     code of our remote processes. Thus, a non-zero exit status doesn't
+     necessarily mean that srun failed - it could be that an orted returned
+     a non-zero exit status. Of course, that means the orted failed(!), so
+     the end result is the same - the job didn't start.
     
-       Unfortunately, the pid returned here is the srun pid, not the pid of
-       the proc that actually died! So, to avoid confusion, just use -1 as the
-       pid so nobody thinks this is real
-    */
+     As a result, we really can't do much with the exit status itself - it
+     could be something in errno (if srun itself failed), or it could be
+     something returned by an orted, or it could be something returned by
+     the OS (e.g., couldn't find the orted binary). Somebody is welcome
+     to sort out all the options and pretty-print a better error message. For
+     now, though, the only thing that really matters is that
+     srun failed. Report the error and make sure that orterun
+     wakes up - otherwise, do nothing!
+     
+     Unfortunately, the pid returned here is the srun pid, not the pid of
+     the proc that actually died! So, to avoid confusion, just use -1 as the
+     pid so nobody thinks this is real
+     */
    
-    if (0 != status) {
-        if (failed_launch) {
-            /* report that the daemon has failed so we can exit
-             */
-            orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
-            
-        } else {
+    /* if we are in the launch phase, then any termination is bad */
+    if (launching_daemons) {
+        /* report that one or more daemons failed to launch so we can exit */
+        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                             "%s plm:slurm: daemon failed during launch",
+                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+        orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
+    } else {
+        /* if this is after launch, then we need to abort only if the status
+         * returned is non-zero - i.e., if the orteds exited with an error
+         */
+        if (0 != status) {
            /* an orted must have died unexpectedly after launch - report
             * that the daemon has failed so we exit
             */
+            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                                 "%s plm:slurm: daemon failed while running",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED);
        }
+        /* otherwise, check to see if this is the primary pid */
+        if (primary_srun_pid == pid) {
+            /* in this case, we just want to fire the proper trigger so
+             * mpirun can exit
+             */
+            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                                 "%s plm:slurm: primary daemons complete!",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+            jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+            jdata->state = ORTE_JOB_STATE_TERMINATED;
+            /* need to set the #terminated value to avoid an incorrect error msg */
+            jdata->num_terminated = jdata->num_procs;
+            orte_trigger_event(&orteds_exit);
+        }
    }
-    
 }


@ -553,6 +591,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
                                char *prefix)
 {
    int fd;
+    int srun_pid;
    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);

    if (NULL == exec_argv) {
@ -653,6 +692,14 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
        sides of the fork... */
        setpgid(srun_pid, srun_pid);
        
+        /* if this is the primary launch - i.e., not a comm_spawn of a
+         * child job - then save the pid
+         */
+        if (!primary_pid_set) {
+            primary_srun_pid = srun_pid;
+            primary_pid_set = true;
+        }
+        
        /* setup the waitpid so we can find out if srun succeeds! */
        orte_wait_cb(srun_pid, srun_wait_cb, NULL);
        free(exec_argv);
--- a/orte/mca/rmaps/base/rmaps_base_support_fns.c
+++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c
@ -352,7 +352,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
 int orte_rmaps_base_compute_usage(orte_job_t *jdata)
 {
    orte_std_cntr_t i;
-    orte_vpid_t j, k;
+    int j, k;
    orte_node_t **nodes;
    orte_proc_t **procs, *psave, *psave2;
    orte_vpid_t minv, minv2;
@ -378,13 +378,24 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
        procs = (orte_proc_t**)nodes[i]->procs->addr;
        local_rank = 0;
        
-        for (k=0; k < nodes[i]->num_procs; k++) {
+        /* the node map may have holes in it, so cycle
+         * all the way through and avoid the holes
+         */
+        for (k=0; k < nodes[i]->procs->size; k++) {
+            /* if this proc is NULL, skip it */
+            if (NULL == procs[k]) {
+                continue;
+            }
            minv = ORTE_VPID_MAX;
            minv2 = ORTE_VPID_MAX;
            psave = NULL;
            psave2 = NULL;
            /* find the minimum vpid proc */
-            for (j=0; j < nodes[i]->num_procs; j++) {
+            for (j=0; j < nodes[i]->procs->size; j++) {
+                /* if this proc is NULL, skip it */
+                if (NULL == procs[j]) {
+                    continue;
+                }
                if (procs[j]->name.jobid == jdata->jobid &&
                    ORTE_LOCAL_RANK_MAX == procs[j]->local_rank &&
                    procs[j]->name.vpid < minv) {
--- a/orte/runtime/orte_globals.c
+++ b/orte/runtime/orte_globals.c
@ -407,31 +407,6 @@ int orte_dt_init(void)
 }

 #if !ORTE_DISABLE_FULL_SUPPORT
-int orte_hnp_globals_init(void)
-{
-    int rc;
-
-    orte_job_data = OBJ_NEW(opal_pointer_array_t);
-    if (ORTE_SUCCESS != (rc = opal_pointer_array_init(orte_job_data,
-                                                      1,
-                                                      ORTE_GLOBAL_ARRAY_MAX_SIZE,
-                                                      1))) {
-        ORTE_ERROR_LOG(rc);
-        return rc;
-    }
-    
-    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
-    if (ORTE_SUCCESS != (rc = opal_pointer_array_init(orte_node_pool,
-                                                      ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
-                                                      ORTE_GLOBAL_ARRAY_MAX_SIZE,
-                                                      ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
-        ORTE_ERROR_LOG(rc);
-        return rc;
-    }
-    
-    return ORTE_SUCCESS;
-}
-

 orte_job_t* orte_get_job_data_object(orte_jobid_t job)
 {
@ -575,20 +550,22 @@ static void orte_job_construct(orte_job_t* job)
 static void orte_job_destruct(orte_job_t* job)
 {
    orte_std_cntr_t i;
-    orte_vpid_t j;
+    int n;
    
    for (i=0; i < job->num_apps; i++) {
        if (NULL != job->apps->addr[i]) OBJ_RELEASE(job->apps->addr[i]);
    }
    OBJ_RELEASE(job->apps);
    
-    for (j=0; j < job->num_procs; j++) {
-        if (NULL != job->procs->addr[j]) OBJ_RELEASE(job->procs->addr[j]);
+    if (NULL != job->map) OBJ_RELEASE(job->map);
+    
+    for (n=0; n < job->procs->size; n++) {
+        if (NULL != job->procs->addr[n]) {
+            OBJ_RELEASE(job->procs->addr[n]);
+        }
    }
    OBJ_RELEASE(job->procs);
    
-    if (NULL != job->map) OBJ_RELEASE(job->map);
-
 #if OPAL_ENABLE_FT == 1
    if (NULL != job->ckpt_snapshot_ref) {
        free(job->ckpt_snapshot_ref);
@ -635,7 +612,7 @@ static void orte_node_construct(orte_node_t* node)

 static void orte_node_destruct(orte_node_t* node)
 {
-    orte_vpid_t i;
+    int i;
    
    if (NULL != node->name) {
        free(node->name);
@ -645,10 +622,16 @@ static void orte_node_destruct(orte_node_t* node)
        opal_argv_free(node->alias);
    }
    
-    if (NULL != node->daemon) OBJ_RELEASE(node->daemon);
+    if (NULL != node->daemon) {
+        node->daemon->node = NULL;
+        OBJ_RELEASE(node->daemon);
+    }
    
-    for (i=0; i < node->num_procs; i++) {
-        if (NULL != node->procs->addr[i]) OBJ_RELEASE(node->procs->addr[i]);
+    for (i=0; i < node->procs->size; i++) {
+        if (NULL != node->procs->addr[i]) {
+            ((orte_proc_t*)(node->procs->addr[i]))->node = NULL;
+            OBJ_RELEASE(node->procs->addr[i]);
+        }
    }
    OBJ_RELEASE(node->procs);
    
--- a/orte/runtime/runtime_internals.h
+++ b/orte/runtime/runtime_internals.h
@ -29,16 +29,6 @@

 BEGIN_C_DECLS

-
-#if !ORTE_DISABLE_FULL_SUPPORT
-/**
- * Initialize global storage for HNPs
- */
-ORTE_DECLSPEC   int orte_hnp_globals_init(void);
-
-#endif /* !ORTE_DISABLE_FULL_SUPPORT */
-
-
 /**
 * Init the ORTE datatype support
 */
--- a/orte/tools/orterun/orterun.c
+++ b/orte/tools/orterun/orterun.c
@ -787,7 +787,6 @@ DONE:
 static void job_completed(int trigpipe, short event, void *arg)
 {
    int rc;
-    orte_job_state_t exit_state;
    orte_job_t *daemons;
    
    /* if the abort exit event is set, delete it */
@ -796,8 +795,6 @@ static void job_completed(int trigpipe, short event, void *arg)
        free(abort_exit_event);
    }
    
-    exit_state = jdata->state;
-
    /* if we never launched, just skip this part to avoid
     * meaningless error messages
     */
@ -806,7 +803,7 @@ static void job_completed(int trigpipe, short event, void *arg)
        goto DONE;
    }
    
-    if (ORTE_JOB_STATE_TERMINATED != exit_state) {
+    if (0 != orte_exit_status) {
        /* abnormal termination of some kind */
        dump_aborted_procs();
        /* If we showed more abort messages than were allowed,
@ -991,8 +988,8 @@ static void dump_aborted_procs(void)
    jobs = (orte_job_t**)orte_job_data->addr;
    for (n=1; n < orte_job_data->size; n++) {
        if (NULL == jobs[n]) {
-            /* the array is left-justified, so we can quit on the first NULL */
-            return;
+            /* the array is no longer left-justified, so we have to continue */
+            continue;
        }
        if (ORTE_JOB_STATE_UNDEF != jobs[n]->state &&
            ORTE_JOB_STATE_INIT != jobs[n]->state &&
--- a/orte/util/nidmap.c
+++ b/orte/util/nidmap.c
@ -828,7 +828,14 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
    
    jobs = (orte_job_t**)orte_job_data->addr;
    /* for each job... */
-    for (j=0; j < orte_job_data->size && NULL != jobs[j]; j++) {
+    for (j=1; j < orte_job_data->size; j++) {
+        /* the job array is no longer left-justified and may
+         * have holes in it as we recover resources at job
+         * completion
+         */
+        if (NULL == jobs[j]) {
+            continue;
+        }
        jdata = jobs[j];
        /* pack the jobid */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {