Complete modifications for failed-to-start of applications. Modifications for failed-to-start of orteds coming next.

This completes the minor changes required to the PLS components. Basically, there is a small change required to the parameter list of the orted cmd functions. I caught and did it for xcpu and poe, in addition to the components listed in my email - so I think that only leaves xgrid unconverted. The orted fail-to-start mods will also make changes in the PLS components, but those can be localized so they come in one at a time. This commit was SVN r14499.
2007-04-24 20:53:54 +00:00 · 2007-04-24 20:53:54 +00:00 · 18cb5c9762
--- a/orte/mca/pls/base/pls_base_receive.c
+++ b/orte/mca/pls/base/pls_base_receive.c
@ -169,13 +169,6 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
            break;
            
        case ORTE_PLS_TERMINATE_ORTEDS_CMD:
-            /* get the jobid whose daemons are to be terminated */
-            count = 1;
-            if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
-                ORTE_ERROR_LOG(rc);
-                goto SEND_ANSWER;
-            }
-                
            /* get any attributes */
            OBJ_CONSTRUCT(&attrs, opal_list_t);
            count = 1;
@ -199,7 +192,7 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
            timeout.tv_usec = microsecs;
            
            /* issue the command */
-            if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &timeout, &attrs))) {
+            if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(&timeout, &attrs))) {
                ORTE_ERROR_LOG(rc);
            }
                
--- a/orte/mca/pls/bproc/pls_bproc.c
+++ b/orte/mca/pls/bproc/pls_bproc.c
@ -46,7 +46,6 @@

 #include "opal/mca/installdirs/installdirs.h"
 #include "opal/class/opal_list.h"
-#include "opal/class/opal_list.h"
 #include "opal/event/event.h"
 #include "opal/mca/base/mca_base_param.h"
 #include "opal/util/argv.h"
@ -72,6 +71,7 @@
 #include "orte/mca/schema/schema_types.h"
 #include "orte/mca/smr/smr.h"
 #include "orte/runtime/orte_wait.h"
+#include "orte/runtime/orte_wakeup.h"
 #include "orte/runtime/runtime.h"
 #include "orte/runtime/params.h"

@ -438,6 +438,10 @@ static void orte_pls_bproc_setup_env(char *** env)
 * @retval ORTE_SUCCESS
 * @retval error
 */
+/* When working in this function, ALWAYS jump to "cleanup" if
+ * you encounter an error so that orterun will be woken up and
+ * the job can cleanly terminate
+ */
 static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
    int * daemon_list = NULL;
    int num_daemons = 0;
@ -452,9 +456,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
    orte_vpid_t daemon_vpid_start;
    orte_std_cntr_t idx;
    struct stat buf;
-    opal_list_t daemons;
-    orte_pls_daemon_info_t *dmn;
-    opal_list_item_t *item;
    struct timeval joblaunchstart, launchstart, launchstop;

    OPAL_TRACE(1);
@ -468,11 +469,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
    /* indicate that the daemons have not completely launched yet */
    daemons_launched = false;
    
-    /* setup a list that will contain the info for all the daemons
-     * so we can store it on the registry when done
-     */
-    OBJ_CONSTRUCT(&daemons, opal_list_t);
-
    /* get the number of nodes in this job and allocate an array for
     * their names so we can pass that to bproc - populate the list
     * with the node names
@ -480,12 +476,12 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
    num_daemons = map->num_nodes;
    if (0 == num_daemons) {
        /* nothing to do */
-        OBJ_DESTRUCT(&daemons);
        return ORTE_SUCCESS;
    }
    
    if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+        rc = ORTE_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }
    i = 0;
@ -500,6 +496,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
    /* allocate storage for bproc to return the daemon pids */
    if(NULL == (pids = (int*)malloc(sizeof(int) * num_daemons))) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+        rc = ORTE_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }

@ -632,6 +629,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
                    rc, *pids);
    }

+    /* we need to be smarter here - right now, we stop on the first negative pid. But
+     * daemons beyond that one might have started. This could leave a daemon stranded
+     * when we abort
+     */
    for(i = 0; i < num_daemons; i++) {
        if(0 >= pids[i]) {
            opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true,
@ -650,28 +651,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
-
-            dmn = OBJ_NEW(orte_pls_daemon_info_t);
-            rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0,
-                                             daemon_vpid_start + i);
-            if(ORTE_SUCCESS != rc) {
-                ORTE_ERROR_LOG(rc);
-                goto cleanup;
-            }
-            dmn->cell = dmn->name->cellid;
-            dmn->nodename = strdup(param);
-            dmn->active_job = map->job;
-            opal_list_append(&daemons, &dmn->super);
-            
            free(param);
        }
    }
    
-    /* store the daemon info */
-    if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
-        ORTE_ERROR_LOG(rc);
-    }
-
    /* setup the callbacks - this needs to be done *after* we store the
     * daemon info so that short-lived apps don't cause mpirun to
     * try and terminate the orteds before we record them
@ -718,7 +701,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
                }
                rc = ORTE_ERROR;
                ORTE_ERROR_LOG(rc);
-                orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL);
                goto cleanup;
            }
        }
@ -747,10 +729,17 @@ cleanup:
    if(NULL != orted_path) {
        free(orted_path);
    }
-    while (NULL != (item = opal_list_remove_first(&daemons))) {
-        OBJ_RELEASE(item);
+
+    /* check for failed launch - if so, force terminate */
+    if (!daemons_launched) {
+        if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(map->job, ORTE_JOB_STATE_FAILED_TO_START))) {
+            ORTE_ERROR_LOG(rc);
+        }
+        
+        if (ORTE_SUCCESS != (rc = orte_wakeup(map->job))) {
+            ORTE_ERROR_LOG(rc);
+        }        
    }
-    OBJ_DESTRUCT(&daemons);

    return rc;
 }
@ -784,7 +773,7 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
    orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL);
    
    /* kill the daemons */
-    orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL);
+    orte_pls_bproc_terminate_orteds(&orte_abort_timeout, NULL);
    
    /* shouldn't ever get here.. */
    exit(1);
@ -806,9 +795,16 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
 * @retval ORTE_SUCCESS
 * @retval error
 */
+
+/* When working in this function, ALWAYS jump to "cleanup" if
+* you encounter an error so that orterun will be woken up and
+* the job can cleanly terminate. Since we don't use the ORTE
+* daemons to launch the application procs, this is the *only*
+* way we have of knowing something went wrong.
+*/
 static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
                                     orte_vpid_t vpid_start, int app_context) {
-    int *node_array, num_nodes, cycle;
+    int *node_array=NULL, num_nodes, cycle;
    int rc, i, j, stride;
    orte_std_cntr_t num_processes;
    int *pids = NULL;
@ -817,6 +813,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
    struct bproc_io_t bproc_io[3];
    char **env;
    int dbg;
+    bool app_launched = false;

    OPAL_TRACE(1);
    
@ -862,7 +859,8 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
    node_array = (int*)malloc(map->num_nodes * sizeof(int));
    if (NULL == node_array) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
-        return ORTE_ERR_OUT_OF_RESOURCE;
+        rc = ORTE_ERR_OUT_OF_RESOURCE;
+        goto cleanup;
    }
    
    /* initialize the cycle count. Computing the process name under Bproc
@ -949,6 +947,10 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
            goto cleanup;
        }
        
+        /* we need to be smarter here - right now, we stop on the first negative pid. But
+         * processes beyond that one might have started. This leaves those procs stranded
+         * when we abort
+         */
        for(j = 0; j < num_nodes; j++) {
            if(0 >= pids[j]) {
                opal_show_help("help-pls-bproc.txt", "proc-launch-bad-pid", true,
@ -1007,15 +1009,34 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
            goto cleanup;
        }
    }
-
+    
+    /* get here if the app procs launched cleanly */
+    apps_launched = true;
+    
 cleanup:
    if(NULL != pids) {
        free(pids);
    }
    
-    free(node_array);
+    if (NULL != node_array) {
+        free(node_array);
+    }
+    
+    if (NULL != env) {
+        opal_argv_free(env);
+    }
+    
+    /* check for failed launch - if so, force terminate */
+    if (!apps_launched) {
+        if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(map->job, ORTE_JOB_STATE_FAILED_TO_START))) {
+            ORTE_ERROR_LOG(rc);
+        }
+        
+        if (ORTE_SUCCESS != (rc = orte_wakeup(map->job))) {
+            ORTE_ERROR_LOG(rc);
+        }        
+    }
    
-    if (NULL != env) opal_argv_free(env);
    return rc;
 }

@ -1032,8 +1053,13 @@ cleanup:
 * @retval ORTE_SUCCESS
 * @retval error
 */
+
+/* When working in this function, ALWAYS jump to "cleanup" if
+ * you encounter an error so that orterun will be woken up and
+ * the job can cleanly terminate
+ */
 int orte_pls_bproc_launch(orte_jobid_t jobid) {
-    orte_job_map_t* map;
+    orte_job_map_t* map = NULL;
    orte_mapped_node_t *map_node;
    orte_vpid_t vpid_launch;
    int rc;
@ -1043,26 +1069,31 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
    char cwd_save[OMPI_PATH_MAX + 1];
    orte_ras_node_t *ras_node;
    char **daemon_env;
+    bool launched;

    OPAL_TRACE(1);
    
+    /* indicate the launch condition */
+    launched = false;
+    
    /* make sure the pls_bproc receive function has been started */
    if (ORTE_SUCCESS != (rc = orte_pls_bproc_comm_start())) {
        ORTE_ERROR_LOG(rc);
-        return rc;
+        goto cleanup;
    }
    
    /* save the current working directory */
    if (NULL == getcwd(cwd_save, sizeof(cwd_save))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
-        return ORTE_ERR_NOT_FOUND;
+        rc = ORTE_ERR_NOT_FOUND;
+        goto cleanup;
    }
    cwd_save[sizeof(cwd_save) - 1] = '\0';
    
    /* get the job map */
    if(ORTE_SUCCESS != (rc = orte_rmaps.get_job_map(&map, jobid))) {
        ORTE_ERROR_LOG(rc);
-        return rc;
+        goto cleanup;
    }

    /* set the mapping mode */
@ -1158,16 +1189,32 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
        vpid_launch += map->apps[context]->num_procs;
    }

+    /* indicate a successful launch */
+    launched = true;
+    
 cleanup:
    chdir(cwd_save);

-    OBJ_RELEASE(map);
+    if (NULL != map) {
+        OBJ_RELEASE(map);
+    }

    if (mca_pls_bproc_component.do_not_launch) {
        /* indicate that we failed to launch, but do so silently */
        return ORTE_ERR_SILENT;
    }
    
+    /* check for failed launch - if so, force terminate */
+    if (!launched) {
+        if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
+            ORTE_ERROR_LOG(rc);
+        }
+        
+        if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
+            ORTE_ERROR_LOG(rc);
+        }        
+    }
+    
    return rc;
 }

@ -1203,17 +1250,15 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, op
 /**
 * Terminate the orteds for a given job
 */
-int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
+int orte_pls_bproc_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
    int rc;

-    OPAL_TRACE(1);
-    
-    /* now tell them to die! */
+    /* tell them to die! */
    if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
        ORTE_ERROR_LOG(rc);
    }
-
+    
    return rc;
 }

--- a/orte/mca/pls/cnos/pls_cnos.c
+++ b/orte/mca/pls/cnos/pls_cnos.c
@ -43,7 +43,7 @@

 static int orte_pls_cnos_launch_job(orte_jobid_t jobid);
 static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
-static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
+static int orte_pls_cnos_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
 static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name);
 static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
 static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal);
@ -91,18 +91,13 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeo
 }


-static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
+static int orte_pls_cnos_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
-    orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid;
-    
-    /* make sure it's my job */
-    if (jobid == my_jobid) {
 #ifdef HAVE_KILLRANK
-        killrank(-1, SIGKILL);
+    killrank(-1, SIGKILL);
 #else
-        exit(0);
+    exit(0);
 #endif
-    }

    return ORTE_ERR_NOT_SUPPORTED;
 }
--- a/orte/mca/pls/pls.h
+++ b/orte/mca/pls/pls.h
@ -207,7 +207,7 @@ typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, struct time
 /**
 * Terminate the daemons associated with this jobid
 */
-typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
+typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(struct timeval *timeout, opal_list_t *attrs);

 /**
 * Terminate a specific process.
--- a/orte/mca/pls/poe/pls_poe_module.c
+++ b/orte/mca/pls/poe/pls_poe_module.c
@ -64,7 +64,7 @@ extern char **environ;
 */
 static int pls_poe_launch_job(orte_jobid_t jobid);
 static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
-static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
+static int pls_poe_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
 static int pls_poe_terminate_proc(const orte_process_name_t *name);
 static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
 static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal);
@ -477,7 +477,8 @@ static inline int poe_launch_interactive_job(orte_jobid_t jobid)
        fclose(hfp);
    }

-    rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
+    vpid_start = 0;
+    rc = orte_ns.get_vpid_range(jobid, &vpid_range);
    if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }

    /* Create a temporary POE command file */
@ -589,7 +590,7 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name)
    return ORTE_ERR_NOT_IMPLEMENTED;
 }

-static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
+static int pls_poe_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
    return ORTE_ERR_NOT_IMPLEMENTED;
 }
--- a/orte/mca/pls/proxy/pls_proxy.c
+++ b/orte/mca/pls/proxy/pls_proxy.c
@ -201,7 +201,7 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal
    return ORTE_SUCCESS;
 }

-int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs)
+int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
    orte_buffer_t* cmd;
    orte_buffer_t* answer;
@ -226,12 +226,6 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, o
        return rc;
    }
    
-    if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
-        ORTE_ERROR_LOG(rc);
-        OBJ_RELEASE(cmd);
-        return rc;
-    }
-    
    if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(cmd);
--- a/orte/mca/pls/proxy/pls_proxy.h
+++ b/orte/mca/pls/proxy/pls_proxy.h
@ -53,7 +53,7 @@ int orte_pls_proxy_finalize(void);
 */
 int orte_pls_proxy_launch(orte_jobid_t job);
 int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
-int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
+int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
 int orte_pls_proxy_terminate_proc(const orte_process_name_t* name);
 int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs);
 int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal);
--- a/orte/mca/pls/rsh/pls_rsh.h
+++ b/orte/mca/pls/rsh/pls_rsh.h
@ -55,7 +55,7 @@ int orte_pls_rsh_finalize(void);
 */
 int orte_pls_rsh_launch(orte_jobid_t);
 int orte_pls_rsh_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
-int orte_pls_rsh_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
+int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t*);
 int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
 int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*);
 int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);
--- a/orte/mca/pls/rsh/pls_rsh_module.c
+++ b/orte/mca/pls/rsh/pls_rsh_module.c
@ -1071,7 +1071,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal
 /**
 * Terminate the orteds for a given job
 */
-int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
+int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
    int rc;
    
--- a/orte/mca/pls/slurm/pls_slurm_module.c
+++ b/orte/mca/pls/slurm/pls_slurm_module.c
@ -75,7 +75,7 @@
 */
 static int pls_slurm_launch_job(orte_jobid_t jobid);
 static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
-static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
+static int pls_slurm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
 static int pls_slurm_terminate_proc(const orte_process_name_t *name);
 static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
 static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);
@ -101,9 +101,10 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = {
 };

 /*
- * Local variable
+ * Local variables
 */
 static pid_t srun_pid = 0;
+static orte_jobid_t active_job = ORTE_JOBID_INVALID;


 /*
@ -113,16 +114,19 @@ static pid_t srun_pid = 0;
 extern char **environ;
 #endif  /* !defined(__WINDOWS__) */

+/* When working in this function, ALWAYS jump to "cleanup" if
+ * you encounter an error so that orterun will be woken up and
+ * the job can cleanly terminate
+ */
 static int pls_slurm_launch_job(orte_jobid_t jobid)
 {
-    orte_job_map_t *map;
+    orte_job_map_t *map = NULL;
    opal_list_item_t *item;
    size_t num_nodes;
    orte_vpid_t vpid;
-    orte_vpid_t start_vpid;
    char *jobid_string = NULL;
    char *param;
-    char **argv;
+    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
@ -136,10 +140,9 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
-    opal_list_t daemons;
-    orte_pls_daemon_info_t *dmn;
    struct timeval joblaunchstart, launchstart, launchstop;
    int proc_name_index = 0;
+    bool failed_launch = true;

    if (mca_pls_slurm_component.timing) {
        if (0 != gettimeofday(&joblaunchstart, NULL)) {
@ -147,10 +150,8 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
        }        
    }
    
-    /* setup a list that will contain the info for all the daemons
-     * so we can store it on the registry when done
-     */
-    OBJ_CONSTRUCT(&daemons, opal_list_t);
+    /* save the active jobid */
+    active_job = jobid;
    
    /* Query the map for this job.
     * We need the entire mapping for a couple of reasons:
@ -161,8 +162,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
    rc = orte_rmaps.get_job_map(&map, jobid);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
-        OBJ_DESTRUCT(&daemons);
-        return rc;
+        goto cleanup;
    }

    /* if the user requested that we re-use daemons,
@ -171,9 +171,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
    if (orte_pls_base.reuse_daemons) {
        if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
            ORTE_ERROR_LOG(rc);
-            OBJ_RELEASE(map);
-            OBJ_DESTRUCT(&daemons);
-            return rc;
+            goto cleanup;
        }
    }
    
@ -186,14 +184,13 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
         * on existing daemons, so we can just return
         */
        OBJ_RELEASE(map);
-        OBJ_DESTRUCT(&daemons);
        return ORTE_SUCCESS;
    }
    rc = orte_ns.reserve_range(0, num_nodes, &vpid);
    if (ORTE_SUCCESS != rc) {
+        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
-    start_vpid = vpid; 

    /* setup the orted triggers for passing their launch info */
    if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
@ -332,31 +329,6 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
        }
    }

-    /* setup the daemon info for each node */
-    vpid = start_vpid;
-    for (item = opal_list_get_first(&map->nodes);
-         item != opal_list_get_end(&map->nodes);
-         item = opal_list_get_next(item)) {
-        orte_mapped_node_t* node = (orte_mapped_node_t*)item;
-        
-        /* record the daemons info for this node */
-        dmn = OBJ_NEW(orte_pls_daemon_info_t);
-        dmn->active_job = jobid;
-        dmn->cell = node->cell;
-        dmn->nodename = strdup(node->nodename);
-        if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(dmn->name), dmn->cell, 0, vpid))) {
-            ORTE_ERROR_LOG(rc);
-            goto cleanup;
-        }
-        opal_list_append(&daemons, &dmn->super);
-        vpid++;
-    }
-
-    /* store the daemon info on the registry */
-    if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
-        ORTE_ERROR_LOG(rc);
-    }
-    
    /* setup environment */
    env = opal_argv_copy(environ);
    var = mca_base_param_environ_variable("seed", NULL, NULL);
@ -374,7 +346,19 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
    }
    
    /* exec the daemon */
-    rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);
+    if (ORTE_SUCCESS != (rc = pls_slurm_start_proc(argc, argv, env, cur_prefix))) {
+        ORTE_ERROR_LOG(rc);
+        goto cleanup;
+    }
+    
+    /* do NOT wait for srun to complete. Srun only completes when the processes
+     * it starts - in this case, the orteds - complete. We need to go ahead and
+     * return so orterun can do the rest of its stuff. Instead, we'll catch
+     * any srun failures and deal with them elsewhere
+     */
+    
+    /* declare the launch a success */
+    failed_launch = false;
    
    if (mca_pls_slurm_component.timing) {
        if (0 != gettimeofday(&launchstop, NULL)) {
@ -395,21 +379,32 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
    }

    /* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */
-    /* JMS: how do we catch when srun dies? */

 cleanup:
-    OBJ_RELEASE(map);
-    opal_argv_free(argv);
-    opal_argv_free(env);
-
+    if (NULL != map) {
+        OBJ_RELEASE(map);
+    }
+    if (NULL != argv) {
+        opal_argv_free(argv);
+    }
+    if (NULL != env) {
+        opal_argv_free(env);
+    }
+    
    if(NULL != jobid_string) {
        free(jobid_string);
    }
    
-    while (NULL != (item = opal_list_remove_first(&daemons))) {
-        OBJ_RELEASE(item);
+    /* check for failed launch - if so, force terminate */
+    if (failed_launch) {
+        if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
+            ORTE_ERROR_LOG(rc);
+        }
+        
+        if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
+            ORTE_ERROR_LOG(rc);
+        }        
    }
-    OBJ_DESTRUCT(&daemons);
    
    return rc;
 }
@ -431,11 +426,18 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout,
 /**
 * Terminate the orteds for a given job
 */
-static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
+static int pls_slurm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
    int rc;
-
-    /* order them to go away */
+    
+    /* deregister the waitpid callback to ensure we don't make it look like
+    * srun failed when it didn't. Since the srun may have already completed,
+    * do NOT ERROR_LOG any return code to avoid confusing, duplicate error
+    * messages
+    */
+    orte_wait_cb_cancel(srun_pid);
+    
+    /* tell them to die! */
    if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
        ORTE_ERROR_LOG(rc);
    }
@ -495,7 +497,7 @@ static int pls_slurm_cancel_operation(void)
 static int pls_slurm_finalize(void)
 {
    int rc;
-
+    
    /* cleanup any pending recvs */
    if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
        ORTE_ERROR_LOG(rc);
@ -505,6 +507,46 @@ static int pls_slurm_finalize(void)
 }


+static void srun_wait_cb(pid_t pid, int status, void* cbdata){
+    /* According to the SLURM folks, srun always returns the highest exit
+       code of our remote processes. Thus, a non-zero exit status doesn't
+       necessarily mean that srun failed - it could be that an orted returned
+       a non-zero exit status. Of course, that means the orted failed(!), so
+       the end result is the same - the job didn't start.
+    
+       As a result, we really can't do much with the exit status itself - it
+       could be something in errno (if srun itself failed), or it could be
+       something returned by an orted, or it could be something returned by
+       the OS (e.g., couldn't find the orted binary). Somebody is welcome
+       to sort out all the options and pretty-print a better error message. For
+       now, though, the only thing that really matters is that
+       srun failed. Report the error and make sure that orterun
+       wakes up - otherwise, do nothing!
+    */
+    
+    int rc;
+    
+    if (0 != status) {
+        /* we have a problem */
+        opal_output(0, "ERROR: srun failed to start the required daemons.");
+        opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
+        opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
+        opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
+        
+        /* set the job state so we know it failed to start */
+        if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
+            ORTE_ERROR_LOG(rc);
+        }
+        
+        /* force termination of the job */
+        if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
+            ORTE_ERROR_LOG(rc);
+        }
+    }
+    
+}
+
+
 static int pls_slurm_start_proc(int argc, char **argv, char **env,
                                char *prefix)
 {
@ -517,9 +559,11 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,

    srun_pid = fork();
    if (-1 == srun_pid) {
-        opal_output(0, "pls:slurm:start_proc: fork failed");
-        return ORTE_ERR_IN_ERRNO;
-    } else if (0 == srun_pid) {
+        ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
+        return ORTE_ERR_SYS_LIMITS_CHILDREN;
+    }
+    
+    if (0 == srun_pid) {  /* child */
        char *bin_base = NULL, *lib_base = NULL;

        /* Figure out the basenames for the libdir and bindir.  There
@ -596,14 +640,16 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
        /* don't return - need to exit - returning would be bad -
           we're not in the calling process anymore */
        exit(1);
+    } else {  /* parent */
+        /* just in case, make sure that the srun process is not in our
+        process group any more.  Stevens says always do this on both
+        sides of the fork... */
+        setpgid(srun_pid, srun_pid);
+        
+        /* setup the waitpid so we can find out if srun succeeds! */
+        orte_wait_cb(srun_pid, srun_wait_cb, NULL);
+        free(exec_argv);
    }

-    free(exec_argv);
-
-    /* just in case, make sure that the srun process is not in our
-       process group any more.  Stevens says always do this on both
-       sides of the fork... */
-    setpgid(srun_pid, srun_pid);
-
    return ORTE_SUCCESS;
 }
--- a/orte/mca/pls/tm/help-pls-tm.txt
+++ b/orte/mca/pls/tm/help-pls-tm.txt
@ -25,20 +25,17 @@ The first two prefix values supplied for node %s were:
    %s
 and %s
 #
-[daemon-not-found]
-The TM (PBS / Torqus) process starter in Open MPI was unable to find
-its daemon executable (orted) on the node where mpirun was executed.  
+[tm-spawn-failed]
+The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
+on a remote node.

-This sanity check is performed because the back-end PBS / Torque
-process launcher does not provide any kind of error to Open MPI if it
-tries to launch its daemon on a remote node, but the daemon cannot be
-found.  Open MPI's check for the daemon locally is somewhat of a lame
-workaround / sanity check.
+Command line: %s
+Node name: %s
+Launch id: %d

 If you do not understand this error mesage, please try the following:

-1. Try to add the Open MPI executables to your PATH
-2. Use the --prefix option to mpirun to indicate where Open MPI can
-   find its executables
-3. Set the MCA parameter "pls_tm_want_path_check" to 0
-4. Talk to your local system administration
+1. Ensure that the executable "orted" is in your PATH
+2. Use the --prefix option to indicate where we can
+   find that executable
+3. Talk to your local system administrator
--- a/orte/mca/pls/tm/pls_tm_module.c
+++ b/orte/mca/pls/tm/pls_tm_module.c
@ -60,6 +60,7 @@
 #include "orte/orte_types.h"
 #include "orte/runtime/runtime.h"
 #include "orte/runtime/orte_wait.h"
+#include "orte/runtime/orte_wakeup.h"
 #include "orte/mca/pls/pls.h"
 #include "orte/mca/errmgr/errmgr.h"
 #include "orte/mca/smr/smr.h"
@ -80,7 +81,7 @@
 */
 static int pls_tm_launch_job(orte_jobid_t jobid);
 static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
-static int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
+static int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
 static int pls_tm_terminate_proc(const orte_process_name_t *name);
 static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
 static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal);
@ -89,7 +90,6 @@ static int pls_tm_finalize(void);

 static int pls_tm_connect(void);
 static int pls_tm_disconnect(void);
-static int pls_tm_check_path(char *exe, char **env);

 /*
 * Local variables
@ -114,19 +114,23 @@ orte_pls_base_module_t orte_pls_tm_module = {
 extern char **environ;
 #endif  /* !defined(__WINDOWS__) */

+/* When working in this function, ALWAYS jump to "cleanup" if
+ * you encounter an error so that orterun will be woken up and
+ * the job can cleanly terminate
+ */
 static int pls_tm_launch_job(orte_jobid_t jobid)
 {
-    orte_job_map_t *map;
+    orte_job_map_t *map = NULL;
    opal_list_item_t *item;
    size_t num_nodes;
    orte_vpid_t vpid;
    int node_name_index;
    int proc_name_index;
    char *jobid_string;
-    char *param;
-    char **env;
+    char *uri, *param;
+    char **env = NULL;
    char *var;
-    char **argv;
+    char **argv = NULL;
    int argc;
    int rc;
    bool connected = false;
@ -136,12 +140,11 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
    tm_task_id *tm_task_ids = NULL;
    int local_err;
    tm_event_t event;
-    opal_list_t daemons;
-    orte_pls_daemon_info_t *dmn;
    struct timeval launchstart, launchstop, completionstart, completionstop;
    struct timeval jobstart, jobstop;
    int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
    float avgtime=0.0;
+    bool failed_launch = true;
    
    /* check for timing request - get start time if so */
    if (mca_pls_tm_component.timing) {
@ -158,7 +161,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
    rc = orte_rmaps.get_job_map(&map, jobid);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
-        return rc;
+        goto cleanup;
    }

    /* if the user requested that we re-use daemons,
@ -167,8 +170,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
    if (orte_pls_base.reuse_daemons) {
        if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
            ORTE_ERROR_LOG(rc);
-            OBJ_RELEASE(map);
-            return rc;
+            goto cleanup;
        }
    }
    
@ -184,6 +186,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
     */
    rc = orte_ns.reserve_range(0, num_nodes, &vpid);
    if (ORTE_SUCCESS != rc) {
+        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

@ -193,20 +196,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        goto cleanup;
    }
    
-    /* setup a list that will contain the info for all the daemons
-     * so we can store it on the registry when done
-     */
-    OBJ_CONSTRUCT(&daemons, opal_list_t);
-    
    /* Allocate a bunch of TM events to use for tm_spawn()ing */
    tm_events = malloc(sizeof(tm_event_t) * num_nodes);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
+        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    tm_task_ids = malloc(sizeof(tm_task_id) * num_nodes);
    if (NULL == tm_task_ids) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
+        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

@ -294,17 +294,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        }
    }
    
-    /* Do a quick sanity check to ensure that we can find the
-        orted in the PATH */
-    
-    if (ORTE_SUCCESS != 
-        (rc = pls_tm_check_path(argv[0], env))) {
-        ORTE_ERROR_LOG(rc);
-        opal_show_help("help-pls-tm.txt", "daemon-not-found",
-                        true, argv[0]);
-        goto cleanup;
-    }
-        
    /* Iterate through each of the nodes and spin
     * up a daemon.
     */
@ -315,19 +304,10 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        orte_process_name_t* name;
        char* name_string;
        
-        /* new daemon - setup to record its info */
-        dmn = OBJ_NEW(orte_pls_daemon_info_t);
-        dmn->active_job = jobid;
-        opal_list_append(&daemons, &dmn->super);
-        
        /* setup node name */
        free(argv[node_name_index]);
        argv[node_name_index] = strdup(node->nodename);
        
-        /* record the node name in the daemon struct */
-        dmn->cell = node->cell;
-        dmn->nodename = strdup(node->nodename);
-        
        /* initialize daemons process name */
        rc = orte_ns.create_process_name(&name, node->cell, 0, vpid);
        if (ORTE_SUCCESS != rc) {
@ -335,12 +315,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
            goto cleanup;
        }
        
-        /* save it in the daemon struct */
-        if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
-            ORTE_ERROR_LOG(rc);
-            goto cleanup;
-        }
-        
        /* setup per-node options */
        if (mca_pls_tm_component.debug ||
            mca_pls_tm_component.verbose) {
@ -352,7 +326,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        rc = orte_ns.get_proc_name_string(&name_string, name);
        if (ORTE_SUCCESS != rc) {
            opal_output(0, "pls:tm: unable to create process name");
-            return rc;
+            goto cleanup;
        }
        free(argv[proc_name_index]);
        argv[proc_name_index] = strdup(name_string);
@ -377,13 +351,12 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        
        rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
        if (TM_SUCCESS != rc) {
-            return ORTE_ERROR;
-        }
-        
-        if (ORTE_SUCCESS != rc) {
-            opal_output(0, "pls:tm: start_procs returned error %d", rc);
+            opal_show_help("help-pls-tm.txt", "tm-spawn-failed",
+                           true, argv[0], node->nodename, node->launch_id);
+            rc = ORTE_ERROR;
            goto cleanup;
        }
+        
        /* check for timing request - get stop time and process if so */
        if (mca_pls_tm_component.timing) {
            if (0 != gettimeofday(&launchstop, NULL)) {
@ -423,21 +396,19 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        }
    }
    
-    /* all done, so store the daemon info on the registry */
-    if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
-        ORTE_ERROR_LOG(rc);
-    }
-    
    /* TM poll for all the spawns */
    for (i = 0; i < launched; ++i) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            errno = local_err;
            opal_output(0, "pls:tm: failed to poll for a spawned proc, return status = %d", rc);
-            return ORTE_ERR_IN_ERRNO;
+            goto cleanup;
        }
    }
    
+    /* if we get here, then everything launched okay - record that fact */
+    failed_launch = false;
+    
    /* check for timing request - get stop time for launch completion and report */
    if (mca_pls_tm_component.timing) {
        if (0 != gettimeofday(&completionstop, NULL)) {
@ -455,7 +426,15 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
    
    
 cleanup:
-    OBJ_RELEASE(map);
+    if (NULL != map) {
+        OBJ_RELEASE(map);
+    }
+    if (NULL != argv) {
+        opal_argv_free(argv);
+    }
+    if (NULL != env) {
+        opal_argv_free(env);
+    }
    
    if (connected) {
        pls_tm_disconnect();
@ -474,12 +453,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
        free(bin_base);
    }

-    /* deconstruct the daemon list */
-    while (NULL != (item = opal_list_remove_first(&daemons))) {
-        OBJ_RELEASE(item);
+    /* check for failed launch - if so, force terminate */
+    if (failed_launch) {
+        if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
+            ORTE_ERROR_LOG(rc);
+        }
+        
+        if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
+            ORTE_ERROR_LOG(rc);
+        }        
    }
-    OBJ_DESTRUCT(&daemons);
-
+        
    /* check for timing request - get stop time and process if so */
    if (mca_pls_tm_component.timing) {
        if (0 != gettimeofday(&jobstop, NULL)) {
@ -502,11 +486,11 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
 {
    int rc;
    
-    /* order them to kill their local procs for this job */
+   /* order all of the daemons to kill their local procs for this job */
    if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
        ORTE_ERROR_LOG(rc);
    }
-    
+
    return rc;
 }

@ -514,7 +498,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
 /**
 * Terminate the orteds for a given job
 */
-int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
+int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
 {
    int rc;
    
@ -620,83 +604,3 @@ static int pls_tm_disconnect(void)

    return ORTE_SUCCESS;
 }
-
-
-static int pls_tm_check_path(char *exe, char **env)
-{
-    static int size = 256;
-    int i;
-    char *file;
-    char *cwd;
-    char *path = NULL;
-
-    /* Do we want this check at all? */
-
-    if (!mca_pls_tm_component.want_path_check) {
-        return ORTE_SUCCESS;
-    }
-
-    /* Find the path in the supplied environment */
-
-    for (i = 0; NULL != env[i]; ++i) {
-        if (0 == strncmp("PATH=", env[i], 5)) {
-            path = strdup(env[i]);
-            break;
-        }
-    }
-    if (NULL == env[i]) {
-        path = strdup("NULL");
-    }
-
-    /* Check the already-successful paths (i.e., be a little
-       friendlier to the filesystem -- if we find the executable
-       successfully, save it) */
-
-    for (i = 0; NULL != mca_pls_tm_component.checked_paths &&
-             NULL != mca_pls_tm_component.checked_paths[i]; ++i) {
-        if (0 == strcmp(path, mca_pls_tm_component.checked_paths[i])) {
-            return ORTE_SUCCESS;
-        }
-    }
-
-    /* We didn't already find it, so check now.  First, get the cwd. */
-
-    do {
-        cwd = malloc(size);
-        if (NULL == cwd) {
-            return ORTE_ERR_OUT_OF_RESOURCE;
-        }
-        if (NULL == getcwd(cwd, size)) {
-            free(cwd);
-            if (ERANGE == errno) {
-                size *= 2;
-            } else {
-                return ORTE_ERR_IN_ERRNO;
-            }
-        } else {
-            break;
-        }
-    } while (1);
-
-    /* Now do the search */
-
-    file = opal_path_findv(exe, X_OK, env, cwd);
-    free(cwd);
-    if (NULL == file) {
-        free(path);
-        return ORTE_ERR_NOT_FOUND;
-    }
-    if (mca_pls_tm_component.debug) {
-        opal_output(0, "pls:tm: found %s", file);
-    }
-    free(file);
-
-    /* Success -- so cache it */
-
-    opal_argv_append_nosize(&mca_pls_tm_component.checked_paths, path);
-
-    /* All done */
-
-    free(path);
-    return ORTE_SUCCESS;
-}
--- a/orte/mca/pls/xcpu/pls_xcpu.c
+++ b/orte/mca/pls/xcpu/pls_xcpu.c
@ -256,7 +256,8 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
 	num_apps = map->num_apps;

 	/* next, get the vpid_start and range */
-	rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
+    vpid_start = 0;
+	rc = orte_ns.get_vpid_range(jobid, &vpid_range);
 	if (rc != ORTE_SUCCESS) {
 		ORTE_ERROR_LOG(rc);
 		return rc;
@ -375,7 +376,7 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
 	return ORTE_SUCCESS;
 }

-int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs)
+int orte_pls_xcpu_terminate_orteds(struct timeval *timeout, opal_list_t * attrs)
 {
 	return ORTE_SUCCESS;
 }
--- a/orte/mca/pls/xcpu/pls_xcpu.h
+++ b/orte/mca/pls/xcpu/pls_xcpu.h
@ -63,7 +63,7 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file
 */
 int orte_pls_xcpu_launch_job(orte_jobid_t);
 int orte_pls_xcpu_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *);
-int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs);
+int orte_pls_xcpu_terminate_orteds(struct timeval *timeout, opal_list_t * attrs);
 int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
 int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t*);
 int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig);
--- a/orte/mca/rmgr/base/rmgr_base_check_context.c
+++ b/orte/mca/rmgr/base/rmgr_base_check_context.c
@ -79,9 +79,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
           was, barf because they specifically asked for something we
           can't provide. */
        if (context->user_specified_cwd) {
-            opal_show_help("help-rmgr-base.txt", "chdir-error",
-                           true, hostname, context->cwd, strerror(errno));
-            return ORTE_ERR_NOT_FOUND;
+            return ORTE_ERR_WDIR_NOT_FOUND;
        }
        
        /* If the user didn't specifically ask for it, then it
@ -99,9 +97,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
                good = false;
            }
            if (!good) {
-                opal_show_help("help-rmgr-base.txt", "chdir-error",
-                               true, tmp, strerror(errno));
-                return ORTE_ERR_NOT_FOUND;
+                return ORTE_ERR_WDIR_NOT_FOUND;
            }
            
            /* Reset the pwd in this local copy of the
@ -154,19 +150,13 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
        free(tmp);
        tmp = opal_path_findv(context->argv[0], X_OK, environ, context->cwd);
        if (NULL == tmp) {
-            opal_show_help("help-rmgr-base.txt",
-                           "argv0-not-found",
-                           true, hostname, context->argv[0]);
-            return ORTE_ERR_NOT_FOUND;
+            return ORTE_ERR_EXE_NOT_FOUND;
        }
        free(context->app);
        context->app = tmp;
    } else {
        if (0 != access(context->app, X_OK)) {
-            opal_show_help("help-rmgr-base.txt",
-                           "argv0-not-accessible",
-                           true, hostname, context->argv[0]);
-            return ORTE_ERR_NOT_FOUND;
+            return ORTE_ERR_EXE_NOT_ACCESSIBLE;
        }
    }
    
--- a/orte/mca/rmgr/base/rmgr_base_open.c
+++ b/orte/mca/rmgr/base/rmgr_base_open.c
@ -65,9 +65,8 @@ orte_rmgr_base_module_t orte_rmgr = {
    orte_rmgr_base_put_app_context,
    orte_rmgr_base_check_context_cwd,
    orte_rmgr_base_check_context_app,
-    orte_rmgr_base_set_vpid_range,
-    orte_rmgr_base_get_vpid_range
-    
+    orte_rmgr_base_set_proc_info,
+    orte_rmgr_base_get_proc_info
 };

 /*
--- a/orte/mca/rmgr/base/rmgr_base_vpid_support_fns.c
+++ b/orte/mca/rmgr/base/rmgr_base_vpid_support_fns.c
@ -35,121 +35,6 @@

 #include "orte/mca/rmgr/base/rmgr_private.h"

-/**
- *  Set the vpid start and range for a job/pset on the registry
- */
-
-int orte_rmgr_base_set_vpid_range(orte_jobid_t jobid, orte_vpid_t start, orte_vpid_t range)
-{
-    orte_gpr_value_t *value;
-    char *segment;
-    int rc;
-    
-    if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
-        ORTE_ERROR_LOG(rc);
-        return rc;
-    }
-    
-    if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_OVERWRITE, segment, 2, 1))) {
-        ORTE_ERROR_LOG(rc);
-        free(segment);
-        return rc;
-    }
-    free(segment);
-    value->tokens[0] = strdup(ORTE_JOB_GLOBALS);
-    
-    if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_JOB_VPID_START_KEY, ORTE_VPID, &start))) {
-        ORTE_ERROR_LOG(rc);
-        OBJ_RELEASE(value);
-        return rc;
-    }
-    if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_JOB_VPID_RANGE_KEY, ORTE_VPID, &range))) {
-        ORTE_ERROR_LOG(rc);
-        OBJ_RELEASE(value);
-        return rc;
-    }
-    
-    rc = orte_gpr.put(1, &value);
-    if (ORTE_SUCCESS != rc) ORTE_ERROR_LOG(rc);
-    
-    OBJ_RELEASE(value);
-    return rc;
-}
-
-
-/**
- *  Get the vpid start and range for a job/pset from the registry
- */
-
-int orte_rmgr_base_get_vpid_range(orte_jobid_t jobid, orte_vpid_t *start, orte_vpid_t *range)
-{
-    char *segment;
-    char *tokens[2];
-    char *keys[3];
-    orte_gpr_value_t** values = NULL;
-    orte_std_cntr_t i, num_values = 0;
-    orte_vpid_t *vptr;
-    int rc;
-    
-    /* query the job segment on the registry */
-    if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
-        ORTE_ERROR_LOG(rc);
-        return rc;
-    }
-    
-    tokens[0] = ORTE_JOB_GLOBALS;
-    tokens[1] = NULL;
-    
-    keys[0] = ORTE_JOB_VPID_START_KEY;
-    keys[1] = ORTE_JOB_VPID_RANGE_KEY;
-    keys[2] = NULL;
-    
-    rc = orte_gpr.get(
-                      ORTE_GPR_KEYS_AND|ORTE_GPR_TOKENS_OR,
-                      segment,
-                      tokens,
-                      keys,
-                      &num_values,
-                      &values
-                      );
-    if(rc != ORTE_SUCCESS) {
-        free(segment);
-        ORTE_ERROR_LOG(rc);
-        return rc;
-    }
-    if(num_values != 1) {
-        rc = ORTE_ERR_NOT_FOUND;
-        ORTE_ERROR_LOG(rc);
-        goto cleanup;
-    }
-    
-    for(i=0; i<values[0]->cnt; i++) {
-        if(strcmp(values[0]->keyvals[i]->key, ORTE_JOB_VPID_START_KEY) == 0) {
-            if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, values[0]->keyvals[i]->value, ORTE_VPID))) {
-                ORTE_ERROR_LOG(rc);
-                goto cleanup;
-            }
-            *start = *vptr;
-            continue;
-        }
-        if(strcmp(values[0]->keyvals[i]->key, ORTE_JOB_VPID_RANGE_KEY) == 0) {
-            if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, values[0]->keyvals[i]->value, ORTE_VPID))) {
-                ORTE_ERROR_LOG(rc);
-                goto cleanup;
-            }
-            *range = *vptr;
-            continue;
-        }
-    }
-    
-cleanup:
-    for(i=0; i<num_values; i++)
-        OBJ_RELEASE(values[i]);
-    free(segment);
-    free(values);
-    return rc;
-}
-
 int orte_rmgr_base_set_proc_info(const orte_process_name_t* name, pid_t pid, char *nodename) 
 {
    orte_gpr_value_t *values[1];
--- a/orte/mca/rmgr/base/rmgr_private.h
+++ b/orte/mca/rmgr/base/rmgr_private.h
@ -80,11 +80,6 @@ ORTE_DECLSPEC int orte_rmgr_base_check_context_app(orte_app_context_t *context);
 ORTE_DECLSPEC int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
                                                   bool want_chdir);

-ORTE_DECLSPEC int orte_rmgr_base_set_vpid_range(orte_jobid_t jobid, orte_vpid_t start, orte_vpid_t range);
-
-ORTE_DECLSPEC int orte_rmgr_base_get_vpid_range(orte_jobid_t jobid, orte_vpid_t *start, orte_vpid_t *range);
-
-
 ORTE_DECLSPEC int orte_rmgr_base_set_proc_info(const orte_process_name_t* name, pid_t pid, char * nodename);
 ORTE_DECLSPEC int orte_rmgr_base_get_proc_info(const orte_process_name_t* name, pid_t* pid, char **nodename);

--- a/orte/mca/rmgr/cnos/rmgr_cnos.c
+++ b/orte/mca/rmgr/cnos/rmgr_cnos.c
@ -75,14 +75,6 @@ static int orte_rmgr_cnos_check_context_app(orte_app_context_t *context);
 static int orte_rmgr_cnos_check_context_cwd(orte_app_context_t *context,
                                            bool want_chdir);

-static int orte_rmgr_cnos_set_vpid_range(orte_jobid_t jobid,
-                                         orte_vpid_t start,
-                                         orte_vpid_t range);
-
-static int orte_rmgr_cnos_get_vpid_range(orte_jobid_t jobid,
-                                         orte_vpid_t *start,
-                                         orte_vpid_t *range);
-
 static orte_gpr_keyval_t* orte_rmgr_cnos_find_attribute(opal_list_t* attr_list, char* key);

 static int orte_rmgr_cnos_add_attribute(opal_list_t* attr_list, char* key,
@ -114,8 +106,6 @@ orte_rmgr_base_module_t orte_rmgr_cnos_module = {
    orte_rmgr_cnos_put_app_context,
    orte_rmgr_cnos_check_context_cwd,
    orte_rmgr_cnos_check_context_app,
-    orte_rmgr_cnos_set_vpid_range,
-    orte_rmgr_cnos_get_vpid_range,
    orte_rmgr_cnos_set_proc_info,
    orte_rmgr_cnos_get_proc_info
 };
@ -196,20 +186,6 @@ static int orte_rmgr_cnos_check_context_cwd(orte_app_context_t *context,
    return ORTE_ERR_NOT_SUPPORTED;
 }

-static int orte_rmgr_cnos_set_vpid_range(orte_jobid_t jobid,
-                                         orte_vpid_t start,
-                                         orte_vpid_t range)
-{
-    return ORTE_ERR_NOT_SUPPORTED;
-}
-
-static int orte_rmgr_cnos_get_vpid_range(orte_jobid_t jobid,
-                                         orte_vpid_t *start,
-                                         orte_vpid_t *range)
-{
-    return ORTE_ERR_NOT_SUPPORTED;
-}
-
 static orte_gpr_keyval_t* orte_rmgr_cnos_find_attribute(opal_list_t* attr_list, char* key)
 {
    return NULL;
--- a/orte/mca/rmgr/proxy/rmgr_proxy.c
+++ b/orte/mca/rmgr/proxy/rmgr_proxy.c
@ -74,8 +74,6 @@ orte_rmgr_base_module_t orte_rmgr_proxy_module = {
    orte_rmgr_base_put_app_context,
    orte_rmgr_base_check_context_cwd,
    orte_rmgr_base_check_context_app,
-    orte_rmgr_base_set_vpid_range,
-    orte_rmgr_base_get_vpid_range,
    orte_rmgr_base_set_proc_info,
    orte_rmgr_base_get_proc_info
 };
--- a/orte/mca/rmgr/rmgr.h
+++ b/orte/mca/rmgr/rmgr.h
@ -242,25 +242,6 @@ typedef int (*orte_rmgr_base_module_check_context_cwd_fn_t)(orte_app_context_t *
 */
 typedef int (*orte_rmgr_base_module_check_context_app_fn_t)(orte_app_context_t *context);

-/**
-    * VPID FUNCTIONS
- */
-
-/**
-    * Store the vpid range of a job
- */
-typedef int (*orte_rmgr_base_module_set_vpid_range_fn_t)(orte_jobid_t jobid,
-                                                         orte_vpid_t start,
-                                                         orte_vpid_t range);
-
-
-/**
-    * Retrieve the vpid range of a job
- */
-typedef int (*orte_rmgr_base_module_get_vpid_range_fn_t)(orte_jobid_t jobid,
-                                                         orte_vpid_t *start,
-                                                         orte_vpid_t *range);
-
    /**
     * Set the process' local PID
     */
@ -290,8 +271,6 @@ struct orte_rmgr_base_module_2_0_0_t {
    orte_rmgr_base_module_store_app_context_fn_t    store_app_context;
    orte_rmgr_base_module_check_context_cwd_fn_t    check_context_cwd;
    orte_rmgr_base_module_check_context_app_fn_t    check_context_app;
-    orte_rmgr_base_module_set_vpid_range_fn_t       set_vpid_range;
-    orte_rmgr_base_module_get_vpid_range_fn_t       get_vpid_range;
    orte_rmgr_base_module_set_process_info_fn_t     set_process_info;
    orte_rmgr_base_module_get_process_info_fn_t     get_process_info;
 };
--- a/orte/mca/rmgr/urm/rmgr_urm.c
+++ b/orte/mca/rmgr/urm/rmgr_urm.c
@ -89,8 +89,6 @@ orte_rmgr_base_module_t orte_rmgr_urm_module = {
    orte_rmgr_base_put_app_context,
    orte_rmgr_base_check_context_cwd,
    orte_rmgr_base_check_context_app,
-    orte_rmgr_base_set_vpid_range,
-    orte_rmgr_base_get_vpid_range,
    orte_rmgr_base_set_proc_info,
    orte_rmgr_base_get_proc_info
 };
--- a/orte/runtime/help-orte-runtime.txt
+++ b/orte/runtime/help-orte-runtime.txt
@ -27,5 +27,5 @@ here's some additional information (which may only be relevant to an
 Open MPI developer):

  %s failed
-  --> Returned value %d instead of ORTE_SUCCESS
+  --> Returned value %s (%d) instead of ORTE_SUCCESS

--- a/orte/runtime/orte_init_stage1.c
+++ b/orte/runtime/orte_init_stage1.c
@ -206,7 +206,7 @@ int orte_init_stage1(bool infrastructure)
    /*
     * Initialize the daemon launch system so those types
     * are registered (needed by the sds to talk to its
-                       * local daemon)
+     * local daemon)
     */
    if (ORTE_SUCCESS != (ret = orte_odls_base_open())) {
        ORTE_ERROR_LOG(ret);
@ -282,10 +282,9 @@ int orte_init_stage1(bool infrastructure)

    /*
     * Now that we know for certain if we are an HNP and/or a daemon,
-     * setup the resource management frameworks. This includes opening
-     * and selecting the daemon launch framework - that framework "knows"
-     * what to do if it isn't in a daemon, and everyone needs that framework
-     * to at least register its datatypes.
+     * setup the resource management frameworks. This includes
+     * selecting the daemon launch framework - that framework "knows"
+     * what to do if it isn't in a daemon.
     */
    if (ORTE_SUCCESS != (ret = orte_rds_base_open())) {
        ORTE_ERROR_LOG(ret);
@ -420,12 +419,6 @@ int orte_init_stage1(bool infrastructure)
        }
        OBJ_RELEASE(app);
        
-        if (ORTE_SUCCESS != (ret = orte_rmgr.set_vpid_range(my_jobid,0,1))) {
-            ORTE_ERROR_LOG(ret);
-            error = "orte_rmgr.set_vpid_range for singleton/seed";
-            goto error;
-        }
-        
        if (orte_process_info.singleton) {
            /* setup a fake node structure - this is required to support
            * the MPI attributes function that is sitting on a trigger
@ -734,7 +727,7 @@ error:
    if (ret != ORTE_SUCCESS) {
        opal_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
-                       true, error, ret);
+                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
--- a/orte/tools/orted/orted_comm.c
+++ b/orte/tools/orted/orted_comm.c
@ -109,12 +109,12 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
                goto CLEANUP;
            }

-            if (orted_globals.debug_daemons) {
-                opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
-                            ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[0]);
-            }
-            
            for (n=0; n < num_jobs; n++) {
+                if (orted_globals.debug_daemons) {
+                    opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
+                                ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[n]);
+                }
+                
                if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(jobs[n], true))) {
                    ORTE_ERROR_LOG(ret);
                }
@ -382,7 +382,7 @@ static void halt_vm(void)
    /* terminate the vm - this will also wake us up so we can exit */
    OBJ_CONSTRUCT(&attrs, opal_list_t);
    orte_rmgr.add_attribute(&attrs, ORTE_DAEMON_HARD_KILL, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
-    ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs);
+    ret = orte_pls.terminate_orteds(&orte_abort_timeout, &attrs);
    while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
    OBJ_DESTRUCT(&attrs);