First cut at direct launch for TM. Able to launch non-ORTE procs and detect their completion for a clean shutdown.

This commit was SVN r17732.
2008-03-05 13:51:32 +00:00 · 2008-03-05 13:51:32 +00:00 · 06d3145fe4
--- a/orte/mca/plm/tmd/plm_tmd_module.c
+++ b/orte/mca/plm/tmd/plm_tmd_module.c
@ -64,8 +64,10 @@
 #include "opal/runtime/opal_progress.h"

 #include "orte/util/name_fns.h"
+#include "orte/util/context_fns.h"
 #include "orte/runtime/orte_globals.h"
 #include "orte/runtime/orte_wait.h"
+#include "orte/runtime/orte_wakeup.h"
 #include "orte/mca/errmgr/errmgr.h"
 #include "orte/mca/rmaps/rmaps.h"

@ -73,7 +75,8 @@
 #include "orte/mca/plm/base/plm_private.h"
 #include "plm_tmd.h"

-
+/* define frequency of checking job complete */
+#define PLM_CHECK_COMPLETE_TIME 1

 /*
 * Local functions
@ -88,6 +91,13 @@ static int plm_tmd_finalize(void);
 static int plm_tmd_connect(void);
 static int plm_tmd_disconnect(void);

+/*
+ * Local "global" variables
+ */
+static tm_task_id *tm_task_ids = NULL;
+static int num_tids;
+static orte_job_t *active_jdata;
+
 /*
 * Global variable
 */
@ -115,6 +125,89 @@ static int plm_tmd_init(void)
 }


+/* check for job completion */
+static void check_job_complete(int fd, short evnt, void *arg)
+{
+    tm_event_t event, *tm_events;
+    int *exit_codes, i, rc, local_err;
+    orte_vpid_t v;
+    orte_proc_t **procs;
+    
+opal_output(0, "checking job completion");
+
+    tm_events = malloc(sizeof(tm_event_t) * num_tids);
+    exit_codes = malloc(sizeof(int) * num_tids);
+    
+    /* connect to the mom */
+    rc = plm_tmd_connect();
+    if (ORTE_SUCCESS != rc) {
+        opal_output(0, "CANNOT CONNECT TO MOM");
+        goto cleanup;
+    }
+    
+    /* cycle through the task_id's and check for complete */
+    for (i=0; i < num_tids; i++) {
+        if (ORTE_SUCCESS != (rc = tm_obit(tm_task_ids[i], exit_codes + i, tm_events + i))) {
+            opal_output(0, "CANNOT CHECK PROC %d", i);
+        }
+    }
+    opal_output(0, "obits requested");
+    
+    
+    /* cycle through and poll the events */
+    for (i=0; i < num_tids; i++) {
+        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
+        if (TM_SUCCESS != rc) {
+            errno = local_err;
+            opal_output(0, "plm:tmd: failed to poll obit, return status = %d", rc);
+        }
+    }
+    opal_output(0, "obits polled");
+    
+    
+    /* store the exit codes */
+    procs = (orte_proc_t**)active_jdata->procs->addr;
+    for (v=0; v < active_jdata->num_procs; v++) {
+        procs[v]->exit_code = exit_codes[v];
+        opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
+        if (WIFEXITED(exit_codes[v])) {
+            if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
+                procs[v]->state = ORTE_PROC_STATE_TERMINATED;
+                active_jdata->num_terminated++;
+            }
+        } else {
+            procs[v]->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
+            active_jdata->abort = true;
+            active_jdata->aborted_proc = procs[v];
+            active_jdata->state = ORTE_JOB_STATE_ABORTED;
+        }
+    }
+    
+    /* disconnect from mom */
+    plm_tmd_disconnect();
+    
+cleanup:
+    /* free tm_events */
+    free(tm_events);
+    free(exit_codes);
+
+    /* check for completion */
+    if (active_jdata->num_terminated >= active_jdata->num_procs) {
+        active_jdata->state = ORTE_JOB_STATE_TERMINATED;
+        orte_wakeup(0);
+    } else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
+               !orte_abnormal_term_ordered && !orte_abort_in_progress) {
+        orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),
+                                 active_jdata->aborted_proc->exit_code);
+    }
+
+    /* reset the timer */
+    ORTE_TIMER_EVENT(1, check_job_complete);
+    opal_output(0, "timer reset");
+    
+}
+
+
 /* When working in this function, ALWAYS jump to "cleanup" if
 * you encounter an error so that orterun will be woken up and
 * the job can cleanly terminate
@ -124,35 +217,22 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
    orte_job_map_t *map = NULL;
    orte_app_context_t **apps;
    orte_node_t **nodes;
-    int node_name_index;
-    int proc_vpid_index;
-    char *param;
+    orte_proc_t **procs;
+    char *param, *param2;
    char **env = NULL;
-    char *var;
-    char **argv = NULL;
-    int argc;
    int rc;
    bool connected = false;
-    orte_std_cntr_t launched = 0, i; 
+    orte_std_cntr_t launched = 0, i, n; 
    char *bin_base = NULL, *lib_base = NULL;
-    tm_event_t *tm_events = NULL;
-    tm_task_id *tm_task_ids = NULL;
    int local_err;
    tm_event_t event;
-    struct timeval launchstart, launchstop, completionstart, completionstop;
-    struct timeval jobstart, jobstop;
-    int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
-    float avgtime=0.0;
    bool failed_launch = true;
-    mode_t current_umask;
+    orte_vpid_t v;
+    tm_event_t *tm_events = NULL;
+
+    /* record who we are working on */
+    active_jdata = jdata;
    
-    
-    /* check for timing request - get start time if so */
-    if (orte_timing) {
-        if (0 != gettimeofday(&jobstart, NULL)) {
-            opal_output(0, "plm_tmd: could not obtain job start time");
-        }
-    }
    /* create a jobid for this job */
    if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
        ORTE_ERROR_LOG(rc);
@ -178,175 +258,299 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
    }
    apps = (orte_app_context_t**)jdata->apps->addr;
    nodes = (orte_node_t**)map->nodes->addr;
+    procs = (orte_proc_t**)jdata->procs->addr;

-    if (0 == map->num_new_daemons) {
-        /* have all the daemons we need - launch app */
-        goto launch_apps;
-    }
-    
-    /* Allocate a bunch of TM events to use for tm_spawn()ing */
-    tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
+    /* Allocate a bunch of TM events to use for tm_spawn()ing - we will
+     * need one/process since we are not launching daemons
+     */
+    tm_events = malloc(sizeof(tm_event_t) * jdata->num_procs);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
-    tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
+    tm_task_ids = malloc(sizeof(tm_task_id) * jdata->num_procs);
    if (NULL == tm_task_ids) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
+    num_tids = jdata->num_procs;

-    /* Add basic orted command line options */
-    orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
-                                          &proc_vpid_index,
-                                          &node_name_index);
-
-    if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
-        param = opal_argv_join(argv, ' ');
-        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
-                             "%s plm:tm: final top-level argv:\n\t%s",
-                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                             (NULL == param) ? "NULL" : param));
-        if (NULL != param) free(param);
+    /* Figure out the basenames for the libdir and bindir.  There is a
+        lengthy comment about this in plm_rsh_module.c explaining all
+        the rationale for how / why we're doing this. */
+    lib_base = opal_basename(opal_install_dirs.libdir);
+    bin_base = opal_basename(opal_install_dirs.bindir);
+    
+    
+    /* setup the environment for each app_context - we need to insert enviro
+     * values that tell the procs how to bootstrap themselves
+     */
+    for (n=0; n < jdata->num_apps; n++) {
+        orte_app_context_t *context = apps[n];
+        char ***env = &context->env;
+        
+        /* setup base environment: copy the current environ and merge
+         * in the app context environ
+         */
+        if (NULL != context->env) {
+            *env = opal_environ_merge(orte_launch_environ, context->env);
+        } else {
+            *env = opal_argv_copy(orte_launch_environ);
+        }
+        
+        /* Try to change to the context cwd and check that the app
+         * exists and is executable The function will
+         * take care of outputting a pretty error message, if required
+         */
+        if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(context, true))) {
+            /* do not ERROR_LOG - it will be reported elsewhere */
+            goto cleanup;
+        }
+        if (ORTE_SUCCESS != (rc = orte_util_check_context_app(context))) {
+            /* do not ERROR_LOG - it will be reported elsewhere */
+            goto cleanup;
+        }
+        
+        /* special case handling for --prefix: this is somewhat icky,
+         * but at least some users do this.  :-\ It is possible that
+         * when using --prefix, the user will also "-x PATH" and/or
+         * "-x LD_LIBRARY_PATH", which would therefore clobber the
+         * work that was done in the prior pls to ensure that we have
+         * the prefix at the beginning of the PATH and
+         * LD_LIBRARY_PATH.  So examine the context->env and see if we
+         * find PATH or LD_LIBRARY_PATH.  If found, that means the
+         * prior work was clobbered, and we need to re-prefix those
+         * variables.
+         */
+        for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {
+            char *newenv;
+            
+            /* Reset PATH */
+            if (0 == strncmp("PATH=", context->env[i], 5)) {
+                asprintf(&newenv, "%s/%s:%s", 
+                         apps[n]->prefix_dir, bin_base, context->env[i] + 5);
+                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                                     "%s plm:tmd: resetting PATH: %s",
+                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                     newenv));
+                opal_setenv("PATH", newenv, true, env);
+                free(newenv);
+            }
+            
+            /* Reset LD_LIBRARY_PATH */
+            else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {
+                asprintf(&newenv, "%s/%s:%s", 
+                         apps[n]->prefix_dir, lib_base, context->env[i] + 16);
+                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                                     "%s plm:tm: resetting LD_LIBRARY_PATH: %s",
+                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                     newenv));
+                opal_setenv("LD_LIBRARY_PATH", newenv, true, env);
+                free(newenv);
+            }
+        }
+        
+        /* there will be no local daemon */
+        param = mca_base_param_environ_variable("orte","local_daemon","uri");
+        opal_unsetenv(param, env);
+        free(param);
+        
+        /* pass the hnp's contact info to the local proc */
+        param = mca_base_param_environ_variable("orte","hnp","uri");
+        opal_setenv(param, orte_process_info.my_hnp_uri, true, env);
+        free(param);
+        
+        /* set the app_context number into the environment */
+        param = mca_base_param_environ_variable("orte","app","num");
+        asprintf(&param2, "%ld", (long)context->idx);
+        opal_setenv(param, param2, true, env);
+        free(param);
+        free(param2);
+        
+        /* set the universe size in the environment */
+        param = mca_base_param_environ_variable("orte","universe","size");
+        asprintf(&param2, "%ld", (long)jdata->total_slots_alloc);
+        opal_setenv(param, param2, true, env);
+        free(param);
+        
+        /* although the total_slots_alloc is the universe size, users
+         * would appreciate being given a public environmental variable
+         * that also represents this value - something MPI specific - so
+         * do that here.
+         *
+         * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
+         * We know - just live with it
+         */
+        opal_setenv("OMPI_UNIVERSE_SIZE", param2, true, env);
+        free(param2);
+        
+        /* tell the proc to use the "envd" - i.e., the environment direct
+         * ESS module to set itself up
+         */
+        param = mca_base_param_environ_variable("ess",NULL,"NULL");
+        opal_setenv(param, "envd", true, env);
+        free(param);
+       
+        /* since we want to pass the name as separate components, make sure
+         * that the "name" environmental variable is cleared!
+         */
+        if(NULL == (param = mca_base_param_environ_variable("orte","ess","name"))) {
+            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+            return ORTE_ERR_OUT_OF_RESOURCE;
+        }
+        opal_unsetenv(param, env);
+        free(param);
+        
+        /* tell the proc the jobid - same for everyone */
+        if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&param2, jdata->jobid))) {
+            ORTE_ERROR_LOG(rc);
+            goto cleanup;
+        }
+        if(NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
+            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+            rc = ORTE_ERR_OUT_OF_RESOURCE;
+            goto cleanup;
+        }
+        opal_setenv(param, param2, true, env);
+        free(param);
+        free(param2);
+        
+        /* tell the proc the #procs in this job */
+        asprintf(&param2, "%ld", (long) jdata->num_procs);
+        if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_procs"))) {
+            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+            return ORTE_ERR_OUT_OF_RESOURCE;
+        }
+        opal_setenv(param, param2, true, env);
+        free(param);
+        
+        /* although the num_procs is the comm_world size, users
+         * would appreciate being given a public environmental variable
+         * that also represents this value - something MPI specific - so
+         * do that here.
+         *
+         * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
+         * We know - just live with it
+         */
+        opal_setenv("OMPI_COMM_WORLD_SIZE", param2, true, env);
+        free(param2);
+        
    }
-
+    
    rc = plm_tmd_connect();
    if (ORTE_SUCCESS != rc) {
        goto cleanup;
    }
    connected = true;

-    /* Figure out the basenames for the libdir and bindir.  There is a
-       lengthy comment about this in plm_rsh_module.c explaining all
-       the rationale for how / why we're doing this. */
-    lib_base = opal_basename(opal_install_dirs.libdir);
-    bin_base = opal_basename(opal_install_dirs.bindir);
-
-    /* setup environment */
-    env = opal_argv_copy(environ);
-
-    /* add our umask -- see big note in orted.c */
-    current_umask = umask(0);
-    umask(current_umask);
-    asprintf(&var, "0%o", current_umask);
-    opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
-    free(var);
-    
-    /* If we have a prefix, then modify the PATH and
-        LD_LIBRARY_PATH environment variables. We only allow
-        a single prefix to be specified. Since there will
-        always be at least one app_context, we take it from
-        there
-    */
-    if (NULL != apps[0]->prefix_dir) {
-        char *newenv;
+    /* Iterate through each of the procs and launch it */
+    for (v = 0; v < jdata->num_procs; v++) {
+        orte_proc_t *proc = procs[v];
+        orte_node_t* node = proc->node;
+        orte_app_context_t *context = apps[proc->app_idx];
+        char **environ_copy;
+        int argc;
        
-        for (i = 0; NULL != env && NULL != env[i]; ++i) {
-            /* Reset PATH */
-            if (0 == strncmp("PATH=", env[i], 5)) {
-                asprintf(&newenv, "%s/%s:%s", 
-                            apps[0]->prefix_dir, bin_base, env[i] + 5);
-                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
-                                     "%s plm:tm: resetting PATH: %s",
-                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                     newenv));
-                opal_setenv("PATH", newenv, true, &env);
-                free(newenv);
-            } 
-            
-            /* Reset LD_LIBRARY_PATH */
-            else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
-                asprintf(&newenv, "%s/%s:%s", 
-                            apps[0]->prefix_dir, lib_base, env[i] + 16);
-                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
-                                     "%s plm:tm: resetting LD_LIBRARY_PATH: %s",
-                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                     newenv));
-                opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
-                free(newenv);
-            } 
-        }
-    }
-    
-    /* Iterate through each of the nodes and spin
-     * up a daemon.
-     */
-    for (i = 0; i < map->num_nodes; i++) {
-        orte_node_t* node = nodes[i];
-        char* vpid_string;
+        /* copy the environment */
+        environ_copy = opal_argv_copy(context->env);
        
-        /* if this daemon already exists, don't launch it! */
-        if (node->daemon_launched) {
-            continue;
+        /* pass the proc's vpid */
+        if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&param2, proc->name.vpid))) {
+            ORTE_ERROR_LOG(rc);
+            return rc;
        }
- 
-        /* setup node name */
-        free(argv[node_name_index]);
-        argv[node_name_index] = strdup(node->name);
+        if(NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
+            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+            rc = ORTE_ERR_OUT_OF_RESOURCE;
+            goto cleanup;
+        }
+        opal_setenv(param, param2, true, &environ_copy);
+        free(param);
+        /* although the vpid IS the process' rank within the job, users
+         * would appreciate being given a public environmental variable
+         * that also represents this value - something MPI specific - so
+         * do that here.
+         *
+         * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
+         * We know - just live with it
+         */
+        opal_setenv("OMPI_COMM_WORLD_RANK", param2, true, &environ_copy);
+        free(param2);  /* done with this now */
+
+        /* pass the node's launch_id to the process as a "nodeid" so it can
+         * identify which procs are local to it
+         */
+        asprintf(&param2, "%ld", (long)node->launch_id);
+        param = mca_base_param_environ_variable("orte","nodeid",NULL);
+        opal_setenv(param, param2, true, &environ_copy);
+        free(param);
+        free(param2);
+        /* ensure we use the same nodename */
+        param = mca_base_param_environ_variable("orte", "base", "nodename");
+        opal_setenv(param, node->name, true, &environ_copy);
+        free(param);
+        
+        /* set the local rank */
+        asprintf(&param2, "%lu", (unsigned long) proc->local_rank);
+        if(NULL == (param = mca_base_param_environ_variable("orte","ess","local_rank"))) {
+            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+            rc = ORTE_ERR_OUT_OF_RESOURCE;
+            goto cleanup;
+        }
+        opal_setenv(param, param2, true, &environ_copy);
+        free(param);
+        free(param2);
+        
+        /* setup yield schedule and processor affinity
+         * We default here to always setting the affinity processor if we want
+         * it. The processor affinity system then determines
+         * if processor affinity is enabled/requested - if so, it then uses
+         * this value to select the process to which the proc is "assigned".
+         * Otherwise, the paffinity subsystem just ignores this value anyway
+         */
+        param = mca_base_param_environ_variable("mpi", NULL,
+                                                "paffinity_processor");
+        asprintf(&param2, "%lu", (unsigned long) proc->local_rank);
+        opal_setenv(param, param2, true, &environ_copy);
+        free(param);
+        free(param2);
+        
+        if (node->oversubscribed) {
+            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
+            opal_setenv(param, "1", false, &environ_copy);
+        } else {
+            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
+            opal_setenv(param, "0", false, &environ_copy);
+        }
+        free(param);
+        
+        asprintf(&param2, "%ld", (long) node->num_procs);
+        if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_local_procs"))) {
+            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
+            return ORTE_ERR_OUT_OF_RESOURCE;
+        }
+        opal_setenv(param, param2, true, &environ_copy);
+        free(param);
+        free(param2);
        
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: launching on node %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             node->name));
        
-        /* setup process name */
-        rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
-        if (ORTE_SUCCESS != rc) {
-            opal_output(0, "plm:tm: unable to get daemon vpid as string");
-            exit(-1);
-        }
-        free(argv[proc_vpid_index]);
-        argv[proc_vpid_index] = strdup(vpid_string);
-        free(vpid_string);
+        /* set the argc count */
+        argc = opal_argv_count(context->argv);
        
-        /* exec the daemon */
-        if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
-            param = opal_argv_join(argv, ' ');
-            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
-                                 "%s plm:tm: executing:\n\t%s",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                 (NULL == param) ? "NULL" : param));
-            if (NULL != param) free(param);
-        }
-        
-        /* check for timing request - get start time if so */
-        if (orte_timing) {
-            if (0 != gettimeofday(&launchstart, NULL)) {
-                opal_output(0, "plm_tmd: could not obtain start time");
-                launchstart.tv_sec = 0;
-                launchstart.tv_usec = 0;
-            }
-        }
-        
-        rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
+        rc = tm_spawn(argc, context->argv, environ_copy, node->launch_id, tm_task_ids + launched, tm_events + launched);
        if (TM_SUCCESS != rc) {
            opal_show_help("help-plm-tm.txt", "tm-spawn-failed",
-                           true, argv[0], node->name, node->launch_id);
+                           true, context->argv[0], node->name, node->launch_id);
            rc = ORTE_ERROR;
            goto cleanup;
        }
-        
-        /* check for timing request - get stop time and process if so */
-        if (orte_timing) {
-            if (0 != gettimeofday(&launchstop, NULL)) {
-                opal_output(0, "plm_tmd: could not obtain stop time");
-            } else {
-                deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
-                         (launchstop.tv_usec - launchstart.tv_usec);
-                avgtime = avgtime + deltat / map->num_new_daemons;
-                if (deltat < mintime) {
-                    mintime = deltat;
-                    miniter = launched;
-                }
-                if (deltat > maxtime) {
-                    maxtime = deltat;
-                    maxiter = launched;
-                }
-            }
-        }
-        
+        opal_argv_free(environ_copy);
        launched++;

        /* Allow some progress to occur */
@ -354,18 +558,9 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
    }

    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
-                         "%s plm:tm:launch: finished spawning orteds",
+                         "%s plm:tm:launch: finished spawning procs",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

-    /* check for timing request - get start time for launch completion */
-    if (orte_timing) {
-        if (0 != gettimeofday(&completionstart, NULL)) {
-            opal_output(0, "plm_tmd: could not obtain completion start time");
-            completionstart.tv_sec = 0;
-            completionstart.tv_usec = 0;
-        }
-    }
-    
    /* TM poll for all the spawns */
    for (i = 0; i < launched; ++i) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
@ -375,17 +570,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
            goto cleanup;
        }
    }
-    
-    /* wait for daemons to callback */
-    if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
-        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
-                             "%s plm:tm: daemon launch failed for job %s on error %s",
-                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                             ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
-        goto cleanup;
-    }
-    
-launch_apps:
+#if 0    
    if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: launch of apps failed for job %s on error %s",
@ -393,30 +578,19 @@ launch_apps:
                             ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
        goto cleanup;
    }
-    
+#endif
    /* if we get here, then everything launched okay - record that fact */
    failed_launch = false;
    
-    /* check for timing request - get stop time for launch completion and report */
-    if (orte_timing) {
-        if (0 != gettimeofday(&completionstop, NULL)) {
-            opal_output(0, "plm_tmd: could not obtain completion stop time");
-        } else {
-            deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
-                     (launchstop.tv_usec - launchstart.tv_usec);
-            opal_output(0, "plm_tmd: launch completion required %d usec", deltat);
-        }
-        opal_output(0, "plm_tmd: Launch statistics:");
-        opal_output(0, "plm_tmd: Average time to launch an orted: %f usec", avgtime);
-        opal_output(0, "plm_tmd: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter);
-        opal_output(0, "plm_tmd: Min time to launch an orted: %d usec at iter %d", mintime, miniter);
-    }
-    
+    /* setup periodic timer so we check for job completion */
+    ORTE_TIMER_EVENT(1, check_job_complete);
    
 cleanup:
-    if (NULL != argv) {
-        opal_argv_free(argv);
+    /* can't reuse the events, so may as well get rid of them */
+    if (NULL != tm_events) {
+        free(tm_events);
    }
+    
    if (NULL != env) {
        opal_argv_free(env);
    }
@ -424,13 +598,6 @@ launch_apps:
    if (connected) {
        plm_tmd_disconnect();
    }
-    if (NULL != tm_events) {
-        free(tm_events);
-    }
-    if (NULL != tm_task_ids) {
-        free(tm_task_ids);
-    }
-    
    if (NULL != lib_base) {
        free(lib_base);
    }
@ -443,17 +610,6 @@ launch_apps:
        orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
    }
        
-    /* check for timing request - get stop time and process if so */
-    if (orte_timing) {
-        if (0 != gettimeofday(&jobstop, NULL)) {
-            opal_output(0, "plm_tmd: could not obtain stop time");
-        } else {
-            deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 +
-                     (jobstop.tv_usec - jobstart.tv_usec);
-            opal_output(0, "plm_tmd: launch of entire job required %d usec", deltat);
-        }
-    }
-    
    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:tm:launch: finished",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -465,41 +621,119 @@ launch_apps:
 static int plm_tmd_terminate_job(orte_jobid_t jobid)
 {
    int rc;
+    int i, local_err;
+    tm_event_t event, *tm_events;
    
-   /* order all of the daemons to kill their local procs for this job */
-    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) {
+    tm_events = malloc(sizeof(tm_event_t) * num_tids);
+    if (NULL == tm_events) {
+        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
+        return rc;
    }

-    return rc;
+    /* connect to the mom */
+    rc = plm_tmd_connect();
+    if (ORTE_SUCCESS != rc) {
+        opal_output(0, "CANNOT CONNECT TO MOM");
+        goto cleanup;
+    }
+
+    /* cycle through the task_id's and kill them */
+    for (i=0; i < num_tids; i++) {
+        if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], SIGTERM, &tm_events[i]))) {
+            opal_output(0, "CANNOT KILL PROC %d", i);
+        }
+    }
+    
+    /* cycle through and poll the events */
+    for (i=0; i < num_tids; i++) {
+        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
+        if (TM_SUCCESS != rc) {
+            errno = local_err;
+            opal_output(0, "plm:tmd: failed to kill proc, return status = %d", rc);
+        }
+    }
+
+    /* disconnect from mom */
+    plm_tmd_disconnect();
+
+cleanup:
+    /* free tm_events */
+    free(tm_events);
+    
+    return ORTE_SUCCESS;
 }


 /**
 * Terminate the orteds for a given job
 */
-int plm_tmd_terminate_orteds(void)
+static int plm_tmd_terminate_orteds(void)
 {
-    int rc;
-    
-    /* now tell them to die! */
-    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) {
-        ORTE_ERROR_LOG(rc);
+    orte_job_t *daemons;
+    int data=1;
+
+    /* fake the system into thinking the orteds
+     * reported their clean termination
+     */
+    if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
+        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
+        return ORTE_ERR_NOT_FOUND;
    }
+    daemons->num_terminated = daemons->num_procs;
    
-    return rc;
+    /* fire the trigger indicating that the orteds are gone */
+    write(orteds_exit, &data, sizeof(int));
+    opal_progress();
+    
+    return ORTE_SUCCESS;
 }

 static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
 {
    int rc;
+    int i, local_err;
+    tm_event_t event, *tm_events;
    
-    /* order them to pass this signal to their local procs */
-    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
+    tm_events = malloc(sizeof(tm_event_t) * num_tids);
+    if (NULL == tm_events) {
+        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
+        return rc;
    }
    
-    return rc;
+    /* connect to the mom */
+    rc = plm_tmd_connect();
+    if (ORTE_SUCCESS != rc) {
+        opal_output(0, "CANNOT CONNECT TO MOM");
+        goto cleanup;
+    }
+    
+    /* cycle through the task_id's and kill them */
+    for (i=0; i < num_tids; i++) {
+        if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], signal, &tm_events[i]))) {
+            opal_output(0, "CANNOT SIGNAL PROC %d", i);
+        }
+    }
+    
+    /* cycle through and poll the events */
+    for (i=0; i < num_tids; i++) {
+        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
+        if (TM_SUCCESS != rc) {
+            errno = local_err;
+            opal_output(0, "plm:tmd: failed to signal proc, return status = %d", rc);
+            goto cleanup;
+        }
+    }
+    
+    /* disconnect from mom */
+    plm_tmd_disconnect();
+    
+cleanup:
+    /* free tm_events */
+    free(tm_events);
+    
+    return ORTE_SUCCESS;
 }


@ -515,6 +749,10 @@ static int plm_tmd_finalize(void)
        ORTE_ERROR_LOG(rc);
    }

+    if (NULL != tm_task_ids) {
+        free(tm_task_ids);
+    }
+
    return ORTE_SUCCESS;
 }

--- a/orte/runtime/orte_wait.h
+++ b/orte/runtime/orte_wait.h
@ -245,6 +245,27 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
        *(event) = tmp;                                             \
    }while(0);                                                      \

+
+/**
+ * There are places in the code where we just want to periodically
+ * wakeup to do something, and then go back to sleep again. Setting
+ * a timer allows us to do this
+ */
+#define ORTE_TIMER_EVENT(time, cbfunc)                          \
+    do {                                                        \
+        struct timeval now;                                     \
+        opal_event_t *tmp;                                      \
+        tmp = (opal_event_t*)malloc(sizeof(opal_event_t));      \
+        opal_evtimer_set(tmp, (cbfunc), tmp);                   \
+        now.tv_sec = (time);                                    \
+        now.tv_usec = 0;                                        \
+        OPAL_OUTPUT_VERBOSE((1, orte_debug_output,              \
+                            "defining timer event: %ld sec",    \
+                            (long)now.tv_sec));                 \
+        opal_evtimer_add(tmp, &now);                            \
+    }while(0);                                                  \
+
+
 /**
 * \internal
 *
--- a/orte/tools/orterun/orterun.c
+++ b/orte/tools/orterun/orterun.c
@ -923,8 +923,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
 */
 static void abort_signal_callback(int fd, short flags, void *arg)
 {
-    opal_event_t *event;
-        
    /* if we have already ordered this once, or we are already
     * aborting the job, don't keep doing it to avoid race conditions
     */
@ -942,7 +940,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
       (which is a Bad Thing), so we can't call it directly.
       Instead, we have to exit this handler and setup to call
       job_completed() after this. */
-    ORTE_DETECT_TIMEOUT(&event, 0, 0, 1, abort_exit_callback);
+    ORTE_TIMER_EVENT(0, abort_exit_callback);
 }

 /**