openmpi/orte/mca/plm/tmd/plm_tmd_module.c

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2006 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
 *                         reserved. 
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 *
 * These symbols are in a file by themselves to provide nice linker
 * semantics.  Since linkers generally pull in symbols by object
 * files, keeping these symbols as the only symbols in this file
 * prevents utility programs such as "ompi_info" from having to import
 * entire components just to query their version and parameters.
 */

#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"

#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <signal.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#ifdef HAVE_SCHED_H
#include <sched.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include <errno.h>
#include <tm.h>

#include "opal/mca/installdirs/installdirs.h"
#include "opal/threads/condition.h"
#include "opal/event/event.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/show_help.h"
#include "opal/util/path.h"
#include "opal/util/basename.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/runtime/opal_progress.h"

#include "orte/util/name_fns.h"
#include "orte/util/context_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"

#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "plm_tmd.h"

/* define frequency of checking job complete */
#define PLM_CHECK_COMPLETE_TIME 1

/*
 * Local functions
 */
static int plm_tmd_init(void);
static int plm_tmd_launch_job(orte_job_t *jdata);
static int plm_tmd_terminate_job(orte_jobid_t jobid);
static int plm_tmd_terminate_orteds(void);
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal);
static int plm_tmd_finalize(void);

static int plm_tmd_connect(void);
static int plm_tmd_disconnect(void);

/*
 * Local "global" variables
 */
static tm_task_id *tm_task_ids = NULL;
static int num_tids;
static orte_job_t *active_jdata;

/*
 * Global variable
 */
orte_plm_base_module_t orte_plm_tmd_module = {
    plm_tmd_init,
    orte_plm_base_set_hnp_name,
    plm_tmd_launch_job,
    plm_tmd_terminate_job,
    plm_tmd_terminate_orteds,
    plm_tmd_signal_job,
    plm_tmd_finalize
};

/**
* Init the module
 */
static int plm_tmd_init(void)
{
    int rc;
    
    if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
        ORTE_ERROR_LOG(rc);
    }
    return rc;
}


/* check for job completion */
static void check_job_complete(int fd, short evnt, void *arg)
{
    tm_event_t event, *tm_events;
    int *exit_codes, i, rc, local_err;
    orte_vpid_t v;
    orte_proc_t **procs;
    
opal_output(0, "checking job completion");

    tm_events = malloc(sizeof(tm_event_t) * num_tids);
    exit_codes = malloc(sizeof(int) * num_tids);
    
    /* connect to the mom */
    rc = plm_tmd_connect();
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "CANNOT CONNECT TO MOM");
        goto cleanup;
    }
    
    /* cycle through the task_id's and check for complete */
    for (i=0; i < num_tids; i++) {
        if (ORTE_SUCCESS != (rc = tm_obit(tm_task_ids[i], exit_codes + i, tm_events + i))) {
            opal_output(0, "CANNOT CHECK PROC %d", i);
        }
    }
    opal_output(0, "obits requested");
    
    
    /* cycle through and poll the events */
    for (i=0; i < num_tids; i++) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            errno = local_err;
            opal_output(0, "plm:tmd: failed to poll obit, return status = %d", rc);
        }
    }
    opal_output(0, "obits polled");
    
    
    /* store the exit codes */
    procs = (orte_proc_t**)active_jdata->procs->addr;
    for (v=0; v < active_jdata->num_procs; v++) {
        procs[v]->exit_code = exit_codes[v];
        ORTE_UPDATE_EXIT_STATUS(exit_codes[v]);
        opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
        if (WIFEXITED(exit_codes[v])) {
            if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
                procs[v]->state = ORTE_PROC_STATE_TERMINATED;
                active_jdata->num_terminated++;
            }
        } else {
            procs[v]->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
            active_jdata->abort = true;
            active_jdata->aborted_proc = procs[v];
            active_jdata->state = ORTE_JOB_STATE_ABORTED;
        }
    }
    
    /* disconnect from mom */
    plm_tmd_disconnect();
    
cleanup:
    /* free tm_events */
    free(tm_events);
    free(exit_codes);

    /* check for completion */
    if (active_jdata->num_terminated >= active_jdata->num_procs) {
        active_jdata->state = ORTE_JOB_STATE_TERMINATED;
        orte_wakeup();
    } else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
               !orte_abnormal_term_ordered && !orte_abort_in_progress) {
        orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),
                                 active_jdata->aborted_proc->exit_code);
    }

    /* reset the timer */
    ORTE_TIMER_EVENT(1, check_job_complete);
    opal_output(0, "timer reset");
    
}


/* When working in this function, ALWAYS jump to "cleanup" if
 * you encounter an error so that orterun will be woken up and
 * the job can cleanly terminate
 */
static int plm_tmd_launch_job(orte_job_t *jdata)
{
    orte_job_map_t *map = NULL;
    orte_app_context_t **apps;
    orte_node_t **nodes;
    orte_proc_t **procs;
    char *param, *param2;
    char **env = NULL;
    int rc;
    bool connected = false;
    orte_std_cntr_t launched = 0, i, n; 
    char *bin_base = NULL, *lib_base = NULL;
    int local_err;
    tm_event_t event;
    bool failed_launch = true;
    orte_vpid_t v;
    tm_event_t *tm_events = NULL;

    /* record who we are working on */
    active_jdata = jdata;
    
    /* create a jobid for this job */
    if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:tm: launching job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));
    
    /* setup the job */
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* Get the map for this job */
    if (NULL == (map = orte_rmaps.get_job_map(jdata->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }
    apps = (orte_app_context_t**)jdata->apps->addr;
    nodes = (orte_node_t**)map->nodes->addr;
    procs = (orte_proc_t**)jdata->procs->addr;

    /* Allocate a bunch of TM events to use for tm_spawn()ing - we will
     * need one/process since we are not launching daemons
     */
    tm_events = malloc(sizeof(tm_event_t) * jdata->num_procs);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    tm_task_ids = malloc(sizeof(tm_task_id) * jdata->num_procs);
    if (NULL == tm_task_ids) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    num_tids = jdata->num_procs;

    /* Figure out the basenames for the libdir and bindir.  There is a
        lengthy comment about this in plm_rsh_module.c explaining all
        the rationale for how / why we're doing this. */
    lib_base = opal_basename(opal_install_dirs.libdir);
    bin_base = opal_basename(opal_install_dirs.bindir);
    
    
    /* setup the environment for each app_context - we need to insert enviro
     * values that tell the procs how to bootstrap themselves
     */
    for (n=0; n < jdata->num_apps; n++) {
        orte_app_context_t *context = apps[n];
        char ***env = &context->env;
        
        /* setup base environment: copy the current environ and merge
         * in the app context environ
         */
        if (NULL != context->env) {
            *env = opal_environ_merge(orte_launch_environ, context->env);
        } else {
            *env = opal_argv_copy(orte_launch_environ);
        }
        
        /* Try to change to the context cwd and check that the app
         * exists and is executable The function will
         * take care of outputting a pretty error message, if required
         */
        if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(context, true))) {
            /* do not ERROR_LOG - it will be reported elsewhere */
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = orte_util_check_context_app(context))) {
            /* do not ERROR_LOG - it will be reported elsewhere */
            goto cleanup;
        }
        
        /* special case handling for --prefix: this is somewhat icky,
         * but at least some users do this.  :-\ It is possible that
         * when using --prefix, the user will also "-x PATH" and/or
         * "-x LD_LIBRARY_PATH", which would therefore clobber the
         * work that was done in the prior pls to ensure that we have
         * the prefix at the beginning of the PATH and
         * LD_LIBRARY_PATH.  So examine the context->env and see if we
         * find PATH or LD_LIBRARY_PATH.  If found, that means the
         * prior work was clobbered, and we need to re-prefix those
         * variables.
         */
        for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {
            char *newenv;
            
            /* Reset PATH */
            if (0 == strncmp("PATH=", context->env[i], 5)) {
                asprintf(&newenv, "%s/%s:%s", 
                         apps[n]->prefix_dir, bin_base, context->env[i] + 5);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                     "%s plm:tmd: resetting PATH: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     newenv));
                opal_setenv("PATH", newenv, true, env);
                free(newenv);
            }
            
            /* Reset LD_LIBRARY_PATH */
            else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {
                asprintf(&newenv, "%s/%s:%s", 
                         apps[n]->prefix_dir, lib_base, context->env[i] + 16);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                     "%s plm:tm: resetting LD_LIBRARY_PATH: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     newenv));
                opal_setenv("LD_LIBRARY_PATH", newenv, true, env);
                free(newenv);
            }
        }
        
        /* there will be no local daemon */
        param = mca_base_param_environ_variable("orte","local_daemon","uri");
        opal_unsetenv(param, env);
        free(param);
        
        /* pass the hnp's contact info to the local proc */
        param = mca_base_param_environ_variable("orte","hnp","uri");
        opal_setenv(param, orte_process_info.my_hnp_uri, true, env);
        free(param);
        
        /* set the app_context number into the environment */
        param = mca_base_param_environ_variable("orte","app","num");
        asprintf(&param2, "%ld", (long)context->idx);
        opal_setenv(param, param2, true, env);
        free(param);
        free(param2);
        
        /* set the universe size in the environment */
        param = mca_base_param_environ_variable("orte","universe","size");
        asprintf(&param2, "%ld", (long)jdata->total_slots_alloc);
        opal_setenv(param, param2, true, env);
        free(param);
        
        /* although the total_slots_alloc is the universe size, users
         * would appreciate being given a public environmental variable
         * that also represents this value - something MPI specific - so
         * do that here.
         *
         * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
         * We know - just live with it
         */
        opal_setenv("OMPI_UNIVERSE_SIZE", param2, true, env);
        free(param2);
        
        /* tell the proc to use the "envd" - i.e., the environment direct
         * ESS module to set itself up
         */
        param = mca_base_param_environ_variable("ess",NULL,"NULL");
        opal_setenv(param, "envd", true, env);
        free(param);
       
        /* since we want to pass the name as separate components, make sure
         * that the "name" environmental variable is cleared!
         */
        if(NULL == (param = mca_base_param_environ_variable("orte","ess","name"))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        opal_unsetenv(param, env);
        free(param);
        
        /* tell the proc the jobid - same for everyone */
        if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&param2, jdata->jobid))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if(NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            rc = ORTE_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
        opal_setenv(param, param2, true, env);
        free(param);
        free(param2);
        
        /* tell the proc the #procs in this job */
        asprintf(&param2, "%ld", (long) jdata->num_procs);
        if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_procs"))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        opal_setenv(param, param2, true, env);
        free(param);
        
        /* although the num_procs is the comm_world size, users
         * would appreciate being given a public environmental variable
         * that also represents this value - something MPI specific - so
         * do that here.
         *
         * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
         * We know - just live with it
         */
        opal_setenv("OMPI_COMM_WORLD_SIZE", param2, true, env);
        free(param2);
        
    }
    
    rc = plm_tmd_connect();
    if (ORTE_SUCCESS != rc) {
        goto cleanup;
    }
    connected = true;

    /* Iterate through each of the procs and launch it */
    for (v = 0; v < jdata->num_procs; v++) {
        orte_proc_t *proc = procs[v];
        orte_node_t* node = proc->node;
        orte_app_context_t *context = apps[proc->app_idx];
        char **environ_copy;
        int argc;
        
        /* copy the environment */
        environ_copy = opal_argv_copy(context->env);
        
        /* pass the proc's vpid */
        if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&param2, proc->name.vpid))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if(NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            rc = ORTE_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
        opal_setenv(param, param2, true, &environ_copy);
        free(param);
        /* although the vpid IS the process' rank within the job, users
         * would appreciate being given a public environmental variable
         * that also represents this value - something MPI specific - so
         * do that here.
         *
         * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
         * We know - just live with it
         */
        opal_setenv("OMPI_COMM_WORLD_RANK", param2, true, &environ_copy);
        free(param2);  /* done with this now */

        /* pass the node's launch_id to the process as a "nodeid" so it can
         * identify which procs are local to it
         */
        asprintf(&param2, "%ld", (long)node->launch_id);
        param = mca_base_param_environ_variable("orte","nodeid",NULL);
        opal_setenv(param, param2, true, &environ_copy);
        free(param);
        free(param2);
        /* ensure we use the same nodename */
        param = mca_base_param_environ_variable("orte", "base", "nodename");
        opal_setenv(param, node->name, true, &environ_copy);
        free(param);
        
        /* set the local rank */
        asprintf(&param2, "%lu", (unsigned long) proc->local_rank);
        if(NULL == (param = mca_base_param_environ_variable("orte","ess","local_rank"))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            rc = ORTE_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
        opal_setenv(param, param2, true, &environ_copy);
        free(param);
        free(param2);
        
        /* setup yield schedule and processor affinity
         * We default here to always setting the affinity processor if we want
         * it. The processor affinity system then determines
         * if processor affinity is enabled/requested - if so, it then uses
         * this value to select the process to which the proc is "assigned".
         * Otherwise, the paffinity subsystem just ignores this value anyway
         */
        param = mca_base_param_environ_variable("mpi", NULL,
                                                "paffinity_processor");
        asprintf(&param2, "%lu", (unsigned long) proc->local_rank);
        opal_setenv(param, param2, true, &environ_copy);
        free(param);
        free(param2);
        
        if (node->oversubscribed) {
            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
            opal_setenv(param, "1", false, &environ_copy);
        } else {
            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
            opal_setenv(param, "0", false, &environ_copy);
        }
        free(param);
        
        asprintf(&param2, "%ld", (long) node->num_procs);
        if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_local_procs"))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        opal_setenv(param, param2, true, &environ_copy);
        free(param);
        free(param2);
        
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: launching on node %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             node->name));
        
        /* set the argc count */
        argc = opal_argv_count(context->argv);
        
        rc = tm_spawn(argc, context->argv, environ_copy, node->launch_id, tm_task_ids + launched, tm_events + launched);
        if (TM_SUCCESS != rc) {
            opal_show_help("help-plm-tm.txt", "tm-spawn-failed",
                           true, context->argv[0], node->name, node->launch_id);
            rc = ORTE_ERROR;
            goto cleanup;
        }
        opal_argv_free(environ_copy);
        launched++;

        /* Allow some progress to occur */
        opal_event_loop(OPAL_EVLOOP_NONBLOCK);
    }

    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:tm:launch: finished spawning procs",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* TM poll for all the spawns */
    for (i = 0; i < launched; ++i) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            errno = local_err;
            opal_output(0, "plm:tm: failed to poll for a spawned proc, return status = %d", rc);
            goto cleanup;
        }
    }
#if 0    
    if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: launch of apps failed for job %s on error %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
        goto cleanup;
    }
#endif
    /* if we get here, then everything launched okay - record that fact */
    failed_launch = false;
    
    /* setup periodic timer so we check for job completion */
    ORTE_TIMER_EVENT(1, check_job_complete);
    
 cleanup:
    /* can't reuse the events, so may as well get rid of them */
    if (NULL != tm_events) {
        free(tm_events);
    }
    
    if (NULL != env) {
        opal_argv_free(env);
    }
    
    if (connected) {
        plm_tmd_disconnect();
    }
    if (NULL != lib_base) {
        free(lib_base);
    }
    if (NULL != bin_base) {
        free(bin_base);
    }

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
    }
        
    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:tm:launch: finished",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    return rc;
}


static int plm_tmd_terminate_job(orte_jobid_t jobid)
{
    int rc;
    int i, local_err;
    tm_event_t event, *tm_events;
    
    tm_events = malloc(sizeof(tm_event_t) * num_tids);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /* connect to the mom */
    rc = plm_tmd_connect();
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "CANNOT CONNECT TO MOM");
        goto cleanup;
    }

    /* cycle through the task_id's and kill them */
    for (i=0; i < num_tids; i++) {
        if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], SIGTERM, &tm_events[i]))) {
            opal_output(0, "CANNOT KILL PROC %d", i);
        }
    }
    
    /* cycle through and poll the events */
    for (i=0; i < num_tids; i++) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            errno = local_err;
            opal_output(0, "plm:tmd: failed to kill proc, return status = %d", rc);
        }
    }

    /* disconnect from mom */
    plm_tmd_disconnect();

cleanup:
    /* free tm_events */
    free(tm_events);
    
    return ORTE_SUCCESS;
}


/**
 * Terminate the orteds for a given job
 */
static int plm_tmd_terminate_orteds(void)
{
    orte_job_t *daemons;
    int data=1;

    /* fake the system into thinking the orteds
     * reported their clean termination
     */
    if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    daemons->num_terminated = daemons->num_procs;
    
    /* fire the trigger indicating that the orteds are gone */
    write(orteds_exit, &data, sizeof(int));
    opal_progress();
    
    return ORTE_SUCCESS;
}

static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
{
    int rc;
    int i, local_err;
    tm_event_t event, *tm_events;
    
    tm_events = malloc(sizeof(tm_event_t) * num_tids);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* connect to the mom */
    rc = plm_tmd_connect();
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "CANNOT CONNECT TO MOM");
        goto cleanup;
    }
    
    /* cycle through the task_id's and kill them */
    for (i=0; i < num_tids; i++) {
        if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], signal, &tm_events[i]))) {
            opal_output(0, "CANNOT SIGNAL PROC %d", i);
        }
    }
    
    /* cycle through and poll the events */
    for (i=0; i < num_tids; i++) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            errno = local_err;
            opal_output(0, "plm:tmd: failed to signal proc, return status = %d", rc);
            goto cleanup;
        }
    }
    
    /* disconnect from mom */
    plm_tmd_disconnect();
    
cleanup:
    /* free tm_events */
    free(tm_events);
    
    return ORTE_SUCCESS;
}


/*
 * Free stuff
 */
static int plm_tmd_finalize(void)
{
    int rc;
    
    /* cleanup any pending recvs */
    if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
        ORTE_ERROR_LOG(rc);
    }

    if (NULL != tm_task_ids) {
        free(tm_task_ids);
    }

    return ORTE_SUCCESS;
}


static int plm_tmd_connect(void)
{
    int ret;
    struct tm_roots tm_root;
    int count, progress;

    /* try a couple times to connect - might get busy signals every
       now and then */
    for (count = 0 ; count < 10; ++count) {
        ret = tm_init(NULL, &tm_root);
        if (TM_SUCCESS == ret) {
            return ORTE_SUCCESS;
        }

        for (progress = 0 ; progress < 10 ; ++progress) {
            opal_progress();
#if HAVE_SCHED_YIELD
            sched_yield();
#endif
        }
    }

    return ORTE_ERR_RESOURCE_BUSY;
}


static int plm_tmd_disconnect(void)
{
    tm_finalize();

    return ORTE_SUCCESS;
}