openmpi/orte/orted/orted_comm.c

/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2006 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007      Cisco, Inc.  All rights reserved.
 * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "orte_config.h"

#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>

#include "orte/orte_constants.h"

#include "opal/event/event.h"
#include "opal/mca/base/base.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/util/bit_ops.h"
#include "opal/util/cmd_line.h"
#include "opal/util/daemon_init.h"
#include "opal/util/opal_environ.h"
#include "opal/util/os_path.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/util/show_help.h"
#include "opal/util/trace.h"
#include "opal/util/argv.h"
#include "opal/runtime/opal.h"
#include "opal/mca/base/mca_base_param.h"


#include "orte/dss/dss.h"
#include "orte/class/orte_value_array.h"
#include "orte/util/sys_info.h"
#include "orte/util/proc_info.h"
#include "orte/util/univ_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/universe_setup_file_io.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/ras/ras.h"
#include "orte/mca/rds/rds.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/rmgr/base/rmgr_private.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/routed/routed.h"


#include "orte/runtime/runtime.h"
#include "orte/runtime/params.h"

#include "orte/orted/orted.h"

/*
 * Globals
 */
static bool warmup_routes;

static int binomial_route_msg(orte_process_name_t *sender,
                              orte_buffer_t *buf,
                              orte_rml_tag_t tag);

static int process_commands(orte_process_name_t* sender,
                            orte_buffer_t *buffer,
                            orte_rml_tag_t tag);


void orte_daemon_recv_routed(int status, orte_process_name_t* sender,
                             orte_buffer_t *buffer, orte_rml_tag_t tag,
                             void* cbdata)
{
    orte_daemon_cmd_flag_t routing_mode;
    int ret;
    orte_std_cntr_t n;

    OPAL_TRACE(1);

    OPAL_THREAD_LOCK(&orted_comm_mutex);

    if (orte_debug_daemons_flag) {
       opal_output(0, "%s orted_recv_routed: received message from %s",
                   ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                   ORTE_NAME_PRINT(sender));
    }

    /* init the warmup routes flag */
    warmup_routes = false;

    /* unpack the routing algorithm */
    n = 1;
    if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &routing_mode, &n, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(ret);
        goto CLEANUP;
    }

    /* if the mode is BINOMIAL, then handle that elsewhere */
    if (ORTE_DAEMON_ROUTE_BINOMIAL == routing_mode) {
        if (ORTE_SUCCESS != (ret = binomial_route_msg(sender, buffer, tag))) {
            ORTE_ERROR_LOG(ret);
            goto CLEANUP;
        }
    } else {
        /* process the command locally */
        if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) {
            ORTE_ERROR_LOG(ret);
        }
    }

CLEANUP:
    /* see if we need to warmup any daemon-to-daemon routes */
    if (warmup_routes) {
        if (ORTE_SUCCESS != (ret = orte_routed.warmup_routes())) {
            ORTE_ERROR_LOG(ret);
        }
    }

    OPAL_THREAD_UNLOCK(&orted_comm_mutex);

    /* reissue the non-blocking receive */
    ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_ROUTED,
                                  ORTE_RML_NON_PERSISTENT, orte_daemon_recv_routed, NULL);
    if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
        ORTE_ERROR_LOG(ret);
    }
}

void orte_daemon_recv(int status, orte_process_name_t* sender,
                      orte_buffer_t *buffer, orte_rml_tag_t tag,
                      void* cbdata)
{
    int ret;

    OPAL_TRACE(1);

    OPAL_THREAD_LOCK(&orted_comm_mutex);

    if (orte_debug_daemons_flag) {
        opal_output(0, "%s orted_recv_cmd: received message from %s",
                    ORTE_NAME_PRINT(orte_process_info.my_name),
                    ORTE_NAME_PRINT(sender));
    }

    /* init the warmup routes flag */
    warmup_routes = false;

    /* process the command */
    if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) {
        ORTE_ERROR_LOG(ret);
    }

    /* see if we need to warmup any daemon-to-daemon routes */
    if (warmup_routes) {
        if (ORTE_SUCCESS != (ret = orte_routed.warmup_routes())) {
            ORTE_ERROR_LOG(ret);
        }
    }

    OPAL_THREAD_UNLOCK(&orted_comm_mutex);

    /* reissue the non-blocking receive */
    ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                                  ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
    if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
        ORTE_ERROR_LOG(ret);
    }
}

static int process_commands(orte_process_name_t* sender,
                            orte_buffer_t *buffer,
                            orte_rml_tag_t tag)
{
    orte_daemon_cmd_flag_t command;
    orte_buffer_t *relay;
    int ret;
    orte_std_cntr_t n;
    int32_t signal;
    orte_gpr_notify_data_t *ndat;
    orte_jobid_t *jobs, job;
    orte_std_cntr_t num_jobs;
    orte_rml_tag_t target_tag;
    opal_list_t attrs;
    opal_list_item_t *item;
    char *contact_info;
    orte_buffer_t *answer;
    orte_rml_cmd_flag_t rml_cmd;

    /* unpack the command */
    n = 1;
    if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(ret);
        return ret;
    }

    /* now process the command locally */
    switch(command) {

        /****    KILL_LOCAL_PROCS   ****/
        case ORTE_DAEMON_KILL_LOCAL_PROCS:
            /* unpack the number of jobids */
            n = 1;
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &num_jobs, &n, ORTE_STD_CNTR))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }
            /* unpack the array of jobids */
            jobs = (orte_jobid_t*)malloc(num_jobs * sizeof(orte_jobid_t));
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, jobs, &num_jobs, ORTE_JOBID))) {
                ORTE_ERROR_LOG(ret);
                free(jobs);
                goto CLEANUP;
            }

            for (n=0; n < num_jobs; n++) {
                if (orte_debug_daemons_flag) {
                    opal_output(0, "%s orted_cmd: received kill_local_procs for job %ld",
                                ORTE_NAME_PRINT(orte_process_info.my_name), (long)jobs[n]);
                }

                if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(jobs[n], true))) {
                    ORTE_ERROR_LOG(ret);
                }
            }
            free(jobs);
            break;

        /****    SIGNAL_LOCAL_PROCS   ****/
        case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_cmd: received signal_local_procs",
                            ORTE_NAME_PRINT(orte_process_info.my_name));
            }
            /* unpack the number of jobids */
            n = 1;
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &num_jobs, &n, ORTE_STD_CNTR))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }
            /* unpack the array of jobids */
            jobs = (orte_jobid_t*)malloc(num_jobs * sizeof(orte_jobid_t));
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, jobs, &num_jobs, ORTE_JOBID))) {
                ORTE_ERROR_LOG(ret);
                free(jobs);
                goto CLEANUP;
            }

            /* get the signal */
            n = 1;
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &signal, &n, ORTE_INT32))) {
                ORTE_ERROR_LOG(ret);
                free(jobs);
                goto CLEANUP;
            }

            /* signal them */
            if (ORTE_SUCCESS != (ret = orte_odls.signal_local_procs(NULL, signal))) {
                ORTE_ERROR_LOG(ret);
            }
            free(jobs);
            break;

            /****    ADD_LOCAL_PROCS   ****/
        case ORTE_DAEMON_ADD_LOCAL_PROCS:
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_cmd: received add_local_procs",
                            ORTE_NAME_PRINT(orte_process_info.my_name));
            }
            /* unpack the notify data object */
            n = 1;
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &ndat, &n, ORTE_GPR_NOTIFY_DATA))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }

            /* launch the processes */
            if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat))) {
                ORTE_ERROR_LOG(ret);
            }

            /* cleanup the memory */
            OBJ_RELEASE(ndat);
            break;

            /****    DELIVER A MESSAGE TO THE LOCAL PROCS    ****/
        case ORTE_DAEMON_MESSAGE_LOCAL_PROCS:
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_cmd: received message_local_procs",
                            ORTE_NAME_PRINT(orte_process_info.my_name));
            }

            /* unpack the jobid of the procs that are to receive the message */
            n = 1;
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }

            /* unpack the tag where we are to deliver the message */
            n = 1;
            if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &target_tag, &n, ORTE_RML_TAG))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }

            relay = OBJ_NEW(orte_buffer_t);
            orte_dss.copy_payload(relay, buffer);

            /* if job=0, then this message is for us and not for our children */
            if (0 == job) {
                /* if the target tag is our xcast_barrier or rml_update, then we have
                 * to handle the message as a special case. The RML has logic in it
                 * intended to make it easier to use. This special logic mandates that
                 * any message we "send" actually only goes into the queue for later
                 * transmission. Thus, since we are already in a recv when we enter
                 * the "process_commands" function, any attempt to "send" the relay
                 * buffer to ourselves will only be added to the queue - it won't
                 * actually be delivered until *after* we conclude the processing
                 * of the current recv.
                 *
                 * The problem here is that, for messages where we need to relay
                 * them along the orted chain, the rml_update
                 * message contains contact info we may well need in order to do
                 * the relay! So we need to process those messages immediately.
                 * The only way to accomplish that is to (a) detect that the
                 * buffer is intended for those tags, and then (b) process
                 * those buffers here.
                 *
                 */
                if (ORTE_RML_TAG_RML_INFO_UPDATE == target_tag) {
                    n = 1;
                    if (ORTE_SUCCESS != (ret = orte_dss.unpack(relay, &rml_cmd, &n, ORTE_RML_CMD))) {
                        ORTE_ERROR_LOG(ret);
                        goto CLEANUP;
                    }
                    if (ORTE_SUCCESS != (ret = orte_dss.unpack(relay, &ndat, &n, ORTE_GPR_NOTIFY_DATA))) {
                        ORTE_ERROR_LOG(ret);
                        goto CLEANUP;
                    }
                    /* initialize the routes to my peers */
                    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(0, ndat))) {
                        ORTE_ERROR_LOG(ret);
                        goto CLEANUP;
                    }
                    /* set the warmup flag so we can warmup the routes between all
                     * daemons, as required by the routed framework. We have to set
                     * the flag here, but do the actual warmup later, to avoid blocking
                     * any relayed xcast (e.g., binomial)
                     */
                    warmup_routes = true;
                } else {
                    /* just deliver it to ourselves */
                    if ((ret = orte_rml.send_buffer(ORTE_PROC_MY_NAME, relay, target_tag, 0)) < 0) {
                        ORTE_ERROR_LOG(ret);
                    } else {
                        ret = ORTE_SUCCESS;
                    }
                }
            } else {
                /* must be for our children - deliver the message */
                if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay, target_tag))) {
                    ORTE_ERROR_LOG(ret);
                }
            }
            OBJ_RELEASE(relay);
            break;

            /****    EXIT COMMAND    ****/
        case ORTE_DAEMON_EXIT_CMD:
            if (orte_orterun) {
                /* if we are mpirun, do nothing - we will
                 * exit at our own sweet time
                 */
                return ORTE_SUCCESS;
            }
            /* eventually, we need to revise this so we only
             * exit if all our children are dead. For now, treat
             * the same as an exit_vm "hard kill" command
             */
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_cmd: received exit",
                            ORTE_NAME_PRINT(orte_process_info.my_name));
            }
            /* no response to send here - we'll send it when nearly exit'd */
            orted_comm_exit_cond = true;
            opal_condition_signal(&orted_comm_cond);
            /* have to unlock here as we are waking up and will
             * do things inside the orted
             */
            return ORTE_SUCCESS;
            break;

            /****    HALT VM COMMAND    ****/
        case ORTE_DAEMON_HALT_VM_CMD:
            if (orte_orterun) {
                /* if we are mpirun, do nothing - we will
                * exit at our own sweet time
                */
                return ORTE_SUCCESS;
            }
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_cmd: received halt vm",
                            ORTE_NAME_PRINT(orte_process_info.my_name));
            }
            /* if we are the HNP, then terminate all orteds reporting to us */
            if (orte_process_info.seed) {
                OBJ_CONSTRUCT(&attrs, opal_list_t);
                orte_rmgr.add_attribute(&attrs, ORTE_DAEMON_HARD_KILL, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
                ret = orte_pls.terminate_orteds(&orte_abort_timeout, &attrs);
                while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
                OBJ_DESTRUCT(&attrs);
            }
            /* wake up so we can exit too */
            orted_comm_exit_cond = true;
            opal_condition_signal(&orted_comm_cond);
            /* have to unlock here as we are waking up and will
            * do things inside the orted
            */
            return ORTE_SUCCESS;
            break;

            /****     CONTACT QUERY COMMAND    ****/
        case ORTE_DAEMON_CONTACT_QUERY_CMD:
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_cmd: received contact query",
                            ORTE_NAME_PRINT(orte_process_info.my_name));
            }
            /* send back contact info */
            contact_info = orte_rml.get_contact_info();

            if (NULL == contact_info) {
                ORTE_ERROR_LOG(ORTE_ERROR);
                ret = ORTE_ERROR;
                goto CLEANUP;
            }

                /* setup buffer with answer */
            answer = OBJ_NEW(orte_buffer_t);
            if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(answer);
                goto CLEANUP;
            }

            if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
                ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
                ret = ORTE_ERR_COMM_FAILURE;
            }
            OBJ_RELEASE(answer);
            break;

            /****     HOSTFILE COMMAND    ****/
        case ORTE_DAEMON_HOSTFILE_CMD:
            ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
            ret = ORTE_ERR_NOT_IMPLEMENTED;
            break;

            /****     SCRIPTFILE COMMAND    ****/
        case ORTE_DAEMON_SCRIPTFILE_CMD:
            ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
            ret = ORTE_ERR_NOT_IMPLEMENTED;
            break;

            /****     HEARTBEAT COMMAND    ****/
        case ORTE_DAEMON_HEARTBEAT_CMD:
            ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
            ret = ORTE_ERR_NOT_IMPLEMENTED;
            break;

            /****    WARMUP CONNECTION TO LOCAL PROC    ****/
        case ORTE_DAEMON_WARMUP_LOCAL_CONN:
            /* nothing to do here - just ignore it */
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_recv: received connection from local proc",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            }
            ret = ORTE_SUCCESS;
            break;

            /****    SYNC FROM LOCAL PROC    ****/
        case ORTE_DAEMON_SYNC_BY_PROC:
            if (orte_debug_daemons_flag) {
                opal_output(0, "%s orted_recv: received sync from local proc %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(sender));
            }
            if (ORTE_SUCCESS != (ret = orte_odls.require_sync(sender))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }
            break;

        default:
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            ret = ORTE_ERR_BAD_PARAM;
    }

CLEANUP:
    return ret;
}


static int binomial_route_msg(orte_process_name_t *sender,
                              orte_buffer_t *buf,
                              orte_rml_tag_t tag)
{
    orte_daemon_cmd_flag_t mode;
    orte_std_cntr_t n, num_daemons;
    int i, bitmap, peer, size, rank, hibit, mask;
    orte_process_name_t target;
    orte_buffer_t *relay;
    int ret;

    /* initialize the relay buffer */
    relay = OBJ_NEW(orte_buffer_t);
    if (NULL == relay) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

    /* tell the downstream daemons the routing algorithm is binomial */
    mode = ORTE_DAEMON_ROUTE_BINOMIAL;
    if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &mode, 1, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(ret);
        goto CLEANUP;
    }

    /* unpack the current number of daemons - we need it here! */
    n = 1;
    if (ORTE_SUCCESS != (ret = orte_dss.unpack(buf, &num_daemons, &n, ORTE_STD_CNTR))) {
        ORTE_ERROR_LOG(ret);
        goto CLEANUP;
    }

    /* pass that value to the downstream daemons */
    if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &num_daemons, 1, ORTE_STD_CNTR))) {
        ORTE_ERROR_LOG(ret);
        goto CLEANUP;
    }

    /* copy the message payload to the relay buffer - this is non-destructive
     * Note that this still includes the target job and target tag data
     * required for eventual delivery of the payload
     */
    if (ORTE_SUCCESS != (ret = orte_dss.copy_payload(relay, buf))) {
        ORTE_ERROR_LOG(ret);
        goto CLEANUP;
    }

    /* process the command locally - we need to do this prior to attempting
     * to send the message to the next recipient in case this message
     * contains address information for that recipient. If we don't, then
     * the send will fail
     */
    if (ORTE_SUCCESS != (ret = process_commands(sender, buf, tag))) {
        ORTE_ERROR_LOG(ret);
    }

    /* compute the bitmap */
    bitmap = opal_cube_dim((int)num_daemons);
    rank = (int)ORTE_PROC_MY_NAME->vpid;
    size = (int)num_daemons;

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    target.jobid = 0;
    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < size) {
            target.vpid = (orte_vpid_t)peer;
            if (0 > (ret = orte_rml.send_buffer(&target, relay, ORTE_RML_TAG_ORTED_ROUTED, 0))) {
                ORTE_ERROR_LOG(ret);
                goto CLEANUP;
            }
        }
    }

CLEANUP:
    OBJ_RELEASE(relay);

    return ORTE_SUCCESS;
}