openmpi/orte/mca/plm/base/plm_base_receive.c

/* -*- C -*-
 *
 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2011 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
/** @file:
 *
 */

/*
 * includes
 */
#include "orte_config.h"

#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif

#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/dss/dss.h"
#include "opal/threads/threads.h"

#include "orte/constants.h"
#include "orte/types.h"
#include "orte/util/proc_info.h"
#include "orte/util/error_strings.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/ras/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_quit.h"

#include "orte/mca/plm/plm_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/base.h"

static bool recv_issued=false;
static opal_mutex_t lock;
static opal_condition_t cond;
static opal_list_t recvs;
static opal_event_t ready;
static int ready_fd[2];
static bool processing;

static void process_msg(int fd, short event, void *data);

int orte_plm_base_comm_start(void)
{
    int rc;

    if (recv_issued) {
        return ORTE_SUCCESS;
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive start comm",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    processing = false;
    OBJ_CONSTRUCT(&lock, opal_mutex_t);
    OBJ_CONSTRUCT(&cond, opal_condition_t);
    OBJ_CONSTRUCT(&recvs, opal_list_t);
#ifndef __WINDOWS__
    pipe(ready_fd);
#else
    if (create_socketpair(AF_UNIX, SOCK_STREAM, 0, ready_fd) == -1) {
        return ORTE_ERROR;
    }
#endif

    memset(&ready, 0, sizeof(opal_event_t));
    opal_event_set(opal_event_base, &ready, ready_fd[0], OPAL_EV_READ, process_msg, NULL);
    opal_event_add(&ready, 0);
    
    if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                      ORTE_RML_TAG_PLM,
                                                      ORTE_RML_NON_PERSISTENT,
                                                      orte_plm_base_recv,
                                                      NULL))) {
        ORTE_ERROR_LOG(rc);
    }
    recv_issued = true;
    
    return rc;
}


int orte_plm_base_comm_stop(void)
{
    if (!recv_issued) {
        return ORTE_SUCCESS;
    }
    
    OBJ_DESTRUCT(&recvs);
    opal_event_del(&ready);
#ifndef __WINDOWS__
    close(ready_fd[0]);
#else
    closesocket(ready_fd[0]);
#endif
    processing = false;
    OBJ_DESTRUCT(&lock);
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive stop comm",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLM);
    recv_issued = false;
    
    return ORTE_SUCCESS;
}


/* process incoming messages in order of receipt */
static void process_msg(int fd, short event, void *data)
{
    orte_msg_packet_t *msgpkt;
    orte_plm_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_jobid_t job;
    orte_job_t *jdata, *parent;
    opal_buffer_t answer;
    orte_vpid_t vpid;
#if ORTE_ENABLE_EPOCH
    orte_epoch_t epoch;
#endif
    orte_proc_t *proc;
    orte_proc_state_t state;
    orte_exit_code_t exit_code;
    int rc=ORTE_SUCCESS, ret;
    orte_app_context_t *app, *child_app;
    opal_list_item_t *item;
    int dump[128];
    orte_process_name_t name;
    pid_t pid;
    bool running;
    
    OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive processing msg",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* clear the file descriptor to stop the event from refiring */
#ifndef __WINDOWS__
    read(fd, &dump, sizeof(dump));
#else
    recv(fd, (char *) &dump, sizeof(dump), 0);
#endif
    
    /* reset the event for the next message */
    opal_event_add(&ready, 0);
    
    while (NULL != (item = opal_list_remove_first(&recvs))) {
        msgpkt = (orte_msg_packet_t*)item;

        /* setup a default response */
        OBJ_CONSTRUCT(&answer, opal_buffer_t);
        job = ORTE_JOBID_INVALID;
        
        count = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &command, &count, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto CLEANUP;
        }
        
        switch (command) {
        case ORTE_PLM_LAUNCH_JOB_CMD:
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive job launch command",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                
            /* unpack the job object */
            count = 1;
            if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &jdata, &count, ORTE_JOB))) {
                ORTE_ERROR_LOG(rc);
                goto ANSWER_LAUNCH;
            }
            
            /* flag that this is a dynamic spawn */
            jdata->dyn_spawn_active = true;

            /* get the parent's job object */
            if (NULL == (parent = orte_get_job_data_object(msgpkt->sender.jobid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                goto ANSWER_LAUNCH;
            }
                    
            /* if the prefix was set in the parent's job, we need to transfer
             * that prefix to the child's app_context so any further launch of
             * orteds can find the correct binary. There always has to be at
             * least one app_context in both parent and child, so we don't
             * need to check that here. However, be sure not to overwrite
             * the prefix if the user already provided it!
             */
            app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
            child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
            if (NULL != app->prefix_dir &&
                NULL == child_app->prefix_dir) {
                child_app->prefix_dir = strdup(app->prefix_dir);
            }
                    
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive adding hosts",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    
            /* process any add-hostfile and add-host options that were provided */
            if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
                ORTE_ERROR_LOG(rc);
                goto ANSWER_LAUNCH;
            }
                    
            if( NULL == parent->bookmark ) {
                /* find the sender's node in the job map */
                if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, msgpkt->sender.vpid))) {
                    /* set the bookmark so the child starts from that place - this means
                     * that the first child process could be co-located with the proc
                     * that called comm_spawn, assuming slots remain on that node. Otherwise,
                     * the procs will start on the next available node
                     */
                    jdata->bookmark = proc->node;
                }
            } else {
                jdata->bookmark = parent->bookmark;
            }
                    
            /* launch it */
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive calling spawn",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            OPAL_RELEASE_THREAD(&lock, &cond, &processing);
            if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
                ORTE_ERROR_LOG(rc);
                OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
                goto DEPART;
            }
            OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);

            job = jdata->jobid;
                    
            /* output debugger proctable, if requested */
            if (orte_debugger_dump_proctable && !jdata->map->display_map) {
                char *output;
                opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "%s\n", output);
                    fflush(orte_xml_fp);
                } else {
                    opal_output(orte_clean_output, "%s", output);
                }
                free(output);
            }

            /* return the favor so that any repetitive comm_spawns track each other */
            parent->bookmark = jdata->bookmark;
                
            /* if the child is an ORTE job, wait for the procs to report they are alive */
            if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
                OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                     "%s plm:base:receive waiting for procs to report",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                OPAL_RELEASE_THREAD(&lock, &cond, &processing);
                /* we will wait here until the thread is released,
                 * indicating that all procs have reported
                 */
                OPAL_ACQUIRE_THREAD(&jdata->dyn_spawn_lock,
                                    &jdata->dyn_spawn_cond,
                                    &jdata->dyn_spawn_active);
                OPAL_THREAD_UNLOCK(&jdata->dyn_spawn_lock);
                OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
            }
                
        ANSWER_LAUNCH:
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive job %s launched",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(job)));
                
            /* pack the jobid to be returned */
            if (ORTE_SUCCESS != (ret = opal_dss.pack(&answer, &job, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(ret);
            }
                
            /* send the response back to the sender */
            if (0 > (ret = orte_rml.send_buffer(&msgpkt->sender, &answer, ORTE_RML_TAG_PLM_PROXY, 0))) {
                ORTE_ERROR_LOG(ret);
            }
            break;
                
        case ORTE_PLM_UPDATE_PROC_STATE:
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive update proc state command from %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&(msgpkt->sender)) ));
            count = 1;
            running = false;
            while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
                    
                OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                     "%s plm:base:receive got update_proc_state for job %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOBID_PRINT(job)));
                    
                name.jobid = job;
                running = false;
                /* get the job object */
                if (NULL == (jdata = orte_get_job_data_object(job))) {
                    ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                    goto CLEANUP;
                }
                /* if we are timing, the daemon will have included the time it
                 * recvd the launch msg - the maximum time between when we sent
                 * that message and a daemon recvd it tells us the time reqd
                 * to wireup the daemon comm network
                 */
                if (orte_timing) {
                    int64_t tmpsec, tmpusec;
                    count = 1;
                    if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) {
                        ORTE_ERROR_LOG(rc);
                        goto CLEANUP;
                    }
                    count = 1;
                    if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) {
                        ORTE_ERROR_LOG(rc);
                        goto CLEANUP;
                    }
                    /* keep the maximum time */
                    if (tmpsec > jdata->max_launch_msg_recvd.tv_sec) {
                        jdata->max_launch_msg_recvd.tv_sec = tmpsec;
                        jdata->max_launch_msg_recvd.tv_usec = tmpusec;
                    } else if (tmpsec == jdata->max_launch_msg_recvd.tv_sec &&
                               tmpusec > jdata->max_launch_msg_recvd.tv_usec) {
                        jdata->max_launch_msg_recvd.tv_usec = tmpusec;
                    }
                    if (orte_timing_details) {
                        int64_t sec, usec;
                        char *timestr;
                        ORTE_COMPUTE_TIME_DIFF(sec, usec, jdata->launch_msg_sent.tv_sec, jdata->launch_msg_sent.tv_usec,
                                               tmpsec, tmpusec);
                        timestr = orte_pretty_print_timing(sec, usec);
                        fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n",
                                ORTE_VPID_PRINT(msgpkt->sender.vpid), timestr);
                        free(timestr);
                    }
                }
                count = 1;
                while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID))) {
                    if (ORTE_VPID_INVALID == vpid) {
                        /* flag indicates that this job is complete - move on */
                        break;
                    }
                    name.vpid = vpid;
                    ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name));

                    /* unpack the pid */
                    count = 1;
                    if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) {
                        ORTE_ERROR_LOG(rc);
                        goto CLEANUP;
                    }
                    /* if we are timing things, unpack the time this proc was started */
                    if (orte_timing) {
                        int64_t tmpsec, tmpusec;
                        count = 1;
                        if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) {
                            ORTE_ERROR_LOG(rc);
                            goto CLEANUP;
                        }
                        count = 1;
                        if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) {
                            ORTE_ERROR_LOG(rc);
                            goto CLEANUP;
                        }
                        if (orte_timing_details) {
                            time_t tmptime;
                            char *tmpstr;
                            tmptime = tmpsec;
                            tmpstr = ctime(&tmptime);
                            /* remove the newline and the year at the end */
                            tmpstr[strlen(tmpstr)-6] = '\0';
                            fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n",
                                    ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000));
                        }
                    }
                    /* unpack the state */
                    count = 1;
                    if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &state, &count, ORTE_PROC_STATE))) {
                        ORTE_ERROR_LOG(rc);
                        goto CLEANUP;
                    }
                    if (ORTE_PROC_STATE_RUNNING == state) {
                        running = true;
                    }
                    /* unpack the exit code */
                    count = 1;
                    if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
                        ORTE_ERROR_LOG(rc);
                        goto CLEANUP;
                    }
                        
                    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                         "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));
                        
                    /* update the state */
                    OPAL_RELEASE_THREAD(&lock, &cond, &processing);
                    orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF,
                                             &name, state, pid, exit_code);
                    OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
                }
                count = 1;
            }
            if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
            jdata->num_daemons_reported++;
            if (orte_report_launch_progress && running) {
                if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
                    opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
                                (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
                                (int)jdata->num_launched, (int)jdata->num_procs);
                }
            }
            break;
                
        case ORTE_PLM_INIT_ROUTES_CMD:
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive init routes command",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            count=1;
            if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto CLEANUP;
            }
            name.jobid = job;
            count=1;
            while (ORTE_SUCCESS == opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID)) {
                if (ORTE_VPID_INVALID == vpid) {
                    break;
                }
                name.vpid = vpid;
                
#if ORTE_ENABLE_EPOCH
                count=1;
                opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH);
                name.epoch = epoch;
#endif

                OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                     "%s plm:base:receive Described rank %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&name)));
                /* update the errmgr state */
                orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED,
                                         &name, ORTE_PROC_STATE_REGISTERED,
                                         0, ORTE_ERROR_DEFAULT_EXIT_CODE);
                count=1;
            }
            /* pass the remainder of the buffer to the active module's
             * init_routes API
             */
            if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
                ORTE_ERROR_LOG(rc);
            }
            
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive done with init routes command",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            break;

        default:
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive unknown command",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
            rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
            break;
        }
        
    CLEANUP:
        /* release the message */
        OBJ_RELEASE(msgpkt);
        OBJ_DESTRUCT(&answer);
        if (ORTE_SUCCESS != rc) {
            goto DEPART;
        }
    }
        
 DEPART:
    /* release the thread */
    OPAL_RELEASE_THREAD(&lock, &cond, &processing);
    
    /* see if an error occurred - if so, wakeup the HNP so we can exit */
    if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
        orte_jobs_complete();
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive done processing commands",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}

/*
 * handle message from proxies
 * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program.
 * DO NOT RELEASE THIS BUFFER IN THIS CODE
 */

void orte_plm_base_recv(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    int rc;
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive got message from %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(sender)));

    /* don't process this right away - we need to get out of the recv before
     * we process the message as it may ask us to do something that involves
     * more messaging! Instead, setup an event so that the message gets processed
     * as soon as we leave the recv.
     *
     * The macro makes a copy of the buffer, which we release above - the incoming
     * buffer, however, is NOT released here, although its payload IS transferred
     * to the message buffer for later processing
     */
    ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], true, sender, &buffer);
    
    /* reissue the recv */
    if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                      ORTE_RML_TAG_PLM,
                                                      ORTE_RML_NON_PERSISTENT,
                                                      orte_plm_base_recv,
                                                      NULL))) {
        ORTE_ERROR_LOG(rc);
    }
    
    return;
}

/* where HNP messages come */
void orte_plm_base_receive_process_msg(int fd, short event, void *data)
{
    orte_message_event_t *mev = (orte_message_event_t*)data;

    ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], false, &mev->sender, &mev->buffer);
    OBJ_RELEASE(mev);
}