openmpi/orte/mca/routed/tree/routed_tree.c

/*
 * Copyright (c) 2007      Los Alamos National Security, LLC.
 *                         All rights reserved. 
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

#include "orte_config.h"
#include "orte/orte_constants.h"

#include "opal/util/output.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/odls/odls.h"

#include "orte/mca/smr/smr.h"
#include "orte/mca/rml/base/rml_contact.h"

#include "orte/mca/routed/base/base.h"
#include "routed_tree.h"

int
orte_routed_tree_update_route(orte_process_name_t *target,
                               orte_process_name_t *route)
{ 
    if (target->jobid == ORTE_JOBID_INVALID ||
        target->vpid == ORTE_VPID_INVALID) {
        return ORTE_ERR_BAD_PARAM;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                         "%s routed_tree_update: %s --> %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(target), 
                         ORTE_NAME_PRINT(route)));


    /* exact match */
    if (target->jobid != ORTE_JOBID_WILDCARD &&
        target->vpid != ORTE_VPID_WILDCARD) {
        opal_list_item_t *item;
        orte_routed_tree_entry_t *entry;

        for (item = opal_list_get_first(&orte_routed_tree_module.peer_list) ;
             item != opal_list_get_end(&orte_routed_tree_module.peer_list) ;
             item = opal_list_get_next(item)) {
            entry = (orte_routed_tree_entry_t*) item;

            if (0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, 
                                            target, &entry->target)) {
                entry->route = *route;
                return ORTE_SUCCESS;
            }
        }

        entry = OBJ_NEW(orte_routed_tree_entry_t);
        entry->target = *target;
        entry->route = *route;
        opal_list_append(&orte_routed_tree_module.peer_list, &entry->super);
        return ORTE_SUCCESS;
    }

    /* vpid wildcard */
    if (target->jobid != ORTE_JOBID_WILDCARD &&
        target->vpid == ORTE_VPID_WILDCARD) {
        opal_list_item_t *item;
        orte_routed_tree_entry_t *entry;

        for (item = opal_list_get_first(&orte_routed_tree_module.vpid_wildcard_list) ;
             item != opal_list_get_end(&orte_routed_tree_module.vpid_wildcard_list) ;
             item = opal_list_get_next(item)) {
            entry = (orte_routed_tree_entry_t*) item;

            if (0 == orte_ns.compare_fields(ORTE_NS_CMP_JOBID, 
                                            target, &entry->target)) {
                entry->route = *route;
                return ORTE_SUCCESS;
            }
        }

        entry = OBJ_NEW(orte_routed_tree_entry_t);
        entry->target = *target;
        entry->route = *route;
        opal_list_append(&orte_routed_tree_module.vpid_wildcard_list, &entry->super);
        return ORTE_SUCCESS;
    }

    /* wildcard */
    if (target->jobid == ORTE_JOBID_WILDCARD &&
        target->vpid == ORTE_VPID_WILDCARD) {
        orte_routed_tree_module.full_wildcard_entry.route = *route;
        return ORTE_SUCCESS;
    }

    return ORTE_ERR_NOT_SUPPORTED;
}


orte_process_name_t
orte_routed_tree_get_route(orte_process_name_t *target)
{
    orte_process_name_t ret;
    opal_list_item_t *item;

    /* if it is me, then the route is just direct */
    if (ORTE_EQUAL == orte_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
        ret = *target;
        goto found;
    }
    
    /* check exact matches */
    for (item = opal_list_get_first(&orte_routed_tree_module.peer_list) ;
         item != opal_list_get_end(&orte_routed_tree_module.peer_list) ;
         item = opal_list_get_next(item)) {
        orte_routed_tree_entry_t *entry = 
            (orte_routed_tree_entry_t*) item;

        if (0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, 
                                        target, &entry->target)) {
            ret = entry->route;
            goto found;
        }
    }

    ret = orte_routed_tree_module.full_wildcard_entry.route;

 found:

    OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
                         "%s routed_tree_get(%s) --> %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(target), 
                         ORTE_NAME_PRINT(&ret)));
    
    return ret;
}

int orte_routed_tree_init_routes(orte_jobid_t job, orte_gpr_notify_data_t *ndat)
{
    /* the tree module routes all proc communications through
     * the local daemon. Daemons must identify which of their
     * daemon-peers is "hosting" the specified recipient and
     * route the message to that daemon. Daemon contact info
     * is handled elsewhere, so all we need to do here is
     * ensure that the procs are told to route through their
     * local daemon, and that daemons are told how to route
     * for each proc
     */
    int rc;

    /* if I am a daemon or HNP, then I have to extract the routing info for this job
     * from the data sent to me for launch and update the routing tables to
     * point at the daemon for each proc
     */
    if (orte_process_info.daemon || orte_process_info.seed) {
        orte_std_cntr_t j;
        orte_process_name_t daemon, proc;
        orte_gpr_value_t **values, *value;
        opal_list_t proc_list;
        opal_list_item_t *item;
        orte_namelist_t *nitem;
        
        OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                             "%s routed_tree: init routes for daemon/seed job %ld",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)job));
        
        if (0 == job) {
            if (NULL == ndat) {
                /* if ndat is NULL, then this is being called during init,
                 * so just seed the routing table with a path back to the HNP...
                 */
                if (ORTE_SUCCESS != (rc = orte_routed_tree_update_route(ORTE_PROC_MY_HNP,
                                                                        ORTE_PROC_MY_HNP))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                /* ...and register our contact info with the HNP */
                if (ORTE_SUCCESS != (rc = orte_rml_base_register_contact_info())) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            } else {
                /* ndat != NULL means we are getting an update of RML info
                 * for the daemons - so update our contact info and routes
                 */
                orte_rml_base_contact_info_notify(ndat, NULL);
            }

            return ORTE_SUCCESS;
        }
        
        /* if ndat=NULL, then I can just ignore anything else - this is
         * being called because there are places where other routing
         * algos need to be called and we don't
         */
        if (NULL == ndat) {
            OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
                                 "%s routed_tree: no routing info provided for daemons",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            return ORTE_SUCCESS;
        }
        
        /* for any other job, extract the contact map from the launch
         * message since it contains info from every daemon
         */
        OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
                             "%s routed_tree: extract proc routing info",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
        values = (orte_gpr_value_t**)(ndat->values)->addr;
        daemon.jobid = 0;
        /* loop through all returned values - the first position contains only
         * job-global data, so we can skip it
         */
        for (j=1; j < ndat->cnt; j++) {
            value = values[j];
            
            /* extract the relevant data for this node */
            OBJ_CONSTRUCT(&proc_list, opal_list_t);
            if (ORTE_SUCCESS != (rc = orte_odls.extract_proc_map_info(&daemon, &proc_list, value))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }

            while (NULL != (item = opal_list_remove_first(&proc_list))) {
                nitem = (orte_namelist_t*)item;
                proc.jobid = job;
                proc.vpid = nitem->name->vpid;
                if (ORTE_PROC_MY_NAME->vpid != daemon.vpid) {
                    /* Setup the route to the remote proc via its daemon */
                    if (ORTE_SUCCESS != (rc = orte_routed_tree_update_route(&proc, &daemon))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                } else {
                    /* setup the route for my own procs as they may not have talked
                     * to me yet - if they have, this will simply overwrite the existing
                     * entry, so no harm done
                     */
                    if (ORTE_SUCCESS != (rc = orte_routed_tree_update_route(&proc, &proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                }
                OBJ_RELEASE(nitem);
            }
            OBJ_DESTRUCT(&proc_list);
        }
        
        OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
                             "%s routed_tree: completed init routes",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        return ORTE_SUCCESS;
    }
    

    /* I must be a proc - just setup my route to the local daemon */
    {
        int id;
        char *rml_uri;

        if (ORTE_EQUAL == orte_dss.compare(ORTE_NAME_INVALID, &orte_process_info.my_daemon, ORTE_NAME)) {
            /* the daemon wasn't previously defined, so look for it */
            id = mca_base_param_register_string("orte", "local_daemon", "uri", NULL, NULL);
            mca_base_param_lookup_string(id, &rml_uri);
            if (NULL == rml_uri) {
                /* in this module, we absolutely MUST have this information - if
                 * we didn't get it, then error out
                 */
                opal_output(0, "%s ERROR: Failed to identify the local daemon's URI",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                opal_output(0, "%s ERROR: This is a fatal condition when the tree router",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                opal_output(0, "%s ERROR: has been selected - either select the unity router",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                opal_output(0, "%s ERROR: or ensure that the local daemon info is provided",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                return ORTE_ERR_FATAL;
            }
            /* Set the contact info in the RML - this won't actually establish
             * the connection, but just tells the RML how to reach the daemon
             * if/when we attempt to send to it
             */
            if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
                ORTE_ERROR_LOG(rc);
                free(rml_uri);
                return(rc);
            }
            /* extract the daemon's name so we can update the routing table */
            if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &orte_process_info.my_daemon, NULL))) {
                ORTE_ERROR_LOG(rc);
                free(rml_uri);
                return rc;
            }
            free(rml_uri);  /* done with this */
        }
        
        /* setup the route to all other procs to flow through the daemon */
        if (ORTE_SUCCESS != (rc = orte_routed_tree_update_route(ORTE_NAME_WILDCARD, &orte_process_info.my_daemon))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* set my proc state - this will fire the corresponding trigger so
         * everything else in this procedure can happen. Note that it involves a
         * a communication, which means that a connection to the local daemon
         * will be initiated. Thus, the local daemon will subsequently know
         * my contact info
         */
        if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_AT_STG1, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        return ORTE_SUCCESS;
    }
    
}

int orte_routed_tree_warmup_routes(void)
{
    orte_std_cntr_t i, j, simultaneous, world_size, istop;
    orte_process_name_t next, prev;
    struct iovec inmsg[1], outmsg[1];
    int ret;

    if (orte_process_info.seed) {
        /* the HNP does not need to participate as it already has
         * a warmed-up connection to every daemon, so just return
         */
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                         "%s routed_tree: warming up daemon wireup for %ld procs",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.num_procs));
    
    world_size = orte_process_info.num_procs;
    istop = world_size/2;
    simultaneous = 1;
    if (world_size < simultaneous) {
        simultaneous = world_size;
    }
    next.jobid = 0;
    prev.jobid = 0;
    inmsg[0].iov_base = outmsg[0].iov_base = NULL;
    inmsg[0].iov_len = outmsg[0].iov_len = 0;

    for (i = 1 ; i <= istop ; i += simultaneous) {
#if 0
        if (simultaneous > (istop - i)) {
            /* only fill in the rest */
            simultaneous = istop - i;
        }
#endif
        /* the HNP does not need to participate as it already has
         * a warmed-up connection to every daemon, so we exclude
         * vpid=0 from both send and receive
         */
        for (j = 0 ; j < simultaneous ; ++j) {
            next.vpid = (ORTE_PROC_MY_NAME->vpid + (i + j )) % world_size;
            if (next.vpid == 0) {
                continue;
            }
            OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                                 "%s routed_tree: daemon wireup sending to %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&next)));
            
            /* sends do not wait for a match */
            ret = orte_rml.send(&next,
                                outmsg,
                                1,
                                ORTE_RML_TAG_WIREUP,
                                0);
            if (ret < 0) return ret;
        }
        for (j = 0 ; j < simultaneous ; ++j) {
            prev.vpid = (ORTE_PROC_MY_NAME->vpid - (i + j) + world_size) % world_size;
            if (prev.vpid == 0) {
                continue;
            }
            OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                                 "%s routed_tree: daemon wireup recving from %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&prev)));
            
            ret = orte_rml.recv(&prev,
                                inmsg,
                                1,
                                ORTE_RML_TAG_WIREUP,
                                0);
            if (ret < 0) return ret;
        }
    }
    return ORTE_SUCCESS;
    
}