openmpi/orte/mca/grpcomm/brks/grpcomm_brks.c

/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 * Copyright (c) 2007      The Trustees of Indiana University.
 *                         All rights reserved.
 * Copyright (c) 2011      Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
 *                         rights reserved.
 * Copyright (c) 2014      Intel, Inc.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "orte/runtime/orte_wait.h"

#include <string.h>

#include "opal/dss/dss.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"

#include "orte/mca/grpcomm/base/base.h"
#include "grpcomm_brks.h"


/* Static API's */
static int init(void);
static void finalize(void);
static int xcast(orte_vpid_t *vpids,
                 size_t nprocs,
                 opal_buffer_t *msg);
static int allgather(orte_grpcomm_coll_t *coll,
                     opal_buffer_t *buf);
static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance);
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
                                     opal_buffer_t* buffer, orte_rml_tag_t tag,
                                     void* cbdata);
static int brks_finalize_coll(orte_grpcomm_coll_t *coll, int ret);

/* Module def */
orte_grpcomm_base_module_t orte_grpcomm_brks_module = {
    init,
    finalize,
    xcast,
    allgather
};

/**
 * Initialize the module
 */
static int init(void)
{
    return OPAL_SUCCESS;
}

/**
 * Finalize the module
 */
static void finalize(void)
{
    return;
}

static int xcast(orte_vpid_t *vpids,
                 size_t nprocs,
                 opal_buffer_t *msg)
{
    return ORTE_ERR_NOT_IMPLEMENTED;
}

static int allgather(orte_grpcomm_coll_t *coll,
                     opal_buffer_t *sendbuf)
{
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:coll:bruck algo employed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* if we only have one proc participating, just copy the data across and return */
    if (coll->ndmns * (coll->ndmns - 1)) {
        OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
                     "%s grpcomm:coll:bruck number of participating daemons (%d) is power 2",
                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int) coll->ndmns ));
        return ORTE_ERROR;
    }

    /* start by seeding the collection with our own data */
    opal_dss.copy_payload(&coll->bucket, sendbuf);

    /* Communication step:
     At every step i, rank r:
     - doubles the distance
     - sends message containing all data collected so far to rank r - distance
     - receives message containing all data collected so far from rank (r + distance)
     */
    /* find my position in the group of participants. This
     * value is the "rank" we will use in the algo
     */
    brks_allgather_send_dist(coll, 1);

    return ORTE_SUCCESS;
}

static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance) {
    orte_process_name_t peer_send, peer_recv;
    opal_buffer_t *send_buf;
    int rc;

    peer_send.jobid = ORTE_PROC_MY_NAME->jobid;
    peer_recv.jobid = ORTE_PROC_MY_NAME->jobid;

    if (1 == coll->ndmns) {
        peer_send.vpid = ORTE_PROC_MY_NAME->vpid;
        peer_recv.vpid = ORTE_PROC_MY_NAME->vpid;
    } else {
        orte_vpid_t nv, rank;
        rank = ORTE_VPID_INVALID;
        for (nv = 0; nv < coll->ndmns; nv++) {
            if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
                rank = nv;
                break;
            }
        }
        /* check for bozo case */
        if (ORTE_VPID_INVALID == rank) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        /* first send my current contents */
        nv = (rank - distance + coll->ndmns) % coll->ndmns;
        peer_send.vpid = coll->dmns[nv];

        /* now setup to recv from my other partner */
        nv = (rank + distance) % coll->ndmns;
        peer_recv.vpid = coll->dmns[nv];
    }

    send_buf = OBJ_NEW(opal_buffer_t);

    /* pack the signature */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &coll->sig, 1, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(send_buf);
        return rc;
    }
    /* pack the current distance */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &distance, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(send_buf);
        return rc;
    }
    /* pack the number of reported processes */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &coll->nreported, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(send_buf);
        return rc;
    }
    /* pack the data */
    if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(send_buf, &coll->bucket))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(send_buf);
        return rc;
    }

    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:coll:bruck sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer_send)));

    if (0 > (rc = orte_rml.send_buffer_nb(&peer_send, send_buf,
                                          -ORTE_RML_TAG_ALLGATHER,
                                          orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(send_buf);
        return rc;
    };

    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:coll:bruck receiving from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer_recv)));

    /* setup recv for distance data */
    orte_rml.recv_buffer_nb(&peer_recv,
                            -ORTE_RML_TAG_ALLGATHER,
                            ORTE_RML_NON_PERSISTENT,
                            brks_allgather_recv_dist, NULL);

    return ORTE_SUCCESS;
}

static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
                                     opal_buffer_t* buffer, orte_rml_tag_t tag,
                                     void* cbdata)
{
    int32_t cnt, num_remote;
    int rc;
    orte_grpcomm_signature_t *sig;
    orte_grpcomm_coll_t *coll;
    orte_vpid_t distance, new_distance;

    /* unpack the signature */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* check for the tracker and create it if not found */
    if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OBJ_RELEASE(sig);
        return;
    }

    /* unpack the distance */
    distance = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) {
        OBJ_RELEASE(sig);
        ORTE_ERROR_LOG(rc);
        brks_finalize_coll(coll, rc);
        return;
    }

    /* unpack number of reported processes */
    num_remote = 0;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &num_remote, &cnt, OPAL_INT32))) {
        OBJ_RELEASE(sig);
        ORTE_ERROR_LOG(rc);
        brks_finalize_coll(coll, rc);
        return;
    }
    coll->nreported += num_remote;

    /* capture any provided content */
    if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
        OBJ_RELEASE(sig);
        ORTE_ERROR_LOG(rc);
        brks_finalize_coll(coll, rc);
        return;
    }

    //update distance and send
    new_distance = distance <<= 1;
    if (new_distance < coll->ndmns) {
        brks_allgather_send_dist(coll, new_distance);
    } else {
        brks_finalize_coll(coll, ORTE_SUCCESS);
    }

    OBJ_RELEASE(sig);

    return;
}

static int brks_finalize_coll(orte_grpcomm_coll_t *coll, int ret) {
    /* execute the callback */
    if (NULL != coll->cbfunc) {
        coll->cbfunc(ret, &coll->bucket, coll->cbdata);
    }

    opal_list_remove_item(&orte_grpcomm_base.ongoing, &coll->super);

    return ORTE_SUCCESS;
}