openmpi/oshmem/mca/scoll/basic/scoll_basic_reduce.c

/*
 * Copyright (c) 2013      Mellanox Technologies, Inc.
 *                         All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "oshmem_config.h"
#include <stdio.h>
#include <stdlib.h>

#include "opal/util/bit_ops.h"

#include "oshmem/constants.h"
#include "oshmem/op/op.h"
#include "oshmem/mca/spml/spml.h"
#include "oshmem/mca/scoll/scoll.h"
#include "oshmem/mca/scoll/base/base.h"
#include "scoll_basic.h"

static int _algorithm_central_counter(struct oshmem_group_t *group,
                                       struct oshmem_op_t *op,
                                       void *target,
                                       const void *source,
                                       size_t nlong,
                                       long *pSync,
                                       void *pWrk);
static int _algorithm_tournament(struct oshmem_group_t *group,
                                  struct oshmem_op_t *op,
                                  void *target,
                                  const void *source,
                                  size_t nlong,
                                  long *pSync,
                                  void *pWrk);
static int _algorithm_recursive_doubling(struct oshmem_group_t *group,
                                          struct oshmem_op_t *op,
                                          void *target,
                                          const void *source,
                                          size_t nlong,
                                          long *pSync,
                                          void *pWrk);
static int _algorithm_linear(struct oshmem_group_t *group,
                              struct oshmem_op_t *op,
                              void *target,
                              const void *source,
                              size_t nlong,
                              long *pSync,
                              void *pWrk);
static int _algorithm_log(struct oshmem_group_t *group,
                           struct oshmem_op_t *op,
                           void *target,
                           const void *source,
                           size_t nlong,
                           long *pSync,
                           void *pWrk);

int mca_scoll_basic_reduce(struct oshmem_group_t *group,
                           struct oshmem_op_t *op,
                           void *target,
                           const void *source,
                           size_t nlong,
                           long *pSync,
                           void *pWrk,
                           int alg)
{
    int rc = OSHMEM_SUCCESS;

    /* Arguments validation */
    if (!group) {
        SCOLL_ERROR("Active set (group) of PE is not defined");
        rc = OSHMEM_ERR_BAD_PARAM;
    }

    /* Check if this PE is part of the group */
    if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) {
        int i = 0;

        if (pSync) {
            alg = (alg == SCOLL_DEFAULT_ALG ?
                    mca_scoll_basic_param_reduce_algorithm : alg);
            switch (alg) {
            case SCOLL_ALG_REDUCE_CENTRAL_COUNTER:
                {
                    rc = _algorithm_central_counter(group,
                                                     op,
                                                     target,
                                                     source,
                                                     nlong,
                                                     pSync,
                                                     pWrk);
                    break;
                }
            case SCOLL_ALG_REDUCE_TOURNAMENT:
                {
                    rc = _algorithm_tournament(group,
                                                op,
                                                target,
                                                source,
                                                nlong,
                                                pSync,
                                                pWrk);
                    break;
                }
            case SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING:
                {
                    rc = _algorithm_recursive_doubling(group,
                                                        op,
                                                        target,
                                                        source,
                                                        nlong,
                                                        pSync,
                                                        pWrk);
                    break;
                }
            case SCOLL_ALG_REDUCE_LEGACY_LINEAR:
                {
                    rc = _algorithm_linear(group,
                                            op,
                                            target,
                                            source,
                                            nlong,
                                            pSync,
                                            pWrk);
                    break;
                }
            case SCOLL_ALG_REDUCE_LEGACY_LOG:
                {
                    rc = _algorithm_log(group,
                                         op,
                                         target,
                                         source,
                                         nlong,
                                         pSync,
                                         pWrk);
                    break;
                }
            default:
                {
                    rc = _algorithm_central_counter(group,
                                                     op,
                                                     target,
                                                     source,
                                                     nlong,
                                                     pSync,
                                                     pWrk);
                }
            }
        } else {
            SCOLL_ERROR("Incorrect argument pSync");
            rc = OSHMEM_ERR_BAD_PARAM;
        }

        /* Restore initial values */
        SCOLL_VERBOSE(12,
                      "PE#%d Restore special synchronization array",
                      group->my_pe);
        for (i = 0; pSync && (i < _SHMEM_REDUCE_SYNC_SIZE); i++) {
            pSync[i] = _SHMEM_SYNC_VALUE;
        }
    }

    return rc;
}

/*
 This algorithm is quite simple and straightforward for PEs with identical data size.
 One node gathers data from peers and send final result to them.
 Outlay:
 NP-1 competing network transfers are needed.
 */
static int _algorithm_central_counter(struct oshmem_group_t *group,
                                       struct oshmem_op_t *op,
                                       void *target,
                                       const void *source,
                                       size_t nlong,
                                       long *pSync,
                                       void *pWrk)
{
    int rc = OSHMEM_SUCCESS;
    int i = 0;
    int PE_root = oshmem_proc_pe(group->proc_array[0]);

    SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Central Counter", group->my_pe);

    if (PE_root == group->my_pe) {
        int pe_cur = 0;
        void *target_cur = NULL;

        target_cur = malloc(nlong);
        if (target_cur) {
            memcpy(target, (void *) source, nlong);

            SCOLL_VERBOSE(14,
                          "[#%d] Gather data from all PEs in the group",
                          group->my_pe);
            for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS);
                    i++) {
                /* Get PE ID of a peer from the group */
                pe_cur = oshmem_proc_pe(group->proc_array[i]);

                if (pe_cur == group->my_pe)
                    continue;

                SCOLL_VERBOSE(14,
                              "[#%d] Gather data (%d bytes) from #%d",
                              group->my_pe, (int)nlong, pe_cur);

                /* Clean up temporary buffer */
                memset(target_cur, 0, nlong);

                /* Get data from the current peer */
                rc = MCA_SPML_CALL(get((void *)source, nlong, target_cur, pe_cur));

                /* Do reduction operation */
                if (rc == OSHMEM_SUCCESS) {
                    op->o_func.c_fn(target_cur, target, nlong / op->dt_size);
                }
            }

            free(target_cur);
        } else {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        }
    }

    /* Send result to all PE in group */
    if (rc == OSHMEM_SUCCESS) {
        SCOLL_VERBOSE(14,
                      "[#%d] Broadcast from the root #%d",
                      group->my_pe, PE_root);
        rc = group->g_scoll.scoll_broadcast(group,
                                            PE_root,
                                            target,
                                            target,
                                            nlong,
                                            (pSync + 1),
                                            SCOLL_DEFAULT_ALG);
    }

    return rc;
}

static int _algorithm_tournament(struct oshmem_group_t *group,
                                  struct oshmem_op_t *op,
                                  void *target,
                                  const void *source,
                                  size_t nlong,
                                  long *pSync,
                                  void *pWrk)
{
    int rc = OSHMEM_SUCCESS;
    int round = 0;
    int exit_flag = group->proc_count - 1;
    long value = SHMEM_SYNC_INIT;
    int my_id = oshmem_proc_group_find_id(group, group->my_pe);
    int peer_id = 0;
    int peer_pe = 0;
    void *target_cur = NULL;
    int PE_root = oshmem_proc_pe(group->proc_array[0]);

    SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Tournament", group->my_pe);
    SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);

    /* Set current state as WAIT */
    pSync[0] = SHMEM_SYNC_WAIT;

    target_cur = malloc(nlong);
    if (target_cur) {
        memcpy(target_cur, (void *) source, nlong);
    } else {
        return OSHMEM_ERR_OUT_OF_RESOURCE;
    }

    while (exit_flag && (rc == OSHMEM_SUCCESS)) {
        /* Define a peer for competition */
        peer_id = my_id ^ (1 << round);

        /* Update exit condition and round counter */
        exit_flag >>= 1;
        round++;

        /* Do not have peer for tournament */
        if (peer_id >= group->proc_count)
            continue;

        if (my_id < peer_id) {
            pSync[0] = peer_id;
            value = my_id;

            SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
            rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));

            /* Do reduction operation */
            if (rc == OSHMEM_SUCCESS) {
                op->o_func.c_fn(target, target_cur, nlong / op->dt_size);
            }
        } else {
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

#if 1 /* It is ugly implementation of compare and swap operation
         Usage of this hack does not give performance improvement but
         it is expected that shmem_long_cswap() will make it faster.
       */
            do {
                MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe));
            } while (value != my_id);

            SCOLL_VERBOSE(14,
                          "[#%d] round = %d send data to #%d",
                          group->my_pe, round, peer_pe);
            rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe));

            MCA_SPML_CALL(fence());

            SCOLL_VERBOSE(14,
                          "[#%d] round = %d signals to #%d",
                          group->my_pe, round, peer_pe);
            value = peer_id;
            rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
#endif
            SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
            value = SHMEM_SYNC_RUN;
            rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));

            break;
        }
    }

    /* Send result to all PE in group */
    if ((my_id == 0) && (rc == OSHMEM_SUCCESS)) {
        SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe);

        memcpy(target, target_cur, nlong);

        value = SHMEM_SYNC_RUN;
        for (peer_id = 1;
                (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS);
                peer_id++) {
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
            rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
        }
    }

    /* Send result to all PE in group */
    if (rc == OSHMEM_SUCCESS) {
        SCOLL_VERBOSE(14,
                      "[#%d] Broadcast from the root #%d",
                      group->my_pe, PE_root);
        rc = group->g_scoll.scoll_broadcast(group,
                                            PE_root,
                                            target,
                                            target,
                                            nlong,
                                            (pSync + 1),
                                            SCOLL_DEFAULT_ALG);
    }

    free(target_cur);

    SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);

    return rc;
}

static int _algorithm_recursive_doubling(struct oshmem_group_t *group,
                                          struct oshmem_op_t *op,
                                          void *target,
                                          const void *source,
                                          size_t nlong,
                                          long *pSync,
                                          void *pWrk)
{
    int rc = OSHMEM_SUCCESS;
    int round = 0;
    int floor2_proc = 0;
    int exit_flag = 0;
    long value = SHMEM_SYNC_INIT;
    void *target_cur = NULL;
    int my_id = oshmem_proc_group_find_id(group, group->my_pe);
    int peer_id = 0;
    int peer_pe = 0;
    int i = 0;

    floor2_proc = 1;
    i = group->proc_count;
    i >>= 1;
    while (i) {
        i >>= 1;
        floor2_proc <<= 1;
    }

    target_cur = malloc(nlong);
    if (target_cur) {
        memcpy(target_cur, (void *) source, nlong);
    } else {
        return OSHMEM_ERR_OUT_OF_RESOURCE;
    }

    SCOLL_VERBOSE(12,
                  "[#%d] Reduce algorithm: Recursive Doubling",
                  group->my_pe);
    SCOLL_VERBOSE(15,
                  "[#%d] pSync[0] = %ld floor2_proc = %d",
                  group->my_pe, pSync[0], floor2_proc);

    if (my_id >= floor2_proc) {
        /* I am in extra group, my partner is node (my_id-y) in basic group */
        peer_id = my_id - floor2_proc;
        peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

        /* Special procedure is needed in case target and source are the same */
        if (source == target) {
            SCOLL_VERBOSE(14,
                          "[#%d] wait for peer #%d is ready",
                          group->my_pe, peer_pe);
            value = SHMEM_SYNC_WAIT;
            rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
        }

        SCOLL_VERBOSE(14,
                      "[#%d] is extra send data to #%d",
                      group->my_pe, peer_pe);
        rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe));

        MCA_SPML_CALL(fence());

        SCOLL_VERBOSE(14,
                      "[#%d] is extra and signal to #%d",
                      group->my_pe, peer_pe);
        value = SHMEM_SYNC_RUN;
        rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));

        SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe);
        value = SHMEM_SYNC_RUN;
        rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
    } else {
        /* Wait for a peer from extra group */
        if ((group->proc_count - floor2_proc) > my_id) {
            /* I am in basic group, my partner is node (my_id+y) in extra group */
            peer_id = my_id + floor2_proc;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

            /* Special procedure is needed in case target and source are the same */
            if (source == target) {
                SCOLL_VERBOSE(14,
                              "[#%d] signal to #%d that I am ready",
                              group->my_pe, peer_pe);
                value = SHMEM_SYNC_WAIT;
                rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
            }

            SCOLL_VERBOSE(14,
                          "[#%d] wait a signal from #%d",
                          group->my_pe, peer_pe);
            value = SHMEM_SYNC_RUN;
            rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));

            /* Do reduction operation */
            if (rc == OSHMEM_SUCCESS) {
                op->o_func.c_fn(target, target_cur, nlong / op->dt_size);
            }
        }

        /* Pairwise exchange  */
        exit_flag = floor2_proc - 1;
        pSync[0] = round;
        while (exit_flag && (rc == OSHMEM_SUCCESS)) {
            /* Define a peer for competition */
            peer_id = my_id ^ (1 << round);

            /* Update exit condition and round counter */
            exit_flag >>= 1;
            round++;

            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

#if 1 /* It is ugly implementation of compare and swap operation
         Usage of this hack does not give performance improvement but
         it is expected that shmem_long_cswap() will make it faster.
       */
            do {
                MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe));
            } while (value != (round - 1));

            SCOLL_VERBOSE(14,
                          "[#%d] round = %d send data to #%d",
                          group->my_pe, round, peer_pe);
            rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe));

            MCA_SPML_CALL(fence());

            SCOLL_VERBOSE(14,
                          "[#%d] round = %d signals to #%d",
                          group->my_pe, round, peer_pe);
            value = SHMEM_SYNC_RUN;
            rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
#endif

            SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
            value = SHMEM_SYNC_RUN;
            rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));

            /* Do reduction operation */
            if (rc == OSHMEM_SUCCESS) {
                op->o_func.c_fn(target, target_cur, nlong / op->dt_size);
            }

            pSync[0] = round;
        }

        memcpy(target, target_cur, nlong);

        /* Notify a peer from extra group */
        if ((group->proc_count - floor2_proc) > my_id) {
            /* I am in basic group, my partner is node (my_id+y) in extra group */
            peer_id = my_id + floor2_proc;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

            SCOLL_VERBOSE(14,
                          "[#%d] is extra send data to #%d",
                          group->my_pe, peer_pe);
            rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe));

            MCA_SPML_CALL(fence());

            SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe);
            value = SHMEM_SYNC_RUN;
            rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
        }
    }

    free(target_cur);

    SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);

    return rc;
}

static int _algorithm_linear(struct oshmem_group_t *group,
                              struct oshmem_op_t *op,
                              void *target,
                              const void *source,
                              size_t nlong,
                              long *pSync,
                              void *pWrk)
{
    int rc = OSHMEM_SUCCESS;
    int i, rank, size;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;
    char *inbuf;
    int peer_id = 0;
    int peer_pe = 0;

    /* Initialize */
    rank = group->my_pe;
    size = group->proc_count;
    int root_id = size - 1;
    int root_pe = oshmem_proc_pe(group->proc_array[root_id]);

    SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Basic", group->my_pe);

    /* If not root, send data to the root. */

    if (rank != root_pe) {
        rc = MCA_SPML_CALL(send((void*)source, nlong, root_pe, MCA_SPML_BASE_PUT_STANDARD));
    } else {

        /* for reducing buffer allocation lengths.... */

        if (size > 1) {
            free_buffer = (char*) malloc(nlong);
            if (NULL == free_buffer) {
                return OSHMEM_ERR_OUT_OF_RESOURCE;
            }
            pml_buffer = free_buffer;
        }

        /* Initialize the receive buffer. */

        if (root_id == (size - 1)) {
            memcpy(target, (void *) source, nlong);
        } else {
            peer_id = size - 1;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
            rc = MCA_SPML_CALL(recv(target, nlong, peer_pe));
        }
        if (OSHMEM_SUCCESS != rc) {
            if (NULL != free_buffer) {
                free(free_buffer);
            }
            return rc;
        }

        /* Loop receiving and calling reduction function (C or Fortran). */

        for (i = size - 2; i >= 0; --i) {
            if (root_id == i) {
                inbuf = (char*) source;
            } else {
                peer_id = i;
                peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
                rc = MCA_SPML_CALL(recv(pml_buffer, nlong, peer_pe));
                if (OSHMEM_SUCCESS != rc) {
                    if (NULL != free_buffer) {
                        free(free_buffer);
                    }
                    return rc;
                }

                inbuf = pml_buffer;
            }

            /* Perform the reduction */
            op->o_func.c_fn(inbuf, target, nlong / op->dt_size);
        }

        if (NULL != free_buffer) {
            free(free_buffer);
        }
    }

    /* Send result to all PE in group */
    if (rc == OSHMEM_SUCCESS) {
        SCOLL_VERBOSE(14,
                      "[#%d] Broadcast from the root #%d",
                      group->my_pe, root_pe);
        rc = group->g_scoll.scoll_broadcast(group,
                                            root_pe,
                                            target,
                                            target,
                                            nlong,
                                            (pSync + 1),
                                            SCOLL_DEFAULT_ALG);
    }

    /* All done */
    return rc;
}

static int _algorithm_log(struct oshmem_group_t *group,
                           struct oshmem_op_t *op,
                           void *target,
                           const void *source,
                           size_t nlong,
                           long *pSync,
                           void *pWrk)
{
    int rc = OSHMEM_SUCCESS;
    int i, size, rank, vrank;
    int mask;
    void *sbuf = (void*) source;
    void *rbuf = target;
    char *free_buffer = NULL;
    char *free_rbuf = NULL;
    char *pml_buffer = NULL;
    char *snd_buffer = NULL;
    char *rcv_buffer = (char*) rbuf;
    int my_id = oshmem_proc_group_find_id(group, group->my_pe);
    int peer_id = 0;
    int peer_pe = 0;
    int root_id = 0;
    int root_pe = oshmem_proc_pe(group->proc_array[root_id]);
    int dim = 0;

    /* Initialize */
    rank = group->my_pe;
    size = group->proc_count;
    dim = opal_cube_dim(group->proc_count);
    vrank = (my_id + size - root_id) % size;

    SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Log", rank);

    /* Allocate the incoming and resulting message buffers.  See lengthy
     * rationale above. */

    free_buffer = (char*) malloc(nlong);
    if (NULL == free_buffer) {
        return OSHMEM_ERR_OUT_OF_RESOURCE;
    }

    pml_buffer = free_buffer;
    rcv_buffer = pml_buffer;

    /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy
     * rationale above. */

    snd_buffer = (char*) sbuf;

    if (my_id != root_id && 0 == (vrank & 1)) {
        /* root is the only one required to provide a valid rbuf.
         * Assume rbuf is invalid for all other ranks, so fix it up
         * here to be valid on all non-leaf ranks */
        free_rbuf = (char*) malloc(nlong);
        if (NULL == free_rbuf) {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            goto cleanup_and_return;
        }
        rbuf = free_rbuf;
    }

    /* Loop over cube dimensions. High processes send to low ones in the
     * dimension. */

    for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) {

        /* A high-proc sends to low-proc and stops. */
        if (vrank & mask) {
            peer_id = vrank & ~mask;
            peer_id = (peer_id + root_id) % size;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

            rc = MCA_SPML_CALL(send((void*)snd_buffer, nlong, peer_pe, MCA_SPML_BASE_PUT_STANDARD));
            if (OSHMEM_SUCCESS != rc) {
                goto cleanup_and_return;
            }
            snd_buffer = (char*) rbuf;
            break;
        }

        /* A low-proc receives, reduces, and moves to a higher
         * dimension. */

        else {
            peer_id = vrank | mask;
            if (peer_id >= size) {
                continue;
            }
            peer_id = (peer_id + root_id) % size;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

            /* Most of the time (all except the first one for commutative
             * operations) we receive in the user provided buffer
             * (rbuf). But the exception is here to allow us to dont have
             * to copy from the sbuf to a temporary location. If the
             * operation is commutative we dont care in which order we
             * apply the operation, so for the first time we can receive
             * the data in the pml_buffer and then apply to operation
             * between this buffer and the user provided data. */

            rc = MCA_SPML_CALL(recv(rcv_buffer, nlong, peer_pe));
            if (OSHMEM_SUCCESS != rc) {
                goto cleanup_and_return;
            }
            /* Perform the operation. The target is always the user
             * provided buffer We do the operation only if we receive it
             * not in the user buffer */
            if (snd_buffer != sbuf) {
                /* the target buffer is the locally allocated one */
                op->o_func.c_fn(rcv_buffer, pml_buffer, nlong / op->dt_size);
            } else {
                /* If we're commutative, we don't care about the order of
                 * operations and we can just reduce the operations now.
                 * If we are not commutative, we have to copy the send
                 * buffer into a temp buffer (pml_buffer) and then reduce
                 * what we just received against it. */
                {
                    op->o_func.c_fn(sbuf, pml_buffer, nlong / op->dt_size);
                }
                /* now we have to send the buffer containing the computed data */
                snd_buffer = pml_buffer;
                /* starting from now we always receive in the user
                 * provided buffer */
                rcv_buffer = (char*) rbuf;
            }
        }
    }

    /* Get the result to the root if needed. */
    rc = OSHMEM_SUCCESS;
    if (0 == vrank) {
        if (root_id == my_id) {
            memcpy(rbuf, snd_buffer, nlong);
        } else {
            rc = MCA_SPML_CALL(send((void*)snd_buffer, nlong, root_pe, MCA_SPML_BASE_PUT_STANDARD));
        }
    } else if (my_id == root_id) {
        rc = MCA_SPML_CALL(recv(rcv_buffer, nlong, root_pe));
        if (rcv_buffer != rbuf) {
            op->o_func.c_fn(rcv_buffer, rbuf, nlong / op->dt_size);
        }
    }

    cleanup_and_return: if (NULL != free_buffer) {
        free(free_buffer);
    }
    if (NULL != free_rbuf) {
        free(free_rbuf);
    }

    /* Send result to all PE in group */
    if (rc == OSHMEM_SUCCESS) {
        SCOLL_VERBOSE(14,
                      "[#%d] Broadcast from the root #%d",
                      rank, root_pe);
        rc = group->g_scoll.scoll_broadcast(group,
                                            root_pe,
                                            target,
                                            target,
                                            nlong,
                                            (pSync + 1),
                                            SCOLL_DEFAULT_ALG);
    }

    /* All done */
    return rc;
}