openmpi/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
 * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
 * Copyright (c) 2016      Los Alamos National Security, LLC. All rights
 *                         reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#ifndef MCA_BCOL_PTPCOLL_BCAST_H
#define MCA_BCOL_PTPCOLL_BCAST_H

#include "ompi_config.h"
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"

BEGIN_C_DECLS

int bcol_ptpcoll_bcast_init(mca_bcol_base_module_t *super);

int bcol_ptpcoll_bcast_k_nomial_anyroot (bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_k_nomial_anyroot_progress(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);

int bcol_ptpcoll_bcast_k_nomial_known_root(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_k_nomial_known_root_progress(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);

int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);

int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);

int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);

int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress(bcol_function_args_t *input_args,
        struct mca_bcol_base_function_t *const_args);


/* macros */
#define K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(                                                                    \
        radix_mask_pow,                                                                                             \
        my_group_index, group_size, group_list,                                                                     \
        data_buffer, segment_size, count, tag,                                                                      \
        comm, send_requests, num_pending_sends)                                                                     \
do {                                                                                                                \
    int rc = OMPI_SUCCESS;                                                                                          \
    int dst;                                                                                                        \
    int comm_dst;                                                                                                   \
    int send_size;                                                                                                  \
    int send_offset;                                                                                                \
    int delta;                                                                                                      \
    int dst_boundary_rank;                                                                                          \
    int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0;                                                 \
                                                                                                                    \
    while(radix_mask_pow >= 0) {                                                                                    \
        /* For each level of tree, do sends */                                                                      \
        dst = my_group_index ^ radix_mask;                                                                          \
        comm_dst = group_list[dst];                                                                                 \
                                                                                                                    \
        dst_boundary_rank = dst & ((~(int)0) << (radix_mask_pow));                                                  \
                                                                                                                    \
        send_offset = segment_size * dst_boundary_rank;                                                             \
        /* Pasha: make sure that we handle the corner cases */                                                      \
        delta = count - send_offset;                                                                                \
        if (delta <= 0) {                                                                                           \
            send_size = 0; /* we have to send something, other way it will hang */                                  \
        } else  {                                                                                                   \
            /* the tail case */                                                                                     \
            send_size = (int)                                                                                       \
            (delta - (int)segment_size * radix_mask) < 0 ? delta :                                                  \
            (int)segment_size * radix_mask;                                                                         \
        }                                                                                                           \
                                                                                                                    \
        /* Non blocking send .... */                                                                                \
        PTPCOLL_VERBOSE(9 ,                                                                                         \
                ("Bcast p2s, Isend to %d[%d],count %d,tag %d,addr %p [%p] send_size %d,send_offset %d, radix %d %d",\
                 dst, comm_dst, count, tag,                                                                         \
                 data_buffer, (void *)((unsigned char *)data_buffer + (size_t)send_offset),                         \
                 send_size,                                                                                         \
                 send_offset,                                                                                       \
                 radix_mask,                                                                                        \
                 radix_mask_pow                                                                                     \
                ));                                                                                                 \
        rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + (size_t)send_offset),                       \
                    send_size, MPI_BYTE,                                                                            \
                    comm_dst, tag,                                                                                  \
                    MCA_PML_BASE_SEND_STANDARD, comm,                                                               \
                    &(send_requests[*num_pending_sends])));                                                         \
        PTPCOLL_VERBOSE(10, ("send request addr is %p", send_requests[*num_pending_sends]));                        \
        if( OMPI_SUCCESS != rc ) {                                                                                  \
            PTPCOLL_VERBOSE(10, ("Failed to isend data"));                                                          \
            return OMPI_ERROR;                                                                                      \
        }                                                                                                           \
        ++(*num_pending_sends);                                                                                     \
        radix_mask >>= 1;                                                                                           \
        radix_mask_pow--;                                                                                           \
    }                                                                                                               \
} while(0)

#define NARRAY_SCATTER_NB(narray_node, process_shift, group_size,                          \
        data_buffer, base_block_size, count, tag, comm, send_requests,                     \
        num_pending_sends)                                                                 \
do {                                                                                       \
    int n, rc = OMPI_SUCCESS;                                                              \
    int dst;                                                                               \
    int comm_dst;                                                                          \
    int offset;                                                                            \
    int size_count = count;                                                                \
    \
    /* Send out data to all relevant childrens  */                                         \
    for (n = 0; n < narray_node->n_children && size_count > 0; n++) {                      \
        \
        dst = narray_node->children_ranks[n] + process_shift;                              \
        if (dst >= group_size) {                                                           \
            dst -= group_size;                                                             \
        }                                                                                  \
        \
        comm_dst = group_list[dst];                                                        \
        offset = n * base_block_size;                                                      \
        size_count -= base_block_size;                                                     \
        if (OPAL_UNLIKELY(size_count < 0)) {                                               \
            count = base_block_size + size_count;                                          \
        } else {                                                                           \
            count = base_block_size;                                                       \
        }                                                                                  \
        \
        /* Non blocking send .... */                                                       \
        PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p",     \
                    dst, comm_dst, count, tag,                                             \
                    data_buffer));                                                         \
        rc = MCA_PML_CALL(isend((void *)((char *)data_buffer + (size_t)offset), count, MPI_BYTE,\
                    comm_dst, tag,                                                         \
                    MCA_PML_BASE_SEND_STANDARD, comm,                                      \
                    &(send_requests[*num_pending_sends])));                                \
        if( OMPI_SUCCESS != rc ) {                                                         \
            PTPCOLL_VERBOSE(10, ("Failed to isend data"));                                 \
            return OMPI_ERROR;                                                             \
        }                                                                                  \
        ++(*num_pending_sends);                                                            \
    }                                                                                      \
} while(0)

#define NARRAY_SCATTER_B(narray_node, process_shift, group_size,                                                    \
                          data_buffer, base_block_size, count, tag, comm, send_requests,                            \
                          num_pending_sends, completed)                                                             \
do {                                                                                                                \
    NARRAY_SCATTER_NB(narray_node, process_shift, group_size,                                                       \
            data_buffer, base_block_size, count, tag, comm, send_requests,                                          \
            num_pending_sends);                                                                                     \
    if (*num_pending_sends > 0) {                                                                                   \
        completed = mca_bcol_ptpcoll_test_all_for_match(num_pending_sends, send_requests, &rc);                     \
        if (OMPI_SUCCESS != rc) {                                                                                   \
            return OMPI_ERROR;                                                                                      \
        }                                                                                                           \
    } else {                                                                                                        \
        completed = 1;                                                                                              \
    }                                                                                                               \
} while (0)

#define CHECK_IF_ROOT_OR_VROOT(module, i)  \
    (module->pow_2 == module->ml_mem.ml_buf_desc[i].radix_mask_pow)

/* inline functions */
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra(
        mca_bcol_ptpcoll_module_t *ptpcoll_module,
        void *data_buffer, int count, int tag,
        int extra_peer, ompi_communicator_t *comm,
        int *active_requests, ompi_request_t **requests)
{
    int rc = OMPI_SUCCESS;
    int completed = 0;
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;

    /* tag is -1 already */
    /* send the all data to your extra peer */
    PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra to %d tag %d",
                extra_peer, tag));
    rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE,
                group_list[extra_peer], tag,
                MCA_PML_BASE_SEND_STANDARD, comm,
                &(requests[*active_requests])));
    if( OMPI_SUCCESS != rc ) {
        PTPCOLL_VERBOSE(10, ("Failed to send data"));
        return OMPI_ERROR;
    }

    ++(*active_requests);

    completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
    if (0 == completed) {
        PTPCOLL_VERBOSE(10, ("PR Extra send was not completed"));
        /* we have to store the iteration number somewhere */
        return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
    }

    return BCOL_FN_COMPLETE;
}

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_send_n_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
        void *data_buffer, int count, int tag,
        int *extra_peers, int num_peers, int skip,
        ompi_communicator_t *comm,
        int *active_requests, ompi_request_t **requests)
{
    int rc = OMPI_SUCCESS;
    int completed = 0;
    int i;
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;

    /* send the all data to your extra peer */
    for (i = 0; i < num_peers; i++) {
        PTPCOLL_VERBOSE(10, ("send_n_extra to %d tag %d",
                    extra_peers[i], tag));
        if (extra_peers[i] == skip) {
            PTPCOLL_VERBOSE(10, ("SKIP"));
            continue;
        }

        rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE,
                    group_list[extra_peers[i]], tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[*active_requests])));
        if( OMPI_SUCCESS != rc ) {
            PTPCOLL_VERBOSE(10, ("Failed to send data"));
            return OMPI_ERROR;
        }

        ++(*active_requests);
    }

    completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
    if (0 == completed) {
        PTPCOLL_VERBOSE(10, ("PR Extra send was not completed"));
        /* we have to store the iteration number somewhere */
        return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
    }

    return BCOL_FN_COMPLETE;
}

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_gather_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module,
        int buffer_index, void *data_buffer, int count, int base_block_size)
{
    int rc;
    int completed = 0; /* not completed */
    int *active_requests =
        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
    int i;
    int *iteration =
        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
    ompi_request_t **requests =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
    void *curr_data_sbuffer = NULL,
         *curr_data_rbuffer = NULL;
    int radix_mask_pow = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow;
    int delta;
    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;

    PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_gather_anyroot %d %d %d",
                ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration,
                ptpcoll_module->pow_2,
                1 << ptpcoll_module->pow_2));

    /* we assume the iteration #iteration already was completed with probe */
    for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
            i < ptpcoll_module->pow_2; i++) {
        int pow2 = 1 << i;
        int peer_index = my_group_index ^ pow2;
        int comm_rank  = group_list[peer_index];
        int slen, rlen,
            send_offset,
            recv_offset;

        if (i > radix_mask_pow) {
            /* *active_requests = 0; */
            /* send - receive data from the peer */
            slen = rlen = pow2 * base_block_size;
            send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
            recv_offset = base_block_size * ((peer_index)     & ((~(int)0) << i));
            curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset);
            curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset);

            delta = count - recv_offset;
            if (delta > 0) {
                if (delta < rlen) {
                    /* recv the tail */
                    rlen = delta;
                }
                PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d",
                            pow2,
                            1 << ptpcoll_module->pow_2,
                            curr_data_rbuffer,
                            recv_offset,
                            rlen,
                            comm_rank));
                rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE,
                            comm_rank, tag, comm, &requests[*active_requests]));
                if( OMPI_SUCCESS != rc ) {
                    PTPCOLL_VERBOSE(10, ("Failed to receive data"));
                    return OMPI_ERROR;
                }
                ++(*active_requests);
            }

            delta = count - send_offset;
            if (delta > 0) {
                if (delta < slen) {
                    /* recv the tail */
                    slen = delta;
                }
                PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d",
                            pow2,
                            1 << ptpcoll_module->pow_2,
                            curr_data_sbuffer,
                            send_offset,
                            slen,
                            comm_rank));
                rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE,
                            comm_rank, tag,
                            MCA_PML_BASE_SEND_STANDARD, comm,
                            &(requests[*active_requests])));
                if( OMPI_SUCCESS != rc ) {
                    PTPCOLL_VERBOSE(10, ("Failed to send data"));
                    return OMPI_ERROR;
                }
                ++(*active_requests);
            }

            if (*active_requests > 0) {
                completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
                if (0 == completed) {
                    *iteration = i;
                    /* we have to store the iteration number somewhere */
                    return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
                }
            }
        } else if (i == radix_mask_pow) {
            /* only receive data */
            rlen = pow2 * base_block_size;
            recv_offset = base_block_size * ((peer_index)     & ((~(int)0) << i));
            curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset);
            delta = count - recv_offset;
            if (0 >= delta) {
                /* we have nothing to send, skip the iteration */
                continue;
            }
            if (delta < rlen) {
                /* recv the tail */
                rlen = delta;
            }
            /* receive data from the peer */
            PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d",
                        pow2,
                        1 << ptpcoll_module->pow_2,
                        curr_data_rbuffer,
                        recv_offset,
                        rlen,
                        comm_rank));
            rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE,
                        comm_rank, tag, comm, &(requests[*active_requests])));
            if( OMPI_SUCCESS != rc ) {
                PTPCOLL_VERBOSE(10, ("Failed to receive data"));
                return OMPI_ERROR;
            }
            ++(*active_requests);
            completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
            if (0 == completed) {
                *iteration = i;
                PTPCOLL_VERBOSE(10, ("Recv was not completed"));
                /* we have to store the iteration number somewhere */
                return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
            }
            PTPCOLL_VERBOSE(10, ("Recv was completed"));
        } else if (i < radix_mask_pow) {
            /* Only send data */
            slen = pow2 * base_block_size;
            send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
            curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset);
            delta = count - send_offset;
            if (0 >= delta) {
                /* we have nothing to send, skip the iteration */
                continue;
            }
            if (delta < slen) {
                slen = delta;
            }
            PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d",
                        pow2,
                        1 << ptpcoll_module->pow_2,
                        curr_data_sbuffer,
                        send_offset,
                        slen,
                        comm_rank));
            rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE,
                        comm_rank, tag, MCA_PML_BASE_SEND_STANDARD, comm,
                        &(requests[*active_requests])));
            if( OMPI_SUCCESS != rc ) {
                PTPCOLL_VERBOSE(10, ("Failed to send data"));
                return OMPI_ERROR;
            }
            ++(*active_requests);
            completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
            if (0 == completed) {
                *iteration = i;
                /* we have to store the iteration number somewhere */
                return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
            }
        }
    }

    return BCOL_FN_COMPLETE;
}

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module,
        int buffer_index, void *data_buffer, int count, int base_block_size)
{
    mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
    int rc;
    int completed = 0; /* not completed */
    int comm_root;
    int i;
    int *radix_mask_pow =
        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow);
    int *active_requests =
        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
    ompi_status_public_t status;
    ompi_request_t **requests =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
    int pow2_group_size = ptpcoll_module->pow_2num;
    int pow2_distance;
    int my_left_boundary_rank;
    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
    int group_root_index = 0;
    void *curr_data_buffer = NULL;
    int tag =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
    int recv_count = 0;
    int *coll_status =
        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;

    assert(0 == *active_requests);

    PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot"));
    for (i = 0; i < cm->num_to_probe &&
            0 == completed; i++) {
        MCA_PML_CALL(iprobe(MPI_ANY_SOURCE, tag,
                    comm, &completed, &status));
        PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d",
                    tag));
    }

    /* the function always returns OMPI_SUCCESS, so we don't check return code */
    if (0 == completed) {
        PTPCOLL_VERBOSE(10, ("IPROBE was not matched"));
        /* No data was received, return no match error */
        return BCOL_FN_NOT_STARTED;
    }

    comm_root = status.MPI_SOURCE;


    PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is %d", comm_root));

    /* For proxy we have to check if we got something from extra node */
    if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) {
        if (group_list[ptpcoll_module->proxy_extra_index] == comm_root) {
            PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is extra node %d",
                        comm_root));
            /* scatter the data among other peer in the pow2 group */
            *radix_mask_pow =  ptpcoll_module->pow_2;

            pow2_distance  = ptpcoll_module->pow_2 - 1;
            curr_data_buffer = data_buffer;
            recv_count = count;
            goto PR_SCATTHER;
        }
    }

    /* Find group index for communicator root of the data */
    group_root_index = get_group_index_and_distance_for_binomial
        (my_group_index, comm_root, pow2_group_size, group_list, &pow2_distance);
    if (OPAL_UNLIKELY(group_root_index < 0)) {
        PTPCOLL_ERROR(("Fatal error, no group root index found, my id %d, pow2_g_size %d comm_root %d",
                    my_group_index, pow2_group_size, comm_root));
        return OMPI_ERROR;
    }

    PTPCOLL_VERBOSE(10, ("Group root index is %d distance is %d",
                group_root_index, pow2_distance));

    /* Use group_root_index to calculate the */

    /* Post receive that will fetch the data */
    /* Pasha: Who is packing data ?
       Should I assume that we get contiguous buffer ?
       Or should I pack by myself
       ===================================================================================================
       === On this stage I assume that data is contiguous. So I use MPI_BYTE datatype and COUNT = size ===
       ===================================================================================================
     */

    recv_count = base_block_size * (1 << pow2_distance); /* we may receive larger data */

    my_left_boundary_rank = my_group_index & ((~(int)0) << pow2_distance );

    curr_data_buffer = (void *)((unsigned char *)data_buffer +
            (size_t) base_block_size * my_left_boundary_rank);

    *radix_mask_pow = pow2_distance;

    pow2_distance--;

PR_SCATTHER:
    PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], "
                "recv_count %d, tag %d, addr %p, offset %d, pow2_distace %d",
                comm_root, group_root_index, recv_count,
                tag, curr_data_buffer,
                my_group_index * base_block_size, pow2_distance));

    rc = MCA_PML_CALL(recv(curr_data_buffer, recv_count, MPI_BYTE,
                comm_root, tag, comm, MPI_STATUS_IGNORE));
    if( OMPI_SUCCESS != rc ) {
        PTPCOLL_VERBOSE(10, ("Failed to receive data"));
        return OMPI_ERROR;
    }

    PTPCOLL_VERBOSE(10, ("Bcast, Data was received"));

    /* Sending forward the data over K-nomial tree */
    *coll_status = PTPCOLL_SCATTER_STARTED;
    K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(
            pow2_distance,
            my_group_index, group_size, group_list,
            data_buffer, base_block_size,
            count, tag, comm, requests,
            active_requests);

    /* Since the next step (gather) does not really require
       completion on scatter , we may return complete  */
    return BCOL_FN_COMPLETE;
}

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_binomial_root_to_src(int group_root, int my_rank,
        int pow2_size, int group_size, int *distance)
{
    int root, relative_rank, src,
        pow2_distance = 0, i;

    if (group_root < pow2_size) {
        root = group_root;
    } else {
        /* the source of the data is extra node,
           the real root it represented by some rank from
           pow2 group */
        root = group_root - pow2_size;
        /* shortcut for the case when my rank is root for the group */
        if (my_rank == root) {
            *distance = -1;
            return group_root;
        }
    }

    relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size :
                                           my_rank - root;

    for (i = 1; i < pow2_size; i<<=1, pow2_distance++) {
        if (relative_rank & i) {
            src = my_rank ^ i;
            if (src >= pow2_size)
                src -= pow2_size;

            *distance = pow2_distance;
            return src;
        }
    }

    /* error case */
    *distance = -1;
    return -1;
}

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module,
        int buffer_index, void *data_buffer, int count, int base_block_size)
{
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
    int rc;
    int *active_requests =
        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
    ompi_request_t **requests =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
    int tmp_radix_mask_pow =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow - 1;
    int tag =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
    int *status =
        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;

    PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot"));

    if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
                requests, &rc)) {
        PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
        return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
    }

    PTPCOLL_VERBOSE(10, ("Bcast, Data was received"));

    /* Sending forward the data over binimial nomial tree */
    *status = PTPCOLL_SCATTER_STARTED;
    K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(
            tmp_radix_mask_pow,
            my_group_index, group_size, group_list,
            data_buffer, base_block_size,
            count, tag, comm, requests,
            active_requests);


    return BCOL_FN_COMPLETE;
}

#define NARRAY_BLOCK_SIZE(size, module, level_size)                                      \
                     ((size + (module)->full_narray_tree_num_leafs - 1) /                \
                     (module)->full_narray_tree_num_leafs) *                             \
                     ((module)->full_narray_tree_num_leafs /                             \
                     ((0 == level_size) ?                                                \
                      mca_bcol_ptpcoll_component.narray_knomial_radix :                  \
                      level_size))

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module,
        int buffer_index, void *data_buffer, int count, int process_shift,
        int relative_group_index)
{
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
    int rc;
    int *active_requests =
        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
    ompi_request_t **requests =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
    int *status =
        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
    int scatter_count = 0;
    int offset = 0;
    int base_block_size = 0;
    void *curr_data_buffer = NULL;

    PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_narray_test_and_scatter_known_root"));

    if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
                requests, &rc)) {
        PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
        return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
    }

    /* Sending forward the data over binimial nomial tree */
    *status = PTPCOLL_SCATTER_STARTED;
    if(0 == relative_group_index) {
        scatter_count = count;
    } else {
        scatter_count = NARRAY_BLOCK_SIZE(count, ptpcoll_module,
                ptpcoll_module->narray_knomial_node[relative_group_index].level_size);
    }

    offset = scatter_count *
        ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level;

    /* make sure that we do not overun memory */
    if (OPAL_UNLIKELY(offset + scatter_count > count)) {
        scatter_count = count - offset;
    }

    PTPCOLL_VERBOSE(10, ("Bcast, Data was received %d %d %d",
                scatter_count,
                ptpcoll_module->narray_knomial_node[relative_group_index].level_size,
                ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level));


    curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t)offset);

    /* calculating scatter block size for next level of tree */
    base_block_size = NARRAY_BLOCK_SIZE(count, ptpcoll_module,
        ptpcoll_module->narray_knomial_node[relative_group_index].level_size *
        mca_bcol_ptpcoll_component.narray_knomial_radix);

    PTPCOLL_VERBOSE(10, ("scatter_known_rootaaa %d %d %d %d %d",scatter_count, offset, base_block_size,
                ptpcoll_module->narray_knomial_node[relative_group_index].level_size /mca_bcol_ptpcoll_component.narray_knomial_radix,
                ptpcoll_module->full_narray_tree_num_leafs));

    NARRAY_SCATTER_NB((&ptpcoll_module->narray_knomial_node[relative_group_index]),
            process_shift, ptpcoll_module->full_narray_tree_size,
            curr_data_buffer, base_block_size, scatter_count, tag, comm,
            requests, active_requests);

    /* Bummer, I tried to prevent this, special case for virtual root */
    if(0 == relative_group_index) {
        if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
                    requests, &rc)) {
            PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
            *status = PTPCOLL_ROOT_SEND_STARTED;
            return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
        }
    }

    return BCOL_FN_COMPLETE;
}

static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_narray_knomial_gather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
        const int buffer_index, void *data_buffer, const int count,
        const int relative_group_index)
{
    int completed = 0; /* not completed */
    int *active_requests =
        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
    int blocks_in_step =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask;
    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
    int group_size = ptpcoll_module->full_narray_tree_size;
    int i, k,
        rc,
        len, slen, rlen,
        peer, group_peer;
    size_t s_offset,
           r_offset;
    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
    ompi_request_t **requests =
        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
    netpatterns_narray_knomial_tree_node_t *narray_node =
        &ptpcoll_module->narray_knomial_node[relative_group_index];
    netpatterns_k_exchange_node_t *k_node =
        &narray_node->k_node;
    mca_bcol_ptpcoll_component_t *cm =
        &mca_bcol_ptpcoll_component;
    size_t base_block_size =
        NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_node->level_size);

    PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_narray_knomial_gather %d %d %d %d %d %d %d",
                ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration,
                base_block_size, count, narray_node->level_size,
                relative_group_index, k_node->n_exchanges, tag));

    /* we assume the iteration #iteration already was completed with probe */
    for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
            i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) {

        len = base_block_size * blocks_in_step;

        for (k = 0; k < cm->narray_knomial_radix - 1; k++) {
            group_peer = my_group_index +
                (k_node->rank_exchanges[i][k] - narray_node->rank_on_level);
            if (group_peer >= group_size) {
                group_peer -= group_size;
            } else if (group_peer < 0) {
                group_peer += group_size;
            }
            peer = group_list[group_peer];

            r_offset = (size_t)k_node->rank_exchanges[i][k] / blocks_in_step *
                len;

            /* check that we do not run out of message boundary */
            if (OPAL_UNLIKELY(r_offset + len > (size_t)count)) {
                rlen = count - r_offset;
                if (OPAL_UNLIKELY(rlen <= 0)) {
                    continue;
                }
            } else {
                rlen = len;
            }
            PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p offset %d len %d %d %d tag %d",
                        peer, data_buffer, r_offset, rlen, len, blocks_in_step, tag));
            rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + r_offset),
                        rlen, MPI_BYTE,
                        peer, tag, comm, &requests[*active_requests]));
            if( OMPI_SUCCESS != rc ) {
                PTPCOLL_VERBOSE(10, ("Failed to receive data"));
                return OMPI_ERROR;
            }
            ++(*active_requests);
        }

        for (k = 0; k < cm->narray_knomial_radix - 1; k++) {
            group_peer = my_group_index +
                (k_node->rank_exchanges[i][k] - narray_node->rank_on_level);
            if (group_peer >= group_size) {
                group_peer -= group_size;
            } else if (group_peer < 0) {
                group_peer += group_size;
            }
            peer = group_list[group_peer];

            s_offset = (size_t)narray_node->rank_on_level / blocks_in_step *
                len;

            /* check that we do not run out of message boundary */
            if (OPAL_UNLIKELY(s_offset + len > (size_t)count)) {
                slen =  count - s_offset;
                if (OPAL_UNLIKELY(slen <= 0)) {
                    continue;
                }
            } else {
                slen = len;
            }

            PTPCOLL_VERBOSE(10, ("Send data from %d, addr %p offset %d len %d %d %d tag %d",
                        peer, data_buffer, s_offset, slen, len, blocks_in_step, tag));
            rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + s_offset),
                        slen, MPI_BYTE,
                        peer, tag, MCA_PML_BASE_SEND_STANDARD, comm,
                        &(requests[*active_requests])));
            if( OMPI_SUCCESS != rc ) {
                PTPCOLL_VERBOSE(10, ("Failed to send data"));
                return OMPI_ERROR;
            }
            ++(*active_requests);
        }

        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
        if (0 == completed) {
            /* cache data for next iteration */
            ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration =
                i; /* why not to store step for next iteration ?! */
            ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask =
                blocks_in_step * cm->narray_knomial_radix;
            return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
        }
    }

    return BCOL_FN_COMPLETE;
}

END_C_DECLS

#endif