openmpi/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c

/*
 * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
 * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "ompi_config.h"

#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <inttypes.h>

#include "bcol_iboffload.h"
#include "bcol_iboffload_alltoall.h"
#include "bcol_iboffload_bcast.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_task.h"
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_collfrag.h"
#include "bcol_iboffload_endpoint.h"

#include "opal/include/opal/types.h"

static int mca_bcol_iboffload_allgather_init(
                               bcol_function_args_t *fn_arguments,
                               mca_bcol_iboffload_module_t *iboffload_module,
                               mca_bcol_iboffload_collreq_t **coll_request,
                               bool if_bcol_last, int mq_credits,
                               collective_message_progress_function progress_fn)
{
    int rc;

    ompi_free_list_item_t *item;
    mca_bcol_iboffload_collfrag_t *coll_fragment;
    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    OMPI_FREE_LIST_WAIT(&cm->collreqs_free, item, rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Wait for free list failed.\n"));
        return rc;
    }
    /* setup call request */
    (*coll_request) = (mca_bcol_iboffload_collreq_t *) item;

    (*coll_request)->n_fragments  = 0;
    (*coll_request)->n_frags_sent = 0;
    (*coll_request)->n_frag_mpi_complete = 0;
    (*coll_request)->n_frag_net_complete = 0;
    (*coll_request)->if_bcol_last = if_bcol_last;
    (*coll_request)->ml_buffer_index = fn_arguments->buffer_index;
    (*coll_request)->completion_cb_fn = NULL;
    (*coll_request)->buffer_info[SBUF].buf = (void *) (
            (unsigned char *)fn_arguments->sbuf +
            fn_arguments->sbuf_offset);
    (*coll_request)->buffer_info[RBUF].buf = (void *) (
        (unsigned char *)fn_arguments->rbuf +
        fn_arguments->rbuf_offset);
    (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset;
    (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset;
    /* seems like we should initialize the memory registration pointer to NULL here */
    (*coll_request)->buffer_info[SBUF].iboffload_reg = NULL;
    (*coll_request)->buffer_info[RBUF].iboffload_reg = NULL;
    (*coll_request)->dtype = fn_arguments->dtype;
    (*coll_request)->count = fn_arguments->count;
    (*coll_request)->module = iboffload_module;
    /* TODO Pasha: we need it for pending quque. Set it later. */
    (*coll_request)->progress_fn = progress_fn;
    /* TODO Pasha: fix it  later */
    (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;

    (*coll_request)->order_info = &fn_arguments->order_info;

    coll_fragment = &((*coll_request)->first_collfrag);
    mca_bcol_iboffload_collfrag_init(coll_fragment);

    /** Vasily ????? */
    /* mq_credits = (*coll_request)->total_tasks_num; */
    coll_fragment->mq_credits = mq_credits;
    coll_fragment->mq_index = COLL_MQ;
    /* pasha: just set it to zero */
    coll_fragment->last_wait_num = 0;
    coll_fragment->alg = -2; /* used only for debug */
    /*
    if (my_rank == algthm_ptr->root) {
        coll_fragment->last_wait_num = 0;
    } else {
        coll_fragment->last_wait_num = algth_lst->last_wait_num;
    }
    */
    /* Pasha: we have nothing to unpack */
    coll_fragment->unpack_size = 0;
    /* coll_fragment->unpack_size = pack_len; */
    /* coll_fragment->alg = RECURSIVE_DOUBLING_TREE_BCAST; */

    /* set pointers for (coll frag) <-> (coll full request) */
    (*coll_request)->user_handle_freed = false;

    fn_arguments->bcol_opaque_data = (void *) (*coll_request);
    /*  We don't have root..
    if (true == fn_arguments->root_flag) {
        (*coll_request)->root = my_group_index;
    } else {
        (*coll_request)->root = fn_arguments->root_route->rank;
    }
    */

    MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS((*coll_request), coll_fragment);
    return OMPI_SUCCESS;
}

#if 1
static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload)
{
    int i, j;
    /*Seems that we don't require this*/
    netpatterns_k_exchange_node_t *exchange_node = &iboffload->knomial_allgather_tree;

    mca_bcol_iboffload_endpoint_t *ep;

    IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
#if 0
    fprintf(stderr,"Entering Open Connections\n");
#endif

    /* start with extras and proxy connections */
    if(exchange_node->n_extra_sources > 0) {
        /* connect to endpoint */
        /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_extra_sources_array[0]]];*/
        ep = iboffload->endpoints[exchange_node->rank_extra_sources_array[0]];
         while (OMPI_SUCCESS !=
                check_endpoint_state(ep, NULL, NULL)) {
            opal_progress();
        }
    }
    /* now move through the recursive k-ing exchanges */
    if(NULL != exchange_node->rank_exchanges) {
        for( i = 0; i < exchange_node->log_tree_order; i++) {
            for( j = 0; j < ( exchange_node->tree_order - 1 ); j++) {
                if( exchange_node->rank_exchanges[i][j] < 0 ){
                    continue;
                }
                /* connect to endpoint */
                /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_exchanges[i][j]]];*/
                ep = iboffload->endpoints[exchange_node->rank_exchanges[i][j]];
                if (iboffload->ibnet->super.my_index < ep->index) {
                    while(0 == (ep)->remote_zero_rdma_addr.addr) {
                        opal_progress();
                    }
                } else {
                    IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
                    while (OMPI_SUCCESS !=
                            check_endpoint_state(ep, NULL, NULL)) {
                        opal_progress();
                    }
                }

            }
        }
    }

    /* set the connection status to connected */
    iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true;
}
#endif


static inline void bcol_iboffload_setup_allgather_ring_endpoints_connection(mca_bcol_iboffload_module_t *iboffload)
{
    int i;
    const int group_size = iboffload->ibnet->super.group_size;
    mca_bcol_iboffload_endpoint_t *ep;

    IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));

    /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be
     *
     */

     /* I'm going to leave this alone for now, because I'm
      *  not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids?
      */
    for (i = 0; i < group_size; i++) {
        ep = iboffload->endpoints[i];
        while (OMPI_SUCCESS !=
                check_endpoint_state(ep, NULL, NULL)) {
            opal_progress();
        }
    }

    /* set the connection status to connected */

    /*JSL - change this macro */
    iboffload->connection_status[ALLGATHER_NEIGHBOR_ALG] = true;
}

#if 0
/* allgather neighbor exchange algorithm N/2 communication steps, 2 connections */
static int mca_bcol_iboffload_neighbor_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc,
        src, dst;

    uint32_t pack_len;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
    int group_size = iboffload_module->group_size;
    int step, roffset, soffset;
    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
    int even_rank;
    int parity;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

#if 0
    fprintf(stderr,"entering large msg neighbor exchange allgather\n");
#endif
    IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather"));
    if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_NEIGHBOR_ALG])) {
        IBOFFLOAD_VERBOSE(10,("Allgather open new connection "));
        bcol_iboffload_setup_allgather_ring_endpoints_connection(iboffload_module);
    }

    pack_len = coll_request->count * coll_request->dtype->super.size;
    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                pack_len,
                coll_request->count,
                coll_request->dtype->super.size));

    /* register send and receive sides */
    /* send side, only sending pack_len data */

    /* I think that probably I will only register the rbuf */
    /* on receive side I need to register pack_len*group_size data */
    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size,
            &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Cannot register memory: "
                         "addr - %p, %d bytes.\n",
                          coll_request->buffer_info[RBUF].buf, pack_len));
        return OMPI_ERROR;
    }
    coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                    iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;


    /* start the neighbor exchange */

    even_rank = !(my_group_index % 2);
    if (even_rank) {
        neighbor[0] = (my_group_index + 1) % group_size;
        neighbor[1] = (my_group_index - 1 + group_size) % group_size;
        recv_data_from[0] = my_group_index;
        recv_data_from[1] = my_group_index;
        offset_at_step[0] = (+2);
        offset_at_step[1] = (-2);
    } else {
        neighbor[0] = (my_group_index - 1 + group_size) % group_size;
        neighbor[1] = (my_group_index + 1) % group_size;
        recv_data_from[0] = neighbor[0];
        recv_data_from[1] = neighbor[0];
        offset_at_step[0] = (-2);
        offset_at_step[1] = (+2);
    }

    /* first step is special step, only send one block */
    roffset = neighbor[0]*pack_len;
    soffset = my_group_index*pack_len;
    /* send receive this */

    dst = neighbor[0];
    src = neighbor[0];

    rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
            src, iboffload_module,
            coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
            goto out_of_resources;
        }
        return OMPI_ERROR;
    }


    rc = mca_bcol_iboffload_recv_rtr_setup(
            &last_wait, dst, iboffload_module, coll_fragment);
    /* send the data */
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to"
                    "mca_bcol_iboffload_recv_rtr_setup"));
        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
            goto out_of_resources;
        }
        return OMPI_ERROR;
    }

    rc = mca_bcol_iboffload_send_large_buff_setup(
            &last_send, RBUF,
            coll_request->buffer_info[RBUF].offset +
            soffset/* offset calc */ ,
            pack_len, dst,
            iboffload_module, coll_fragment);

    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to"
                    "mca_bcol_iboffload_send_large_buff_setup"));
        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
            goto out_of_resources;
        }
        return OMPI_ERROR;
    }
    /* send is done */


    rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF,
            coll_request->buffer_info[RBUF].offset +
            roffset,
            pack_len, src,
            iboffload_module, coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
            goto out_of_resources;
        }
        return OMPI_ERROR;
    }

    /* now for the actual neighbor exchange algorithm */


    /* determine initial send location */
    if(even_rank) {
        send_data_from = my_group_index;
    }else {
        send_data_from = recv_data_from[0];
    }
    for( step = 1; step < (group_size/2); step++) {

        parity = step % 2;
        recv_data_from[parity] =
            (recv_data_from[parity] + offset_at_step[parity] + group_size) % group_size;
        src = neighbor[parity];
        dst = src;

        roffset = recv_data_from[parity] * pack_len;
        soffset = send_data_from * pack_len;

        /* post send rtr and recev rtr together */
        if( 1 == step ){
            rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                    src, iboffload_module,
                    coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }

            rc = mca_bcol_iboffload_recv_rtr_setup(
                    &last_wait, dst, iboffload_module, coll_fragment);
            /* send the data */
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            "mca_bcol_iboffload_recv_rtr_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }
        }


        /* I'm using the hierarchy offset used in the k-nomial allgather */
        /* this won't work...*/
        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, RBUF,
                coll_request->buffer_info[RBUF].offset +
                soffset/* offset calc */ ,
                2 * pack_len, dst,
                iboffload_module, coll_fragment);

        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        "mca_bcol_iboffload_send_large_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        /* send is done */


        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF,
                coll_request->buffer_info[RBUF].offset +
                roffset,
                2 * pack_len, src,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        send_data_from = recv_data_from[parity];

    }

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  = 1;
    (coll_request)->n_frags_sent = 1;

    assert(NULL != last_wait);
    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
    coll_fragment->signal_task_wr_id = last_wait->wr_id;
    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index);
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
#endif

#if 0
/* debug connection routine */
static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload)
{
    int i;
    const int group_size = iboffload->ibnet->super.group_size;
    mca_bcol_iboffload_endpoint_t *ep;

    IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));

    /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be
     *
     */

     /* I'm going to leave this alone for now, because I'm
      *  not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids?
      */
    for (i = 0; i < group_size; i++) {
        ep = iboffload->endpoints[i];
        while (OMPI_SUCCESS !=
                check_endpoint_state(ep, NULL, NULL)) {
            opal_progress();
        }
    }

    /* set the connection status to connected */

    /*JSL - change this macro */
    iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true;
}
#endif

static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc,
        src, dst, comm_dst, comm_src;
    int tree_order, pow_k, i, j;

    uint32_t pack_len;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
    int group_size = iboffload_module->group_size;
    int *group_list = iboffload_module->super.sbgp_partner_module->group_list;
    int my_comm_index = group_list[my_group_index];

    netpatterns_k_exchange_node_t *exchange_node = &iboffload_module->knomial_allgather_tree;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

#if 0
    fprintf(stderr,"entering large msg allgather\n");
#endif
    IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather"));
    if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) {
        IBOFFLOAD_VERBOSE(10,("Allgather open new connection "));
        bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module);
    }

    pack_len = coll_request->count * coll_request->dtype->super.size;
    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                pack_len,
                coll_request->count,
                coll_request->dtype->super.size));

    /* register send and receive sides */
    /* send side, only sending pack_len data */

    /* I think that probably I will only register the rbuf */
    /* on receive side I need to register pack_len*group_size data */

    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size,
            &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Cannot register memory: "
                         "addr - %p, %d bytes.\n",
                          coll_request->buffer_info[RBUF].buf, pack_len));
        return OMPI_ERROR;
    }
    coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                    iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;

    /* start with the extra / proxy phase */
    if( EXTRA_NODE == exchange_node->node_type ) {


        /* send pack_len data to proxy */
        comm_dst = exchange_node->rank_extra_sources_array[0];
        /* get ib subnet id */
        dst = comm_dst; /* comm_to_ibnet[comm_dst];*/
        /* post ready-to-receive receive on sender's side */
        rc = mca_bcol_iboffload_recv_rtr_setup(
                &last_wait, dst, iboffload_module, coll_fragment);

        /* send the data */
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        "mca_bcol_iboffload_recv_rtr_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }

        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, RBUF, coll_request->buffer_info[RBUF].offset + my_comm_index*pack_len,
                pack_len, dst,
                iboffload_module, coll_fragment);

        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        "mca_bcol_iboffload_send_large_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        /* send is done */

        /* post the receive */
        comm_src = comm_dst;
        src = dst;
        /* Sending this results in a race condition where if the rtr send bypasses
           the large msg receive on proxy's side, then it triggers the start of the
           recurssive k-ing phase prematurely causing random data corruption.
          */
       /*
        rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                                    src, iboffload_module,
                                    coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        */
        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
                RBUF, coll_request->buffer_info[RBUF].offset,
                pack_len*group_size, src,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }

        goto FINISHED;


    } else if( 0 < exchange_node->n_extra_sources ) {

        /* am a proxy, receive pack_len data from extra */
        comm_src = exchange_node->rank_extra_sources_array[0];
        /* get ib subnet */
        src =  comm_src; /*comm_to_ibnet[comm_src];*/

        rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                                    src, iboffload_module,
                                    coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }


        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
                RBUF, coll_request->buffer_info[RBUF].offset + pack_len*comm_src,
                pack_len, src,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }

    }

    /* start recursive k - ing */
    tree_order = exchange_node->tree_order;
    pow_k =  exchange_node->log_tree_order;
    for( i = 0; i < pow_k; i++) {


        /* Post ready-to-recv messages - I am here */
        for( j = 0; j <( tree_order - 1); j++) {
            comm_src = exchange_node->rank_exchanges[i][j];
            if( comm_src < 0 ){
                continue;
            }
            /* get ib subnet */
            src = comm_src; /*comm_to_ibnet[comm_src];*/

            rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                    src, iboffload_module,
                    coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }
        }

        /* Post receive ready-to-recev message - I can send to you */
        for( j = 0; j < (tree_order - 1); j++) {
            /* recev ready-to-receive message */
            comm_dst = exchange_node->rank_exchanges[i][j];
            /* remember, if we have extra ranks, then we won't participate
             * with a least one peer. Make a check:
             */
            if( comm_dst < 0 ){
                continue;
            }

            /* get ib subnet id */
            dst = comm_dst; /*comm_to_ibnet[comm_dst];*/
            /* post ready-to-receive receive on sender's side */
            rc = mca_bcol_iboffload_recv_rtr_setup(
                    &last_wait, dst, iboffload_module, coll_fragment);
            /* send the data */
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            "mca_bcol_iboffload_recv_rtr_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }
        }


        /* (k-1) sends */
        for( j = 0; j < (tree_order - 1); j++ ) {

            /* send phase
             */
            comm_dst = exchange_node->rank_exchanges[i][j];
            /* remember, if we have extra ranks, then we won't participate
             * with a least one peer. Make a check
             */
            if( comm_dst < 0 ){
                continue;
            }

            /* get ib subnet id */
            dst = comm_dst; /*comm_to_ibnet[comm_dst];*/
            rc = mca_bcol_iboffload_send_large_buff_setup(
                    &last_send, RBUF,
                    coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].s_offset/* offset calc */ ,
                    exchange_node->payload_info[i][j].s_len*pack_len, dst,
                    iboffload_module, coll_fragment);

            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            "mca_bcol_iboffload_send_large_buff_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }
            /* send is done */

        }

        /* we post receives after all sends in order to achieve concurrent
         * sends as well as assuring blocking until completely receiving
         * all data at level k before starting level k+1 sends
         */
        /* (k-1) receives - these are blocking */
        for( j = 0; j < (tree_order - 1); j++) {
            /*recv phase */
            comm_src = exchange_node->rank_exchanges[i][j];
            if( comm_src < 0 ){
                continue;
            }
            /* get ib subnet */
            src = comm_src; /*comm_to_ibnet[comm_src];*/

            rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF,
                    coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].r_offset,
                    exchange_node->payload_info[i][j].r_len*pack_len, src,
                    iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }


        }


    }

    /* last step, just send it back to the extra if I have one */
    if( 0 < exchange_node->n_extra_sources ) {

        comm_dst = exchange_node->rank_extra_sources_array[0];

        /* get ib subnet id */
        dst = comm_dst; /*comm_to_ibnet[comm_dst];*/
        /*
        rc = mca_bcol_iboffload_recv_rtr_setup(
                &last_wait, dst, iboffload_module, coll_fragment);

        // send the data
         we are already guaranteed that extra rank is waiting
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        "mca_bcol_iboffload_recv_rtr_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        */

        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, RBUF, coll_request->buffer_info[RBUF].offset,
                pack_len*group_size, dst,
                iboffload_module, coll_fragment);

        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        "mca_bcol_iboffload_send_large_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        /* send is done */

    }

FINISHED:

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  = 1;
    (coll_request)->n_frags_sent = 1;

    assert(NULL != last_wait);
    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
    coll_fragment->signal_task_wr_id = last_wait->wr_id;
    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index);
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}

static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc,
        src, dst, comm_dst, comm_src, i, j;
    int tree_order, pow_k, knt;
    uint32_t pack_len;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
    int group_size = iboffload_module->group_size;
    netpatterns_k_exchange_node_t *exchange_node =
                                    &iboffload_module->knomial_allgather_tree;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;
    int *list_connected = iboffload_module->super.list_n_connected;

    /* test test */
    int buff_offset = iboffload_module->super.hier_scather_offset;

    IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast"));


    if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) {
        IBOFFLOAD_VERBOSE(10,("Bcast open new connection "));
        bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module);
    }

    pack_len = coll_request->count * coll_request->dtype->super.size;
    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                            pack_len,
                            coll_request->count,
                            coll_request->dtype->super.size));

    /* now we calculate the actual buff_offset */
    buff_offset = buff_offset*pack_len;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;
    /* we put this in to propagate the lkey into this local data structure */
    coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey;
    /* end hack */
    if( EXTRA_NODE == exchange_node->node_type ) {
        /* setup the rdma "send" pack_len data to proxy rank */
        comm_dst = exchange_node->rank_extra_sources_array[0];
        /* get ib subnet id */
        dst = comm_dst;
        /* now I need to calculate my own offset info */
        knt = 0;
        for( i = 0; i < my_group_index; i++){
            knt += list_connected[i];
        }

        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                &last_send, pack_len*list_connected[my_group_index],  pack_len*knt /* source offset */,
                pack_len*knt /* destination offset */, dst,
                iboffload_module, coll_fragment);
#if 0
        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                &last_send, pack_len,  pack_len*group_list[my_group_index] /* source offset */,
                pack_len*group_list[my_group_index] /* destination offset */, dst,
                iboffload_module, coll_fragment);
#endif
        /* old flow with ml offset */
#if 0
        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                &last_send, pack_len,  pack_len*group_list[my_group_index] /* source offset */,
                coll_request->buffer_info[RBUF].offset + pack_len*group_list[my_group_index] /* destination offset */, dst,
                iboffload_module, coll_fragment);
#endif
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_small_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }
        /* send is done */

        /* setup the rdma "receive" from proxy */
        comm_src = comm_dst;
        src = dst;
        /* more general is the number connected */
        knt = 0;
        for( i = 0; i < group_size; i++) {
            knt += list_connected[i];
        }


        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                                    pack_len*knt, src,
                                    iboffload_module, coll_fragment);

       /*
        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                                    pack_len*group_size, src,
                                    iboffload_module, coll_fragment);
        */
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }

        goto FINISHED;
    } else if( 0 < exchange_node->n_extra_sources ) {

        /* am a proxy, receive pack_len data from extra */
        comm_src = exchange_node->rank_extra_sources_array[0];
        /* get ib subnet */
        src = comm_src;
        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                                    pack_len*list_connected[src], src,
                                    iboffload_module, coll_fragment);
        /*
        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                                    pack_len, src,
                                    iboffload_module, coll_fragment);
        */
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
            }
            return OMPI_ERROR;
        }


    }

    /* start recursive k - ing */
    tree_order = exchange_node->tree_order;
    pow_k =  exchange_node->log_tree_order;
    /*fprintf(stderr,"tree order %d pow_k %d\n",tree_order,pow_k);*/
    for( i = 0; i < pow_k; i++) {
        for( j = 0; j < (tree_order - 1); j++ ) {
            /* send phase
             */
            comm_dst = exchange_node->rank_exchanges[i][j];
            /* remember, if we have extra ranks, then we won't participate
             * with a least one peer. Make a check
             */
            /*fprintf(stderr,"AAA my index %d comm_dst %d\n",my_group_index,comm_dst);*/
            if( comm_dst < 0 ){
                continue;
            }

            /* get ib subnet id */
            /* again, don't think we need this */
            /*dst = ibnet_map[comm_dst];*/
            dst = comm_dst;
            /*
            fprintf(stderr,"BBB my index %d dst %d pack len %d s_len %d src offset %d r_len %d \n",my_group_index,dst,
                    pack_len,exchange_node->payload_info[i][j].s_len,exchange_node->payload_info[i][j].s_offset,
                    exchange_node->payload_info[i][j].r_len);
            */
            /* rdma "send" setup */


            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
                    exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */,
                    exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst,
                    iboffload_module, coll_fragment);

#if 0
            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
                    exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* source offset */,
                    exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* destination offset */, dst,
                    iboffload_module, coll_fragment);
#endif

#if 0
            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
                    exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */,
                    exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst,
                    iboffload_module, coll_fragment);
#endif
#if 0
            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
                    coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */,
                    coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst,
                    iboffload_module, coll_fragment);
#endif
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_send_small_buff_setup"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }

            /* send is done */
        }

       for( j = 0; j < (tree_order - 1); j++) {

            /* rdma "recv" phase */
           comm_src = exchange_node->rank_exchanges[i][j];
           /* remember, if we have extra ranks, then we won't participate
            * with a least one peer. Make a check
            */
           if( comm_src < 0 ){
               continue;
           }

           /* get ib subnet id */
           /* shouldn't need this */
           src = comm_src;

           rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                    exchange_node->payload_info[i][j].r_len * pack_len, src,
                    iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                    goto out_of_resources;
                }
                return OMPI_ERROR;
            }

        }
    }

    /* last step, proxies send full data back to the extra ranks */
    if( 0 < exchange_node->n_extra_sources ) {
        /* send pack_len data to proxy */
        comm_dst = exchange_node->rank_extra_sources_array[0];
        /* get ibnet id */
        dst = comm_dst;

        knt = 0;
        for( i = 0; i < group_size; i++){
            knt += list_connected[i];
        }

        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                &last_send, pack_len*knt, 0 /* source offset */,
                0 /* destination offset */, dst,
                iboffload_module, coll_fragment);
#if 0
        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                &last_send, pack_len*group_size, 0 /* source offset */,
                0 /* destination offset */, dst,
                iboffload_module, coll_fragment);
#endif
#if 0
        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
                &last_send, pack_len*group_size, coll_request->buffer_info[RBUF].offset /* source offset */,
                coll_request->buffer_info[SBUF].offset /* destination offset */, dst,
                iboffload_module, coll_fragment);
#endif
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_small_buff_setup"));
            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
                goto out_of_resources;
                fprintf(stderr,"I'm out of resources \n");
            }
            return OMPI_ERROR;
        }
        /* send is done */

    }

FINISHED:

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  = 1;
    (coll_request)->n_frags_sent = 1;

    assert(NULL != last_wait);
    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
    coll_fragment->signal_task_wr_id = last_wait->wr_id;
    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    assert(MCA_COLL_ML_NO_BUFFER != coll_request->ml_buffer_index);
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}

#if 0
static int mca_bcol_iboffload_neighbor_allgather_userbuffer_intra(
                                            bcol_function_args_t *fn_arguments,
                                            struct coll_ml_function_t *const_args)
{
    mca_bcol_iboffload_module_t *iboffload_module =
        (mca_bcol_iboffload_module_t *)const_args->bcol_module;

    int rc;
    int mq_credits = iboffload_module->group_size * 2 * 2; /* large message protocol consumes
                                                            * twice as many mq credits
                                                            */

    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
    mca_bcol_iboffload_collreq_t *coll_request;

    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);

    rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module,
            &coll_request, if_bcol_last, mq_credits,
            mca_bcol_iboffload_neighbor_allgather_userbuffer_exec);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    rc = coll_request->progress_fn(iboffload_module, coll_request);

    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc));
    return rc;
}
#endif

#if 1
static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra(bcol_function_args_t *fn_arguments,
                                                   struct coll_ml_function_t *const_args)
{
    mca_bcol_iboffload_module_t *iboffload_module =
        (mca_bcol_iboffload_module_t *)const_args->bcol_module;

    int rc;
    int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)*
                       iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 * 2; /* large message protocol
                                                                                              * consumes twice as much
                                                                                              */

    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
    mca_bcol_iboffload_collreq_t *coll_request;

    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);

    rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module,
            &coll_request, if_bcol_last, mq_credits,
            mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    rc = coll_request->progress_fn(iboffload_module, coll_request);

    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc));
    return rc;
}
#endif

static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra(bcol_function_args_t *fn_arguments,
                                                   struct coll_ml_function_t *const_args)
{
    mca_bcol_iboffload_module_t *iboffload_module =
        (mca_bcol_iboffload_module_t *)const_args->bcol_module;

    int rc;

    /* I'll add one for everyone, since nobody wants to feel left out */
    int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)*
                       iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 ;
    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
    mca_bcol_iboffload_collreq_t *coll_request;

    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);

    rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module,
            &coll_request, if_bcol_last, mq_credits,
            mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    rc = coll_request->progress_fn(iboffload_module, coll_request);

    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_small_msg_bcast_intra was started [%d]\n", rc));
    return rc;
}


/* these progress engines are shared between alltoall and allgather and exist in both files,
 * should be moved to a common .h file
 */
static int mca_bcol_iboffload_collreq_mlbuffer_progress(
            bcol_function_args_t *input_args,
            struct coll_ml_function_t *const_args)
{
    int i;
    mca_bcol_iboffload_collreq_t *coll_request =
         (mca_bcol_iboffload_collreq_t *)
                   input_args->bcol_opaque_data;
    IBOFFLOAD_VERBOSE(10, ("Run progress (ml buffer).\n"));
    for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) {
    if (BCOL_IS_COMPLETED(coll_request)) {

        coll_request->user_handle_freed = true;

        if (COLLREQ_IS_DONE(coll_request)) {
        IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
        RELEASE_COLLREQ(coll_request);
        }
        IBOFFLOAD_VERBOSE(10, ("Collective finished (ml buffer).\n"));

        return BCOL_FN_COMPLETE;
    }
    }
    IBOFFLOAD_VERBOSE(10, ("Collective not finished (ml buffer).\n"));
    return BCOL_FN_STARTED;
}


static int mca_bcol_iboffload_collreq_userbuffer_progress(
                        bcol_function_args_t *input_args,
                        struct coll_ml_function_t *const_args)
{
    int i;
    mca_bcol_iboffload_collreq_t *coll_request =
                 (mca_bcol_iboffload_collreq_t *)
                                   input_args->bcol_opaque_data;

    IBOFFLOAD_VERBOSE(10, ("Run progress (user buffer)\n"));

    /* Complete the allgather - progress releases full request descriptors */

    for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) {
        if (coll_request->n_frag_mpi_complete == coll_request->n_fragments &&
            coll_request->n_frag_net_complete == coll_request->n_fragments) {

            IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n"));

            if (NULL != coll_request->buffer_info[SBUF].iboffload_reg) {
                coll_request->module->device->mpool->mpool_deregister(
                        coll_request->module->device->mpool,
                        (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg);
                coll_request->buffer_info[SBUF].iboffload_reg = NULL;
            }


            if (NULL != coll_request->buffer_info[RBUF].iboffload_reg) {
                coll_request->module->device->mpool->mpool_deregister(
                        coll_request->module->device->mpool,
                        (mca_mpool_base_registration_t *) coll_request->buffer_info[RBUF].iboffload_reg);
                coll_request->buffer_info[RBUF].iboffload_reg = NULL;
            }

            RELEASE_COLLREQ(coll_request);
            IBOFFLOAD_VERBOSE(10, ("New bcast done !!!"));
            return BCOL_FN_COMPLETE;
        }
    }

    IBOFFLOAD_VERBOSE(10, ("Collective finished (user buffer).\n"));

    /* We are not done */
    return BCOL_FN_STARTED;
}

int mca_bcol_iboffload_allgather_register(mca_bcol_base_module_t *super)
{
    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;

    IBOFFLOAD_VERBOSE(10, ("Register iboffload Allgather.\n"));
    comm_attribs.bcoll_type = BCOL_ALLGATHER;

    comm_attribs.comm_size_min = 0;
    comm_attribs.comm_size_max = 1024 * 1024;
    comm_attribs.waiting_semantics = NON_BLOCKING;

    inv_attribs.bcol_msg_min = 0;
    inv_attribs.bcol_msg_max = 20000; /* range 1 */

    inv_attribs.datatype_bitmap = 0xffffffff;
    inv_attribs.op_types_bitmap = 0xffffffff;

    comm_attribs.data_src = DATA_SRC_KNOWN;

    mca_bcol_base_set_attributes(super,
            &comm_attribs, &inv_attribs,
            mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra,
            mca_bcol_iboffload_collreq_mlbuffer_progress);

    inv_attribs.bcol_msg_min = 10000000;
    inv_attribs.bcol_msg_max = 10485760; /* range 4 */


    /* zero-copy k-nomial algorithm */
#if 1
    mca_bcol_base_set_attributes(super,
            &comm_attribs, &inv_attribs,
            mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra,
            mca_bcol_iboffload_collreq_userbuffer_progress);
#endif
    /* zero-copy neighbor exchange algorithm */
#if 0
    mca_bcol_base_set_attributes(super,
            &comm_attribs, &inv_attribs,
            mca_bcol_iboffload_neighbor_allgather_userbuffer_intra,
            mca_bcol_iboffload_collreq_userbuffer_progress);
#endif
    return OMPI_SUCCESS;
}