openmpi/ompi/mca/btl/openib/btl_openib_endpoint.c

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006-2007 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2006-2007 Los Alamos National Security, LLC.  All rights
 *                         reserved. 
 * Copyright (c) 2006-2007 Voltaire All rights reserved.
 *
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

#include "ompi_config.h"

#include <sys/time.h>
#include <time.h>
#include <errno.h> 
#include <string.h> 

#include "orte/mca/ns/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss.h"

#include "ompi/types.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/class/ompi_free_list.h" 

#include "btl_openib.h"
#include "btl_openib_endpoint.h" 
#include "btl_openib_proc.h"
#include "btl_openib_frag.h"
#include "connect/base.h"

static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);

static int post_send(mca_btl_openib_endpoint_t *ep,
        mca_btl_openib_send_frag_t *frag, const bool rdma)
{
    mca_btl_openib_module_t *openib_btl = ep->endpoint_btl;
    mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment;
    struct ibv_sge *sg = &to_com_frag(frag)->sg_entry;
    struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc;
    struct ibv_send_wr *bad_wr;
    int qp = to_base_frag(frag)->base.order;

    sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) +
        (rdma ? sizeof(mca_btl_openib_footer_t) : 0);

    if(sg->length <= openib_btl->ib_inline_max) {
        sr_desc->send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
    } else {
        sr_desc->send_flags = IBV_SEND_SIGNALED;
    }

    if(ep->nbo)
        BTL_OPENIB_HEADER_HTON(*frag->hdr);

    if(rdma) {
        int32_t head;
        mca_btl_openib_footer_t* ftr =
            (mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) +
                    seg->seg_len);
        sr_desc->opcode = IBV_WR_RDMA_WRITE;
        MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length);
        MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
#if OMPI_ENABLE_DEBUG
        ((mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) +
            seg->seg_len))->seq = ep->eager_rdma_remote.seq++;
#endif
        if(ep->nbo)
            BTL_OPENIB_FOOTER_HTON(*ftr);

        sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
        MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
        sr_desc->wr.rdma.remote_addr =
            ep->eager_rdma_remote.base.lval +
            head * openib_btl->eager_rdma_frag_size +
            sizeof(mca_btl_openib_header_t) +
            mca_btl_openib_component.eager_limit +
            sizeof(mca_btl_openib_footer_t);
        sr_desc->wr.rdma.remote_addr -= sg->length;
    } else {
        if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
            sr_desc->opcode = IBV_WR_SEND_WITH_IMM;
            sr_desc->imm_data = ep->rem_info.rem_index;
        } else {
            sr_desc->opcode = IBV_WR_SEND;
        }
    }

    assert(sg->addr == (uint64_t)frag->hdr);

    return ibv_post_send(ep->qps[qp].lcl_qp, sr_desc, &bad_wr);
 }

static inline int acruire_wqe(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{
    int qp = to_base_frag(frag)->base.order;
    int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);

    if(OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, -1) < 0) {
        OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1);
        opal_list_append(&endpoint->qps[qp].pending_frags[prio],
                (opal_list_item_t *)frag);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    return OMPI_SUCCESS;
}

static inline int
acquire_eager_rdma_send_credit(mca_btl_openib_endpoint_t *endpoint)
{
    if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    return OMPI_SUCCESS;
}

static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{
    mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
    int qp = to_base_frag(frag)->base.order;
    int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);

    if(BTL_OPENIB_QP_TYPE_PP(qp)) { 
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
            opal_list_append(&endpoint->qps[qp].pending_frags[prio],
                    (opal_list_item_t *)frag);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    } else {
        if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) {
            OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
            OPAL_THREAD_LOCK(&openib_btl->ib_lock);
            opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
                             (opal_list_item_t *)frag);
            OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    return OMPI_SUCCESS;
}

#define GET_CREDITS(FROM, TO)                                        \
            do {                                                     \
                TO = FROM;                                           \
            } while(0 == OPAL_ATOMIC_CMPSET_32(&FROM, TO, 0))

/* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{ 
    mca_btl_openib_header_t *hdr = frag->hdr;
    mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
    int qp, ib_rc;
    int32_t cm_return;
    bool do_rdma = false;

    if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
        des->order = frag->qp_idx;

    qp = des->order;

    if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS)
        return OMPI_ERR_OUT_OF_RESOURCE;

    if(des->des_src->seg_len <= mca_btl_openib_component.eager_limit &&
            (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
        /* High priority frag. Try to send over eager RDMA */
        if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS)
            do_rdma = true;
    }

    if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) {
        OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
    if(hdr->credits)
        hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;

    hdr->cm_seen = 0;
    if(!do_rdma) {
        if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
            GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
        }

        GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
        /* cm_seen is only 8 bytes, but cm_return is 32 bytes */
        if(cm_return > 255) {
            hdr->cm_seen = 255;
            cm_return -= 255;
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
        } else {
            hdr->cm_seen = cm_return;
        }
    }

    ib_rc = post_send(endpoint, frag, do_rdma); 

    if(!ib_rc)
        return OMPI_SUCCESS;

    if(endpoint->nbo)
        BTL_OPENIB_HEADER_NTOH(*hdr);

    if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
                BTL_OPENIB_CREDITS(hdr->credits));
    }

    OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1);

    if(do_rdma) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    } else {
        if(BTL_OPENIB_QP_TYPE_PP(qp)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
                    hdr->credits);
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
        } else {
            mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
            OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
        }
    }
    BTL_ERROR(("error posting send request error %d: %s\n", 
               ib_rc, strerror(ib_rc))); 
    return OMPI_ERROR; 
}


OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t, 
                   opal_list_item_t, mca_btl_openib_endpoint_construct, 
                   mca_btl_openib_endpoint_destruct);

/*
 * Initialize state of the endpoint instance.
 *
 */
static void mca_btl_openib_endpoint_construct_qp(mca_btl_base_endpoint_t *endpoint, int qp)
{
 
    endpoint->qps[qp].lcl_qp = NULL;
    endpoint->qps[qp].rd_credit_send_lock = 0;
    /* setup rem_info */
    endpoint->rem_info.rem_qps[qp].rem_qp_num = 0; 
    endpoint->rem_info.rem_qps[qp].rem_psn = 0;

    OBJ_CONSTRUCT(&endpoint->qps[qp].pending_frags[0], opal_list_t);
    OBJ_CONSTRUCT(&endpoint->qps[qp].pending_frags[1], opal_list_t);
    if(BTL_OPENIB_QP_TYPE_PP(qp)) { 
        /* local credits are set here such that on initial posting
         * of the receive buffers we end up with zero credits to return
         * to our peer. The peer initializes his sd_credits to reflect this 
         * below. Note that this may be a problem for iWARP as the sender 
         * now has credits even if the receive buffers are not yet posted 
         */
        endpoint->qps[qp].u.pp_qp.rd_credits =
            -mca_btl_openib_component.qp_infos[qp].rd_num;
        
        endpoint->qps[qp].u.pp_qp.rd_posted = 0;
        endpoint->qps[qp].u.pp_qp.cm_sent = 0;
        endpoint->qps[qp].u.pp_qp.cm_return = 
            -mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
        endpoint->qps[qp].u.pp_qp.cm_received =
            mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;

        /* initialize the local view of credits */
        endpoint->qps[qp].u.pp_qp.sd_credits = 
            mca_btl_openib_component.qp_infos[qp].rd_num;
        
        /* number of available send wqes */
        endpoint->qps[qp].sd_wqe = mca_btl_openib_component.qp_infos[qp].rd_num;
    
    } else { 
        /* number of available send wqes */
        endpoint->qps[qp].sd_wqe = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
    } 
    
}

static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
{

    int qp;
    
    /* setup qp structures */
    if( mca_btl_openib_component.num_qps > 0 ) { 
        endpoint->qps = 
            (mca_btl_openib_endpoint_qp_t*) 
            malloc(sizeof(mca_btl_openib_endpoint_qp_t) * 
                   mca_btl_openib_component.num_qps);
        memset(endpoint->qps, 0, sizeof(mca_btl_openib_endpoint_qp_t) *
                mca_btl_openib_component.num_qps);
        endpoint->rem_info.rem_qps = 
            (mca_btl_openib_rem_qp_info_t*)
            malloc(sizeof(mca_btl_openib_rem_qp_info_t) *
                   mca_btl_openib_component.num_qps);
        memset(endpoint->rem_info.rem_qps, 0,
                sizeof(mca_btl_openib_rem_qp_info_t) *
                mca_btl_openib_component.num_qps);
    }

    endpoint->endpoint_btl = 0;
    endpoint->endpoint_proc = 0;
    endpoint->endpoint_tstamp = 0.0;
    endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
    endpoint->endpoint_retries = 0;
    OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&endpoint->pending_lazy_frags, opal_list_t);
    OBJ_CONSTRUCT(&endpoint->pending_get_frags, opal_list_t);
    OBJ_CONSTRUCT(&endpoint->pending_put_frags, opal_list_t);
    
    endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom;

    /* initialize RDMA eager related parts */
    endpoint->eager_recv_count = 0;
    memset(&endpoint->eager_rdma_remote, 0,
           sizeof(mca_btl_openib_eager_rdma_remote_t));
    memset(&endpoint->eager_rdma_local, 0,
           sizeof(mca_btl_openib_eager_rdma_local_t));
    OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t);

    endpoint->rem_info.rem_lid = 0; 
    endpoint->rem_info.rem_subnet_id = 0; 
    endpoint->rem_info.rem_mtu = 0;
    endpoint->nbo = false;
    endpoint->use_eager_rdma = false;
    endpoint->eager_rdma_remote.tokens = 0;
    endpoint->eager_rdma_local.credits = 0;
    
    for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { 
        mca_btl_openib_endpoint_construct_qp(endpoint, qp);
    }
}

/*
 * Destroy a endpoint
 *
 */

static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
{
    bool pval_clean = false;
    int qp;
    /* Release memory resources */
    do {
        /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma ()
         * was not in "connect" or "bad" flow (failed to allocate memory)
         * and changed the pointer back to NULL 
         */
        if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                    (void*)1)) {
            if ((void*)1 != endpoint->eager_rdma_local.base.pval && 
                    NULL != endpoint->eager_rdma_local.base.pval) {
                endpoint->endpoint_btl->super.btl_mpool->mpool_free(endpoint->endpoint_btl->super.btl_mpool,
                        endpoint->eager_rdma_local.base.pval, 
                        (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
                pval_clean=true;
            }
        } else {
            pval_clean=true;
        }
    } while (!pval_clean);

    /* Close opened QPs if we have them*/
    if(MCA_BTL_IB_CLOSED != endpoint->endpoint_state) {
        
        for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { 
            MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags[0]);
            MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags[1]);
            OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]);
            OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[1]);
            if(ibv_destroy_qp(endpoint->qps[qp].lcl_qp)) {
                BTL_ERROR(("Failed to destroy QP:%d\n", qp));
            }
        }
        /* free the qps */
        free(endpoint->qps);
    }
    OBJ_DESTRUCT(&endpoint->endpoint_lock);
    /* Clean pending lists */
    MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags);
    OBJ_DESTRUCT(&endpoint->pending_lazy_frags);
    
    MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags);
    OBJ_DESTRUCT(&endpoint->pending_get_frags);
    
    MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags);
    OBJ_DESTRUCT(&endpoint->pending_put_frags);
}


/*
 * call when the connect module has created all the qp's on an
 * endpoint and needs to have some receive buffers posted
 */
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
{
    int qp;

    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { 
        if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { 
            mca_btl_openib_post_srr(endpoint->endpoint_btl, 1, qp);
        } else { 
            mca_btl_openib_endpoint_post_rr(endpoint, qp);
        }
    }
    
    return OMPI_SUCCESS;
}


/*
 * called when the connect module has completed setup of an endpoint
 */
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{
    opal_list_item_t *frag_item;
    mca_btl_openib_send_frag_t *frag;
   
    endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
    
    /* The connection is correctly setup. Now we can decrease the
       event trigger. */
    opal_progress_event_users_decrement();

    /* While there are frags in the list, process them */
    while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) {
        frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags));
        frag = to_send_frag(frag_item);
        /* We need to post this one */
        
        if(OMPI_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag))
            BTL_ERROR(("Error posting send")); 
    }
}

/*
 * Attempt to send a fragment using a given endpoint. If the endpoint is not
 * connected, queue the fragment and start the connection as required.
 */
int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint,
                                 mca_btl_openib_send_frag_t* frag)
{
    int rc;
    bool call_progress = false;
    
    OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
    switch(endpoint->endpoint_state) {
        case MCA_BTL_IB_CONNECTING:

            BTL_VERBOSE(("Queing because state is connecting"));
            
            opal_list_append(&endpoint->pending_lazy_frags,
                    (opal_list_item_t *)frag);
            call_progress = true;
            rc = OMPI_SUCCESS;
            break;

        case MCA_BTL_IB_CONNECT_ACK:
        case MCA_BTL_IB_WAITING_ACK:
            BTL_VERBOSE(("Queuing because waiting for ack"));

            opal_list_append(&endpoint->pending_lazy_frags,
                    (opal_list_item_t *)frag);
            call_progress = true;
            rc = OMPI_SUCCESS;
            break;

        case MCA_BTL_IB_CLOSED:

            BTL_VERBOSE(("Connection to endpoint closed ... connecting ..."));
            opal_list_append(&endpoint->pending_lazy_frags,
                    (opal_list_item_t *)frag);
            rc = ompi_btl_openib_connect.bcf_start_connect(endpoint);
            /*
             * As long as we expect a message from the peer (in order
             * to setup the connection) let the event engine pool the
             * OOB events. Note: we increment it once peer active
             * connection.
             */
            opal_progress_event_users_increment();
            call_progress = true;
            break;

        case MCA_BTL_IB_FAILED:

            rc = OMPI_ERR_UNREACH;
            break;

        case MCA_BTL_IB_CONNECTED:
            BTL_VERBOSE(("Send to : %d, len : %lu, frag : %p",
                        endpoint->endpoint_proc->proc_guid.vpid,
                        frag->sg_entry.length, frag));
            rc = mca_btl_openib_endpoint_post_send(endpoint, frag);
            if(rc == OMPI_ERR_OUT_OF_RESOURCE )
                rc = OMPI_SUCCESS;
            break; 
        default:
            rc = OMPI_ERR_UNREACH;
            break;
    }
    OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
    if(call_progress) opal_progress();
    return rc;
}

/**
 * Return control fragment.
 */

static void mca_btl_openib_endpoint_credits(
    mca_btl_base_module_t* btl,
    struct mca_btl_base_endpoint_t* ep,
    struct mca_btl_base_descriptor_t* des,
    int status)
{
    
    int qp;
    
    mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(des);
    
    qp = frag->qp_idx;
   
    /* we don't acquire a WQE for credit message - so decrement.
     * Note: doing it for QP used for credit management */
    OPAL_THREAD_ADD32(&ep->qps[des->order].sd_wqe, -1);

    if(check_send_credits(ep, qp) || check_eager_rdma_credits(ep))
        mca_btl_openib_endpoint_send_credits(ep, qp);
    else {
        BTL_OPENIB_CREDITS_SEND_UNLOCK(ep, qp);
        /* check one more time if credits are available after unlock */
        send_credits(ep, qp);
    }
}

/**
 * Return credits to peer
 */
                                                                                                                             
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
        const int qp)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_openib_rdma_credits_header_t *credits_hdr;
    int ib_rc;
    bool do_rdma = false;
    int32_t cm_return;

    frag = endpoint->qps[qp].credit_frag;

    if(OPAL_UNLIKELY(NULL == frag)) {
        MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, ib_rc);
        frag->qp_idx = qp;
        endpoint->qps[qp].credit_frag = frag;
        /* set those once and forever */
        to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
        to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
        to_base_frag(frag)->base.des_cbdata = NULL;
        to_com_frag(frag)->endpoint = endpoint;
        frag->hdr->tag = MCA_BTL_TAG_BTL;
        to_base_frag(frag)->segment.seg_len =
            sizeof(mca_btl_openib_rdma_credits_header_t);
    }

    assert(frag->qp_idx == qp);
    credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
        to_base_frag(frag)->segment.seg_addr.pval;
    if(acquire_eager_rdma_send_credit(endpoint) == MPI_SUCCESS) {
        do_rdma = true;
    } else {
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
                (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
            BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
            return;
        }
     }

    GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);

    frag->hdr->cm_seen = 0;
    GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    if(cm_return > 255) {
        frag->hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        frag->hdr->cm_seen = cm_return;
    }

    GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
    credits_hdr->qpn = qp;
    credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;

    if(endpoint->nbo)
         BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);

    ib_rc = post_send(endpoint, frag, do_rdma);

    if(0 == ib_rc)
        return;

    if(endpoint->nbo) {
        BTL_OPENIB_HEADER_NTOH(*frag->hdr);
        BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
    }
    BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
    OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
            frag->hdr->credits);
    OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
            credits_hdr->rdma_credits);
    if(do_rdma)
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    else
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);

    BTL_ERROR(("error posting send request errno %d says %s", ib_rc,
                strerror(errno)));
}

/* local callback function for completion of eager rdma connect */
static void mca_btl_openib_endpoint_eager_rdma_connect_cb(
    mca_btl_base_module_t* btl,
    struct mca_btl_base_endpoint_t* endpoint,
    struct mca_btl_base_descriptor_t* descriptor,
    int status)
{
    MCA_BTL_IB_FRAG_RETURN(descriptor);
}

/* send the eager rdma connect message to the remote endpoint */
static int mca_btl_openib_endpoint_send_eager_rdma(
    mca_btl_base_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_eager_rdma_header_t *rdma_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    int rc;

    MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, rc);
    if(NULL == frag) {
        return -1;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_eager_rdma_connect_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.seg_len =
        sizeof(mca_btl_openib_eager_rdma_header_t);
    to_com_frag(frag)->endpoint = endpoint;

    frag->hdr->tag = MCA_BTL_TAG_BTL;
    rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
    rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
    rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
    rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval);
    BTL_VERBOSE(("sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u type %d and sizeof(rdma_hdr) %d\n", 
               rdma_hdr->rkey, 
               rdma_hdr->rdma_start.lval, 
               rdma_hdr->rdma_start.pval,
               rdma_hdr->rdma_start.ival,
               rdma_hdr->control.type, 
               sizeof(mca_btl_openib_eager_rdma_header_t)
               ));
    
    if(endpoint->nbo) {
        BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));
        
        BTL_VERBOSE(("after HTON: sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n", 
                   rdma_hdr->rkey, 
                   rdma_hdr->rdma_start.lval,
                   rdma_hdr->rdma_start.pval,
                   rdma_hdr->rdma_start.ival
                   ));
    }
    if (mca_btl_openib_endpoint_send(endpoint, frag) != OMPI_SUCCESS) {
        MCA_BTL_IB_FRAG_RETURN(frag);
        BTL_ERROR(("Error sending RDMA buffer", strerror(errno)));
        return -1;
    }
    return 0;
}

/* Setup eager RDMA buffers and notify the remote endpoint*/
void mca_btl_openib_endpoint_connect_eager_rdma(
        mca_btl_openib_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    char *buf;
    mca_btl_openib_recv_frag_t *headers_buf;
    int i;
    orte_std_cntr_t index;

    /* Set local rdma pointer to 1 temporarily so other threads will not try
     * to enter the function */
    if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                (void*)1))
        return;

    headers_buf = (mca_btl_openib_recv_frag_t*)
        malloc(sizeof(mca_btl_openib_recv_frag_t) *
            mca_btl_openib_component.eager_rdma_num);

    if(NULL == headers_buf)
       goto unlock_rdma_local;

    buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
            openib_btl->eager_rdma_frag_size *
            mca_btl_openib_component.eager_rdma_num,
            mca_btl_openib_component.buffer_alignment,
            MCA_MPOOL_FLAGS_CACHE_BYPASS,
            (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);

    if(!buf)
       goto free_headers_buf;

    buf = buf + openib_btl->eager_rdma_frag_size -
        sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit -
        sizeof(mca_btl_openib_header_t);

    for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
        ompi_free_list_item_t *item;
        mca_btl_openib_recv_frag_t * frag;
        mca_btl_openib_frag_init_data_t init_data;

        item = (ompi_free_list_item_t*)&headers_buf[i];
        item->registration = (void*)endpoint->eager_rdma_local.reg;
        item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
        OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t);

        init_data.order = MCA_BTL_NO_ORDER;
        init_data.list = NULL;
        
        mca_btl_openib_frag_init(item, &init_data);                                 
        frag = to_recv_frag(item);
        to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
        to_com_frag(frag)->endpoint = endpoint;
        frag->ftr = (mca_btl_openib_footer_t*)
            ((char*)to_base_frag(frag)->segment.seg_addr.pval +
             mca_btl_openib_component.eager_limit);
        
        MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
    }

    endpoint->eager_rdma_local.frags = headers_buf;

    endpoint->eager_rdma_local.rd_win =
        mca_btl_openib_component.eager_rdma_num >> 2;
    endpoint->eager_rdma_local.rd_win =
        endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;

    /* set local rdma pointer to real value */
    opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1,
            buf);

    if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) {
        /* This can never fail because max number of entries allocated
         * at init time */
        OBJ_RETAIN(endpoint);
        assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
        orte_pointer_array_add(&index, openib_btl->eager_rdma_buffers,
                endpoint);
        /* from this point progress function starts to poll new buffer */
        OPAL_THREAD_ADD32(&openib_btl->eager_rdma_buffers_count, 1);
        return;
    }

    openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
           buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
free_headers_buf:
    free(headers_buf);
unlock_rdma_local:
    /* set local rdma pointer back to zero. Will retry later */
    opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, 
            endpoint->eager_rdma_local.base.pval, NULL);
    endpoint->eager_rdma_local.frags = NULL;
}