/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include #include #include #include #include "orte/mca/ns/base/base.h" #include "orte/mca/oob/base/base.h" #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/dss/dss.h" #include "ompi/types.h" #include "ompi/mca/pml/base/pml_base_sendreq.h" #include "ompi/class/ompi_free_list.h" #include "btl_openib.h" #include "btl_openib_endpoint.h" #include "btl_openib_proc.h" #include "btl_openib_frag.h" #include "connect/base.h" static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); static int post_send(mca_btl_openib_endpoint_t *ep, mca_btl_openib_send_frag_t *frag, const bool rdma) { mca_btl_openib_module_t *openib_btl = ep->endpoint_btl; mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment; struct ibv_sge *sg = &to_com_frag(frag)->sg_entry; struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc; struct ibv_send_wr *bad_wr; int qp = to_base_frag(frag)->base.order; sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0); if(sg->length <= openib_btl->ib_inline_max) { sr_desc->send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } else { sr_desc->send_flags = IBV_SEND_SIGNALED; } if(ep->nbo) BTL_OPENIB_HEADER_HTON(*frag->hdr); if(rdma) { int32_t head; mca_btl_openib_footer_t* ftr = (mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) + seg->seg_len); sr_desc->opcode = IBV_WR_RDMA_WRITE; MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length); MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr); #if OMPI_ENABLE_DEBUG ((mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) + seg->seg_len))->seq = ep->eager_rdma_remote.seq++; #endif if(ep->nbo) BTL_OPENIB_FOOTER_HTON(*ftr); sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey; MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head); sr_desc->wr.rdma.remote_addr = ep->eager_rdma_remote.base.lval + head * openib_btl->eager_rdma_frag_size + sizeof(mca_btl_openib_header_t) + mca_btl_openib_component.eager_limit + sizeof(mca_btl_openib_footer_t); sr_desc->wr.rdma.remote_addr -= sg->length; } else { if(BTL_OPENIB_QP_TYPE_SRQ(qp)) { sr_desc->opcode = IBV_WR_SEND_WITH_IMM; sr_desc->imm_data = ep->rem_info.rem_index; } else { sr_desc->opcode = IBV_WR_SEND; } } assert(sg->addr == (uint64_t)frag->hdr); return ibv_post_send(ep->qps[qp].lcl_qp, sr_desc, &bad_wr); } static inline int acruire_wqe(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { int qp = to_base_frag(frag)->base.order; int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); if(OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, -1) < 0) { OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1); opal_list_append(&endpoint->qps[qp].pending_frags[prio], (opal_list_item_t *)frag); return OMPI_ERR_OUT_OF_RESOURCE; } return OMPI_SUCCESS; } static inline int acquire_eager_rdma_send_credit(mca_btl_openib_endpoint_t *endpoint) { if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); return OMPI_ERR_OUT_OF_RESOURCE; } return OMPI_SUCCESS; } static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; int qp = to_base_frag(frag)->base.order; int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); if(BTL_OPENIB_QP_TYPE_PP(qp)) { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); opal_list_append(&endpoint->qps[qp].pending_frags[prio], (opal_list_item_t *)frag); return OMPI_ERR_OUT_OF_RESOURCE; } } else { if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); OPAL_THREAD_LOCK(&openib_btl->ib_lock); opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio], (opal_list_item_t *)frag); OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); return OMPI_ERR_OUT_OF_RESOURCE; } } return OMPI_SUCCESS; } #define GET_CREDITS(FROM, TO) \ do { \ TO = FROM; \ } while(0 == OPAL_ATOMIC_CMPSET_32(&FROM, TO, 0)) /* this function is called with endpoint->endpoint_lock held */ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { mca_btl_openib_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc; int32_t cm_return; bool do_rdma = false; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS) return OMPI_ERR_OUT_OF_RESOURCE; if(des->des_src->seg_len <= mca_btl_openib_component.eager_limit && (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) { /* High priority frag. Try to send over eager RDMA */ if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS) do_rdma = true; } if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) { OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1); return OMPI_ERR_OUT_OF_RESOURCE; } GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); if(hdr->credits) hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; hdr->cm_seen = 0; if(!do_rdma) { if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) { GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); } GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ if(cm_return > 255) { hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { hdr->cm_seen = cm_return; } } ib_rc = post_send(endpoint, frag, do_rdma); if(!ib_rc) return OMPI_SUCCESS; if(endpoint->nbo) BTL_OPENIB_HEADER_NTOH(*hdr); if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits)); } OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1); if(do_rdma) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); } else { if(BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); } else { mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } } BTL_ERROR(("error posting send request error %d: %s\n", ib_rc, strerror(ib_rc))); return OMPI_ERROR; } OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t, opal_list_item_t, mca_btl_openib_endpoint_construct, mca_btl_openib_endpoint_destruct); /* * Initialize state of the endpoint instance. * */ static void mca_btl_openib_endpoint_construct_qp(mca_btl_base_endpoint_t *endpoint, int qp) { endpoint->qps[qp].lcl_qp = NULL; endpoint->qps[qp].rd_credit_send_lock = 0; /* setup rem_info */ endpoint->rem_info.rem_qps[qp].rem_qp_num = 0; endpoint->rem_info.rem_qps[qp].rem_psn = 0; OBJ_CONSTRUCT(&endpoint->qps[qp].pending_frags[0], opal_list_t); OBJ_CONSTRUCT(&endpoint->qps[qp].pending_frags[1], opal_list_t); if(BTL_OPENIB_QP_TYPE_PP(qp)) { /* local credits are set here such that on initial posting * of the receive buffers we end up with zero credits to return * to our peer. The peer initializes his sd_credits to reflect this * below. Note that this may be a problem for iWARP as the sender * now has credits even if the receive buffers are not yet posted */ endpoint->qps[qp].u.pp_qp.rd_credits = -mca_btl_openib_component.qp_infos[qp].rd_num; endpoint->qps[qp].u.pp_qp.rd_posted = 0; endpoint->qps[qp].u.pp_qp.cm_sent = 0; endpoint->qps[qp].u.pp_qp.cm_return = -mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; endpoint->qps[qp].u.pp_qp.cm_received = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; /* initialize the local view of credits */ endpoint->qps[qp].u.pp_qp.sd_credits = mca_btl_openib_component.qp_infos[qp].rd_num; /* number of available send wqes */ endpoint->qps[qp].sd_wqe = mca_btl_openib_component.qp_infos[qp].rd_num; } else { /* number of available send wqes */ endpoint->qps[qp].sd_wqe = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; } } static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) { int qp; /* setup qp structures */ if( mca_btl_openib_component.num_qps > 0 ) { endpoint->qps = (mca_btl_openib_endpoint_qp_t*) malloc(sizeof(mca_btl_openib_endpoint_qp_t) * mca_btl_openib_component.num_qps); memset(endpoint->qps, 0, sizeof(mca_btl_openib_endpoint_qp_t) * mca_btl_openib_component.num_qps); endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*) malloc(sizeof(mca_btl_openib_rem_qp_info_t) * mca_btl_openib_component.num_qps); memset(endpoint->rem_info.rem_qps, 0, sizeof(mca_btl_openib_rem_qp_info_t) * mca_btl_openib_component.num_qps); } endpoint->endpoint_btl = 0; endpoint->endpoint_proc = 0; endpoint->endpoint_tstamp = 0.0; endpoint->endpoint_state = MCA_BTL_IB_CLOSED; endpoint->endpoint_retries = 0; OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->pending_lazy_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->pending_get_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->pending_put_frags, opal_list_t); endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom; /* initialize RDMA eager related parts */ endpoint->eager_recv_count = 0; memset(&endpoint->eager_rdma_remote, 0, sizeof(mca_btl_openib_eager_rdma_remote_t)); memset(&endpoint->eager_rdma_local, 0, sizeof(mca_btl_openib_eager_rdma_local_t)); OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t); endpoint->rem_info.rem_lid = 0; endpoint->rem_info.rem_subnet_id = 0; endpoint->rem_info.rem_mtu = 0; endpoint->nbo = false; endpoint->use_eager_rdma = false; endpoint->eager_rdma_remote.tokens = 0; endpoint->eager_rdma_local.credits = 0; for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { mca_btl_openib_endpoint_construct_qp(endpoint, qp); } } /* * Destroy a endpoint * */ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) { bool pval_clean = false; int qp; /* Release memory resources */ do { /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma () * was not in "connect" or "bad" flow (failed to allocate memory) * and changed the pointer back to NULL */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) { if ((void*)1 != endpoint->eager_rdma_local.base.pval && NULL != endpoint->eager_rdma_local.base.pval) { endpoint->endpoint_btl->super.btl_mpool->mpool_free(endpoint->endpoint_btl->super.btl_mpool, endpoint->eager_rdma_local.base.pval, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); pval_clean=true; } } else { pval_clean=true; } } while (!pval_clean); /* Close opened QPs if we have them*/ if(MCA_BTL_IB_CLOSED != endpoint->endpoint_state) { for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags[0]); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags[1]); OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]); OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[1]); if(ibv_destroy_qp(endpoint->qps[qp].lcl_qp)) { BTL_ERROR(("Failed to destroy QP:%d\n", qp)); } } /* free the qps */ free(endpoint->qps); } OBJ_DESTRUCT(&endpoint->endpoint_lock); /* Clean pending lists */ MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags); OBJ_DESTRUCT(&endpoint->pending_lazy_frags); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags); OBJ_DESTRUCT(&endpoint->pending_get_frags); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags); OBJ_DESTRUCT(&endpoint->pending_put_frags); } /* * call when the connect module has created all the qp's on an * endpoint and needs to have some receive buffers posted */ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint) { int qp; for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { mca_btl_openib_post_srr(endpoint->endpoint_btl, 1, qp); } else { mca_btl_openib_endpoint_post_rr(endpoint, qp); } } return OMPI_SUCCESS; } /* * called when the connect module has completed setup of an endpoint */ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) { opal_list_item_t *frag_item; mca_btl_openib_send_frag_t *frag; endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; /* The connection is correctly setup. Now we can decrease the event trigger. */ opal_progress_event_users_decrement(); /* While there are frags in the list, process them */ while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)); frag = to_send_frag(frag_item); /* We need to post this one */ if(OMPI_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) BTL_ERROR(("Error posting send")); } } /* * Attempt to send a fragment using a given endpoint. If the endpoint is not * connected, queue the fragment and start the connection as required. */ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint, mca_btl_openib_send_frag_t* frag) { int rc; bool call_progress = false; OPAL_THREAD_LOCK(&endpoint->endpoint_lock); switch(endpoint->endpoint_state) { case MCA_BTL_IB_CONNECTING: BTL_VERBOSE(("Queing because state is connecting")); opal_list_append(&endpoint->pending_lazy_frags, (opal_list_item_t *)frag); call_progress = true; rc = OMPI_SUCCESS; break; case MCA_BTL_IB_CONNECT_ACK: case MCA_BTL_IB_WAITING_ACK: BTL_VERBOSE(("Queuing because waiting for ack")); opal_list_append(&endpoint->pending_lazy_frags, (opal_list_item_t *)frag); call_progress = true; rc = OMPI_SUCCESS; break; case MCA_BTL_IB_CLOSED: BTL_VERBOSE(("Connection to endpoint closed ... connecting ...")); opal_list_append(&endpoint->pending_lazy_frags, (opal_list_item_t *)frag); rc = ompi_btl_openib_connect.bcf_start_connect(endpoint); /* * As long as we expect a message from the peer (in order * to setup the connection) let the event engine pool the * OOB events. Note: we increment it once peer active * connection. */ opal_progress_event_users_increment(); call_progress = true; break; case MCA_BTL_IB_FAILED: rc = OMPI_ERR_UNREACH; break; case MCA_BTL_IB_CONNECTED: BTL_VERBOSE(("Send to : %d, len : %lu, frag : %p", endpoint->endpoint_proc->proc_guid.vpid, frag->sg_entry.length, frag)); rc = mca_btl_openib_endpoint_post_send(endpoint, frag); if(rc == OMPI_ERR_OUT_OF_RESOURCE ) rc = OMPI_SUCCESS; break; default: rc = OMPI_ERR_UNREACH; break; } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); if(call_progress) opal_progress(); return rc; } /** * Return control fragment. */ static void mca_btl_openib_endpoint_credits( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* des, int status) { int qp; mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(des); qp = frag->qp_idx; /* we don't acquire a WQE for credit message - so decrement. * Note: doing it for QP used for credit management */ OPAL_THREAD_ADD32(&ep->qps[des->order].sd_wqe, -1); if(check_send_credits(ep, qp) || check_eager_rdma_credits(ep)) mca_btl_openib_endpoint_send_credits(ep, qp); else { BTL_OPENIB_CREDITS_SEND_UNLOCK(ep, qp); /* check one more time if credits are available after unlock */ send_credits(ep, qp); } } /** * Return credits to peer */ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, const int qp) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_send_control_frag_t* frag; mca_btl_openib_rdma_credits_header_t *credits_hdr; int ib_rc; bool do_rdma = false; int32_t cm_return; frag = endpoint->qps[qp].credit_frag; if(OPAL_UNLIKELY(NULL == frag)) { MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, ib_rc); frag->qp_idx = qp; endpoint->qps[qp].credit_frag = frag; /* set those once and forever */ to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits; to_base_frag(frag)->base.des_cbdata = NULL; to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_rdma_credits_header_t); } assert(frag->qp_idx == qp); credits_hdr = (mca_btl_openib_rdma_credits_header_t*) to_base_frag(frag)->segment.seg_addr.pval; if(acquire_eager_rdma_send_credit(endpoint) == MPI_SUCCESS) { do_rdma = true; } else { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) > (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); return; } } GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); frag->hdr->cm_seen = 0; GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); if(cm_return > 255) { frag->hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { frag->hdr->cm_seen = cm_return; } GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); credits_hdr->qpn = qp; credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS; if(endpoint->nbo) BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr); ib_rc = post_send(endpoint, frag, do_rdma); if(0 == ib_rc) return; if(endpoint->nbo) { BTL_OPENIB_HEADER_NTOH(*frag->hdr); BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr); } BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); if(do_rdma) OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); else OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_ERROR(("error posting send request errno %d says %s", ib_rc, strerror(errno))); } /* local callback function for completion of eager rdma connect */ static void mca_btl_openib_endpoint_eager_rdma_connect_cb( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, int status) { MCA_BTL_IB_FRAG_RETURN(descriptor); } /* send the eager rdma connect message to the remote endpoint */ static int mca_btl_openib_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_eager_rdma_header_t *rdma_hdr; mca_btl_openib_send_control_frag_t* frag; int rc; MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, rc); if(NULL == frag) { return -1; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma_connect_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t); to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval); BTL_VERBOSE(("sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u type %d and sizeof(rdma_hdr) %d\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival, rdma_hdr->control.type, sizeof(mca_btl_openib_eager_rdma_header_t) )); if(endpoint->nbo) { BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr)); BTL_VERBOSE(("after HTON: sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival )); } if (mca_btl_openib_endpoint_send(endpoint, frag) != OMPI_SUCCESS) { MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending RDMA buffer", strerror(errno))); return -1; } return 0; } /* Setup eager RDMA buffers and notify the remote endpoint*/ void mca_btl_openib_endpoint_connect_eager_rdma( mca_btl_openib_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; char *buf; mca_btl_openib_recv_frag_t *headers_buf; int i; orte_std_cntr_t index; /* Set local rdma pointer to 1 temporarily so other threads will not try * to enter the function */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) return; headers_buf = (mca_btl_openib_recv_frag_t*) malloc(sizeof(mca_btl_openib_recv_frag_t) * mca_btl_openib_component.eager_rdma_num); if(NULL == headers_buf) goto unlock_rdma_local; buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, openib_btl->eager_rdma_frag_size * mca_btl_openib_component.eager_rdma_num, mca_btl_openib_component.buffer_alignment, MCA_MPOOL_FLAGS_CACHE_BYPASS, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) goto free_headers_buf; buf = buf + openib_btl->eager_rdma_frag_size - sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit - sizeof(mca_btl_openib_header_t); for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { ompi_free_list_item_t *item; mca_btl_openib_recv_frag_t * frag; mca_btl_openib_frag_init_data_t init_data; item = (ompi_free_list_item_t*)&headers_buf[i]; item->registration = (void*)endpoint->eager_rdma_local.reg; item->ptr = buf + i * openib_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t); init_data.order = MCA_BTL_NO_ORDER; init_data.list = NULL; mca_btl_openib_frag_init(item, &init_data); frag = to_recv_frag(item); to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; to_com_frag(frag)->endpoint = endpoint; frag->ftr = (mca_btl_openib_footer_t*) ((char*)to_base_frag(frag)->segment.seg_addr.pval + mca_btl_openib_component.eager_limit); MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); } endpoint->eager_rdma_local.frags = headers_buf; endpoint->eager_rdma_local.rd_win = mca_btl_openib_component.eager_rdma_num >> 2; endpoint->eager_rdma_local.rd_win = endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1; /* set local rdma pointer to real value */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) { /* This can never fail because max number of entries allocated * at init time */ OBJ_RETAIN(endpoint); assert(((opal_object_t*)endpoint)->obj_reference_count == 2); orte_pointer_array_add(&index, openib_btl->eager_rdma_buffers, endpoint); /* from this point progress function starts to poll new buffer */ OPAL_THREAD_ADD32(&openib_btl->eager_rdma_buffers_count, 1); return; } openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); free_headers_buf: free(headers_buf); unlock_rdma_local: /* set local rdma pointer back to zero. Will retry later */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, endpoint->eager_rdma_local.base.pval, NULL); endpoint->eager_rdma_local.frags = NULL; }