diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 7dd8fc630d..7c4a19338b 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -505,23 +505,124 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order) return &to_base_frag(item)->base; } +/* check if pending fragment has enough space for coalescing */ +static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list, + opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size) +{ + mca_btl_openib_send_frag_t *frag = NULL; + + if(opal_list_is_empty(frag_list)) + return NULL; + + OPAL_THREAD_LOCK(lock); + if(!opal_list_is_empty(frag_list)) { + int qp; + size_t total_length; + opal_list_item_t *i = opal_list_get_first(frag_list); + frag = to_send_frag(i); + if(to_com_frag(frag)->endpoint != ep || + MCA_BTL_OPENIB_FRAG_CONTROL == openib_frag_type(frag)) { + OPAL_THREAD_UNLOCK(lock); + return NULL; + } + + total_length = size + frag->coalesced_length + + to_base_frag(frag)->segment.seg_len + + sizeof(mca_btl_openib_header_coalesced_t); + + qp = to_base_frag(frag)->base.order; + + if(total_length <= mca_btl_openib_component.qp_infos[qp].size) + opal_list_remove_first(frag_list); + else + frag = NULL; + } + OPAL_THREAD_UNLOCK(lock); + + return frag; +} + /** * Allocate a segment. * * @param btl (IN) BTL module * @param size (IN) Request segment size. - * @param size (IN) Size of segment to allocate + * @param size (IN) Size of segment to allocate * * When allocating a segment we pull a pre-alllocated segment * from one of two free lists, an eager list and a max list */ mca_btl_base_descriptor_t* mca_btl_openib_alloc( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_endpoint_t* ep, uint8_t order, size_t size) { - return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order); + mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; + int qp = frag_size_to_order(obtl, size); + mca_btl_openib_send_frag_t *sfrag = NULL; + mca_btl_openib_coalesced_frag_t *cfrag; + + assert(qp != MCA_BTL_NO_ORDER); + + if(mca_btl_openib_component.use_message_coalescing) { + sfrag = check_coalescing(&ep->qps[qp].qp->pending_frags[0], + &ep->qps[qp].qp->lock, ep, size); + + if(NULL == sfrag) { + if(BTL_OPENIB_QP_TYPE_PP(qp)) { + sfrag = check_coalescing(&ep->qps[qp].pending_frags[0], + &ep->endpoint_lock, ep, size); + } else { + sfrag = check_coalescing( + &obtl->qps[qp].u.srq_qp.pending_frags[0], + &obtl->ib_lock, ep, size); + } + } + } + + if(NULL == sfrag) + return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order); + + /* begin coalescing message */ + MCA_BTL_IB_FRAG_ALLOC_COALESCED(obtl, cfrag); + cfrag->send_frag = sfrag; + + /* fix up new coalescing header if this is the first coalesced frag */ + if(sfrag->hdr != sfrag->chdr) { + mca_btl_openib_control_header_t *ctrl_hdr; + mca_btl_openib_header_coalesced_t *clsc_hdr; + uint8_t org_tag; + + org_tag = sfrag->hdr->tag; + sfrag->hdr = sfrag->chdr; + ctrl_hdr = (mca_btl_openib_control_header_t*)(sfrag->hdr + 1); + clsc_hdr = (mca_btl_openib_header_coalesced_t*)(ctrl_hdr + 1); + sfrag->hdr->tag = MCA_BTL_TAG_BTL; + ctrl_hdr->type = MCA_BTL_OPENIB_CONTROL_COALESCED; + clsc_hdr->tag = org_tag; + clsc_hdr->size = to_base_frag(sfrag)->segment.seg_len; + clsc_hdr->alloc_size = to_base_frag(sfrag)->segment.seg_len; + sfrag->coalesced_length = sizeof(mca_btl_openib_control_header_t) + + sizeof(mca_btl_openib_header_coalesced_t); + to_com_frag(sfrag)->sg_entry.addr = (uint64_t)sfrag->hdr; + } + + cfrag->hdr = (mca_btl_openib_header_coalesced_t*) + (((unsigned char*)(sfrag->hdr + 1)) + sfrag->coalesced_length + + to_base_frag(sfrag)->segment.seg_len); + cfrag->hdr->alloc_size = size; + + /* point coalesced frag pointer into a data buffer */ + to_base_frag(cfrag)->segment.seg_addr.pval = cfrag->hdr + 1; + to_base_frag(cfrag)->segment.seg_len = size; + + /* save coalesced fragment on a main fragment; we will need it after send + * completion to free it and to call upper layer callback */ + opal_list_append(&sfrag->coalesced_frags, (opal_list_item_t*)cfrag); + sfrag->coalesced_length += (size+sizeof(mca_btl_openib_header_coalesced_t)); + + return &to_base_frag(cfrag)->base; } /** @@ -548,16 +649,27 @@ int mca_btl_openib_free( /* reset those field on free so we will not have to do it on alloc */ to_base_frag(des)->base.des_flags = 0; - if(MCA_BTL_OPENIB_FRAG_RECV == openib_frag_type(des) || - MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) { - to_base_frag(des)->base.des_src = NULL; - to_base_frag(des)->base.des_src_cnt = 0; - } else if(MCA_BTL_OPENIB_FRAG_SEND == openib_frag_type(des) || - MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) { - to_base_frag(des)->base.des_dst = NULL; - to_base_frag(des)->base.des_dst_cnt = 0; - if(MCA_BTL_OPENIB_FRAG_SEND == openib_frag_type(des)) + switch(openib_frag_type(des)) { + case MCA_BTL_OPENIB_FRAG_RECV: + case MCA_BTL_OPENIB_FRAG_RECV_USER: + to_base_frag(des)->base.des_src = NULL; + to_base_frag(des)->base.des_src_cnt = 0; + break; + case MCA_BTL_OPENIB_FRAG_SEND: + to_send_frag(des)->hdr = (mca_btl_openib_header_t*) + (((unsigned char*)to_send_frag(des)->chdr) + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t)); to_com_frag(des)->sg_entry.addr = (uint64_t)to_send_frag(des)->hdr; + to_send_frag(des)->coalesced_length = 0; + assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags)); + /* fall throug */ + case MCA_BTL_OPENIB_FRAG_SEND_USER: + to_base_frag(des)->base.des_dst = NULL; + to_base_frag(des)->base.des_dst_cnt = 0; + break; + default: + break; } MCA_BTL_IB_FRAG_RETURN(des); @@ -664,9 +776,10 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( if(max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } - - frag = (mca_btl_openib_com_frag_t*) - ib_frag_alloc(openib_btl, max_data + reserve, order); + + frag = (mca_btl_openib_com_frag_t*)(reserve ? + ib_frag_alloc(openib_btl, max_data + reserve, order) : + mca_btl_openib_alloc(btl, endpoint, order, max_data)); if(NULL == frag) return NULL; @@ -941,19 +1054,27 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl) int mca_btl_openib_send( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, mca_btl_base_tag_t tag) { - mca_btl_openib_send_frag_t* frag = to_send_frag(descriptor); + mca_btl_openib_send_frag_t *frag; - assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND); - - to_com_frag(frag)->endpoint = endpoint; - frag->hdr->tag = tag; + assert(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND || + openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED); + + if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) { + to_coalesced_frag(des)->hdr->tag = tag; + to_coalesced_frag(des)->hdr->size = des->des_src->seg_len; + frag = to_coalesced_frag(des)->send_frag; + } else { + frag = to_send_frag(des); + to_com_frag(des)->endpoint = ep; + frag->hdr->tag = tag; + } - return mca_btl_openib_endpoint_send(endpoint, frag); + return mca_btl_openib_endpoint_send(ep, frag); } /* diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 30a414898b..4cfccc24e2 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -195,6 +195,7 @@ struct mca_btl_openib_component_t { if_[in|ex]clude list that we use for error checking (to ensure that they all exist) */ char **if_list; + bool use_message_coalescing; #ifdef HAVE_IBV_FORK_INIT /** Whether we want fork support or not */ int want_fork_support; @@ -316,6 +317,8 @@ struct mca_btl_openib_module_t { ompi_free_list_t send_free_control; /**< frags for control massages */ + ompi_free_list_t send_free_coalesced; /**< frags for coalesced massages */ + opal_mutex_t ib_lock; /**< module level lock */ size_t ib_inline_max; /**< max size of inline send*/ diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 0a67b5daff..673f1dd3a6 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -215,10 +215,14 @@ static void btl_openib_control(mca_btl_base_module_t* btl, void* cbdata) { /* don't return credits used for control messages */ + mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; mca_btl_openib_endpoint_t* ep = to_com_frag(des)->endpoint; mca_btl_openib_control_header_t *ctl_hdr = to_base_frag(des)->segment.seg_addr.pval; mca_btl_openib_eager_rdma_header_t *rdma_hdr; + mca_btl_openib_header_coalesced_t *clsc_hdr = + (mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1); + size_t len = des->des_dst->seg_len - sizeof(*ctl_hdr); switch (ctl_hdr->type) { case MCA_BTL_OPENIB_CONTROL_CREDITS: @@ -251,6 +255,27 @@ static void btl_openib_control(mca_btl_base_module_t* btl, ep->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval; ep->eager_rdma_remote.tokens=mca_btl_openib_component.eager_rdma_num - 1; break; + case MCA_BTL_OPENIB_CONTROL_COALESCED: + while(len > 0) { + size_t skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size); + mca_btl_base_descriptor_t tmp_des; + mca_btl_base_segment_t tmp_seg; + + assert(len >= sizeof(*clsc_hdr)); + + tmp_des.des_dst = &tmp_seg; + tmp_des.des_dst_cnt = 1; + tmp_seg.seg_addr.pval = clsc_hdr + 1; + tmp_seg.seg_len = clsc_hdr->size; + + /* call registered callback */ + obtl->ib_reg[clsc_hdr->tag].cbfunc(&obtl->super, clsc_hdr->tag, + &tmp_des, obtl->ib_reg[clsc_hdr->tag].cbdata); + len -= skip; + clsc_hdr = (mca_btl_openib_header_coalesced_t*) + (((unsigned char*)clsc_hdr) + skip); + } + break; default: BTL_ERROR(("Unknown message type received by BTL")); break; @@ -739,6 +764,7 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t); OBJ_CONSTRUCT(&openib_btl->send_free_control, ompi_free_list_t); + OBJ_CONSTRUCT(&openib_btl->send_free_coalesced, ompi_free_list_t); OBJ_CONSTRUCT(&openib_btl->send_user_free, ompi_free_list_t); OBJ_CONSTRUCT(&openib_btl->recv_user_free, ompi_free_list_t); @@ -771,7 +797,7 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) init_data->order = mca_btl_openib_component.rdma_qp; init_data->list = &openib_btl->recv_user_free; - if(OMPI_SUCCESS != ompi_free_list_init_ex_new(&openib_btl->recv_user_free, + if(OMPI_SUCCESS != ompi_free_list_init_ex_new(&openib_btl->recv_user_free, sizeof(mca_btl_openib_get_frag_t), 2, OBJ_CLASS(mca_btl_openib_get_frag_t), 0, 0, @@ -802,7 +828,21 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) init_data)) { return OMPI_ERROR; } - + + init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t)); + length = sizeof(mca_btl_openib_coalesced_frag_t); + + init_data->list = &openib_btl->send_free_coalesced; + + if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->send_free_coalesced, + length, 2, OBJ_CLASS(mca_btl_openib_coalesced_frag_t), + mca_btl_openib_component.ib_free_list_num, + mca_btl_openib_component.ib_free_list_max, + mca_btl_openib_component.ib_free_list_inc, + NULL, mca_btl_openib_frag_init, init_data)) { + return OMPI_ERROR; + } + /* setup all the qps */ for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { OBJ_CONSTRUCT(&openib_btl->qps[qp].send_free, ompi_free_list_t); @@ -819,9 +859,11 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t)); /* Initialize pool of send fragments */ - length = sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_footer_t) + - mca_btl_openib_component.qp_infos[qp].size; + length = sizeof(mca_btl_openib_header_t) + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t) + + sizeof(mca_btl_openib_footer_t) + + mca_btl_openib_component.qp_infos[qp].size; init_data->order = qp; init_data->list = &openib_btl->qps[qp].send_free; @@ -840,6 +882,8 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t)); length = sizeof(mca_btl_openib_header_t) + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t) + sizeof(mca_btl_openib_footer_t) + mca_btl_openib_component.qp_infos[qp].size; @@ -867,6 +911,8 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) openib_btl->eager_rdma_frag_size = OPAL_ALIGN( sizeof(mca_btl_openib_header_t) + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t) + sizeof(mca_btl_openib_footer_t) + openib_btl->super.btl_eager_limit, mca_btl_openib_component.buffer_alignment, size_t); @@ -1598,8 +1644,16 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca) case IBV_WC_RDMA_WRITE: case IBV_WC_SEND: + if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { + opal_list_item_t *i; + while((i = opal_list_remove_first( + &to_send_frag(des)->coalesced_frags))) { + to_base_frag(i)->base.des_cbfunc(&openib_btl->super, + endpoint, &to_base_frag(i)->base, OMPI_SUCCESS); + } + } /* Process a completed send/put/get */ - des->des_cbfunc(&openib_btl->super, endpoint, des, OMPI_SUCCESS); + des->des_cbfunc(&openib_btl->super, endpoint, des,OMPI_SUCCESS); /* return send wqe */ qp_put_wqe(endpoint, qp); diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index fd40ded457..70a8576614 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -39,10 +39,8 @@ #include "ompi/mca/pml/base/pml_base_sendreq.h" #include "ompi/class/ompi_free_list.h" -#include "btl_openib.h" #include "btl_openib_endpoint.h" #include "btl_openib_proc.h" -#include "btl_openib_frag.h" #include "btl_openib_xrc.h" static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); @@ -59,7 +57,7 @@ static int post_send(mca_btl_openib_endpoint_t *ep, int qp = to_base_frag(frag)->base.order; sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) + - (rdma ? sizeof(mca_btl_openib_footer_t) : 0); + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length; if(sg->length <= openib_btl->ib_inline_max) { sr_desc->send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; @@ -73,8 +71,8 @@ static int post_send(mca_btl_openib_endpoint_t *ep, if(rdma) { int32_t head; mca_btl_openib_footer_t* ftr = - (mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) + - seg->seg_len); + (mca_btl_openib_footer_t*)(((char*)frag->hdr) + sg->length - + sizeof(mca_btl_openib_footer_t)); sr_desc->opcode = IBV_WR_RDMA_WRITE; MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length); MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr); @@ -184,6 +182,7 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, int qp, ib_rc; int32_t cm_return; bool do_rdma = false; + size_t eager_limit; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; @@ -193,7 +192,10 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS) return OMPI_ERR_OUT_OF_RESOURCE; - if(des->des_src->seg_len <= mca_btl_openib_component.eager_limit && + eager_limit = mca_btl_openib_component.eager_limit + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t); + if(des->des_src->seg_len + frag->coalesced_length <= eager_limit && (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) { /* High priority frag. Try to send over eager RDMA */ if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS) diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index cf98ee4e6c..961c1ccaa2 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -28,8 +28,8 @@ #include "opal/event/event.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/btl/btl.h" -#include "btl_openib_frag.h" #include "btl_openib.h" +#include "btl_openib_frag.h" #include "btl_openib_eager_rdma.h" #include #include diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 6010a7c32a..144e09f084 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -19,6 +19,7 @@ * $HEADER$ */ +#include "btl_openib.h" #include "btl_openib_frag.h" #include "btl_openib_eager_rdma.h" @@ -31,7 +32,9 @@ void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx) to_recv_frag(frag)->qp_idx = init_data->order; to_com_frag(frag)->sg_entry.length = mca_btl_openib_component.qp_infos[init_data->order].size + - sizeof(mca_btl_openib_header_t); + sizeof(mca_btl_openib_header_t) + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t); } if(MCA_BTL_OPENIB_FRAG_SEND == frag->type) @@ -92,10 +95,15 @@ static void send_constructor(mca_btl_openib_send_frag_t *frag) base_frag->type = MCA_BTL_OPENIB_FRAG_SEND; - frag->hdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr; - base_frag->segment.seg_addr.pval = - ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); + frag->chdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr; + frag->hdr = (mca_btl_openib_header_t*) + (((unsigned char*)base_frag->base.super.ptr) + + sizeof(mca_btl_openib_header_coalesced_t) + + sizeof(mca_btl_openib_control_header_t)); + base_frag->segment.seg_addr.pval = frag->hdr + 1; to_com_frag(frag)->sg_entry.addr = (uint64_t)frag->hdr; + frag->coalesced_length = 0; + OBJ_CONSTRUCT(&frag->coalesced_frags, opal_list_t); } static void recv_constructor(mca_btl_openib_recv_frag_t *frag) @@ -138,6 +146,18 @@ static void get_constructor(mca_btl_openib_get_frag_t *frag) frag->sr_desc.next = NULL; } +static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag) +{ + mca_btl_openib_frag_t *base_frag = to_base_frag(frag); + + base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED; + + base_frag->base.des_src = &base_frag->segment; + base_frag->base.des_src_cnt = 1; + base_frag->base.des_dst = NULL; + base_frag->base.des_dst_cnt = 0; +} + OBJ_CLASS_INSTANCE( mca_btl_openib_frag_t, mca_btl_base_descriptor_t, @@ -191,3 +211,9 @@ OBJ_CLASS_INSTANCE( mca_btl_openib_in_frag_t, get_constructor, NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_openib_coalesced_frag_t, + mca_btl_openib_frag_t, + coalesced_constructor, + NULL); diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 4bbe3bdd90..3f18e19656 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -53,6 +53,11 @@ do { \ (h).credits = ntohs((h).credits); \ } while (0) +typedef struct mca_btl_openib_header_coalesced_t { + mca_btl_base_tag_t tag; + uint32_t size; + uint32_t alloc_size; +} mca_btl_openib_header_coalesced_t; struct mca_btl_openib_footer_t { #if OMPI_ENABLE_DEBUG @@ -101,8 +106,9 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t; #endif -#define MCA_BTL_OPENIB_CONTROL_CREDITS 0 -#define MCA_BTL_OPENIB_CONTROL_RDMA 1 +#define MCA_BTL_OPENIB_CONTROL_CREDITS 0 +#define MCA_BTL_OPENIB_CONTROL_RDMA 1 +#define MCA_BTL_OPENIB_CONTROL_COALESCED 2 struct mca_btl_openib_control_header_t { uint8_t type; @@ -153,7 +159,8 @@ enum mca_btl_openib_frag_type_t { MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER, MCA_BTL_OPENIB_FRAG_EAGER_RDMA, - MCA_BTL_OPENIB_FRAG_CONTROL + MCA_BTL_OPENIB_FRAG_CONTROL, + MCA_BTL_OPENIB_FRAG_COALESCED }; typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t; @@ -199,9 +206,11 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t); typedef struct mca_btl_openib_send_frag_t { mca_btl_openib_out_frag_t super; - mca_btl_openib_header_t *hdr; + mca_btl_openib_header_t *hdr, *chdr; mca_btl_openib_footer_t *ftr; uint8_t qp_idx; + uint32_t coalesced_length; + opal_list_t coalesced_frags; } mca_btl_openib_send_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t); @@ -235,6 +244,16 @@ typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t); #define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f)) + +typedef struct mca_btl_openib_coalesced_frag_t { + mca_btl_openib_frag_t super; + mca_btl_openib_send_frag_t *send_frag; + mca_btl_openib_header_coalesced_t *hdr; +} mca_btl_openib_coalesced_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t); + +#define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f)) + /* * Allocate an IB send descriptor * @@ -247,6 +266,17 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t); frag = to_send_control_frag(item); \ } while(0) +static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl, + size_t size) +{ + int qp; + for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) + if(mca_btl_openib_component.qp_infos[qp].size >= size) + return qp; + + return MCA_BTL_NO_ORDER; +} + #define MCA_BTL_IB_FRAG_ALLOC_SEND_USER(btl, frag, rc) \ do { \ ompi_free_list_item_t *item; \ @@ -261,6 +291,14 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t); frag = to_com_frag(item); \ } while(0) +#define MCA_BTL_IB_FRAG_ALLOC_COALESCED(btl, frag) \ + do { \ + int ign_rc; \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_GET(&(btl)->send_free_coalesced, item, ign_rc) \ + frag = to_coalesced_frag(item); \ + } while(0) + #define MCA_BTL_IB_FRAG_RETURN(frag) \ do { \ OMPI_FREE_LIST_RETURN(to_base_frag(frag)->list, \ diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index ca21b0a58f..047d7600f9 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -416,6 +416,10 @@ int btl_openib_register_mca_params(void) mca_btl_openib_component.buffer_alignment = (uint32_t) ival; } + CHECK(reg_int("use_message_coalescing", + "Use message coalescing", 1, &ival, 0)); + mca_btl_openib_component.use_message_coalescing = (0 != ival); + /* Info only */ mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,