From b6d50a5733706c77f4b08d3e18bccbb29c5ba15d Mon Sep 17 00:00:00 2001 From: Mike Dubman Date: Wed, 26 Dec 2012 10:19:12 +0000 Subject: [PATCH] Performance optimizations by alexm: * btl sendi(): if message can be send inline try to avoid signal * signal is requested one per 64 or when there are no send wqes when message can not be send inline any other btl method then sendi() This commit was SVN r27724. --- ompi/mca/btl/openib/btl_openib.c | 27 ++++++-- ompi/mca/btl/openib/btl_openib_component.c | 6 +- ompi/mca/btl/openib/btl_openib_endpoint.c | 11 +++- ompi/mca/btl/openib/btl_openib_endpoint.h | 72 ++++++++++++++++++++-- ompi/mca/btl/openib/btl_openib_frag.c | 1 + ompi/mca/btl/openib/btl_openib_frag.h | 2 + 6 files changed, 106 insertions(+), 13 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 708278d946..2f62fb241f 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -1544,6 +1544,7 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, ompi_free_list_item_t* item = NULL; mca_btl_openib_frag_t *frag; mca_btl_openib_header_t *hdr; + int send_signaled; OPAL_THREAD_LOCK(&ep->endpoint_lock); @@ -1644,12 +1645,22 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, hdr->cm_seen = cm_return; } - ib_rc = post_send(ep, to_send_frag(item), do_rdma); +#if BTL_OPENIB_FAILOVER_ENABLED + send_signaled = 0; +#else + send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma); +#endif + ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled); if(!ib_rc) { + if (0 == send_signaled) { + MCA_BTL_IB_FRAG_RETURN(frag); + } #if BTL_OPENIB_FAILOVER_ENABLED - /* Return up in case needed for failover */ - *descriptor = (struct mca_btl_base_descriptor_t *) frag; + else { + /* Return up in case needed for failover */ + *descriptor = (struct mca_btl_base_descriptor_t *) frag; + } #endif OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return OMPI_SUCCESS; @@ -1784,7 +1795,11 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, /* Setting opcode on a frag constructor isn't enough since prepare_src * may return send_frag instead of put_frag */ frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; - frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp])); + frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1); + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) return OMPI_ERROR; @@ -1863,6 +1878,10 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; #endif descriptor->order = qp; + + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); + if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) return OMPI_ERROR; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 9d6397e556..d3206b066d 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -3280,6 +3280,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, mca_btl_openib_module_t *openib_btl = NULL; ompi_proc_t* remote_proc = NULL; int qp, btl_ownership; + int n; des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id; frag = to_com_frag(des); @@ -3343,8 +3344,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, /* return send wqe */ qp_put_wqe(endpoint, qp); + /* return wqes that were sent before this frag */ + n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des)); + if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); + OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n); /* new SRQ credit available. Try to progress pending frags*/ progress_pending_frags_srq(openib_btl, qp); diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index d160ffe478..e1ed96a46d 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -152,7 +152,8 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, hdr->cm_seen = cm_return; } - ib_rc = post_send(endpoint, frag, do_rdma); + qp_reset_signal_count(endpoint, qp); + ib_rc = post_send(endpoint, frag, do_rdma, 1); if(!ib_rc) return OMPI_SUCCESS; @@ -287,8 +288,11 @@ static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp) break; default: BTL_ERROR(("Wrong QP type")); - break; + return; } + + ep_qp->qp->sd_wqe_inflight = 0; + ep_qp->qp->wqe_count = QP_TX_BATCH_COUNT; } void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl, @@ -815,7 +819,8 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, if(endpoint->nbo) BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr); - if((rc = post_send(endpoint, frag, do_rdma)) == 0) + qp_reset_signal_count(endpoint, qp); + if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0) return; if(endpoint->nbo) { diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 72583ac1e0..a62d238f5f 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -37,6 +37,8 @@ #include "ompi/mca/btl/base/btl_base_error.h" #include "connect/base.h" +#define QP_TX_BATCH_COUNT 64 + BEGIN_C_DECLS struct mca_btl_openib_frag_t; @@ -133,6 +135,8 @@ typedef struct mca_btl_openib_qp_t { struct ibv_qp *lcl_qp; uint32_t lcl_psn; int32_t sd_wqe; /**< number of available send wqe entries */ + int32_t sd_wqe_inflight; + int wqe_count; int users; opal_mutex_t lock; } mca_btl_openib_qp_t; @@ -270,6 +274,54 @@ static inline int32_t qp_put_wqe(mca_btl_openib_endpoint_t *ep, const int qp) return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, 1); } + +static inline int32_t qp_inc_inflight_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag) +{ + frag->n_wqes_inflight = 0; + return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe_inflight, 1); +} + +static inline void qp_inflight_wqe_to_frag(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag) +{ + + frag->n_wqes_inflight = ep->qps[qp].qp->sd_wqe_inflight; + ep->qps[qp].qp->sd_wqe_inflight = 0; +} + +static inline int qp_frag_to_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag) +{ + int n; + n = frag->n_wqes_inflight; + OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, n); + frag->n_wqes_inflight = 0; + + return n; +} + +static inline int qp_need_signal(mca_btl_openib_endpoint_t *ep, const int qp, size_t size, int rdma) +{ + + /* note that size here is payload only */ + if (ep->qps[qp].qp->sd_wqe <= 0 || + size + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) > ep->qps[qp].ib_inline_max) { + ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT; + return 1; + } + + if (0 < --ep->qps[qp].qp->wqe_count) { + return 0; + } + + ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT; + return 1; +} + +static inline void qp_reset_signal_count(mca_btl_openib_endpoint_t *ep, const int qp) +{ + ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT; +} + + int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t*, mca_btl_openib_send_frag_t*); int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*, @@ -457,10 +509,14 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep, } static inline __opal_attribute_always_inline__ int -ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp) +ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp, int do_signal) { - return IBV_SEND_SIGNALED | - ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0); + if (do_signal) { + return IBV_SEND_SIGNALED | + ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0); + } else { + return ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0); + } } static inline int @@ -475,7 +531,7 @@ acquire_eager_rdma_send_credit(mca_btl_openib_endpoint_t *endpoint) } static inline int post_send(mca_btl_openib_endpoint_t *ep, - mca_btl_openib_send_frag_t *frag, const bool rdma) + mca_btl_openib_send_frag_t *frag, const bool rdma, int do_signal) { mca_btl_openib_module_t *openib_btl = ep->endpoint_btl; mca_btl_openib_segment_t *seg = &to_base_frag(frag)->segment; @@ -487,7 +543,7 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep, sg->length = seg->base.seg_len + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length; - sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp])); + sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]), do_signal); if(ep->nbo) BTL_OPENIB_HEADER_HTON(*frag->hdr); @@ -545,6 +601,12 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep, #endif assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr); + if (sr_desc->send_flags & IBV_SEND_SIGNALED) { + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + } else { + qp_inc_inflight_wqe(ep, qp, to_com_frag(frag)); + } + return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr); } diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index c357771d84..382f28955a 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -61,6 +61,7 @@ static void com_constructor(mca_btl_openib_com_frag_t *frag) frag->sg_entry.lkey = reg->mr->lkey; base_frag->segment.key = reg->mr->lkey; } + frag->n_wqes_inflight = 0; } static void out_constructor(mca_btl_openib_out_frag_t *frag) diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 1e8b88471e..a4b4611fef 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -306,6 +306,8 @@ typedef struct mca_btl_openib_com_frag_t { struct ibv_sge sg_entry; struct mca_btl_openib_reg_t *registration; struct mca_btl_base_endpoint_t *endpoint; + /* number of unsignaled frags sent before this frag. */ + uint32_t n_wqes_inflight; } mca_btl_openib_com_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);