diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 6be048855a..dfb6738522 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -228,16 +228,14 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl, /* if not sent via rdma */ if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) && ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) { - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, -1); - /* assert(endpoint->qps[qp].u.pp_qp.rd_credits >= -(mca_btl_openib_component.qp_infos[qp].rd_num - mca_btl_openib_component.qp_infos[qp].rd_low)); */ + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1); + OPAL_THREAD_ADD32((int32_t*)&endpoint->qps[qp].u.pp_qp.rd_posted, 1); } } else if (ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) { - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, -1); - /* assert(endpoint->qps[qp].u.pp_qp.rd_credits >= -(mca_btl_openib_component.qp_infos[qp].rd_num - mca_btl_openib_component.qp_infos[qp].rd_low)); */ + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1); + OPAL_THREAD_ADD32((int32_t*)&endpoint->qps[qp].u.pp_qp.rd_posted, 1); } - - switch (ctl_hdr->type) { case MCA_BTL_OPENIB_CONTROL_CREDITS: credits_hdr = (mca_btl_openib_rdma_credits_header_t*)ctl_hdr; @@ -1163,6 +1161,11 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, } } + if(frag->hdr->cm_seen) { + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, + -frag->hdr->cm_seen); + } + /* We may receive credits here so try to progress only things that * may be pending because of credit shortage */ if(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type || @@ -1606,10 +1609,10 @@ error: } if(wc.status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[cq]++) { BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s " - "status number %d for wr_id %llu opcode %d", + "status number %d for wr_id %llu opcode %d qp_idx %d", cq_name[cq], btl_openib_component_status_to_string(wc.status), - wc.status, wc.wr_id, wc.opcode)); + wc.status, wc.wr_id, wc.opcode, frag->qp_idx)); abort(); } if(wc.status == IBV_WC_RETRY_EXC_ERR) { diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 7d71ebdb15..37399788ae 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -220,6 +220,14 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope } else { frag->hdr->credits = 0; } + + if(endpoint->qps[qp].u.pp_qp.cm_return) { + frag->hdr->cm_seen = endpoint->qps[qp].u.pp_qp.cm_return; + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, + -frag->hdr->cm_seen); + } else { + frag->hdr->cm_seen = 0; + } ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma); if(ib_rc) { @@ -287,10 +295,14 @@ static void mca_btl_openib_endpoint_construct_qp(mca_btl_base_endpoint_t *endpoi * now has credits even if the receive buffers are not yet posted */ endpoint->qps[qp].u.pp_qp.rd_credits = - -(mca_btl_openib_component.qp_infos[qp].rd_num + - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv); + -mca_btl_openib_component.qp_infos[qp].rd_num; endpoint->qps[qp].u.pp_qp.rd_posted = 0; + endpoint->qps[qp].u.pp_qp.cm_sent = 0; + endpoint->qps[qp].u.pp_qp.cm_return = + -mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; + endpoint->qps[qp].u.pp_qp.cm_received = + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; /* initialize the local view of credits */ endpoint->qps[qp].u.pp_qp.sd_credits = @@ -1302,6 +1314,16 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, } } + if(0 == do_rdma) { + if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) > + (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) { + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); + OPAL_THREAD_ADD32(&endpoint->qps[qp].rd_pending_credit_chks, + -endpoint->qps[qp].rd_pending_credit_chks); + return; + } + } + frag->base.des_cbfunc = mca_btl_openib_endpoint_credits; frag->base.des_cbdata = NULL; frag->endpoint = endpoint; @@ -1315,6 +1337,14 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, } else { frag->hdr->credits = 0; } + if(endpoint->qps[qp].u.pp_qp.cm_return) { + frag->hdr->cm_seen = endpoint->qps[qp].u.pp_qp.cm_return; + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, + -frag->hdr->cm_seen); + } else { + frag->hdr->cm_seen = 0; + } + /* send eager RDMA credits only for high prio */ if(BTL_OPENIB_EAGER_RDMA_QP(qp) && endpoint->eager_rdma_local.credits > 0) { credits_hdr->rdma_credits = endpoint->eager_rdma_local.credits; @@ -1340,6 +1370,8 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); if(do_rdma) OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); + else + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_ERROR(("error posting send request errno %d says %s", ib_rc, strerror(errno))); } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 0173a992fc..fc130351db 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -97,6 +97,9 @@ struct mca_btl_openib_endpoint_pp_qp_t { */ int32_t rd_posted; /**< number of descriptors posted to the nic*/ int32_t rd_credits; /**< number of credits to return to peer */ + int32_t cm_received; /**< Credit messages received */ + int32_t cm_return; /**< how may credits to return */ + int32_t cm_sent; /**< Outstanding number of credit messages */ }; typedef struct mca_btl_openib_endpoint_pp_qp_t mca_btl_openib_endpoint_pp_qp_t; @@ -211,26 +214,25 @@ static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpo const int qp) { mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; - int rd_num = - mca_btl_openib_component.qp_infos[qp].rd_num + - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; + int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num; + int rd_rsv = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; + int cm_received, rd_posted, rd_low; assert(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type); OPAL_THREAD_LOCK(&openib_btl->ib_lock); - if((endpoint->qps[qp].u.pp_qp.rd_posted - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) <= - mca_btl_openib_component.qp_infos[qp].rd_low + additional && - endpoint->qps[qp].u.pp_qp.rd_posted < - rd_num) { + cm_received = endpoint->qps[qp].u.pp_qp.cm_received; + rd_posted = endpoint->qps[qp].u.pp_qp.rd_posted; + rd_low = mca_btl_openib_component.qp_infos[qp].rd_low; + + if(cm_received >= (rd_rsv >> 2) || rd_posted <= rd_low) { int rc; - int32_t i, num_post = rd_num - endpoint->qps[qp].u.pp_qp.rd_posted; + int32_t i, num_post = rd_num - rd_posted; struct ibv_recv_wr* bad_wr; ompi_free_list_t *free_list; - assert(num_post >= 0); - free_list = &openib_btl->qps[qp].recv_free; - for(i = 0; i < num_post; i++) { + for(i = 0; i < (num_post + cm_received); i++) { ompi_free_list_item_t* item; mca_btl_openib_frag_t* frag; OMPI_FREE_LIST_WAIT(free_list, item, rc); @@ -246,8 +248,16 @@ static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpo return OMPI_ERROR; } } - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post); - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post); + if(num_post > 0) { + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post); + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post); + } + if(cm_received > 0) { + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, + cm_received); + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, + -cm_received); + } assert(endpoint->qps[qp].u.pp_qp.rd_credits < rd_num); assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0); } diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index d5a15566df..288dbe0abe 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -36,6 +36,7 @@ struct mca_btl_openib_header_t { uint8_t padding[1]; #endif uint16_t credits; + uint16_t cm_seen; }; typedef struct mca_btl_openib_header_t mca_btl_openib_header_t; #define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15)