diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index bcc176caf6..c41cb3122a 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -148,12 +148,14 @@ int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size if(!first_time) { struct ibv_srq_attr srq_attr; srq_attr.max_wr = openib_btl->rd_num; - rc = ibv_modify_srq( openib_btl->srq_hp, &srq_attr, IBV_SRQ_MAX_WR); + rc = ibv_modify_srq(openib_btl->srq[BTL_OPENIB_HP_QP], + &srq_attr, IBV_SRQ_MAX_WR); if(rc) { BTL_ERROR(("cannot resize high priority shared receive queue, error: %d", rc)); return OMPI_ERROR; } - rc = ibv_modify_srq(openib_btl->srq_lp, &srq_attr, IBV_SRQ_MAX_WR); + rc = ibv_modify_srq(openib_btl->srq[BTL_OPENIB_LP_QP], + &srq_attr, IBV_SRQ_MAX_WR); if(rc) { BTL_ERROR(("cannot resize low priority shared receive queue, error: %d", rc)); return OMPI_ERROR; @@ -673,7 +675,7 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, frag->sg_entry.addr = (unsigned long) frag->base.des_src->seg_addr.pval; frag->sg_entry.length = frag->base.des_src->seg_len; - if(ibv_post_send(endpoint->lcl_qp_lp, + if(ibv_post_send(endpoint->lcl_qp[BTL_OPENIB_LP_QP], &frag->wr_desc.sr_desc, &bad_wr)){ rc = OMPI_ERROR; @@ -682,11 +684,11 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, } if(mca_btl_openib_component.use_srq) { - MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1); - MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_HP_QP); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_LP_QP); } else { - MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_HP_QP); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_LP_QP); } } return rc; @@ -735,7 +737,7 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl, frag->sg_entry.addr = (unsigned long) frag->base.des_dst->seg_addr.pval; frag->sg_entry.length = frag->base.des_dst->seg_len; - if(ibv_post_send(endpoint->lcl_qp_lp, + if(ibv_post_send(endpoint->lcl_qp[BTL_OPENIB_LP_QP], &frag->wr_desc.sr_desc, &bad_wr)){ BTL_ERROR(("error posting send request errno (%d) says %s", errno, strerror(errno))); @@ -745,11 +747,11 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl, } if(mca_btl_openib_component.use_srq) { - MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1); - MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_HP_QP); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_LP_QP); } else { - MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_HP_QP); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_LP_QP); } } return rc; @@ -770,25 +772,27 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl) attr.attr.max_wr = mca_btl_openib_component.srq_rd_max; attr.attr.max_sge = mca_btl_openib_component.ib_sg_list_size; - openib_btl->srd_posted_hp = 0; - openib_btl->srd_posted_lp = 0; + openib_btl->srd_posted[BTL_OPENIB_HP_QP] = 0; + openib_btl->srd_posted[BTL_OPENIB_LP_QP] = 0; - openib_btl->srq_hp = ibv_create_srq(openib_btl->hca->ib_pd, &attr); - if(NULL == openib_btl->srq_hp) { + openib_btl->srq[BTL_OPENIB_HP_QP] = + ibv_create_srq(openib_btl->hca->ib_pd, &attr); + if(NULL == openib_btl->srq[BTL_OPENIB_HP_QP]) { BTL_ERROR(("error in ibv_create_srq\n")); return OMPI_ERROR; } - openib_btl->srq_lp = ibv_create_srq(openib_btl->hca->ib_pd, &attr); - if(NULL == openib_btl->srq_hp) { + openib_btl->srq[BTL_OPENIB_LP_QP] = + ibv_create_srq(openib_btl->hca->ib_pd, &attr); + if(NULL == openib_btl->srq[BTL_OPENIB_LP_QP]) { BTL_ERROR(("error in ibv_create_srq\n")); return OMPI_ERROR; } } else { - openib_btl->srq_hp = NULL; - openib_btl->srq_lp = NULL; + openib_btl->srq[BTL_OPENIB_HP_QP] = NULL; + openib_btl->srq[BTL_OPENIB_LP_QP] = NULL; } /* Create the low and high priority queue pairs */ diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 3c76743a01..d5916ccadb 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -40,7 +40,8 @@ #include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/base/base.h" -#include "btl_openib_endpoint.h" + +#include "btl_openib_frag.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { @@ -138,6 +139,11 @@ extern mca_btl_openib_component_t mca_btl_openib_component; typedef mca_btl_base_recv_reg_t mca_btl_openib_recv_reg_t; +struct mca_btl_openib_port_info_t { + uint32_t mtu; + uint16_t subnet; +}; +typedef struct mca_btl_openib_port_info_t mca_btl_openib_port_info_t; struct mca_btl_openib_hca_t { struct ibv_device *ib_dev; /* the ib device */ @@ -181,11 +187,8 @@ struct mca_btl_openib_module_t { size_t ib_inline_max; /**< max size of inline send*/ bool poll_cq; - - struct ibv_srq *srq_hp; - struct ibv_srq *srq_lp; - int32_t srd_posted_hp; - int32_t srd_posted_lp; + struct ibv_srq *srq[2]; + int32_t srd_posted[2]; int32_t num_peers; int32_t rd_num; int32_t rd_low; @@ -205,7 +208,6 @@ struct mca_btl_openib_module_t { orte_pointer_array_t *endpoints; }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; -struct mca_btl_openib_frag_t; extern mca_btl_openib_module_t mca_btl_openib_module; /** @@ -407,75 +409,45 @@ extern void mca_btl_openib_send_frag_return( int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t* openib_btl); - - -#define MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, additional) \ -{ \ - do{ \ - OPAL_THREAD_LOCK(&openib_btl->ib_lock); \ - if(openib_btl->srd_posted_hp <= openib_btl->rd_low+additional && \ - openib_btl->srd_posted_hp < openib_btl->rd_num){ \ - MCA_BTL_OPENIB_POST_SRR_SUB(openib_btl->rd_num - \ - openib_btl->srd_posted_hp, \ - openib_btl, \ - &openib_btl->recv_free_eager, \ - &openib_btl->srd_posted_hp, \ - openib_btl->srq_hp); \ - } \ - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); \ - } while(0); \ -} - -#define MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, additional) \ -{ \ - do { \ - OPAL_THREAD_LOCK(&openib_btl->ib_lock); \ - if(openib_btl->srd_posted_lp <= openib_btl->rd_low+additional && \ - openib_btl->srd_posted_lp < openib_btl->rd_num){ \ - MCA_BTL_OPENIB_POST_SRR_SUB(openib_btl->rd_num - \ - openib_btl->srd_posted_lp, \ - openib_btl, \ - &openib_btl->recv_free_max, \ - &openib_btl->srd_posted_lp, \ - openib_btl->srq_lp); \ - } \ - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); \ - } while(0); \ -} - - -#define MCA_BTL_OPENIB_POST_SRR_SUB(cnt, \ - openib_btl, \ - frag_list, \ - srd_posted, \ - srq) \ -{\ - do { \ - int32_t i; \ - int32_t num_post = cnt; \ - ompi_free_list_item_t* item = NULL; \ - mca_btl_openib_frag_t* frag = NULL; \ - struct ibv_recv_wr *bad_wr; \ - int32_t rc; \ - for(i = 0; i < num_post; i++) { \ - OMPI_FREE_LIST_WAIT(frag_list, item, rc); \ - frag = (mca_btl_openib_frag_t*) item; \ - frag->sg_entry.length = frag->size + \ - ((unsigned char*) frag->segment.seg_addr.pval- \ - (unsigned char*) frag->hdr); \ - if(ibv_post_srq_recv(srq, &frag->wr_desc.rd_desc, &bad_wr)) { \ - BTL_ERROR(("error posting receive descriptors to shared receive queue: %s",\ - strerror(errno))); \ - return OMPI_ERROR; \ - }\ - }\ - OPAL_THREAD_ADD32(srd_posted, num_post); \ - } while(0);\ -} - #define BTL_OPENIB_HP_QP 0 #define BTL_OPENIB_LP_QP 1 +static inline mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, + const int additional, const int prio) +{ + OPAL_THREAD_LOCK(&openib_btl->ib_lock); + if(openib_btl->srd_posted[prio] <= openib_btl->rd_low + additional && + openib_btl->srd_posted[prio] < openib_btl->rd_num) { + int32_t i, rc; + int32_t num_post = openib_btl->rd_num - openib_btl->srd_posted[prio]; + ompi_free_list_item_t* item; + mca_btl_openib_frag_t* frag; + struct ibv_recv_wr *bad_wr; + ompi_free_list_t *free_list; + + if(BTL_OPENIB_HP_QP == prio) + free_list = &openib_btl->recv_free_eager; + else + free_list = &openib_btl->recv_free_max; + + for(i = 0; i < num_post; i++) { + OMPI_FREE_LIST_WAIT(free_list, item, rc); + frag = (mca_btl_openib_frag_t*)item; + frag->sg_entry.length = frag->size + + ((unsigned char*)frag->segment.seg_addr.pval - + (unsigned char*)frag->hdr); + if(ibv_post_srq_recv(openib_btl->srq[prio], &frag->wr_desc.rd_desc, + &bad_wr)) { + BTL_ERROR(("error posting receive descriptors to shared " + "receive queue: %s", strerror(errno))); + return OMPI_ERROR; + } + } + OPAL_THREAD_ADD32(&openib_btl->srd_posted[prio], num_post); + } + OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); +} + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 5cf19b03cd..cf22b76d0e 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -203,10 +203,10 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl, /* if not sent via rdma */ if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) && ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) { - OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -1); + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_HP_QP], -1); } } else { - OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -1); + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_LP_QP], -1); } switch (ctl_hdr->type) { @@ -745,21 +745,23 @@ static int btl_openib_handle_incoming_hp(mca_btl_openib_module_t *openib_btl, /* check to see if we need to return credits */ - if((endpoint->rd_credits_hp >= mca_btl_openib_component.rd_win || + if((endpoint->rd_credits[BTL_OPENIB_HP_QP] >= + mca_btl_openib_component.rd_win || endpoint->eager_rdma_local.credits >= mca_btl_openib_component.rd_win) && - OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, 1) == 1) { + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_HP_QP],1) == 1) { mca_btl_openib_endpoint_send_credits_hp(endpoint); } /* repost receive descriptors if receive not by RDMA */ if(!MCA_BTL_OPENIB_RDMA_FRAG(frag)) { if(mca_btl_openib_component.use_srq) { - OPAL_THREAD_ADD32((int32_t*) &openib_btl->srd_posted_hp, -1); - MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 0); + OPAL_THREAD_ADD32((int32_t*)&openib_btl->srd_posted[BTL_OPENIB_HP_QP], -1); + mca_btl_openib_post_srr(openib_btl, 0, BTL_OPENIB_HP_QP); } else { - OPAL_THREAD_ADD32((int32_t*) &endpoint->rd_posted_hp, -1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 0); + OPAL_THREAD_ADD32((int32_t*)&endpoint->rd_posted[BTL_OPENIB_HP_QP], + -1); + btl_openib_endpoint_post_rr(endpoint, 0, BTL_OPENIB_HP_QP); } } @@ -1018,8 +1020,11 @@ static int btl_openib_component_progress(void) if(!mca_btl_openib_component.use_srq) { /* check to see if we need to return credits */ - if((endpoint->rd_credits_hp >= mca_btl_openib_component.rd_win || endpoint->eager_rdma_local.credits >= mca_btl_openib_component.rd_win) && - OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, 1) == 1) { + if((endpoint->rd_credits[BTL_OPENIB_HP_QP] >= + mca_btl_openib_component.rd_win || + endpoint->eager_rdma_local.credits >= + mca_btl_openib_component.rd_win) && + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_HP_QP], 1) == 1) { mca_btl_openib_endpoint_send_credits_hp(endpoint); } @@ -1075,8 +1080,9 @@ static int btl_openib_component_progress(void) if(!mca_btl_openib_component.use_srq) { /* check to see if we need to return credits */ - if( endpoint->rd_credits_lp >= mca_btl_openib_component.rd_win && - OPAL_THREAD_ADD32(&endpoint->sd_credits_lp, 1) == 1) { + if( endpoint->rd_credits[BTL_OPENIB_LP_QP] >= + mca_btl_openib_component.rd_win && + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_LP_QP], 1) == 1) { mca_btl_openib_endpoint_send_credits_lp(endpoint); } } @@ -1119,13 +1125,13 @@ static int btl_openib_component_progress(void) if(mca_btl_openib_component.use_srq) { /* repost receive descriptors */ - OPAL_THREAD_ADD32((int32_t*) &openib_btl->srd_posted_lp, -1); - MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 0); + OPAL_THREAD_ADD32((int32_t*)&openib_btl->srd_posted[BTL_OPENIB_LP_QP], -1); + mca_btl_openib_post_srr(openib_btl, 0, BTL_OPENIB_LP_QP); } else { /* repost receive descriptors */ - OPAL_THREAD_ADD32((int32_t*) &endpoint->rd_posted_lp, -1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 0); - + OPAL_THREAD_ADD32((int32_t*) + &endpoint->rd_posted[BTL_OPENIB_LP_QP], -1); + btl_openib_endpoint_post_rr(endpoint, 0, BTL_OPENIB_LP_QP); OPAL_THREAD_ADD32(&endpoint->sd_tokens[BTL_OPENIB_LP_QP], credits); @@ -1135,8 +1141,9 @@ static int btl_openib_component_progress(void) BTL_OPENIB_LP_QP); /* check to see if we need to return credits */ - if(endpoint->rd_credits_lp >= mca_btl_openib_component.rd_win && - OPAL_THREAD_ADD32(&endpoint->sd_credits_lp, 1) == 1) { + if(endpoint->rd_credits[BTL_OPENIB_LP_QP] >= + mca_btl_openib_component.rd_win && + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_LP_QP], 1) == 1) { mca_btl_openib_endpoint_send_credits_lp(endpoint); } } diff --git a/ompi/mca/btl/openib/btl_openib_eager_rdma.h b/ompi/mca/btl/openib/btl_openib_eager_rdma.h index 034ae7a499..89c7a5adc4 100644 --- a/ompi/mca/btl/openib/btl_openib_eager_rdma.h +++ b/ompi/mca/btl/openib/btl_openib_eager_rdma.h @@ -12,7 +12,6 @@ #include "ompi_config.h" #include "btl_openib.h" -#include "btl_openib_endpoint.h" #include "ompi/mca/mpool/openib/mpool_openib.h" #if defined(c_plusplus) || defined(__cplusplus) diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index d2cc82c3e6..7fa221d38e 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -62,7 +62,6 @@ int mca_btl_openib_endpoint_qp_init_query( uint32_t port_num ); - /* * post a send to the work queue */ @@ -131,25 +130,27 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, -frag->hdr->credits); frag->hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; - } else if(endpoint->rd_credits_hp > 0) { - frag->hdr->credits = endpoint->rd_credits_hp; - OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits); + } else if(endpoint->rd_credits[BTL_OPENIB_HP_QP] > 0) { + frag->hdr->credits = endpoint->rd_credits[BTL_OPENIB_HP_QP]; + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_HP_QP], + -frag->hdr->credits); } else { frag->hdr->credits = 0; } - ib_qp = endpoint->lcl_qp_hp; + ib_qp = endpoint->lcl_qp[BTL_OPENIB_HP_QP]; } else { if(btl_openib_acquire_send_resources(openib_btl, endpoint, frag, BTL_OPENIB_LP_QP, NULL) == OMPI_ERR_OUT_OF_RESOURCE) return MPI_SUCCESS; - if(endpoint->rd_credits_lp > 0) { - frag->hdr->credits = endpoint->rd_credits_lp; - OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits); + if(endpoint->rd_credits[BTL_OPENIB_LP_QP] > 0) { + frag->hdr->credits = endpoint->rd_credits[BTL_OPENIB_LP_QP]; + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_LP_QP], + -frag->hdr->credits); } else { frag->hdr->credits = 0; } - ib_qp = endpoint->lcl_qp_lp; + ib_qp = endpoint->lcl_qp[BTL_OPENIB_LP_QP]; } frag->sg_entry.length = @@ -202,11 +203,11 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope } if(mca_btl_openib_component.use_srq) { - MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1); - MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_HP_QP); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_LP_QP); } else { - MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_HP_QP); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_LP_QP); } return OMPI_SUCCESS; @@ -240,8 +241,8 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) memset(endpoint->lcl_qp_attr_hp, 0, sizeof(struct ibv_qp_attr)); memset(endpoint->lcl_qp_attr_lp, 0, sizeof(struct ibv_qp_attr)); - endpoint->rd_posted_hp = 0; - endpoint->rd_posted_lp = 0; + endpoint->rd_posted[BTL_OPENIB_HP_QP] = 0; + endpoint->rd_posted[BTL_OPENIB_LP_QP] = 0; /* number of available send wqes */ endpoint->sd_wqe[BTL_OPENIB_HP_QP] = mca_btl_openib_component.rd_num; @@ -250,10 +251,10 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) /* zero these out w/ initial posting, so that we start out w/ * zero credits to return to peer */ - endpoint->rd_credits_hp = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv); - endpoint->rd_credits_lp = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv); - endpoint->sd_credits_hp = 0; - endpoint->sd_credits_lp = 0; + endpoint->rd_credits[BTL_OPENIB_HP_QP] = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv); + endpoint->rd_credits[BTL_OPENIB_LP_QP] = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv); + endpoint->sd_credits[BTL_OPENIB_HP_QP] = 0; + endpoint->sd_credits[BTL_OPENIB_LP_QP] = 0; /* initialize the high and low priority tokens */ endpoint->sd_tokens[BTL_OPENIB_HP_QP] = mca_btl_openib_component.rd_num; @@ -313,13 +314,13 @@ static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* en /* pack the info in the send buffer */ - rc = orte_dss.pack(buffer, &endpoint->lcl_qp_hp->qp_num, 1, ORTE_UINT32); + rc = orte_dss.pack(buffer, &endpoint->lcl_qp[BTL_OPENIB_HP_QP]->qp_num, 1, ORTE_UINT32); if(rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } - rc = orte_dss.pack(buffer, &endpoint->lcl_qp_lp->qp_num, 1, ORTE_UINT32); + rc = orte_dss.pack(buffer, &endpoint->lcl_qp[BTL_OPENIB_LP_QP]->qp_num, 1, ORTE_UINT32); if(rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; @@ -367,8 +368,8 @@ static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* en BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", - endpoint->lcl_qp_hp->qp_num, - endpoint->lcl_qp_lp->qp_num, + endpoint->lcl_qp[BTL_OPENIB_HP_QP]->qp_num, + endpoint->lcl_qp[BTL_OPENIB_LP_QP]->qp_num, endpoint->endpoint_btl->lid)); if(rc < 0) { @@ -418,9 +419,9 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, openib_btl->hca->ib_pd, openib_btl->ib_cq_hp, - openib_btl->srq_hp, + openib_btl->srq[BTL_OPENIB_HP_QP], endpoint->lcl_qp_attr_hp, - &endpoint->lcl_qp_hp))) { + &endpoint->lcl_qp[BTL_OPENIB_HP_QP]))) { BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } @@ -431,17 +432,17 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, openib_btl->hca->ib_pd, openib_btl->ib_cq_lp, - openib_btl->srq_lp, + openib_btl->srq[BTL_OPENIB_LP_QP], endpoint->lcl_qp_attr_lp, - &endpoint->lcl_qp_lp))) { + &endpoint->lcl_qp[BTL_OPENIB_LP_QP]))) { BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } endpoint->lcl_psn_lp = lrand48() & 0xffffff; BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", - endpoint->lcl_qp_hp->qp_num, - endpoint->lcl_qp_lp->qp_num, + endpoint->lcl_qp[BTL_OPENIB_HP_QP]->qp_num, + endpoint->lcl_qp[BTL_OPENIB_LP_QP]->qp_num, openib_btl->lid)); /* Send connection info over to remote endpoint */ @@ -468,10 +469,10 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, openib_btl->hca->ib_pd, openib_btl->ib_cq_hp, - openib_btl->srq_hp, + openib_btl->srq[BTL_OPENIB_HP_QP], endpoint->lcl_qp_attr_hp, - &endpoint->lcl_qp_hp))) { + &endpoint->lcl_qp[BTL_OPENIB_HP_QP]))) { BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } @@ -482,18 +483,18 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, openib_btl->hca->ib_pd, openib_btl->ib_cq_lp, - openib_btl->srq_lp, + openib_btl->srq[BTL_OPENIB_LP_QP], endpoint->lcl_qp_attr_lp, - &endpoint->lcl_qp_lp))) { + &endpoint->lcl_qp[BTL_OPENIB_LP_QP]))) { BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } endpoint->lcl_psn_lp = lrand48() & 0xffffff; BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", - endpoint->lcl_qp_hp->qp_num, - endpoint->lcl_qp_lp->qp_num, + endpoint->lcl_qp[BTL_OPENIB_HP_QP]->qp_num, + endpoint->lcl_qp[BTL_OPENIB_LP_QP]->qp_num, openib_btl->lid)); @@ -871,7 +872,7 @@ int mca_btl_openib_endpoint_connect( /* Connection establishment RC */ rc = mca_btl_openib_endpoint_qp_init_query( openib_btl, - endpoint->lcl_qp_hp, + endpoint->lcl_qp[BTL_OPENIB_HP_QP], endpoint->lcl_qp_attr_hp, endpoint->lcl_psn_hp, endpoint->rem_info.rem_qp_num_hp, @@ -888,7 +889,7 @@ int mca_btl_openib_endpoint_connect( } rc = mca_btl_openib_endpoint_qp_init_query( openib_btl, - endpoint->lcl_qp_lp, + endpoint->lcl_qp[BTL_OPENIB_LP_QP], endpoint->lcl_qp_attr_lp, endpoint->lcl_psn_lp, endpoint->rem_info.rem_qp_num_lp, @@ -908,11 +909,11 @@ int mca_btl_openib_endpoint_connect( MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, endpoint->lp_credit_frag, rc); if(mca_btl_openib_component.use_srq) { - MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1); - MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_HP_QP); + mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_LP_QP); } else { - MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_HP_QP); + btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_LP_QP); } return OMPI_SUCCESS; @@ -1066,10 +1067,11 @@ static void mca_btl_openib_endpoint_credits_lp( OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_LP_QP],-1); /* check to see if there are addditional credits to return */ - if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-1)) > 0) { - OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-credits); - if (endpoint->rd_credits_lp >= mca_btl_openib_component.rd_win && - OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,1) == 1) { + if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_LP_QP],-1)) > 0) { + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_LP_QP],-credits); + if (endpoint->rd_credits[BTL_OPENIB_LP_QP] >= + mca_btl_openib_component.rd_win && + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_LP_QP],1) == 1) { mca_btl_openib_endpoint_send_credits_lp(endpoint); } } @@ -1095,9 +1097,10 @@ void mca_btl_openib_endpoint_send_credits_lp( frag->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; - if(endpoint->rd_credits_lp > 0) { - frag->hdr->credits = endpoint->rd_credits_lp; - OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits); + if(endpoint->rd_credits[BTL_OPENIB_LP_QP] > 0) { + frag->hdr->credits = endpoint->rd_credits[BTL_OPENIB_LP_QP]; + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_LP_QP], + -frag->hdr->credits); } else { frag->hdr->credits = 0; } @@ -1120,11 +1123,12 @@ void mca_btl_openib_endpoint_send_credits_lp( frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; } - if(ibv_post_send(endpoint->lcl_qp_lp, + if(ibv_post_send(endpoint->lcl_qp[BTL_OPENIB_LP_QP], &frag->wr_desc.sr_desc, &bad_wr)) { - OPAL_THREAD_ADD32(&endpoint->sd_credits_lp, -1); - OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, frag->hdr->credits); + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_LP_QP], -1); + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_LP_QP], + frag->hdr->credits); MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); BTL_ERROR(("error posting send request errno %d says %s", strerror(errno))); return; @@ -1148,11 +1152,11 @@ static void mca_btl_openib_endpoint_credits_hp( OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_HP_QP],-1); /* check to see if there are addditional credits to return */ - if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-1)) > 0) { - OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-credits); - if ((endpoint->rd_credits_hp >= mca_btl_openib_component.rd_win || + if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_HP_QP],-1)) > 0) { + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_HP_QP],-credits); + if ((endpoint->rd_credits[BTL_OPENIB_HP_QP] >= mca_btl_openib_component.rd_win || endpoint->eager_rdma_local.credits >= mca_btl_openib_component.rd_win) && - OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,1) == 1) { + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_HP_QP],1) == 1) { mca_btl_openib_endpoint_send_credits_hp(endpoint); } } @@ -1178,9 +1182,10 @@ void mca_btl_openib_endpoint_send_credits_hp( frag->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; - if(endpoint->rd_credits_hp > 0) { - frag->hdr->credits = endpoint->rd_credits_hp; - OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits); + if(endpoint->rd_credits[BTL_OPENIB_HP_QP] > 0) { + frag->hdr->credits = endpoint->rd_credits[BTL_OPENIB_HP_QP]; + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_HP_QP], + -frag->hdr->credits); } else frag->hdr->credits = 0; if(endpoint->eager_rdma_local.credits > 0) { @@ -1208,11 +1213,12 @@ void mca_btl_openib_endpoint_send_credits_hp( frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; } - if(ibv_post_send(endpoint->lcl_qp_hp, + if(ibv_post_send(endpoint->lcl_qp[BTL_OPENIB_HP_QP], &frag->wr_desc.sr_desc, &bad_wr)) { - OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, -1); - OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, frag->hdr->credits); + OPAL_THREAD_ADD32(&endpoint->sd_credits[BTL_OPENIB_HP_QP], -1); + OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_HP_QP], + frag->hdr->credits); MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); BTL_ERROR(("error posting send request errno %d says %s", errno, strerror(errno))); diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index ab62935f73..6f3d96cd5a 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -37,13 +37,6 @@ extern "C" { struct mca_btl_openib_frag_t; -struct mca_btl_openib_port_info_t { - uint32_t mtu; - uint16_t subnet; -}; -typedef struct mca_btl_openib_port_info_t mca_btl_openib_port_info_t; - - /** * State of IB endpoint connection. */ @@ -138,9 +131,7 @@ struct mca_btl_base_endpoint_t { uint32_t lcl_psn_lp; /* Local processes port sequence number (Low and High) */ - struct ibv_qp* lcl_qp_hp; - struct ibv_qp* lcl_qp_lp; - /* Local QP (Low and High) */ + struct ibv_qp* lcl_qp[2]; /* Local QP (Low and High) */ struct ibv_qp_attr* lcl_qp_attr_hp; struct ibv_qp_attr* lcl_qp_attr_lp; @@ -149,12 +140,9 @@ struct mca_btl_base_endpoint_t { int32_t sd_tokens[2]; /**< number of send tokens */ int32_t get_tokens; /**< number of available get tokens */ - int32_t rd_posted_hp; /**< number of high priority descriptors posted to the nic*/ - int32_t rd_posted_lp; /**< number of low priority descriptors posted to the nic*/ - int32_t rd_credits_hp; /**< number of high priority credits to return to peer */ - int32_t rd_credits_lp; /**< number of low priority credits to return to peer */ - int32_t sd_credits_hp; /**< number of send wqe entries being used to return credits */ - int32_t sd_credits_lp; /**< number of send wqe entries being used to return credits */ + int32_t rd_posted[2]; /**< number of descriptors posted to the nic*/ + int32_t rd_credits[2]; /**< number of credits to return to peer */ + int32_t sd_credits[2]; /**< number of send wqe entries being used to return credits */ int32_t sd_wqe[2]; /**< number of available send wqe entries */ uint16_t subnet; /**< subnet of this endpoint*/ @@ -181,76 +169,47 @@ void mca_btl_openib_post_recv(void); void mca_btl_openib_endpoint_send_credits_hp(mca_btl_base_endpoint_t*); void mca_btl_openib_endpoint_send_credits_lp(mca_btl_base_endpoint_t*); void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*); - -#define MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, \ - additional) \ -{ \ - do { \ - mca_btl_openib_module_t * openib_btl = endpoint->endpoint_btl; \ - OPAL_THREAD_LOCK(&openib_btl->ib_lock); \ - if(endpoint->rd_posted_hp <= mca_btl_openib_component.rd_low+additional && \ - endpoint->rd_posted_hp < openib_btl->rd_num) { \ - MCA_BTL_OPENIB_ENDPOINT_POST_RR_SUB(openib_btl->rd_num - \ - endpoint->rd_posted_hp, \ - endpoint, \ - &openib_btl->recv_free_eager, \ - endpoint->rd_posted_hp, \ - endpoint->rd_credits_hp, \ - endpoint->lcl_qp_hp); \ - } \ - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); \ - } while(0); \ -} -#define MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, \ - additional) { \ - do { \ - mca_btl_openib_module_t * openib_btl = endpoint->endpoint_btl; \ - OPAL_THREAD_LOCK(&openib_btl->ib_lock); \ - if(endpoint->rd_posted_lp <= mca_btl_openib_component.rd_low+additional && \ - endpoint->rd_posted_lp < openib_btl->rd_num){ \ - MCA_BTL_OPENIB_ENDPOINT_POST_RR_SUB(openib_btl->rd_num - \ - endpoint->rd_posted_lp, \ - endpoint, \ - &openib_btl->recv_free_max, \ - endpoint->rd_posted_lp, \ - endpoint->rd_credits_lp, \ - endpoint->lcl_qp_lp \ - ); } \ - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); \ - } while(0); \ -} +static inline int btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpoint, + const int additional, const int prio) +{ + mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; -#define MCA_BTL_OPENIB_ENDPOINT_POST_RR_SUB(cnt, \ - my_endpoint, \ - frag_list, \ - rd_posted, \ - rd_credits, \ - qp ) \ -do { \ - int32_t i; \ - int rc; \ - int32_t num_post = cnt; \ - struct ibv_recv_wr* bad_wr; \ - for(i = 0; i < num_post; i++) { \ - ompi_free_list_item_t* item; \ - mca_btl_openib_frag_t* frag; \ - OMPI_FREE_LIST_WAIT(frag_list, item, rc); \ - frag = (mca_btl_openib_frag_t*) item; \ - frag->endpoint = my_endpoint; \ - frag->sg_entry.length = frag->size + \ - ((unsigned char*) frag->segment.seg_addr.pval- \ - (unsigned char*) frag->hdr); \ - if(ibv_post_recv(qp, \ - &frag->wr_desc.rd_desc, \ - &bad_wr)) { \ - BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); \ - return OMPI_ERROR; \ - }\ - }\ - OPAL_THREAD_ADD32(&(rd_posted), num_post); \ - OPAL_THREAD_ADD32(&(rd_credits), num_post); \ -} while(0); + OPAL_THREAD_LOCK(&openib_btl->ib_lock); + if(endpoint->rd_posted[prio] <= + mca_btl_openib_component.rd_low + additional && + endpoint->rd_posted[prio] < openib_btl->rd_num) { + int rc; + int32_t i, num_post = openib_btl->rd_num - endpoint->rd_posted[prio]; + struct ibv_recv_wr* bad_wr; + ompi_free_list_t *free_list; + + if(BTL_OPENIB_HP_QP == prio) + free_list = &openib_btl->recv_free_eager; + else + free_list = &openib_btl->recv_free_max; + + for(i = 0; i < num_post; i++) { + ompi_free_list_item_t* item; + mca_btl_openib_frag_t* frag; + OMPI_FREE_LIST_WAIT(free_list, item, rc); + frag = (mca_btl_openib_frag_t*)item; + frag->endpoint = endpoint; + frag->sg_entry.length = frag->size + + ((unsigned char*)frag->segment.seg_addr.pval - + (unsigned char*)frag->hdr); + if(ibv_post_recv(endpoint->lcl_qp[prio], &frag->wr_desc.rd_desc, + &bad_wr)) { + BTL_ERROR(("error posting receive errno says %s\n", + strerror(errno))); + return OMPI_ERROR; + } + } + OPAL_THREAD_ADD32(&endpoint->rd_posted[prio], num_post); + OPAL_THREAD_ADD32(&endpoint->rd_credits[prio], num_post); + } + OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); +} #if defined(c_plusplus) || defined(__cplusplus) } diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 1c0c6d6d4f..18a6b82670 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -17,6 +17,7 @@ */ #include "btl_openib_frag.h" +#include "btl_openib_eager_rdma.h" #include "ompi/mca/mpool/openib/mpool_openib.h" diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index ecf6cfe399..a14b74740c 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -20,10 +20,10 @@ #define MCA_BTL_IB_FRAG_H #include "ompi_config.h" -#include "btl_openib.h" #include #include "ompi/mca/mpool/openib/mpool_openib.h" +#include "ompi/mca/btl/btl.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" {