diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index dba6ff9d44..ee96746cdc 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -31,7 +31,7 @@ #include "mca/mpool/mpool.h" #include "mca/mpool/mvapi/mpool_mvapi.h" #include "mca/btl/base/btl_base_error.h" - +#include mca_btl_mvapi_module_t mca_btl_mvapi_module = { { &mca_btl_mvapi_component.super, @@ -790,9 +790,13 @@ int mca_btl_mvapi_put( mca_btl_base_module_t* btl, if(VAPI_OK != frag->ret){ return OMPI_ERROR; } - MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1); - MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1); - + if(mca_btl_mvapi_component.use_srq) { + MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1); + MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1); + } else { + MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1); + MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1); + } return OMPI_SUCCESS; } @@ -854,7 +858,8 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) /* Allocate Protection Domain */ VAPI_ret_t ret; uint32_t cqe_cnt = 0; - + VAPI_srq_attr_t srq_attr, srq_attr_out; + ret = VAPI_alloc_pd(mvapi_btl->nic, &mvapi_btl->ptag); if(ret != VAPI_OK) { @@ -862,6 +867,35 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) return OMPI_ERROR; } + if(mca_btl_mvapi_component.use_srq) { + mvapi_btl->srr_posted_high = 0; + mvapi_btl->srr_posted_low = 0; + srq_attr.pd_hndl = mvapi_btl->ptag; + srq_attr.max_outs_wr = mca_btl_mvapi_component.ib_wq_size; + srq_attr.max_sentries = mca_btl_mvapi_component.ib_sg_list_size; + srq_attr.srq_limit = mca_btl_mvapi_component.ib_wq_size; + + ret = VAPI_create_srq(mvapi_btl->nic, + &srq_attr, + &mvapi_btl->srq_hndl_high, + &srq_attr_out); + if(ret != VAPI_OK) { + BTL_ERROR("error in VAPI_create_srq: %s", VAPI_strerror(ret)); + return OMPI_ERROR; + } + ret = VAPI_create_srq(mvapi_btl->nic, + &srq_attr, + &mvapi_btl->srq_hndl_low, + &srq_attr_out); + if(ret != VAPI_OK) { + BTL_ERROR("error in VAPI_create_srq: %s", VAPI_strerror(ret)); + return OMPI_ERROR; + } + + } else { + mvapi_btl->srq_hndl_high = VAPI_INVAL_SRQ_HNDL; + mvapi_btl->srq_hndl_low = VAPI_INVAL_SRQ_HNDL; + } ret = VAPI_create_cq(mvapi_btl->nic, mca_btl_mvapi_component.ib_cq_size, &mvapi_btl->cq_hndl_low, &cqe_cnt); diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index a9eb0049f7..6dac862d0f 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -99,7 +99,7 @@ struct mca_btl_mvapi_component_t { uint32_t leave_pinned; uint32_t reg_mru_len; - + uint32_t use_srq; uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */ uint32_t ib_wq_size; /**< Max outstanding WR on the WQ */ @@ -116,7 +116,7 @@ struct mca_btl_mvapi_component_t { uint32_t ib_service_level; uint32_t ib_static_rate; uint32_t ib_src_path_bits; - + }; typedef struct mca_btl_mvapi_component_t mca_btl_mvapi_component_t; @@ -163,14 +163,92 @@ struct mca_btl_mvapi_module_t { mca_mpool_base_module_t* ib_pool; /**< ib memory pool */ VAPI_rr_desc_t* rr_desc_post; - + VAPI_srq_hndl_t srq_hndl_high; /**< A high priority shared receive queue + runtime optional, can also use a receive queue + per queue pair.. */ + VAPI_srq_hndl_t srq_hndl_low; /**< A low priority shared receive queue */ + + uint32_t srr_posted_high; /**< number of high priority shared rr posted to the nic*/ + uint32_t srr_posted_low; /**< number of low priority shared rr posted to the nic*/ + /**< an array to allow posting of rr in one swoop */ size_t ib_inline_max; /**< max size of inline send*/ }; typedef struct mca_btl_mvapi_module_t mca_btl_mvapi_module_t; - struct mca_btl_mvapi_frag_t; + +#define MCA_BTL_MVAPI_POST_SRR_HIGH(post_srr_high_mvapi_btl, \ + post_srr_high_additional) \ +{ \ + OPAL_THREAD_LOCK(&post_srr_high_mvapi_btl->ib_lock); \ + if(post_srr_high_mvapi_btl->srr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+post_srr_high_additional && \ + post_srr_high_mvapi_btl->srr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){ \ + MCA_BTL_MVAPI_POST_SRR_SUB(mca_btl_mvapi_component.ib_rr_buf_max - \ + post_srr_high_mvapi_btl->srr_posted_high, \ + post_srr_high_mvapi_btl, \ + &post_srr_high_mvapi_btl->recv_free_eager, \ + &post_srr_high_mvapi_btl->srr_posted_high, \ + post_srr_high_mvapi_btl->nic, \ + post_srr_high_mvapi_btl->srq_hndl_high); \ + } \ + OPAL_THREAD_UNLOCK(&post_rr_high_mvapi_btl->ib_lock); \ +} + +#define MCA_BTL_MVAPI_POST_SRR_LOW(post_srr_low_mvapi_btl, \ + post_srr_low_additional) \ +{ \ + OPAL_THREAD_LOCK(&post_srr_low_mvapi_btl->ib_lock); \ + if(post_srr_low_mvapi_btl->srr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+post_srr_low_additional && \ + post_srr_low_mvapi_btl->srr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){ \ + MCA_BTL_MVAPI_POST_SRR_SUB(mca_btl_mvapi_component.ib_rr_buf_max - \ + post_srr_low_mvapi_btl->srr_posted_low, \ + post_srr_low_mvapi_btl, \ + &post_srr_low_mvapi_btl->recv_free_max, \ + &post_srr_low_mvapi_btl->srr_posted_low, \ + post_srr_low_mvapi_btl->nic, \ + post_srr_low_mvapi_btl->srq_hndl_low); \ + } \ + OPAL_THREAD_UNLOCK(&post_srr_low_mvapi_btl->ib_lock); \ +} + + +#define MCA_BTL_MVAPI_POST_SRR_SUB(post_srr_sub_cnt, \ + post_srr_sub_mvapi_btl, \ + post_srr_sub_frag_list, \ + post_srr_sub_srr_posted, \ + post_srr_sub_nic, \ + post_srr_sub_srq_hndl) \ +{\ + uint32_t post_srr_sub_i; \ + uint32_t post_srr_sub_rwqe_posted; \ + int post_srr_sub_rc; \ + opal_list_item_t* post_srr_sub_item; \ + mca_btl_mvapi_frag_t* post_srr_sub_frag; \ + VAPI_rr_desc_t* post_srr_sub_desc_post = post_srr_sub_mvapi_btl->rr_desc_post; \ + for(post_srr_sub_i = 0; post_srr_sub_i < post_srr_sub_cnt; post_srr_sub_i++) { \ + OMPI_FREE_LIST_WAIT(post_srr_sub_frag_list, post_srr_sub_item, post_srr_sub_rc); \ + post_srr_sub_frag = (mca_btl_mvapi_frag_t*) post_srr_sub_item; \ + post_srr_sub_frag->sg_entry.len = post_srr_sub_frag->size + \ + ((unsigned char*) post_srr_sub_frag->segment.seg_addr.pval- \ + (unsigned char*) post_srr_sub_frag->hdr); \ + post_srr_sub_desc_post[post_srr_sub_i] = post_srr_sub_frag->rr_desc; \ + }\ + post_srr_sub_frag->ret = VAPI_post_srq( post_srr_sub_nic, \ + post_srr_sub_srq_hndl, \ + post_srr_sub_cnt, \ + post_srr_sub_desc_post, \ + &post_srr_sub_rwqe_posted); \ + if(VAPI_OK != post_srr_sub_frag->ret) { \ + BTL_ERROR("error posting receive descriptors to shared receive queue: %s",\ + VAPI_strerror(post_srr_sub_frag->ret)); \ + } else if(post_srr_sub_rwqe_posted < 1) { \ + BTL_ERROR("error posting receive descriptors to shared receive queue, number of entries posted is %d", post_srr_sub_rwqe_posted); \ + } else {\ + OPAL_THREAD_ADD32(post_srr_sub_srr_posted, post_srr_sub_cnt); \ + }\ +} +struct mca_btl_mvapi_frag_t; extern mca_btl_mvapi_module_t mca_btl_mvapi_module; /** diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index 25034a3ca2..fa15937db3 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -128,6 +128,9 @@ int mca_btl_mvapi_component_open(void) mca_btl_mvapi_param_register_int("rr_buf_min", 8); mca_btl_mvapi_component.reg_mru_len = mca_btl_mvapi_param_register_int("reg_mru_len", 16); + mca_btl_mvapi_component.use_srq = + mca_btl_mvapi_param_register_int("use_srq", 0); + mca_btl_mvapi_component.ib_cq_size = mca_btl_mvapi_param_register_int("ib_cq_size", 40000); @@ -527,9 +530,15 @@ int mca_btl_mvapi_component_progress() mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1); - MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); + + if(mca_btl_mvapi_component.use_srq) { + OPAL_THREAD_ADD32(&mvapi_btl->srr_posted_high, -1); + MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 0); + } else { + OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1); + MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); + } count++; break; @@ -577,11 +586,15 @@ int mca_btl_mvapi_component_progress() mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_max), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1); - MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); - + if(mca_btl_mvapi_component.use_srq) { + OPAL_THREAD_ADD32(&mvapi_btl->srr_posted_low, -1); + MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 0); + } else { + OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1); + MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); + } count++; break; diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index a1cfc9ff59..8ebb07c25d 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -39,6 +39,7 @@ int mca_btl_mvapi_endpoint_create_qp( VAPI_hca_hndl_t nic, VAPI_pd_hndl_t ptag, VAPI_cq_hndl_t cq_hndl, + VAPI_srq_hndl_t srq_hndl, VAPI_qp_hndl_t* qp_hndl, VAPI_qp_prop_t* qp_prop, int transport_type); @@ -62,7 +63,7 @@ static inline int mca_btl_mvapi_endpoint_post_send(mca_btl_mvapi_module_t* mvapi frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr; VAPI_qp_hndl_t qp_hndl; - if(frag->base.des_flags && MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= mvapi_btl->super.btl_eager_limit){ + if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= mvapi_btl->super.btl_eager_limit){ frag->sr_desc.remote_qp = endpoint->rem_qp_num_high; qp_hndl = endpoint->lcl_qp_hndl_high; } else { @@ -87,9 +88,13 @@ static inline int mca_btl_mvapi_endpoint_post_send(mca_btl_mvapi_module_t* mvapi if(VAPI_OK != frag->ret) return OMPI_ERROR; - - MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1); - MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1); + if(mca_btl_mvapi_component.use_srq) { + MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1); + MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1); + } else { + MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1); + MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1); + } return OMPI_SUCCESS; } @@ -281,12 +286,13 @@ static int mca_btl_mvapi_endpoint_start_connect(mca_btl_base_endpoint_t* endpoin /* Create the High Priority Queue Pair */ if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_high, - &endpoint->lcl_qp_hndl_high, - &endpoint->lcl_qp_prop_high, - VAPI_TS_RC))) { + endpoint->endpoint_btl->nic, + endpoint->endpoint_btl->ptag, + endpoint->endpoint_btl->cq_hndl_high, + endpoint->endpoint_btl->srq_hndl_high, + &endpoint->lcl_qp_hndl_high, + &endpoint->lcl_qp_prop_high, + VAPI_TS_RC))) { BTL_ERROR("error creating queue pair, error code %d", rc); return rc; } @@ -294,11 +300,12 @@ static int mca_btl_mvapi_endpoint_start_connect(mca_btl_base_endpoint_t* endpoin /* Create the Low Priority Queue Pair */ if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_low, - &endpoint->lcl_qp_hndl_low, - &endpoint->lcl_qp_prop_low, + endpoint->endpoint_btl->nic, + endpoint->endpoint_btl->ptag, + endpoint->endpoint_btl->cq_hndl_low, + endpoint->endpoint_btl->srq_hndl_low, + &endpoint->lcl_qp_hndl_low, + &endpoint->lcl_qp_prop_low, VAPI_TS_RC))) { BTL_ERROR("error creating queue pair, error code %d", rc); @@ -330,12 +337,13 @@ static int mca_btl_mvapi_endpoint_reply_start_connect(mca_btl_mvapi_endpoint_t * /* Create the High Priority Queue Pair */ if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_high, - &endpoint->lcl_qp_hndl_high, - &endpoint->lcl_qp_prop_high, - VAPI_TS_RC))) { + endpoint->endpoint_btl->nic, + endpoint->endpoint_btl->ptag, + endpoint->endpoint_btl->cq_hndl_high, + endpoint->endpoint_btl->srq_hndl_high, + &endpoint->lcl_qp_hndl_high, + &endpoint->lcl_qp_prop_high, + VAPI_TS_RC))) { BTL_ERROR("error creating queue pair, error code %d", rc); return rc; } @@ -343,12 +351,13 @@ static int mca_btl_mvapi_endpoint_reply_start_connect(mca_btl_mvapi_endpoint_t * /* Create the Low Priority Queue Pair */ if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_low, - &endpoint->lcl_qp_hndl_low, - &endpoint->lcl_qp_prop_low, - VAPI_TS_RC))) { + endpoint->endpoint_btl->nic, + endpoint->endpoint_btl->ptag, + endpoint->endpoint_btl->cq_hndl_low, + endpoint->endpoint_btl->srq_hndl_low, + &endpoint->lcl_qp_hndl_low, + &endpoint->lcl_qp_prop_low, + VAPI_TS_RC))) { BTL_ERROR("error creating queue pair, error code %d", rc); return rc; } @@ -619,13 +628,18 @@ int mca_btl_mvapi_endpoint_connect( endpoint->endpoint_btl->port_id); + if(rc != OMPI_SUCCESS) { return rc; } - - MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 0); - MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 0); + if(mca_btl_mvapi_component.use_srq) { + MCA_BTL_MVAPI_POST_SRR_HIGH(endpoint->endpoint_btl, 1); + MCA_BTL_MVAPI_POST_SRR_LOW(endpoint->endpoint_btl, 1); + } else { + MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 0); + MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 0); + } return OMPI_SUCCESS; } @@ -641,6 +655,7 @@ int mca_btl_mvapi_endpoint_create_qp( VAPI_hca_hndl_t nic, VAPI_pd_hndl_t ptag, VAPI_cq_hndl_t cq_hndl, + VAPI_srq_hndl_t srq_hndl, VAPI_qp_hndl_t* qp_hndl, VAPI_qp_prop_t* qp_prop, int transport_type) @@ -648,7 +663,7 @@ int mca_btl_mvapi_endpoint_create_qp( VAPI_ret_t ret; VAPI_qp_init_attr_t qp_init_attr; - + VAPI_qp_init_attr_ext_t qp_init_attr_ext; switch(transport_type) { case VAPI_TS_RC: /* Set up RC qp parameters */ @@ -679,9 +694,14 @@ int mca_btl_mvapi_endpoint_create_qp( return OMPI_ERR_NOT_IMPLEMENTED; } - ret = VAPI_create_qp(nic, &qp_init_attr, - qp_hndl, qp_prop); + qp_init_attr_ext.srq_hndl = srq_hndl; + ret = VAPI_create_qp_ext(nic, + &qp_init_attr, + &qp_init_attr_ext, + qp_hndl, + qp_prop); + if(VAPI_OK != ret) { BTL_ERROR("error creating the queue pair: %s", VAPI_strerror(ret)); return OMPI_ERROR;