From 4d2d39b0a63788b3e8d9be293eba795ac0dc248d Mon Sep 17 00:00:00 2001 From: Galen Shipman Date: Tue, 18 Oct 2005 14:55:11 +0000 Subject: [PATCH] intial checking of SRQ flow control support for mvapi This commit was SVN r7796. --- ompi/mca/btl/mvapi/btl_mvapi.c | 31 +++++++++-- ompi/mca/btl/mvapi/btl_mvapi.h | 71 +++++++++++++++--------- ompi/mca/btl/mvapi/btl_mvapi_component.c | 71 +++++++++++++++++++++--- ompi/mca/btl/mvapi/btl_mvapi_endpoint.c | 29 ++++++++-- 4 files changed, 157 insertions(+), 45 deletions(-) diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index b624504590..e4bf23b31c 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -127,6 +127,9 @@ int mca_btl_mvapi_add_procs( mvapi_btl->num_peers += nprocs; if(mca_btl_mvapi_component.use_srq) { mvapi_btl->rd_buf_max = mca_btl_mvapi_component.ib_rr_buf_max + log2(nprocs) * mca_btl_mvapi_component.rd_per_peer; + free(mvapi_btl->rr_desc_post); + mvapi_btl->rr_desc_post = (VAPI_rr_desc_t*) malloc((mvapi_btl->rd_buf_max * sizeof(VAPI_rr_desc_t))); + mvapi_btl->rd_buf_min = mvapi_btl->rd_buf_max / 2; } } return OMPI_SUCCESS; @@ -568,18 +571,26 @@ int mca_btl_mvapi_put( mca_btl_base_module_t* btl, int rc; mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl; mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor; - + frag->endpoint = endpoint; assert(endpoint->endpoint_state == MCA_BTL_IB_CONNECTED || endpoint->endpoint_state == MCA_BTL_IB_WAITING_ACK); frag->sr_desc.opcode = VAPI_RDMA_WRITE; /* atomically test and acquire a token */ - if(OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,-1) < 0) { + if(!mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,-1) < 0) { BTL_VERBOSE(("Queing because no rdma write tokens \n")); BTL_MVAPI_INSERT_PENDING(frag, endpoint->pending_frags_lp, endpoint->wr_sq_tokens_lp, endpoint->endpoint_lock, rc); + rc = OMPI_SUCCESS; + } else if(mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp,-1) < 0) { + opal_list_append(&mvapi_btl->pending_frags_lp, (opal_list_item_t *)frag); + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp,1); + rc = OMPI_SUCCESS; + } else { - frag->endpoint = endpoint; + frag->sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_low; @@ -624,14 +635,22 @@ int mca_btl_mvapi_get( mca_btl_base_module_t* btl, assert(endpoint->endpoint_state == MCA_BTL_IB_CONNECTED || endpoint->endpoint_state == MCA_BTL_IB_WAITING_ACK); frag->sr_desc.opcode = VAPI_RDMA_READ; - + frag->endpoint = endpoint; /* atomically test and acquire a token */ - if(OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,-1) < 0) { + if(!mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,-1) < 0) { BTL_VERBOSE(("Queing because no rdma write tokens \n")); BTL_MVAPI_INSERT_PENDING(frag, endpoint->pending_frags_lp, endpoint->wr_sq_tokens_lp, endpoint->endpoint_lock, rc); + rc = OMPI_SUCCESS; + } else if(mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp,-1) < 0) { + opal_list_append(&mvapi_btl->pending_frags_lp, (opal_list_item_t *)frag); + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp,1); + rc = OMPI_SUCCESS; + } else { - frag->endpoint = endpoint; + frag->sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_low; frag->sr_desc.remote_addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_src->seg_addr.pval; frag->sr_desc.r_key = frag->base.des_src->seg_key.key32[0]; diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index 7ea9629544..84215bc4fb 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -118,6 +118,7 @@ struct mca_btl_mvapi_component_t { uint32_t ib_src_path_bits; /* number of send tokes available */ uint32_t max_wr_sq_tokens; + uint32_t max_total_wr_sq_tokens; }; typedef struct mca_btl_mvapi_component_t mca_btl_mvapi_component_t; @@ -174,44 +175,60 @@ struct mca_btl_mvapi_module_t { uint32_t num_peers; uint32_t rd_buf_max; - + uint32_t rd_buf_min; + int32_t wr_sq_tokens_hp; + /**< number of high priority frags that can be outstanding (down counter) */ + int32_t wr_sq_tokens_lp; + /**< number of low priority frags that can be outstanding (down counter) */ + + opal_list_t pending_frags_hp; + /**< list of pending high priority frags */ + + opal_list_t pending_frags_lp; + /**< list of pending low priority frags */ + + }; typedef struct mca_btl_mvapi_module_t mca_btl_mvapi_module_t; -#define MCA_BTL_MVAPI_POST_SRR_HIGH(post_srr_high_mvapi_btl, \ - post_srr_high_additional) \ +#define MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, \ + additional) \ { \ - OPAL_THREAD_LOCK(&post_srr_high_mvapi_btl->ib_lock); \ - if(post_srr_high_mvapi_btl->srr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+post_srr_high_additional && \ - post_srr_high_mvapi_btl->srr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){ \ - MCA_BTL_MVAPI_POST_SRR_SUB(mca_btl_mvapi_component.ib_rr_buf_max - \ - post_srr_high_mvapi_btl->srr_posted_high, \ - post_srr_high_mvapi_btl, \ - &post_srr_high_mvapi_btl->recv_free_eager, \ - &post_srr_high_mvapi_btl->srr_posted_high, \ - post_srr_high_mvapi_btl->nic, \ - post_srr_high_mvapi_btl->srq_hndl_high); \ + do { \ + OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \ + if(mvapi_btl->srr_posted_high <= mvapi_btl->rd_buf_min+additional && \ + mvapi_btl->srr_posted_high < mvapi_btl->rd_buf_max){ \ + MCA_BTL_MVAPI_POST_SRR_SUB(mvapi_btl->rd_buf_max - \ + mvapi_btl->srr_posted_high, \ + mvapi_btl, \ + &mvapi_btl->recv_free_eager, \ + &mvapi_btl->srr_posted_high, \ + mvapi_btl->nic, \ + mvapi_btl->srq_hndl_high); \ } \ - OPAL_THREAD_UNLOCK(&post_srr_high_mvapi_btl->ib_lock); \ + OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \ + }while(0);\ } -#define MCA_BTL_MVAPI_POST_SRR_LOW(post_srr_low_mvapi_btl, \ - post_srr_low_additional) \ +#define MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, \ + additional) \ { \ - OPAL_THREAD_LOCK(&post_srr_low_mvapi_btl->ib_lock); \ - if(post_srr_low_mvapi_btl->srr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+post_srr_low_additional && \ - post_srr_low_mvapi_btl->srr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){ \ - MCA_BTL_MVAPI_POST_SRR_SUB(mca_btl_mvapi_component.ib_rr_buf_max - \ - post_srr_low_mvapi_btl->srr_posted_low, \ - post_srr_low_mvapi_btl, \ - &post_srr_low_mvapi_btl->recv_free_max, \ - &post_srr_low_mvapi_btl->srr_posted_low, \ - post_srr_low_mvapi_btl->nic, \ - post_srr_low_mvapi_btl->srq_hndl_low); \ + do { \ + OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \ + if(mvapi_btl->srr_posted_low <= mvapi_btl->rd_buf_min+additional && \ + mvapi_btl->srr_posted_low < mvapi_btl->rd_buf_max){ \ + MCA_BTL_MVAPI_POST_SRR_SUB(mvapi_btl->rd_buf_max - \ + mvapi_btl->srr_posted_low, \ + mvapi_btl, \ + &mvapi_btl->recv_free_max, \ + &mvapi_btl->srr_posted_low, \ + mvapi_btl->nic, \ + mvapi_btl->srq_hndl_low); \ } \ - OPAL_THREAD_UNLOCK(&post_srr_low_mvapi_btl->ib_lock); \ + OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \ + } while(0); \ } diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index fc6fbe0043..511264591e 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -187,11 +187,11 @@ int mca_btl_mvapi_component_open(void) mca_btl_mvapi_module.super.btl_exclusivity = mca_btl_mvapi_param_register_int ("exclusivity", MCA_BTL_EXCLUSIVITY_DEFAULT); mca_btl_mvapi_module.super.btl_eager_limit = - mca_btl_mvapi_param_register_int ("eager_limit", (64*1024)) + mca_btl_mvapi_param_register_int ("eager_limit", (32*1024)) - sizeof(mca_btl_mvapi_header_t); mca_btl_mvapi_module.super.btl_min_send_size = - mca_btl_mvapi_param_register_int ("min_send_size", (64*1024)) + mca_btl_mvapi_param_register_int ("min_send_size", (32*1024)) - sizeof(mca_btl_mvapi_header_t); mca_btl_mvapi_module.super.btl_max_send_size = @@ -221,7 +221,14 @@ int mca_btl_mvapi_component_open(void) 16, &(mca_btl_mvapi_component.max_wr_sq_tokens)); - + mca_base_param_reg_int(&mca_btl_mvapi_component.super.btl_version, + "max_total_wr_sq_tokens", + "Maximum number of send/rdma work request tokens peer btl", + false, + false, + 32, + &(mca_btl_mvapi_component.max_total_wr_sq_tokens)); + mca_btl_mvapi_component.max_send_size = mca_btl_mvapi_module.super.btl_max_send_size; mca_btl_mvapi_component.eager_limit = mca_btl_mvapi_module.super.btl_eager_limit; @@ -413,10 +420,17 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i]; mvapi_btl->rd_buf_max = mca_btl_mvapi_component.ib_rr_buf_max; + mvapi_btl->rd_buf_min = mca_btl_mvapi_component.ib_rr_buf_min; mvapi_btl->num_peers = 0; + mvapi_btl->wr_sq_tokens_hp = + mvapi_btl->wr_sq_tokens_lp = mca_btl_mvapi_component.max_total_wr_sq_tokens; /* Initialize module state */ + OBJ_CONSTRUCT(&mvapi_btl->pending_frags_hp, opal_list_t); + OBJ_CONSTRUCT(&mvapi_btl->pending_frags_lp, opal_list_t); + + OBJ_CONSTRUCT(&mvapi_btl->ib_lock, opal_mutex_t); OBJ_CONSTRUCT(&mvapi_btl->send_free_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mvapi_btl->send_free_max, ompi_free_list_t); @@ -577,8 +591,9 @@ int mca_btl_mvapi_component_progress() frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc); count++; /* check and see if we need to progress pending sends */ - if(OPAL_THREAD_ADD32(&frag->endpoint->wr_sq_tokens_hp, 1) > 0 - && !opal_list_is_empty(&(frag->endpoint->pending_frags_hp))) { + if( !mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&frag->endpoint->wr_sq_tokens_hp, 1) > 0 + && !opal_list_is_empty(&(frag->endpoint->pending_frags_hp))) { opal_list_item_t *frag_item; OPAL_THREAD_LOCK(&frag->endpoint->endpoint_lock); frag_item = opal_list_remove_first(&(frag->endpoint->pending_frags_hp)); @@ -589,6 +604,17 @@ int mca_btl_mvapi_component_progress() BTL_ERROR(("error in posting pending send\n")); } } + if( mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_hp, 1) > 0 + && !opal_list_is_empty(&mvapi_btl->pending_frags_hp)) { + opal_list_item_t *frag_item; + frag_item = opal_list_remove_first(&mvapi_btl->pending_frags_hp); + frag = (mca_btl_mvapi_frag_t *) frag_item; + if(OMPI_SUCCESS != mca_btl_mvapi_endpoint_send(frag->endpoint, frag)) { + BTL_ERROR(("error in posting pending send\n")); + } + } + break; @@ -648,8 +674,9 @@ int mca_btl_mvapi_component_progress() frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc); count++; /* check and see if we need to progress pending sends */ - if(OPAL_THREAD_ADD32(&frag->endpoint->wr_sq_tokens_lp, 1) > 0 - && !opal_list_is_empty(&(frag->endpoint->pending_frags_lp))) { + if(!mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&frag->endpoint->wr_sq_tokens_lp, 1) > 0 && + !opal_list_is_empty(&(frag->endpoint->pending_frags_lp))) { opal_list_item_t *frag_item; OPAL_THREAD_LOCK(&frag->endpoint->endpoint_lock); frag_item = opal_list_remove_first(&(frag->endpoint->pending_frags_lp)); @@ -679,6 +706,36 @@ int mca_btl_mvapi_component_progress() BTL_ERROR(("error in posting pending operation, invalide opcode %d\n", frag->sr_desc.opcode)); } } + if(mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp, 1) > 0 + && !opal_list_is_empty(&mvapi_btl->pending_frags_lp)) { + opal_list_item_t *frag_item; + frag_item = opal_list_remove_first(&mvapi_btl->pending_frags_lp); + frag = (mca_btl_mvapi_frag_t *) frag_item; + switch(frag->sr_desc.opcode){ + case VAPI_SEND: + if(OMPI_SUCCESS != mca_btl_mvapi_endpoint_send(frag->endpoint, frag)) { + BTL_ERROR(("error in posting pending send\n")); + } + break; + case VAPI_RDMA_WRITE: + if(OMPI_SUCCESS != mca_btl_mvapi_put((mca_btl_base_module_t*) mvapi_btl, + frag->endpoint, + (mca_btl_base_descriptor_t*) frag)) { + BTL_ERROR(("error in posting pending rdma write\n")); + } + break; + case VAPI_RDMA_READ: + if(OMPI_SUCCESS != mca_btl_mvapi_put((mca_btl_base_module_t *) mvapi_btl, + frag->endpoint, + (mca_btl_base_descriptor_t*) frag)) { + BTL_ERROR(("error in posting pending rdma read\n")); + } + break; + default: + BTL_ERROR(("error in posting pending operation, invalide opcode %d\n", frag->sr_desc.opcode)); + } + } break; diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index 39fea1c755..e11c0cf4fd 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -71,12 +71,23 @@ static inline int mca_btl_mvapi_endpoint_post_send( if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= mvapi_btl->super.btl_eager_limit){ /* atomically test and acquire a token */ - if(OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_hp,-1) < 0) { + if(!mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_hp,-1) < 0) { BTL_VERBOSE(("Queing because no send tokens \n")); opal_list_append(&endpoint->pending_frags_hp, (opal_list_item_t *)frag); OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_hp,1); + + /* OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); */ +/* mca_btl_mvapi_component_progress(); */ +/* OPAL_THREAD_LOCK(&endpoint->endpoint_lock); */ + return OMPI_SUCCESS; - } else { + } else if( mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_hp,-1) < 0) { + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_hp,1); + opal_list_append(&mvapi_btl->pending_frags_hp, (opal_list_item_t *)frag); + return OMPI_SUCCESS; + }else { frag->sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_high; qp_hndl = endpoint->lcl_qp_hndl_high; } @@ -84,10 +95,18 @@ static inline int mca_btl_mvapi_endpoint_post_send( } else { /* atomically test and acquire a token */ - if(OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,-1) < 0) { + if(!mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,-1) < 0 ) { BTL_VERBOSE(("Queing because no send tokens \n")); opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag); OPAL_THREAD_ADD32(&endpoint->wr_sq_tokens_lp,1); + + return OMPI_SUCCESS; + } else if(mca_btl_mvapi_component.use_srq && + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp,-1) < 0) { + OPAL_THREAD_ADD32(&mvapi_btl->wr_sq_tokens_lp,1); + opal_list_append(&mvapi_btl->pending_frags_lp, (opal_list_item_t *)frag); + return OMPI_SUCCESS; } else { frag->sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_low; @@ -761,8 +780,8 @@ int mca_btl_mvapi_endpoint_connect( } if(mca_btl_mvapi_component.use_srq) { - MCA_BTL_MVAPI_POST_SRR_HIGH(endpoint->endpoint_btl, 1); - MCA_BTL_MVAPI_POST_SRR_LOW(endpoint->endpoint_btl, 1); + MCA_BTL_MVAPI_POST_SRR_HIGH(endpoint->endpoint_btl, 0); + MCA_BTL_MVAPI_POST_SRR_LOW(endpoint->endpoint_btl, 0); } else { MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 0); MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 0);