From 54b40aef9154c3ff5ca1d7531c4da49bece488c7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 1 Jul 2007 11:34:23 +0000 Subject: [PATCH] Schedule SEND traffic of pipeline protocol between BTLs in accordance with relative bandwidths of each BTL. Precalculate what part of a message should be send via each BTL in advance instead of doing it during scheduling. This commit was SVN r15248. --- ompi/mca/bml/r2/bml_r2.c | 13 +++++++ ompi/mca/pml/ob1/pml_ob1.c | 12 +++++++ ompi/mca/pml/ob1/pml_ob1.h | 52 ++++++++++++++++++++++++++-- ompi/mca/pml/ob1/pml_ob1_component.c | 11 +++--- ompi/mca/pml/ob1/pml_ob1_rdma.c | 42 +++------------------- ompi/mca/pml/ob1/pml_ob1_rdma.h | 15 ++------ ompi/mca/pml/ob1/pml_ob1_recvreq.h | 2 +- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 51 ++++++++++++++++----------- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 5 ++- 9 files changed, 123 insertions(+), 80 deletions(-) diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 61f5db8281..97f5c8aab8 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -157,6 +157,14 @@ static int mca_bml_r2_add_btls( void ) return OMPI_SUCCESS; } +static int btl_bandwidth_compare(const void *v1, const void *v2) +{ + mca_bml_base_btl_t *b1 = (mca_bml_base_btl_t*)v1, + *b2 = (mca_bml_base_btl_t*)v2; + + return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth; +} + /* * For each proc setup a datastructure that indicates the PTLs * that can be used to reach the destination. @@ -380,6 +388,11 @@ int mca_bml_r2_add_procs( * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + + /* sort BTLs in descending order according to bandwidth value */ + qsort(bml_endpoint->btl_send.bml_btls, n_size, + sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); + bml_endpoint->bml_max_send_length = 0; bml_endpoint->bml_max_rdma_length = 0; bml_endpoint->btl_rdma_index = 0; diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index fd528fadb7..8b06c99a84 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -536,3 +536,15 @@ int mca_pml_ob1_ft_event( int state ) return OMPI_SUCCESS; } + +int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2) +{ + const mca_pml_ob1_com_btl_t *b1 = v1, *b2 = v2; + + if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight) + return 1; + if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight) + return -1; + + return 0; +} diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index a9d42899ac..d10b6a3235 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -56,6 +56,7 @@ struct mca_pml_ob1_t { size_t recv_pipeline_depth; size_t rdma_put_retries_limit; int max_rdma_per_request; + int max_send_per_range; bool leave_pinned; int leave_pinned_pipeline; @@ -231,8 +232,6 @@ extern int mca_pml_ob1_ft_event( MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, order, \ sizeof(mca_pml_ob1_hdr_t) + (sizeof(mca_btl_base_segment_t) << 4), size) -#define MCA_PML_OB1_MAX_REGISTRATIONS 4 - struct mca_pml_ob1_pckt_pending_t { ompi_free_list_item_t super; ompi_proc_t* proc; @@ -316,4 +315,53 @@ do { \ length -= hdrlen; \ } while(0) +/* represent BTL chosen for sending request */ +struct mca_pml_ob1_com_btl_t { + mca_bml_base_btl_t *bml_btl; + struct mca_mpool_base_registration_t* btl_reg; + size_t length; +}; +typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; + +int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2); + +/* Calculate what percentage of a message to send through each BTL according to + * relative weight */ +static inline void mca_pml_ob1_calc_weighted_length( + mca_pml_ob1_com_btl_t *btls, int num_btls, size_t size, + double weight_total) +{ + int i; + size_t length_left = size; + + /* shortcut for common case for only one BTL */ + if(num_btls == 1) { + btls[0].length = size; + return; + } + + /* sort BTLs according of their weights so BTLs with smaller weight will + * not hijack all of the traffic */ + qsort(btls, num_btls, sizeof(mca_pml_ob1_com_btl_t), + mca_pml_ob1_com_btl_comp); + + for(i = 0; i < num_btls; i++) { + mca_bml_base_btl_t* bml_btl = btls[i].bml_btl; + size_t length = 0; + if(length_left != 0) { + length = (length_left > bml_btl->btl_eager_limit)? + ((size_t)(size * (bml_btl->btl_weight / weight_total))) : + length_left; + + if(length > length_left) + length = length_left; + length_left -= length; + } + btls[i].length = length; + } + + /* account for rounding errors */ + btls[0].length += length_left; +} + #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index dd7023597c..b35f4ee3a2 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -115,6 +115,8 @@ int mca_pml_ob1_component_open(void) mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5); mca_pml_ob1.max_rdma_per_request = mca_pml_ob1_param_register_int("max_rdma_per_request", 4); + mca_pml_ob1.max_send_per_range = + mca_pml_ob1_param_register_int("max_send_per_range", 4); mca_pml_ob1.unexpected_limit = mca_pml_ob1_param_register_int("unexpected_limit", 128); @@ -149,7 +151,7 @@ int mca_pml_ob1_component_open(void) ompi_free_list_init( &mca_pml_ob1.send_requests, sizeof(mca_pml_ob1_send_request_t) + - (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t), + (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t), OBJ_CLASS(mca_pml_ob1_send_request_t), mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_max, @@ -160,7 +162,7 @@ int mca_pml_ob1_component_open(void) ompi_free_list_init( &mca_pml_ob1.recv_requests, sizeof(mca_pml_ob1_recv_request_t) + - (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t), + (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t), OBJ_CLASS(mca_pml_ob1_recv_request_t), mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_max, @@ -189,8 +191,6 @@ int mca_pml_ob1_component_open(void) mca_pml_ob1.free_list_inc, NULL); - - OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, ompi_free_list_t); ompi_free_list_init( &mca_pml_ob1.pending_pckts, @@ -206,7 +206,8 @@ int mca_pml_ob1_component_open(void) OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, ompi_free_list_t); ompi_free_list_init( &mca_pml_ob1.send_ranges, - sizeof(mca_pml_ob1_send_range_t), + sizeof(mca_pml_ob1_send_range_t) + + (mca_pml_ob1.max_send_per_range - 1) * sizeof(mca_pml_ob1_com_btl_t), OBJ_CLASS(mca_pml_ob1_send_range_t), mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_max, diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index b1a2e24d9d..4e64a8729e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -35,39 +35,6 @@ * from case when registration is not needed */ static mca_mpool_base_registration_t pml_ob1_dummy_reg; -/* Calculate what percentage of a message to send through each BTL according to - * relative weight */ -static inline void calc_weighted_length(mca_pml_ob1_rdma_btl_t *rdma_btls, - int num_btls, size_t size, double weight_total) -{ - int i; - size_t length_left = size; - - /* shortcut for common case for only one BTL */ - if(num_btls == 1) { - rdma_btls[0].length = size; - return; - } - - for(i = 0; i < num_btls; i++) { - mca_bml_base_btl_t* bml_btl = rdma_btls[i].bml_btl; - size_t length = 0; - if(length_left != 0) { - length = (length_left > bml_btl->btl_eager_limit)? - ((size_t)(size * (bml_btl->btl_weight / weight_total))) : - length_left; - - if(length > length_left) - length = length_left; - length_left -= length; - } - rdma_btls[i].length = length; - } - - /* account for rounding errors */ - rdma_btls[0].length += length_left; -} - /* * Check to see if memory is registered or can be registered. Build a * set of registrations on the request. @@ -77,7 +44,7 @@ size_t mca_pml_ob1_rdma_btls( mca_bml_base_endpoint_t* bml_endpoint, unsigned char* base, size_t size, - mca_pml_ob1_rdma_btl_t* rdma_btls) + mca_pml_ob1_com_btl_t* rdma_btls) { int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); double weight_total = 0; @@ -127,7 +94,8 @@ size_t mca_pml_ob1_rdma_btls( if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) return 0; - calc_weighted_length(rdma_btls, num_btls_used, size, weight_total); + mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, + weight_total); bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; return num_btls_used; @@ -136,7 +104,7 @@ size_t mca_pml_ob1_rdma_btls( size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, size_t size, - mca_pml_ob1_rdma_btl_t* rdma_btls) + mca_pml_ob1_com_btl_t* rdma_btls) { int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); double weight_total = 0; @@ -152,7 +120,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls( weight_total += rdma_btls[i].bml_btl->btl_weight; } - calc_weighted_length(rdma_btls, i, size, weight_total); + mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total); return i; } diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h index abd07e248c..3ed0655795 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h @@ -24,29 +24,18 @@ struct mca_bml_base_endpoint_t; -/** - * structure to associate RDMA capable BTL(s) with corresponding registration - */ - -struct mca_pml_ob1_rdma_btl_t { - struct mca_bml_base_btl_t* bml_btl; - struct mca_mpool_base_registration_t* btl_reg; - size_t length; -}; -typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t; - /* * Of the set of available btls that support RDMA, * find those that already have registrations - or * register if required (for leave_pinned option) */ size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, - unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls); + unsigned char* base, size_t size, struct mca_pml_ob1_com_btl_t* btls); /* Choose RDMA BTLs to use for sending of a request by pipeline protocol. * Calculate number of bytes to send through each BTL according to available * bandwidth */ size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, - size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls); + size_t size, mca_pml_ob1_com_btl_t* rdma_btls); #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 1239c143fb..1c67d1d764 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -52,7 +52,7 @@ struct mca_pml_ob1_recv_request_t { uint32_t req_rdma_idx; bool req_pending; bool req_ack_sent; /**< whether ack was sent to the sender */ - mca_pml_ob1_rdma_btl_t req_rdma[1]; + mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 31549bc0cc..94923f1623 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -883,7 +883,10 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq, { mca_pml_ob1_send_range_t *sr; ompi_free_list_item_t *i; - int rc = OMPI_SUCCESS; + mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint; + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + int rc = OMPI_SUCCESS, n; + double weight_total = 0; if(0 == send_length) return; @@ -894,6 +897,18 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq, sr->range_send_offset = send_offset; sr->range_send_length = send_length; + sr->range_btl_idx = 0; + + for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) { + sr->range_btls[n].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send); + weight_total += sr->range_btls[n].bml_btl->btl_weight; + } + + sr->range_btl_cnt = n; + mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length, + weight_total); + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr); OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); @@ -910,22 +925,19 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq, int mca_pml_ob1_send_request_schedule_exclusive( mca_pml_ob1_send_request_t* sendreq) { - mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint; - size_t num_btl_avail = - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); do { - size_t prev_bytes_remaining = 0, num_fail = 0; + size_t prev_bytes_remaining = 0; mca_pml_ob1_send_range_t *range = NULL; + int num_fail = 0; while(true) { mca_pml_ob1_frag_hdr_t* hdr; mca_btl_base_descriptor_t* des; - int rc; + int rc, btl_idx; size_t size, offset; opal_list_item_t *item; - mca_bml_base_btl_t* bml_btl = - mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send); + mca_bml_base_btl_t* bml_btl; if(NULL == range || 0 == range->range_send_length) { OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); @@ -957,7 +969,7 @@ int mca_pml_ob1_send_request_schedule_exclusive( prev_bytes_remaining = range->range_send_length; - if (num_fail == num_btl_avail) { + if (num_fail == range->range_btl_cnt) { assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE); OPAL_THREAD_LOCK(&mca_pml_ob1.lock); sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_SCHEDULE; @@ -967,17 +979,13 @@ int mca_pml_ob1_send_request_schedule_exclusive( return OMPI_ERR_OUT_OF_RESOURCE; } - if(num_btl_avail == 1 || - range->range_send_length < bml_btl->btl_min_send_size) { - size = range->range_send_length; - } else { - /* otherwise attempt to give the BTL a percentage of the message - * based on a weighting factor. for simplicity calculate this as - * a percentage of the overall message length (regardless of - * amount previously assigned) - */ - size = (size_t)(bml_btl->btl_weight * range->range_send_length); - } + do { + btl_idx = range->range_btl_idx; + bml_btl = range->range_btls[btl_idx].bml_btl; + size = range->range_btls[btl_idx].length; + if(++range->range_btl_idx == range->range_btl_cnt) + range->range_btl_idx = 0; + } while(!size); /* makes sure that we don't exceed BTL max send size */ if (bml_btl->btl_max_send_size != 0 && @@ -1035,8 +1043,9 @@ int mca_pml_ob1_send_request_schedule_exclusive( rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); if(rc == OMPI_SUCCESS) { - range->range_send_length -= size; /* update state */ + range->range_btls[btl_idx].length -= size; + range->range_send_length -= size; range->range_send_offset += size; OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); } else { diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 809a4d4cba..f1394c0d47 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -58,7 +58,7 @@ struct mca_pml_ob1_send_request_t { mca_pml_ob1_send_pending_t req_pending; opal_mutex_t req_send_range_lock; opal_list_t req_send_ranges; - mca_pml_ob1_rdma_btl_t req_rdma[1]; + mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; @@ -68,6 +68,9 @@ struct mca_pml_ob1_send_range_t { ompi_free_list_item_t base; uint64_t range_send_offset; uint64_t range_send_length; + int range_btl_idx; + int range_btl_cnt; + mca_pml_ob1_com_btl_t range_btls[1]; }; typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t);