1
1

Schedule SEND traffic of pipeline protocol between BTLs in accordance with

relative bandwidths of each BTL. Precalculate what part of a message should
be send via each BTL in advance instead of doing it during scheduling.

This commit was SVN r15248.
Этот коммит содержится в:
Gleb Natapov 2007-07-01 11:34:23 +00:00
родитель e74aa6b295
Коммит 54b40aef91
9 изменённых файлов: 123 добавлений и 80 удалений

Просмотреть файл

@ -157,6 +157,14 @@ static int mca_bml_r2_add_btls( void )
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static int btl_bandwidth_compare(const void *v1, const void *v2)
{
mca_bml_base_btl_t *b1 = (mca_bml_base_btl_t*)v1,
*b2 = (mca_bml_base_btl_t*)v2;
return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
}
/* /*
* For each proc setup a datastructure that indicates the PTLs * For each proc setup a datastructure that indicates the PTLs
* that can be used to reach the destination. * that can be used to reach the destination.
@ -380,6 +388,11 @@ int mca_bml_r2_add_procs(
* start using the weight to compute the correct amount. * start using the weight to compute the correct amount.
*/ */
n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
/* sort BTLs in descending order according to bandwidth value */
qsort(bml_endpoint->btl_send.bml_btls, n_size,
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
bml_endpoint->bml_max_send_length = 0; bml_endpoint->bml_max_send_length = 0;
bml_endpoint->bml_max_rdma_length = 0; bml_endpoint->bml_max_rdma_length = 0;
bml_endpoint->btl_rdma_index = 0; bml_endpoint->btl_rdma_index = 0;

Просмотреть файл

@ -536,3 +536,15 @@ int mca_pml_ob1_ft_event( int state )
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2)
{
const mca_pml_ob1_com_btl_t *b1 = v1, *b2 = v2;
if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
return 1;
if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
return -1;
return 0;
}

Просмотреть файл

@ -56,6 +56,7 @@ struct mca_pml_ob1_t {
size_t recv_pipeline_depth; size_t recv_pipeline_depth;
size_t rdma_put_retries_limit; size_t rdma_put_retries_limit;
int max_rdma_per_request; int max_rdma_per_request;
int max_send_per_range;
bool leave_pinned; bool leave_pinned;
int leave_pinned_pipeline; int leave_pinned_pipeline;
@ -231,8 +232,6 @@ extern int mca_pml_ob1_ft_event(
MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, order, \ MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, order, \
sizeof(mca_pml_ob1_hdr_t) + (sizeof(mca_btl_base_segment_t) << 4), size) sizeof(mca_pml_ob1_hdr_t) + (sizeof(mca_btl_base_segment_t) << 4), size)
#define MCA_PML_OB1_MAX_REGISTRATIONS 4
struct mca_pml_ob1_pckt_pending_t { struct mca_pml_ob1_pckt_pending_t {
ompi_free_list_item_t super; ompi_free_list_item_t super;
ompi_proc_t* proc; ompi_proc_t* proc;
@ -316,4 +315,53 @@ do { \
length -= hdrlen; \ length -= hdrlen; \
} while(0) } while(0)
/* represent BTL chosen for sending request */
struct mca_pml_ob1_com_btl_t {
mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
size_t length;
};
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;
int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2);
/* Calculate what percentage of a message to send through each BTL according to
* relative weight */
static inline void mca_pml_ob1_calc_weighted_length(
mca_pml_ob1_com_btl_t *btls, int num_btls, size_t size,
double weight_total)
{
int i;
size_t length_left = size;
/* shortcut for common case for only one BTL */
if(num_btls == 1) {
btls[0].length = size;
return;
}
/* sort BTLs according of their weights so BTLs with smaller weight will
* not hijack all of the traffic */
qsort(btls, num_btls, sizeof(mca_pml_ob1_com_btl_t),
mca_pml_ob1_com_btl_comp);
for(i = 0; i < num_btls; i++) {
mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
size_t length = 0;
if(length_left != 0) {
length = (length_left > bml_btl->btl_eager_limit)?
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
length_left;
if(length > length_left)
length = length_left;
length_left -= length;
}
btls[i].length = length;
}
/* account for rounding errors */
btls[0].length += length_left;
}
#endif #endif

Просмотреть файл

@ -115,6 +115,8 @@ int mca_pml_ob1_component_open(void)
mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5); mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
mca_pml_ob1.max_rdma_per_request = mca_pml_ob1.max_rdma_per_request =
mca_pml_ob1_param_register_int("max_rdma_per_request", 4); mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
mca_pml_ob1.max_send_per_range =
mca_pml_ob1_param_register_int("max_send_per_range", 4);
mca_pml_ob1.unexpected_limit = mca_pml_ob1.unexpected_limit =
mca_pml_ob1_param_register_int("unexpected_limit", 128); mca_pml_ob1_param_register_int("unexpected_limit", 128);
@ -149,7 +151,7 @@ int mca_pml_ob1_component_open(void)
ompi_free_list_init( ompi_free_list_init(
&mca_pml_ob1.send_requests, &mca_pml_ob1.send_requests,
sizeof(mca_pml_ob1_send_request_t) + sizeof(mca_pml_ob1_send_request_t) +
(mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t), (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t),
OBJ_CLASS(mca_pml_ob1_send_request_t), OBJ_CLASS(mca_pml_ob1_send_request_t),
mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_num,
mca_pml_ob1.free_list_max, mca_pml_ob1.free_list_max,
@ -160,7 +162,7 @@ int mca_pml_ob1_component_open(void)
ompi_free_list_init( ompi_free_list_init(
&mca_pml_ob1.recv_requests, &mca_pml_ob1.recv_requests,
sizeof(mca_pml_ob1_recv_request_t) + sizeof(mca_pml_ob1_recv_request_t) +
(mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t), (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t),
OBJ_CLASS(mca_pml_ob1_recv_request_t), OBJ_CLASS(mca_pml_ob1_recv_request_t),
mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_num,
mca_pml_ob1.free_list_max, mca_pml_ob1.free_list_max,
@ -189,8 +191,6 @@ int mca_pml_ob1_component_open(void)
mca_pml_ob1.free_list_inc, mca_pml_ob1.free_list_inc,
NULL); NULL);
OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, ompi_free_list_t); OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, ompi_free_list_t);
ompi_free_list_init( ompi_free_list_init(
&mca_pml_ob1.pending_pckts, &mca_pml_ob1.pending_pckts,
@ -206,7 +206,8 @@ int mca_pml_ob1_component_open(void)
OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, ompi_free_list_t); OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, ompi_free_list_t);
ompi_free_list_init( ompi_free_list_init(
&mca_pml_ob1.send_ranges, &mca_pml_ob1.send_ranges,
sizeof(mca_pml_ob1_send_range_t), sizeof(mca_pml_ob1_send_range_t) +
(mca_pml_ob1.max_send_per_range - 1) * sizeof(mca_pml_ob1_com_btl_t),
OBJ_CLASS(mca_pml_ob1_send_range_t), OBJ_CLASS(mca_pml_ob1_send_range_t),
mca_pml_ob1.free_list_num, mca_pml_ob1.free_list_num,
mca_pml_ob1.free_list_max, mca_pml_ob1.free_list_max,

Просмотреть файл

@ -35,39 +35,6 @@
* from case when registration is not needed */ * from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg; static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/* Calculate what percentage of a message to send through each BTL according to
* relative weight */
static inline void calc_weighted_length(mca_pml_ob1_rdma_btl_t *rdma_btls,
int num_btls, size_t size, double weight_total)
{
int i;
size_t length_left = size;
/* shortcut for common case for only one BTL */
if(num_btls == 1) {
rdma_btls[0].length = size;
return;
}
for(i = 0; i < num_btls; i++) {
mca_bml_base_btl_t* bml_btl = rdma_btls[i].bml_btl;
size_t length = 0;
if(length_left != 0) {
length = (length_left > bml_btl->btl_eager_limit)?
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
length_left;
if(length > length_left)
length = length_left;
length_left -= length;
}
rdma_btls[i].length = length;
}
/* account for rounding errors */
rdma_btls[0].length += length_left;
}
/* /*
* Check to see if memory is registered or can be registered. Build a * Check to see if memory is registered or can be registered. Build a
* set of registrations on the request. * set of registrations on the request.
@ -77,7 +44,7 @@ size_t mca_pml_ob1_rdma_btls(
mca_bml_base_endpoint_t* bml_endpoint, mca_bml_base_endpoint_t* bml_endpoint,
unsigned char* base, unsigned char* base,
size_t size, size_t size,
mca_pml_ob1_rdma_btl_t* rdma_btls) mca_pml_ob1_com_btl_t* rdma_btls)
{ {
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0; double weight_total = 0;
@ -127,7 +94,8 @@ size_t mca_pml_ob1_rdma_btls(
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
return 0; return 0;
calc_weighted_length(rdma_btls, num_btls_used, size, weight_total); mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
weight_total);
bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
return num_btls_used; return num_btls_used;
@ -136,7 +104,7 @@ size_t mca_pml_ob1_rdma_btls(
size_t mca_pml_ob1_rdma_pipeline_btls( size_t mca_pml_ob1_rdma_pipeline_btls(
mca_bml_base_endpoint_t* bml_endpoint, mca_bml_base_endpoint_t* bml_endpoint,
size_t size, size_t size,
mca_pml_ob1_rdma_btl_t* rdma_btls) mca_pml_ob1_com_btl_t* rdma_btls)
{ {
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0; double weight_total = 0;
@ -152,7 +120,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls(
weight_total += rdma_btls[i].bml_btl->btl_weight; weight_total += rdma_btls[i].bml_btl->btl_weight;
} }
calc_weighted_length(rdma_btls, i, size, weight_total); mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total);
return i; return i;
} }

Просмотреть файл

@ -24,29 +24,18 @@
struct mca_bml_base_endpoint_t; struct mca_bml_base_endpoint_t;
/**
* structure to associate RDMA capable BTL(s) with corresponding registration
*/
struct mca_pml_ob1_rdma_btl_t {
struct mca_bml_base_btl_t* bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
size_t length;
};
typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
/* /*
* Of the set of available btls that support RDMA, * Of the set of available btls that support RDMA,
* find those that already have registrations - or * find those that already have registrations - or
* register if required (for leave_pinned option) * register if required (for leave_pinned option)
*/ */
size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls); unsigned char* base, size_t size, struct mca_pml_ob1_com_btl_t* btls);
/* Choose RDMA BTLs to use for sending of a request by pipeline protocol. /* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
* Calculate number of bytes to send through each BTL according to available * Calculate number of bytes to send through each BTL according to available
* bandwidth */ * bandwidth */
size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls); size_t size, mca_pml_ob1_com_btl_t* rdma_btls);
#endif #endif

Просмотреть файл

@ -52,7 +52,7 @@ struct mca_pml_ob1_recv_request_t {
uint32_t req_rdma_idx; uint32_t req_rdma_idx;
bool req_pending; bool req_pending;
bool req_ack_sent; /**< whether ack was sent to the sender */ bool req_ack_sent; /**< whether ack was sent to the sender */
mca_pml_ob1_rdma_btl_t req_rdma[1]; mca_pml_ob1_com_btl_t req_rdma[1];
}; };
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;

Просмотреть файл

@ -883,7 +883,10 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
{ {
mca_pml_ob1_send_range_t *sr; mca_pml_ob1_send_range_t *sr;
ompi_free_list_item_t *i; ompi_free_list_item_t *i;
int rc = OMPI_SUCCESS; mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
int rc = OMPI_SUCCESS, n;
double weight_total = 0;
if(0 == send_length) if(0 == send_length)
return; return;
@ -894,6 +897,18 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
sr->range_send_offset = send_offset; sr->range_send_offset = send_offset;
sr->range_send_length = send_length; sr->range_send_length = send_length;
sr->range_btl_idx = 0;
for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) {
sr->range_btls[n].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
weight_total += sr->range_btls[n].bml_btl->btl_weight;
}
sr->range_btl_cnt = n;
mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length,
weight_total);
OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr); opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr);
OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
@ -910,22 +925,19 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
int mca_pml_ob1_send_request_schedule_exclusive( int mca_pml_ob1_send_request_schedule_exclusive(
mca_pml_ob1_send_request_t* sendreq) mca_pml_ob1_send_request_t* sendreq)
{ {
mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
size_t num_btl_avail =
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
do { do {
size_t prev_bytes_remaining = 0, num_fail = 0; size_t prev_bytes_remaining = 0;
mca_pml_ob1_send_range_t *range = NULL; mca_pml_ob1_send_range_t *range = NULL;
int num_fail = 0;
while(true) { while(true) {
mca_pml_ob1_frag_hdr_t* hdr; mca_pml_ob1_frag_hdr_t* hdr;
mca_btl_base_descriptor_t* des; mca_btl_base_descriptor_t* des;
int rc; int rc, btl_idx;
size_t size, offset; size_t size, offset;
opal_list_item_t *item; opal_list_item_t *item;
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_t* bml_btl;
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
if(NULL == range || 0 == range->range_send_length) { if(NULL == range || 0 == range->range_send_length) {
OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
@ -957,7 +969,7 @@ int mca_pml_ob1_send_request_schedule_exclusive(
prev_bytes_remaining = range->range_send_length; prev_bytes_remaining = range->range_send_length;
if (num_fail == num_btl_avail) { if (num_fail == range->range_btl_cnt) {
assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE); assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_SCHEDULE; sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_SCHEDULE;
@ -967,17 +979,13 @@ int mca_pml_ob1_send_request_schedule_exclusive(
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
if(num_btl_avail == 1 || do {
range->range_send_length < bml_btl->btl_min_send_size) { btl_idx = range->range_btl_idx;
size = range->range_send_length; bml_btl = range->range_btls[btl_idx].bml_btl;
} else { size = range->range_btls[btl_idx].length;
/* otherwise attempt to give the BTL a percentage of the message if(++range->range_btl_idx == range->range_btl_cnt)
* based on a weighting factor. for simplicity calculate this as range->range_btl_idx = 0;
* a percentage of the overall message length (regardless of } while(!size);
* amount previously assigned)
*/
size = (size_t)(bml_btl->btl_weight * range->range_send_length);
}
/* makes sure that we don't exceed BTL max send size */ /* makes sure that we don't exceed BTL max send size */
if (bml_btl->btl_max_send_size != 0 && if (bml_btl->btl_max_send_size != 0 &&
@ -1035,8 +1043,9 @@ int mca_pml_ob1_send_request_schedule_exclusive(
rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML);
if(rc == OMPI_SUCCESS) { if(rc == OMPI_SUCCESS) {
range->range_send_length -= size;
/* update state */ /* update state */
range->range_btls[btl_idx].length -= size;
range->range_send_length -= size;
range->range_send_offset += size; range->range_send_offset += size;
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
} else { } else {

Просмотреть файл

@ -58,7 +58,7 @@ struct mca_pml_ob1_send_request_t {
mca_pml_ob1_send_pending_t req_pending; mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock; opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges; opal_list_t req_send_ranges;
mca_pml_ob1_rdma_btl_t req_rdma[1]; mca_pml_ob1_com_btl_t req_rdma[1];
}; };
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -68,6 +68,9 @@ struct mca_pml_ob1_send_range_t {
ompi_free_list_item_t base; ompi_free_list_item_t base;
uint64_t range_send_offset; uint64_t range_send_offset;
uint64_t range_send_length; uint64_t range_send_length;
int range_btl_idx;
int range_btl_cnt;
mca_pml_ob1_com_btl_t range_btls[1];
}; };
typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t; typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t); OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t);