1
1

Schedule RDMA traffic between BTLs in accordance with relative bandwidths of

each BTL. Precalculate what part of a message should be send via each BTL in
advance instead of doing it during scheduling.

This commit was SVN r15247.
Этот коммит содержится в:
Gleb Natapov 2007-07-01 11:31:26 +00:00
родитель 086624a4fe
Коммит e74aa6b295
7 изменённых файлов: 127 добавлений и 76 удалений

Просмотреть файл

@ -55,6 +55,7 @@ struct mca_pml_ob1_t {
size_t send_pipeline_depth;
size_t recv_pipeline_depth;
size_t rdma_put_retries_limit;
int max_rdma_per_request;
bool leave_pinned;
int leave_pinned_pipeline;

Просмотреть файл

@ -113,6 +113,8 @@ int mca_pml_ob1_component_open(void)
mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
mca_pml_ob1.rdma_put_retries_limit =
mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
mca_pml_ob1.max_rdma_per_request =
mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
mca_pml_ob1.unexpected_limit =
mca_pml_ob1_param_register_int("unexpected_limit", 128);
@ -146,7 +148,8 @@ int mca_pml_ob1_component_open(void)
OBJ_CONSTRUCT(&mca_pml_ob1.send_requests, ompi_free_list_t);
ompi_free_list_init(
&mca_pml_ob1.send_requests,
sizeof(mca_pml_ob1_send_request_t),
sizeof(mca_pml_ob1_send_request_t) +
(mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t),
OBJ_CLASS(mca_pml_ob1_send_request_t),
mca_pml_ob1.free_list_num,
mca_pml_ob1.free_list_max,
@ -156,7 +159,8 @@ int mca_pml_ob1_component_open(void)
OBJ_CONSTRUCT(&mca_pml_ob1.recv_requests, ompi_free_list_t);
ompi_free_list_init(
&mca_pml_ob1.recv_requests,
sizeof(mca_pml_ob1_recv_request_t),
sizeof(mca_pml_ob1_recv_request_t) +
(mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t),
OBJ_CLASS(mca_pml_ob1_recv_request_t),
mca_pml_ob1.free_list_num,
mca_pml_ob1.free_list_max,

Просмотреть файл

@ -30,6 +30,44 @@
#include "pml_ob1.h"
#include "pml_ob1_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/* Calculate what percentage of a message to send through each BTL according to
* relative weight */
static inline void calc_weighted_length(mca_pml_ob1_rdma_btl_t *rdma_btls,
int num_btls, size_t size, double weight_total)
{
int i;
size_t length_left = size;
/* shortcut for common case for only one BTL */
if(num_btls == 1) {
rdma_btls[0].length = size;
return;
}
for(i = 0; i < num_btls; i++) {
mca_bml_base_btl_t* bml_btl = rdma_btls[i].bml_btl;
size_t length = 0;
if(length_left != 0) {
length = (length_left > bml_btl->btl_eager_limit)?
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
length_left;
if(length > length_left)
length = length_left;
length_left -= length;
}
rdma_btls[i].length = length;
}
/* account for rounding errors */
rdma_btls[0].length += length_left;
}
/*
* Check to see if memory is registered or can be registered. Build a
* set of registrations on the request.
@ -41,9 +79,9 @@ size_t mca_pml_ob1_rdma_btls(
size_t size,
mca_pml_ob1_rdma_btl_t* rdma_btls)
{
size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
size_t num_btls_used = 0;
size_t n;
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
int num_btls_used = 0, n;
/* shortcut when there are no rdma capable btls */
if(num_btls == 0) {
@ -51,7 +89,7 @@ size_t mca_pml_ob1_rdma_btls(
}
/* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST;
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
n++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
@ -59,10 +97,7 @@ size_t mca_pml_ob1_rdma_btls(
mca_mpool_base_registration_t* reg = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool;
/* btl is rdma capable and registration is not required */
if(NULL == btl_mpool) {
reg = NULL;
} else {
if(NULL != btl_mpool) {
if(!mca_pml_ob1.leave_pinned) {
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool, base, size, &reg);
@ -73,14 +108,51 @@ size_t mca_pml_ob1_rdma_btls(
if(NULL == reg)
bml_btl = NULL; /* skip it */
} else {
/* if registration is not required use dummy registration */
reg = &pml_ob1_dummy_reg;
}
if(bml_btl != NULL) {
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}
}
/* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
return 0;
calc_weighted_length(rdma_btls, num_btls_used, size, weight_total);
bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
return num_btls_used;
}
size_t mca_pml_ob1_rdma_pipeline_btls(
mca_bml_base_endpoint_t* bml_endpoint,
size_t size,
mca_pml_ob1_rdma_btl_t* rdma_btls)
{
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(rdma_btls[i].bml_btl->btl_mpool != NULL)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight;
}
calc_weighted_length(rdma_btls, i, size, weight_total);
return i;
}

Просмотреть файл

@ -31,13 +31,10 @@ struct mca_bml_base_endpoint_t;
struct mca_pml_ob1_rdma_btl_t {
struct mca_bml_base_btl_t* bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
size_t length;
};
typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
#define MCA_PML_OB1_MAX_RDMA_PER_REQUEST 4
/*
* Of the set of available btls that support RDMA,
* find those that already have registrations - or
@ -46,5 +43,10 @@ typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls);
/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
* Calculate number of bytes to send through each BTL according to available
* bandwidth */
size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls);
#endif

Просмотреть файл

@ -240,13 +240,15 @@ static int mca_pml_ob1_recv_request_ack(
/* by default copy everything */
recvreq->req_send_offset = bytes_received;
if(hdr->hdr_msg_length > bytes_received) {
size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
/*
* lookup request buffer to determine if memory is already
* registered.
*/
if(ompi_convertor_need_buffers(&recvreq->req_recv.req_convertor) == 0 &&
hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG) {
hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG &&
rdma_num != 0) {
unsigned char *base;
ompi_convertor_get_current_pointer( &recvreq->req_recv.req_convertor, (void**)&(base) );
@ -261,19 +263,23 @@ static int mca_pml_ob1_recv_request_ack(
recvreq->req_send_offset = hdr->hdr_msg_length;
/* are rdma devices available for long rdma protocol */
} else if (bml_endpoint->btl_send_limit < hdr->hdr_msg_length &&
bml_endpoint->btl_pipeline_send_length <
hdr->hdr_msg_length &&
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
} else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) {
/* use convertor to figure out the rdma offset for this request */
recvreq->req_send_offset = hdr->hdr_msg_length -
bml_endpoint->btl_pipeline_send_length;
if(recvreq->req_send_offset < bytes_received) {
if(recvreq->req_send_offset < bytes_received)
recvreq->req_send_offset = bytes_received;
}
ompi_convertor_set_position( &recvreq->req_recv.req_convertor,
&recvreq->req_send_offset );
/* use converter to figure out the rdma offset for this
* request */
ompi_convertor_set_position(&recvreq->req_recv.req_convertor,
&recvreq->req_send_offset);
recvreq->req_rdma_cnt =
mca_pml_ob1_rdma_pipeline_btls(bml_endpoint,
recvreq->req_send_offset - bytes_received,
recvreq->req_rdma);
}
}
/* nothing to send by copy in/out - no need to ack */
@ -592,14 +598,8 @@ void mca_pml_ob1_recv_request_matched_probe(
int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq )
{
ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc;
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
mca_bml_base_btl_t* bml_btl;
int num_btl_avail =
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
int num_tries = num_btl_avail;
if(recvreq->req_rdma_cnt)
num_tries = recvreq->req_rdma_cnt;
int num_tries = recvreq->req_rdma_cnt;
do {
size_t bytes_remaining = recvreq->req_send_offset -
@ -615,8 +615,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
mca_mpool_base_registration_t * reg = NULL;
int rc;
int rc, rdma_idx;
if(prev_bytes_remaining == bytes_remaining) {
if( ++num_fail == num_tries ) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@ -636,47 +636,18 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
ompi_convertor_set_position(&recvreq->req_recv.req_convertor,
&recvreq->req_rdma_offset);
if(recvreq->req_rdma_cnt) {
/*
* Select the next btl out of the list w/ preregistered
* memory.
*/
bml_btl = recvreq->req_rdma[recvreq->req_rdma_idx].bml_btl;
num_btl_avail = recvreq->req_rdma_cnt - recvreq->req_rdma_idx;
reg = recvreq->req_rdma[recvreq->req_rdma_idx].btl_reg;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
do {
rdma_idx = recvreq->req_rdma_idx;
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
reg = recvreq->req_rdma[rdma_idx].btl_reg;
size = recvreq->req_rdma[rdma_idx].length;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
recvreq->req_rdma_idx = 0;
} else {
/*
* Otherwise, schedule round-robin across the
* available RDMA nics dynamically registering/deregister
* as required.
*/
bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
}
} while(!size);
/*
* If more than one NIC is available - try to use both for
* anything larger than the eager limit
*/
if( num_btl_avail == 1 ||
bytes_remaining < bml_btl->btl_eager_limit ) {
size = bytes_remaining;
} else {
/*
* otherwise attempt to give the BTL a percentage of
* the message based on a weighting factor. for
* simplicity calculate this as a percentage of the
* overall message length (regardless of amount
* previously assigned)
*/
size = (size_t)(bml_btl->btl_weight * bytes_remaining);
}
/* makes sure that we don't exceed BTL max rdma size
* if memory is not pinned already */
if(0 == recvreq->req_rdma_cnt &&
if(NULL == reg &&
bml_btl->btl_rdma_pipeline_frag_size != 0 &&
size > bml_btl->btl_rdma_pipeline_frag_size) {
size = bml_btl->btl_rdma_pipeline_frag_size;
@ -684,8 +655,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
/* prepare a descriptor for RDMA */
mca_bml_base_prepare_dst(bml_btl, reg,
&recvreq->req_recv.req_convertor, MCA_BTL_NO_ORDER,
0, &size, &dst);
&recvreq->req_recv.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, &dst);
if(dst == NULL) {
continue;
}
@ -751,6 +722,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
/* update request state */
recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,1);
recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size;
} else {
mca_bml_base_free(bml_btl,ctl);

Просмотреть файл

@ -48,11 +48,11 @@ struct mca_pml_ob1_recv_request_t {
size_t req_bytes_delivered;
size_t req_rdma_offset;
size_t req_send_offset;
mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST];
uint32_t req_rdma_cnt;
uint32_t req_rdma_idx;
bool req_pending;
bool req_ack_sent; /**< whether ack was sent to the sender */
mca_pml_ob1_rdma_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
@ -135,7 +135,7 @@ do {
\
for( r = 0; r < recvreq->req_rdma_cnt; r++ ) { \
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[r].btl_reg; \
if( NULL != btl_reg ) { \
if( NULL != btl_reg && btl_reg->mpool != NULL) { \
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); \
} \
} \

Просмотреть файл

@ -54,11 +54,11 @@ struct mca_pml_ob1_send_request_t {
bool req_throttle_sends;
size_t req_pipeline_depth;
size_t req_bytes_delivered;
mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST];
uint32_t req_rdma_cnt;
mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges;
mca_pml_ob1_rdma_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -122,7 +122,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s
/* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
if( NULL != reg ) {
if( NULL != reg && reg->mpool != NULL ) {
reg->mpool->mpool_deregister(reg->mpool, reg);
}
}