Schedule RDMA traffic between BTLs in accordance with relative bandwidths of
each BTL. Precalculate what part of a message should be send via each BTL in advance instead of doing it during scheduling. This commit was SVN r15247.
Этот коммит содержится в:
родитель
086624a4fe
Коммит
e74aa6b295
@ -55,6 +55,7 @@ struct mca_pml_ob1_t {
|
||||
size_t send_pipeline_depth;
|
||||
size_t recv_pipeline_depth;
|
||||
size_t rdma_put_retries_limit;
|
||||
int max_rdma_per_request;
|
||||
bool leave_pinned;
|
||||
int leave_pinned_pipeline;
|
||||
|
||||
|
@ -113,6 +113,8 @@ int mca_pml_ob1_component_open(void)
|
||||
mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
|
||||
mca_pml_ob1.rdma_put_retries_limit =
|
||||
mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
|
||||
mca_pml_ob1.max_rdma_per_request =
|
||||
mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
|
||||
|
||||
mca_pml_ob1.unexpected_limit =
|
||||
mca_pml_ob1_param_register_int("unexpected_limit", 128);
|
||||
@ -146,7 +148,8 @@ int mca_pml_ob1_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_pml_ob1.send_requests, ompi_free_list_t);
|
||||
ompi_free_list_init(
|
||||
&mca_pml_ob1.send_requests,
|
||||
sizeof(mca_pml_ob1_send_request_t),
|
||||
sizeof(mca_pml_ob1_send_request_t) +
|
||||
(mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t),
|
||||
OBJ_CLASS(mca_pml_ob1_send_request_t),
|
||||
mca_pml_ob1.free_list_num,
|
||||
mca_pml_ob1.free_list_max,
|
||||
@ -156,7 +159,8 @@ int mca_pml_ob1_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_pml_ob1.recv_requests, ompi_free_list_t);
|
||||
ompi_free_list_init(
|
||||
&mca_pml_ob1.recv_requests,
|
||||
sizeof(mca_pml_ob1_recv_request_t),
|
||||
sizeof(mca_pml_ob1_recv_request_t) +
|
||||
(mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t),
|
||||
OBJ_CLASS(mca_pml_ob1_recv_request_t),
|
||||
mca_pml_ob1.free_list_num,
|
||||
mca_pml_ob1.free_list_max,
|
||||
|
@ -30,6 +30,44 @@
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_rdma.h"
|
||||
|
||||
/* Use this registration if no registration needed for a BTL instead of NULL.
|
||||
* This will help other code to distinguish case when memory is not registered
|
||||
* from case when registration is not needed */
|
||||
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
|
||||
|
||||
/* Calculate what percentage of a message to send through each BTL according to
|
||||
* relative weight */
|
||||
static inline void calc_weighted_length(mca_pml_ob1_rdma_btl_t *rdma_btls,
|
||||
int num_btls, size_t size, double weight_total)
|
||||
{
|
||||
int i;
|
||||
size_t length_left = size;
|
||||
|
||||
/* shortcut for common case for only one BTL */
|
||||
if(num_btls == 1) {
|
||||
rdma_btls[0].length = size;
|
||||
return;
|
||||
}
|
||||
|
||||
for(i = 0; i < num_btls; i++) {
|
||||
mca_bml_base_btl_t* bml_btl = rdma_btls[i].bml_btl;
|
||||
size_t length = 0;
|
||||
if(length_left != 0) {
|
||||
length = (length_left > bml_btl->btl_eager_limit)?
|
||||
((size_t)(size * (bml_btl->btl_weight / weight_total))) :
|
||||
length_left;
|
||||
|
||||
if(length > length_left)
|
||||
length = length_left;
|
||||
length_left -= length;
|
||||
}
|
||||
rdma_btls[i].length = length;
|
||||
}
|
||||
|
||||
/* account for rounding errors */
|
||||
rdma_btls[0].length += length_left;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if memory is registered or can be registered. Build a
|
||||
* set of registrations on the request.
|
||||
@ -41,9 +79,9 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
size_t size,
|
||||
mca_pml_ob1_rdma_btl_t* rdma_btls)
|
||||
{
|
||||
size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
size_t num_btls_used = 0;
|
||||
size_t n;
|
||||
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
double weight_total = 0;
|
||||
int num_btls_used = 0, n;
|
||||
|
||||
/* shortcut when there are no rdma capable btls */
|
||||
if(num_btls == 0) {
|
||||
@ -51,7 +89,7 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST;
|
||||
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
|
||||
n++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
|
||||
@ -59,10 +97,7 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
mca_mpool_base_registration_t* reg = NULL;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool;
|
||||
|
||||
/* btl is rdma capable and registration is not required */
|
||||
if(NULL == btl_mpool) {
|
||||
reg = NULL;
|
||||
} else {
|
||||
if(NULL != btl_mpool) {
|
||||
if(!mca_pml_ob1.leave_pinned) {
|
||||
/* look through existing registrations */
|
||||
btl_mpool->mpool_find(btl_mpool, base, size, ®);
|
||||
@ -73,14 +108,51 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
|
||||
if(NULL == reg)
|
||||
bml_btl = NULL; /* skip it */
|
||||
} else {
|
||||
/* if registration is not required use dummy registration */
|
||||
reg = &pml_ob1_dummy_reg;
|
||||
}
|
||||
|
||||
if(bml_btl != NULL) {
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we don't use leave_pinned and all BTLs that already have this memory
|
||||
* registered amount to less then half of available bandwidth - fall back to
|
||||
* pipeline protocol */
|
||||
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
|
||||
return 0;
|
||||
|
||||
calc_weighted_length(rdma_btls, num_btls_used, size, weight_total);
|
||||
|
||||
bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
|
||||
return num_btls_used;
|
||||
}
|
||||
|
||||
size_t mca_pml_ob1_rdma_pipeline_btls(
|
||||
mca_bml_base_endpoint_t* bml_endpoint,
|
||||
size_t size,
|
||||
mca_pml_ob1_rdma_btl_t* rdma_btls)
|
||||
{
|
||||
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
double weight_total = 0;
|
||||
|
||||
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
|
||||
rdma_btls[i].bml_btl =
|
||||
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
|
||||
if(rdma_btls[i].bml_btl->btl_mpool != NULL)
|
||||
rdma_btls[i].btl_reg = NULL;
|
||||
else
|
||||
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
|
||||
|
||||
weight_total += rdma_btls[i].bml_btl->btl_weight;
|
||||
}
|
||||
|
||||
calc_weighted_length(rdma_btls, i, size, weight_total);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
@ -31,13 +31,10 @@ struct mca_bml_base_endpoint_t;
|
||||
struct mca_pml_ob1_rdma_btl_t {
|
||||
struct mca_bml_base_btl_t* bml_btl;
|
||||
struct mca_mpool_base_registration_t* btl_reg;
|
||||
size_t length;
|
||||
};
|
||||
typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
|
||||
|
||||
|
||||
#define MCA_PML_OB1_MAX_RDMA_PER_REQUEST 4
|
||||
|
||||
|
||||
/*
|
||||
* Of the set of available btls that support RDMA,
|
||||
* find those that already have registrations - or
|
||||
@ -46,5 +43,10 @@ typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
|
||||
size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
|
||||
unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls);
|
||||
|
||||
/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
|
||||
* Calculate number of bytes to send through each BTL according to available
|
||||
* bandwidth */
|
||||
size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
|
||||
size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls);
|
||||
#endif
|
||||
|
||||
|
@ -240,13 +240,15 @@ static int mca_pml_ob1_recv_request_ack(
|
||||
/* by default copy everything */
|
||||
recvreq->req_send_offset = bytes_received;
|
||||
if(hdr->hdr_msg_length > bytes_received) {
|
||||
size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
/*
|
||||
* lookup request buffer to determine if memory is already
|
||||
* registered.
|
||||
*/
|
||||
|
||||
if(ompi_convertor_need_buffers(&recvreq->req_recv.req_convertor) == 0 &&
|
||||
hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG) {
|
||||
hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG &&
|
||||
rdma_num != 0) {
|
||||
unsigned char *base;
|
||||
ompi_convertor_get_current_pointer( &recvreq->req_recv.req_convertor, (void**)&(base) );
|
||||
|
||||
@ -261,19 +263,23 @@ static int mca_pml_ob1_recv_request_ack(
|
||||
|
||||
recvreq->req_send_offset = hdr->hdr_msg_length;
|
||||
/* are rdma devices available for long rdma protocol */
|
||||
} else if (bml_endpoint->btl_send_limit < hdr->hdr_msg_length &&
|
||||
bml_endpoint->btl_pipeline_send_length <
|
||||
hdr->hdr_msg_length &&
|
||||
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
|
||||
|
||||
} else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) {
|
||||
/* use convertor to figure out the rdma offset for this request */
|
||||
recvreq->req_send_offset = hdr->hdr_msg_length -
|
||||
bml_endpoint->btl_pipeline_send_length;
|
||||
if(recvreq->req_send_offset < bytes_received) {
|
||||
|
||||
if(recvreq->req_send_offset < bytes_received)
|
||||
recvreq->req_send_offset = bytes_received;
|
||||
}
|
||||
ompi_convertor_set_position( &recvreq->req_recv.req_convertor,
|
||||
&recvreq->req_send_offset );
|
||||
|
||||
/* use converter to figure out the rdma offset for this
|
||||
* request */
|
||||
ompi_convertor_set_position(&recvreq->req_recv.req_convertor,
|
||||
&recvreq->req_send_offset);
|
||||
|
||||
recvreq->req_rdma_cnt =
|
||||
mca_pml_ob1_rdma_pipeline_btls(bml_endpoint,
|
||||
recvreq->req_send_offset - bytes_received,
|
||||
recvreq->req_rdma);
|
||||
}
|
||||
}
|
||||
/* nothing to send by copy in/out - no need to ack */
|
||||
@ -592,14 +598,8 @@ void mca_pml_ob1_recv_request_matched_probe(
|
||||
int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq )
|
||||
{
|
||||
ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc;
|
||||
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
int num_btl_avail =
|
||||
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
int num_tries = num_btl_avail;
|
||||
|
||||
if(recvreq->req_rdma_cnt)
|
||||
num_tries = recvreq->req_rdma_cnt;
|
||||
int num_tries = recvreq->req_rdma_cnt;
|
||||
|
||||
do {
|
||||
size_t bytes_remaining = recvreq->req_send_offset -
|
||||
@ -615,8 +615,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
|
||||
mca_btl_base_descriptor_t* dst;
|
||||
mca_btl_base_descriptor_t* ctl;
|
||||
mca_mpool_base_registration_t * reg = NULL;
|
||||
int rc;
|
||||
|
||||
int rc, rdma_idx;
|
||||
|
||||
if(prev_bytes_remaining == bytes_remaining) {
|
||||
if( ++num_fail == num_tries ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
@ -636,47 +636,18 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
|
||||
ompi_convertor_set_position(&recvreq->req_recv.req_convertor,
|
||||
&recvreq->req_rdma_offset);
|
||||
|
||||
if(recvreq->req_rdma_cnt) {
|
||||
/*
|
||||
* Select the next btl out of the list w/ preregistered
|
||||
* memory.
|
||||
*/
|
||||
bml_btl = recvreq->req_rdma[recvreq->req_rdma_idx].bml_btl;
|
||||
num_btl_avail = recvreq->req_rdma_cnt - recvreq->req_rdma_idx;
|
||||
reg = recvreq->req_rdma[recvreq->req_rdma_idx].btl_reg;
|
||||
|
||||
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
|
||||
do {
|
||||
rdma_idx = recvreq->req_rdma_idx;
|
||||
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
|
||||
reg = recvreq->req_rdma[rdma_idx].btl_reg;
|
||||
size = recvreq->req_rdma[rdma_idx].length;
|
||||
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
|
||||
recvreq->req_rdma_idx = 0;
|
||||
} else {
|
||||
/*
|
||||
* Otherwise, schedule round-robin across the
|
||||
* available RDMA nics dynamically registering/deregister
|
||||
* as required.
|
||||
*/
|
||||
bml_btl =
|
||||
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
|
||||
}
|
||||
} while(!size);
|
||||
|
||||
/*
|
||||
* If more than one NIC is available - try to use both for
|
||||
* anything larger than the eager limit
|
||||
*/
|
||||
if( num_btl_avail == 1 ||
|
||||
bytes_remaining < bml_btl->btl_eager_limit ) {
|
||||
size = bytes_remaining;
|
||||
} else {
|
||||
/*
|
||||
* otherwise attempt to give the BTL a percentage of
|
||||
* the message based on a weighting factor. for
|
||||
* simplicity calculate this as a percentage of the
|
||||
* overall message length (regardless of amount
|
||||
* previously assigned)
|
||||
*/
|
||||
size = (size_t)(bml_btl->btl_weight * bytes_remaining);
|
||||
}
|
||||
/* makes sure that we don't exceed BTL max rdma size
|
||||
* if memory is not pinned already */
|
||||
if(0 == recvreq->req_rdma_cnt &&
|
||||
if(NULL == reg &&
|
||||
bml_btl->btl_rdma_pipeline_frag_size != 0 &&
|
||||
size > bml_btl->btl_rdma_pipeline_frag_size) {
|
||||
size = bml_btl->btl_rdma_pipeline_frag_size;
|
||||
@ -684,8 +655,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
|
||||
|
||||
/* prepare a descriptor for RDMA */
|
||||
mca_bml_base_prepare_dst(bml_btl, reg,
|
||||
&recvreq->req_recv.req_convertor, MCA_BTL_NO_ORDER,
|
||||
0, &size, &dst);
|
||||
&recvreq->req_recv.req_convertor,
|
||||
MCA_BTL_NO_ORDER, 0, &size, &dst);
|
||||
if(dst == NULL) {
|
||||
continue;
|
||||
}
|
||||
@ -751,6 +722,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
|
||||
/* update request state */
|
||||
recvreq->req_rdma_offset += size;
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,1);
|
||||
recvreq->req_rdma[rdma_idx].length -= size;
|
||||
bytes_remaining -= size;
|
||||
} else {
|
||||
mca_bml_base_free(bml_btl,ctl);
|
||||
|
@ -48,11 +48,11 @@ struct mca_pml_ob1_recv_request_t {
|
||||
size_t req_bytes_delivered;
|
||||
size_t req_rdma_offset;
|
||||
size_t req_send_offset;
|
||||
mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST];
|
||||
uint32_t req_rdma_cnt;
|
||||
uint32_t req_rdma_idx;
|
||||
bool req_pending;
|
||||
bool req_ack_sent; /**< whether ack was sent to the sender */
|
||||
mca_pml_ob1_rdma_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
|
||||
|
||||
@ -135,7 +135,7 @@ do {
|
||||
\
|
||||
for( r = 0; r < recvreq->req_rdma_cnt; r++ ) { \
|
||||
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[r].btl_reg; \
|
||||
if( NULL != btl_reg ) { \
|
||||
if( NULL != btl_reg && btl_reg->mpool != NULL) { \
|
||||
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); \
|
||||
} \
|
||||
} \
|
||||
|
@ -54,11 +54,11 @@ struct mca_pml_ob1_send_request_t {
|
||||
bool req_throttle_sends;
|
||||
size_t req_pipeline_depth;
|
||||
size_t req_bytes_delivered;
|
||||
mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST];
|
||||
uint32_t req_rdma_cnt;
|
||||
mca_pml_ob1_send_pending_t req_pending;
|
||||
opal_mutex_t req_send_range_lock;
|
||||
opal_list_t req_send_ranges;
|
||||
mca_pml_ob1_rdma_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
|
||||
|
||||
@ -122,7 +122,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s
|
||||
/* return mpool resources */
|
||||
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
|
||||
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
|
||||
if( NULL != reg ) {
|
||||
if( NULL != reg && reg->mpool != NULL ) {
|
||||
reg->mpool->mpool_deregister(reg->mpool, reg);
|
||||
}
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user