allow pml pipeline to cache memory registrations
to enable this (off by default) use: -mca pml_ob1_leave_pinned_pipeline 1 !!AND!!! -mca mpool_use_mem_hooks 1 This commit was SVN r8949.
Этот коммит содержится в:
родитель
1abe8ef368
Коммит
44fe6c3896
@ -31,7 +31,9 @@ static void mca_bml_base_endpoint_construct(mca_bml_base_endpoint_t* ep)
|
||||
ep->btl_rdma_offset = 0;
|
||||
ep->btl_max_send_size = 0;
|
||||
ep->btl_flags = 0;
|
||||
|
||||
ep->btl_rdma_size = 0;
|
||||
ep->btl_rdma_align = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&ep->btl_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ep->btl_eager, mca_bml_base_btl_array_t);
|
||||
OBJ_CONSTRUCT(&ep->btl_send, mca_bml_base_btl_array_t);
|
||||
|
@ -209,6 +209,8 @@ struct mca_bml_base_endpoint_t {
|
||||
int btl_flags; /**< prefered method of accessing this peer */
|
||||
size_t btl_rdma_offset; /**< max of min rdma size for available rmda btls */
|
||||
size_t btl_max_send_size; /**< min of max send size for available send btls */
|
||||
size_t btl_rdma_size; /**< max of min rdma size for available rmda btls */
|
||||
size_t btl_rdma_align; /**< max of min rdma size for available rmda btls */
|
||||
mca_bml_base_btl_array_t btl_eager; /**< array of btls to use for first fragments */
|
||||
mca_bml_base_btl_array_t btl_send; /**< array of btls to use for remaining fragments */
|
||||
mca_bml_base_btl_array_t btl_rdma; /**< array of btls that support (prefer) rdma */
|
||||
|
@ -51,6 +51,15 @@ mca_bml_r2_module_t mca_bml_r2 = {
|
||||
};
|
||||
|
||||
|
||||
static inline unsigned int bml_base_log2(unsigned long val) {
|
||||
unsigned int count = 0;
|
||||
while(val > 0) {
|
||||
val = val >> 1;
|
||||
count++;
|
||||
}
|
||||
return count > 0 ? count-1: 0;
|
||||
}
|
||||
|
||||
static int btl_exclusivity_compare(const void* arg1, const void* arg2)
|
||||
{
|
||||
mca_btl_base_module_t* btl1 = *(struct mca_btl_base_module_t**)arg1;
|
||||
@ -249,6 +258,7 @@ int mca_bml_r2_add_procs(
|
||||
mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules);
|
||||
mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules);
|
||||
bml_endpoint->btl_max_send_size = -1;
|
||||
bml_endpoint->btl_rdma_size = -1;
|
||||
bml_endpoint->btl_proc = proc;
|
||||
proc->proc_pml = (struct mca_pml_proc_t*) bml_endpoint;
|
||||
|
||||
@ -381,6 +391,10 @@ int mca_bml_r2_add_procs(
|
||||
if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) {
|
||||
bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size;
|
||||
}
|
||||
if(bml_endpoint->btl_rdma_size > btl->btl_max_rdma_size) {
|
||||
bml_endpoint->btl_rdma_size = btl->btl_max_rdma_size;
|
||||
bml_endpoint->btl_rdma_align = bml_base_log2(bml_endpoint->btl_rdma_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -53,7 +53,8 @@ struct mca_pml_ob1_t {
|
||||
size_t send_pipeline_depth;
|
||||
size_t recv_pipeline_depth;
|
||||
bool leave_pinned;
|
||||
|
||||
int leave_pinned_pipeline;
|
||||
|
||||
/* lock queue access */
|
||||
opal_mutex_t lock;
|
||||
|
||||
|
@ -96,6 +96,8 @@ int mca_pml_ob1_component_open(void)
|
||||
mca_pml_ob1_param_register_int("send_pipeline_depth", 3);
|
||||
mca_pml_ob1.recv_pipeline_depth =
|
||||
mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
|
||||
mca_pml_ob1.leave_pinned_pipeline =
|
||||
mca_pml_ob1_param_register_int("leave_pinned_pipeline", 4);
|
||||
|
||||
OBJ_CONSTRUCT(&mca_pml_ob1.lock, opal_mutex_t);
|
||||
|
||||
|
@ -203,3 +203,73 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
|
||||
OBJ_DESTRUCT(®s);
|
||||
return fit;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* For a given btl - find the best fit registration or
|
||||
* optionally create one for leave pinned.
|
||||
*/
|
||||
|
||||
mca_mpool_base_registration_t* mca_pml_ob1_rdma_register(
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
unsigned char* base,
|
||||
size_t size)
|
||||
{
|
||||
ompi_pointer_array_t regs;
|
||||
mca_mpool_base_registration_t* fit = NULL;
|
||||
mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool;
|
||||
uint32_t reg_cnt;
|
||||
size_t r;
|
||||
int rc;
|
||||
|
||||
/* btl is rdma capable and registration is not required */
|
||||
if(NULL == btl_mpool) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
OBJ_CONSTRUCT(®s, ompi_pointer_array_t);
|
||||
ompi_pointer_array_remove_all(®s);
|
||||
|
||||
/* look through existing registrations */
|
||||
btl_mpool->mpool_find(btl_mpool,
|
||||
base,
|
||||
size,
|
||||
®s,
|
||||
®_cnt);
|
||||
|
||||
|
||||
/*
|
||||
* find the best fit when there are multiple registrations
|
||||
*/
|
||||
for(r = 0; r < reg_cnt; r++) {
|
||||
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, r);
|
||||
size_t reg_len = reg->bound - base + 1;
|
||||
if(reg->base <= base && reg_len >= size) {
|
||||
fit = reg;
|
||||
} else {
|
||||
btl_mpool->mpool_deregister(btl_mpool, reg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* if the leave pinned option is set - and there is not an existing
|
||||
* registration that satisfies this request, create one.
|
||||
*/
|
||||
if(NULL == fit) {
|
||||
/* register the memory */
|
||||
rc = btl_mpool->mpool_register(
|
||||
btl_mpool,
|
||||
base,
|
||||
size,
|
||||
MCA_MPOOL_FLAGS_CACHE,
|
||||
&fit);
|
||||
if(ORTE_SUCCESS != rc || NULL == fit) {
|
||||
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(®s);
|
||||
return fit;
|
||||
}
|
||||
|
@ -60,5 +60,14 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
|
||||
unsigned char* base,
|
||||
size_t size);
|
||||
|
||||
/*
|
||||
* Create a registration
|
||||
*/
|
||||
|
||||
mca_mpool_base_registration_t* mca_pml_ob1_rdma_register(
|
||||
struct mca_bml_base_btl_t* bml_btl,
|
||||
unsigned char* base,
|
||||
size_t size);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -208,20 +208,48 @@ static void mca_pml_ob1_recv_request_ack(
|
||||
/* start rdma at current fragment offset - no need to ack */
|
||||
recvreq->req_rdma_offset = recvreq->req_bytes_received;
|
||||
return;
|
||||
|
||||
/* are rdma devices available for long rdma protocol */
|
||||
} else if (bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length &&
|
||||
|
||||
/* are rdma devices available for long rdma protocol */
|
||||
} else if (mca_pml_ob1.leave_pinned_pipeline &&
|
||||
hdr->hdr_msg_length > bml_endpoint->btl_rdma_size &&
|
||||
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
|
||||
|
||||
char* base;
|
||||
char* align;
|
||||
long lb;
|
||||
|
||||
/* round this up/down to the next aligned address */
|
||||
ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb);
|
||||
base = recvreq->req_recv.req_convertor.pBaseBuf + lb;
|
||||
align = (char*)up_align_addr(base, bml_endpoint->btl_rdma_align)+1;
|
||||
recvreq->req_rdma_offset = align - base;
|
||||
|
||||
/* still w/in range */
|
||||
if(recvreq->req_rdma_offset < bytes_received) {
|
||||
recvreq->req_rdma_offset = bytes_received;
|
||||
}
|
||||
if(recvreq->req_rdma_offset > hdr->hdr_msg_length) {
|
||||
recvreq->req_rdma_offset = hdr->hdr_msg_length;
|
||||
} else {
|
||||
ompi_convertor_set_position(
|
||||
&recvreq->req_recv.req_convertor,
|
||||
&recvreq->req_rdma_offset);
|
||||
}
|
||||
|
||||
|
||||
/* are rdma devices available for long rdma protocol */
|
||||
} else if (!mca_pml_ob1.leave_pinned_pipeline &&
|
||||
bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length &&
|
||||
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
|
||||
|
||||
/* use convertor to figure out the rdma offset for this request */
|
||||
recvreq->req_rdma_offset = bml_endpoint->btl_rdma_offset;
|
||||
if(recvreq->req_rdma_offset < recvreq->req_bytes_received) {
|
||||
recvreq->req_rdma_offset = recvreq->req_bytes_received;
|
||||
}
|
||||
ompi_convertor_set_position(
|
||||
&recvreq->req_recv.req_convertor,
|
||||
&recvreq->req_rdma_offset);
|
||||
}
|
||||
&recvreq->req_recv.req_convertor,
|
||||
&recvreq->req_rdma_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -572,9 +600,11 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
|
||||
mca_mpool_base_registration_t * reg = NULL;
|
||||
size_t num_btl_avail;
|
||||
int rc;
|
||||
|
||||
bool release = false;
|
||||
|
||||
ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset);
|
||||
if(recvreq->req_rdma_cnt) {
|
||||
|
||||
|
||||
/*
|
||||
* Select the next btl out of the list w/ preregistered
|
||||
* memory.
|
||||
@ -602,6 +632,8 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
|
||||
}
|
||||
|
||||
} else {
|
||||
char* base;
|
||||
long lb;
|
||||
|
||||
/*
|
||||
* Otherwise, schedule round-robin across the
|
||||
@ -631,10 +663,16 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
|
||||
if (bml_btl->btl_max_rdma_size != 0 && size > bml_btl->btl_max_rdma_size) {
|
||||
size = bml_btl->btl_max_rdma_size;
|
||||
}
|
||||
if(mca_pml_ob1.leave_pinned_pipeline) {
|
||||
/* lookup and/or create a cached registration */
|
||||
ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb);
|
||||
base = recvreq->req_recv.req_convertor.pBaseBuf + lb + recvreq->req_rdma_offset;
|
||||
reg = mca_pml_ob1_rdma_register(bml_btl, base, size);
|
||||
release = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* prepare a descriptor for RDMA */
|
||||
ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset);
|
||||
mca_bml_base_prepare_dst(
|
||||
bml_btl,
|
||||
reg,
|
||||
@ -648,6 +686,9 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
break;
|
||||
}
|
||||
if(release == true && NULL != bml_btl->btl_mpool) {
|
||||
bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg);
|
||||
}
|
||||
dst->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
dst->des_cbdata = recvreq;
|
||||
|
||||
|
@ -945,6 +945,7 @@ void mca_pml_ob1_send_request_put(
|
||||
size_t offset = hdr->hdr_rdma_offset;
|
||||
size_t i, size = 0;
|
||||
int rc;
|
||||
bool release = false;
|
||||
|
||||
bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag, rc);
|
||||
@ -971,9 +972,21 @@ void mca_pml_ob1_send_request_put(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* set convertor at current offset */
|
||||
ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset);
|
||||
|
||||
/* if registration doesnt exist - create one */
|
||||
if (mca_pml_ob1.leave_pinned_pipeline && reg == NULL) {
|
||||
unsigned char* base;
|
||||
long lb;
|
||||
ompi_ddt_type_lb(sendreq->req_send.req_convertor.pDesc, &lb);
|
||||
base = (unsigned char*)sendreq->req_send.req_convertor.pBaseBuf + lb + offset;
|
||||
reg = mca_pml_ob1_rdma_register(bml_btl, base, size);
|
||||
release = true;
|
||||
}
|
||||
|
||||
/* setup descriptor */
|
||||
ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset);
|
||||
mca_bml_base_prepare_src(
|
||||
bml_btl,
|
||||
reg,
|
||||
@ -988,6 +1001,11 @@ void mca_pml_ob1_send_request_put(
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
}
|
||||
|
||||
if(release == true && NULL != bml_btl->btl_mpool) {
|
||||
bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg);
|
||||
}
|
||||
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
|
||||
frag->rdma_length = size;
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user