1
1

allow pml pipeline to cache memory registrations

to enable this (off by default) use: 
-mca pml_ob1_leave_pinned_pipeline 1 
!!AND!!!
-mca mpool_use_mem_hooks 1 

This commit was SVN r8949.
Этот коммит содержится в:
Galen Shipman 2006-02-09 15:49:51 +00:00
родитель 1abe8ef368
Коммит 44fe6c3896
9 изменённых файлов: 172 добавлений и 13 удалений

Просмотреть файл

@ -31,7 +31,9 @@ static void mca_bml_base_endpoint_construct(mca_bml_base_endpoint_t* ep)
ep->btl_rdma_offset = 0;
ep->btl_max_send_size = 0;
ep->btl_flags = 0;
ep->btl_rdma_size = 0;
ep->btl_rdma_align = 0;
OBJ_CONSTRUCT(&ep->btl_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ep->btl_eager, mca_bml_base_btl_array_t);
OBJ_CONSTRUCT(&ep->btl_send, mca_bml_base_btl_array_t);

Просмотреть файл

@ -209,6 +209,8 @@ struct mca_bml_base_endpoint_t {
int btl_flags; /**< prefered method of accessing this peer */
size_t btl_rdma_offset; /**< max of min rdma size for available rmda btls */
size_t btl_max_send_size; /**< min of max send size for available send btls */
size_t btl_rdma_size; /**< max of min rdma size for available rmda btls */
size_t btl_rdma_align; /**< max of min rdma size for available rmda btls */
mca_bml_base_btl_array_t btl_eager; /**< array of btls to use for first fragments */
mca_bml_base_btl_array_t btl_send; /**< array of btls to use for remaining fragments */
mca_bml_base_btl_array_t btl_rdma; /**< array of btls that support (prefer) rdma */

Просмотреть файл

@ -51,6 +51,15 @@ mca_bml_r2_module_t mca_bml_r2 = {
};
static inline unsigned int bml_base_log2(unsigned long val) {
unsigned int count = 0;
while(val > 0) {
val = val >> 1;
count++;
}
return count > 0 ? count-1: 0;
}
static int btl_exclusivity_compare(const void* arg1, const void* arg2)
{
mca_btl_base_module_t* btl1 = *(struct mca_btl_base_module_t**)arg1;
@ -249,6 +258,7 @@ int mca_bml_r2_add_procs(
mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules);
mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules);
bml_endpoint->btl_max_send_size = -1;
bml_endpoint->btl_rdma_size = -1;
bml_endpoint->btl_proc = proc;
proc->proc_pml = (struct mca_pml_proc_t*) bml_endpoint;
@ -381,6 +391,10 @@ int mca_bml_r2_add_procs(
if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) {
bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size;
}
if(bml_endpoint->btl_rdma_size > btl->btl_max_rdma_size) {
bml_endpoint->btl_rdma_size = btl->btl_max_rdma_size;
bml_endpoint->btl_rdma_align = bml_base_log2(bml_endpoint->btl_rdma_size);
}
}
}
}

Просмотреть файл

@ -53,7 +53,8 @@ struct mca_pml_ob1_t {
size_t send_pipeline_depth;
size_t recv_pipeline_depth;
bool leave_pinned;
int leave_pinned_pipeline;
/* lock queue access */
opal_mutex_t lock;

Просмотреть файл

@ -96,6 +96,8 @@ int mca_pml_ob1_component_open(void)
mca_pml_ob1_param_register_int("send_pipeline_depth", 3);
mca_pml_ob1.recv_pipeline_depth =
mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
mca_pml_ob1.leave_pinned_pipeline =
mca_pml_ob1_param_register_int("leave_pinned_pipeline", 4);
OBJ_CONSTRUCT(&mca_pml_ob1.lock, opal_mutex_t);

Просмотреть файл

@ -203,3 +203,73 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
OBJ_DESTRUCT(&regs);
return fit;
}
/*
* For a given btl - find the best fit registration or
* optionally create one for leave pinned.
*/
mca_mpool_base_registration_t* mca_pml_ob1_rdma_register(
mca_bml_base_btl_t* bml_btl,
unsigned char* base,
size_t size)
{
ompi_pointer_array_t regs;
mca_mpool_base_registration_t* fit = NULL;
mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool;
uint32_t reg_cnt;
size_t r;
int rc;
/* btl is rdma capable and registration is not required */
if(NULL == btl_mpool) {
return NULL;
}
/* check to see if memory is registered */
OBJ_CONSTRUCT(&regs, ompi_pointer_array_t);
ompi_pointer_array_remove_all(&regs);
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool,
base,
size,
&regs,
&reg_cnt);
/*
* find the best fit when there are multiple registrations
*/
for(r = 0; r < reg_cnt; r++) {
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(&regs, r);
size_t reg_len = reg->bound - base + 1;
if(reg->base <= base && reg_len >= size) {
fit = reg;
} else {
btl_mpool->mpool_deregister(btl_mpool, reg);
}
}
/* if the leave pinned option is set - and there is not an existing
* registration that satisfies this request, create one.
*/
if(NULL == fit) {
/* register the memory */
rc = btl_mpool->mpool_register(
btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(ORTE_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
return NULL;
}
}
OBJ_DESTRUCT(&regs);
return fit;
}

Просмотреть файл

@ -60,5 +60,14 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
unsigned char* base,
size_t size);
/*
* Create a registration
*/
mca_mpool_base_registration_t* mca_pml_ob1_rdma_register(
struct mca_bml_base_btl_t* bml_btl,
unsigned char* base,
size_t size);
#endif

Просмотреть файл

@ -208,20 +208,48 @@ static void mca_pml_ob1_recv_request_ack(
/* start rdma at current fragment offset - no need to ack */
recvreq->req_rdma_offset = recvreq->req_bytes_received;
return;
/* are rdma devices available for long rdma protocol */
} else if (bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length &&
/* are rdma devices available for long rdma protocol */
} else if (mca_pml_ob1.leave_pinned_pipeline &&
hdr->hdr_msg_length > bml_endpoint->btl_rdma_size &&
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
char* base;
char* align;
long lb;
/* round this up/down to the next aligned address */
ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb);
base = recvreq->req_recv.req_convertor.pBaseBuf + lb;
align = (char*)up_align_addr(base, bml_endpoint->btl_rdma_align)+1;
recvreq->req_rdma_offset = align - base;
/* still w/in range */
if(recvreq->req_rdma_offset < bytes_received) {
recvreq->req_rdma_offset = bytes_received;
}
if(recvreq->req_rdma_offset > hdr->hdr_msg_length) {
recvreq->req_rdma_offset = hdr->hdr_msg_length;
} else {
ompi_convertor_set_position(
&recvreq->req_recv.req_convertor,
&recvreq->req_rdma_offset);
}
/* are rdma devices available for long rdma protocol */
} else if (!mca_pml_ob1.leave_pinned_pipeline &&
bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length &&
mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
/* use convertor to figure out the rdma offset for this request */
recvreq->req_rdma_offset = bml_endpoint->btl_rdma_offset;
if(recvreq->req_rdma_offset < recvreq->req_bytes_received) {
recvreq->req_rdma_offset = recvreq->req_bytes_received;
}
ompi_convertor_set_position(
&recvreq->req_recv.req_convertor,
&recvreq->req_rdma_offset);
}
&recvreq->req_recv.req_convertor,
&recvreq->req_rdma_offset);
}
}
}
@ -572,9 +600,11 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
mca_mpool_base_registration_t * reg = NULL;
size_t num_btl_avail;
int rc;
bool release = false;
ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset);
if(recvreq->req_rdma_cnt) {
/*
* Select the next btl out of the list w/ preregistered
* memory.
@ -602,6 +632,8 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
}
} else {
char* base;
long lb;
/*
* Otherwise, schedule round-robin across the
@ -631,10 +663,16 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
if (bml_btl->btl_max_rdma_size != 0 && size > bml_btl->btl_max_rdma_size) {
size = bml_btl->btl_max_rdma_size;
}
if(mca_pml_ob1.leave_pinned_pipeline) {
/* lookup and/or create a cached registration */
ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb);
base = recvreq->req_recv.req_convertor.pBaseBuf + lb + recvreq->req_rdma_offset;
reg = mca_pml_ob1_rdma_register(bml_btl, base, size);
release = true;
}
}
/* prepare a descriptor for RDMA */
ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset);
mca_bml_base_prepare_dst(
bml_btl,
reg,
@ -648,6 +686,9 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq)
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
break;
}
if(release == true && NULL != bml_btl->btl_mpool) {
bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg);
}
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;

Просмотреть файл

@ -945,6 +945,7 @@ void mca_pml_ob1_send_request_put(
size_t offset = hdr->hdr_rdma_offset;
size_t i, size = 0;
int rc;
bool release = false;
bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag, rc);
@ -971,9 +972,21 @@ void mca_pml_ob1_send_request_put(
break;
}
}
/* set convertor at current offset */
ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset);
/* if registration doesnt exist - create one */
if (mca_pml_ob1.leave_pinned_pipeline && reg == NULL) {
unsigned char* base;
long lb;
ompi_ddt_type_lb(sendreq->req_send.req_convertor.pDesc, &lb);
base = (unsigned char*)sendreq->req_send.req_convertor.pBaseBuf + lb + offset;
reg = mca_pml_ob1_rdma_register(bml_btl, base, size);
release = true;
}
/* setup descriptor */
ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset);
mca_bml_base_prepare_src(
bml_btl,
reg,
@ -988,6 +1001,11 @@ void mca_pml_ob1_send_request_put(
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
}
if(release == true && NULL != bml_btl->btl_mpool) {
bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg);
}
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
frag->rdma_length = size;