From 44fe6c38968e4e1d32c2ade36b22c5ca9c81b473 Mon Sep 17 00:00:00 2001 From: Galen Shipman Date: Thu, 9 Feb 2006 15:49:51 +0000 Subject: [PATCH] allow pml pipeline to cache memory registrations to enable this (off by default) use: -mca pml_ob1_leave_pinned_pipeline 1 !!AND!!! -mca mpool_use_mem_hooks 1 This commit was SVN r8949. --- ompi/mca/bml/base/bml_base_endpoint.c | 4 +- ompi/mca/bml/bml.h | 2 + ompi/mca/bml/r2/bml_r2.c | 14 ++++++ ompi/mca/pml/ob1/pml_ob1.h | 3 +- ompi/mca/pml/ob1/pml_ob1_component.c | 2 + ompi/mca/pml/ob1/pml_ob1_rdma.c | 70 +++++++++++++++++++++++++++ ompi/mca/pml/ob1/pml_ob1_rdma.h | 9 ++++ ompi/mca/pml/ob1/pml_ob1_recvreq.c | 61 +++++++++++++++++++---- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 20 +++++++- 9 files changed, 172 insertions(+), 13 deletions(-) diff --git a/ompi/mca/bml/base/bml_base_endpoint.c b/ompi/mca/bml/base/bml_base_endpoint.c index c833959d9a..62844d7719 100644 --- a/ompi/mca/bml/base/bml_base_endpoint.c +++ b/ompi/mca/bml/base/bml_base_endpoint.c @@ -31,7 +31,9 @@ static void mca_bml_base_endpoint_construct(mca_bml_base_endpoint_t* ep) ep->btl_rdma_offset = 0; ep->btl_max_send_size = 0; ep->btl_flags = 0; - + ep->btl_rdma_size = 0; + ep->btl_rdma_align = 0; + OBJ_CONSTRUCT(&ep->btl_lock, opal_mutex_t); OBJ_CONSTRUCT(&ep->btl_eager, mca_bml_base_btl_array_t); OBJ_CONSTRUCT(&ep->btl_send, mca_bml_base_btl_array_t); diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index bce4677819..68b92ce705 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -209,6 +209,8 @@ struct mca_bml_base_endpoint_t { int btl_flags; /**< prefered method of accessing this peer */ size_t btl_rdma_offset; /**< max of min rdma size for available rmda btls */ size_t btl_max_send_size; /**< min of max send size for available send btls */ + size_t btl_rdma_size; /**< max of min rdma size for available rmda btls */ + size_t btl_rdma_align; /**< max of min rdma size for available rmda btls */ mca_bml_base_btl_array_t btl_eager; /**< array of btls to use for first fragments */ mca_bml_base_btl_array_t btl_send; /**< array of btls to use for remaining fragments */ mca_bml_base_btl_array_t btl_rdma; /**< array of btls that support (prefer) rdma */ diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index c8c0b1c5f4..6b2807108b 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -51,6 +51,15 @@ mca_bml_r2_module_t mca_bml_r2 = { }; +static inline unsigned int bml_base_log2(unsigned long val) { + unsigned int count = 0; + while(val > 0) { + val = val >> 1; + count++; + } + return count > 0 ? count-1: 0; +} + static int btl_exclusivity_compare(const void* arg1, const void* arg2) { mca_btl_base_module_t* btl1 = *(struct mca_btl_base_module_t**)arg1; @@ -249,6 +258,7 @@ int mca_bml_r2_add_procs( mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; + bml_endpoint->btl_rdma_size = -1; bml_endpoint->btl_proc = proc; proc->proc_pml = (struct mca_pml_proc_t*) bml_endpoint; @@ -381,6 +391,10 @@ int mca_bml_r2_add_procs( if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) { bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size; } + if(bml_endpoint->btl_rdma_size > btl->btl_max_rdma_size) { + bml_endpoint->btl_rdma_size = btl->btl_max_rdma_size; + bml_endpoint->btl_rdma_align = bml_base_log2(bml_endpoint->btl_rdma_size); + } } } } diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index 99117b6da9..ab9ce35f54 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -53,7 +53,8 @@ struct mca_pml_ob1_t { size_t send_pipeline_depth; size_t recv_pipeline_depth; bool leave_pinned; - + int leave_pinned_pipeline; + /* lock queue access */ opal_mutex_t lock; diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index cb526b2aab..4987c3edcb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -96,6 +96,8 @@ int mca_pml_ob1_component_open(void) mca_pml_ob1_param_register_int("send_pipeline_depth", 3); mca_pml_ob1.recv_pipeline_depth = mca_pml_ob1_param_register_int("recv_pipeline_depth", 4); + mca_pml_ob1.leave_pinned_pipeline = + mca_pml_ob1_param_register_int("leave_pinned_pipeline", 4); OBJ_CONSTRUCT(&mca_pml_ob1.lock, opal_mutex_t); diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index 6d5ba97396..4a4dc1ebc9 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -203,3 +203,73 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( OBJ_DESTRUCT(®s); return fit; } + + + +/* + * For a given btl - find the best fit registration or + * optionally create one for leave pinned. + */ + +mca_mpool_base_registration_t* mca_pml_ob1_rdma_register( + mca_bml_base_btl_t* bml_btl, + unsigned char* base, + size_t size) +{ + ompi_pointer_array_t regs; + mca_mpool_base_registration_t* fit = NULL; + mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool; + uint32_t reg_cnt; + size_t r; + int rc; + + /* btl is rdma capable and registration is not required */ + if(NULL == btl_mpool) { + return NULL; + } + + /* check to see if memory is registered */ + OBJ_CONSTRUCT(®s, ompi_pointer_array_t); + ompi_pointer_array_remove_all(®s); + + /* look through existing registrations */ + btl_mpool->mpool_find(btl_mpool, + base, + size, + ®s, + ®_cnt); + + + /* + * find the best fit when there are multiple registrations + */ + for(r = 0; r < reg_cnt; r++) { + mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, r); + size_t reg_len = reg->bound - base + 1; + if(reg->base <= base && reg_len >= size) { + fit = reg; + } else { + btl_mpool->mpool_deregister(btl_mpool, reg); + } + } + + + /* if the leave pinned option is set - and there is not an existing + * registration that satisfies this request, create one. + */ + if(NULL == fit) { + /* register the memory */ + rc = btl_mpool->mpool_register( + btl_mpool, + base, + size, + MCA_MPOOL_FLAGS_CACHE, + &fit); + if(ORTE_SUCCESS != rc || NULL == fit) { + opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); + return NULL; + } + } + OBJ_DESTRUCT(®s); + return fit; +} diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h index 1e6ef4bf94..bdefd692ff 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h @@ -60,5 +60,14 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( unsigned char* base, size_t size); +/* + * Create a registration + */ + +mca_mpool_base_registration_t* mca_pml_ob1_rdma_register( + struct mca_bml_base_btl_t* bml_btl, + unsigned char* base, + size_t size); + #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index db8f539650..a22d5a692a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -208,20 +208,48 @@ static void mca_pml_ob1_recv_request_ack( /* start rdma at current fragment offset - no need to ack */ recvreq->req_rdma_offset = recvreq->req_bytes_received; return; - - /* are rdma devices available for long rdma protocol */ - } else if (bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length && + + /* are rdma devices available for long rdma protocol */ + } else if (mca_pml_ob1.leave_pinned_pipeline && + hdr->hdr_msg_length > bml_endpoint->btl_rdma_size && mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) { - + char* base; + char* align; + long lb; + + /* round this up/down to the next aligned address */ + ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb); + base = recvreq->req_recv.req_convertor.pBaseBuf + lb; + align = (char*)up_align_addr(base, bml_endpoint->btl_rdma_align)+1; + recvreq->req_rdma_offset = align - base; + + /* still w/in range */ + if(recvreq->req_rdma_offset < bytes_received) { + recvreq->req_rdma_offset = bytes_received; + } + if(recvreq->req_rdma_offset > hdr->hdr_msg_length) { + recvreq->req_rdma_offset = hdr->hdr_msg_length; + } else { + ompi_convertor_set_position( + &recvreq->req_recv.req_convertor, + &recvreq->req_rdma_offset); + } + + + /* are rdma devices available for long rdma protocol */ + } else if (!mca_pml_ob1.leave_pinned_pipeline && + bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length && + mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) { + /* use convertor to figure out the rdma offset for this request */ recvreq->req_rdma_offset = bml_endpoint->btl_rdma_offset; if(recvreq->req_rdma_offset < recvreq->req_bytes_received) { recvreq->req_rdma_offset = recvreq->req_bytes_received; } ompi_convertor_set_position( - &recvreq->req_recv.req_convertor, - &recvreq->req_rdma_offset); - } + &recvreq->req_recv.req_convertor, + &recvreq->req_rdma_offset); + } } } @@ -572,9 +600,11 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) mca_mpool_base_registration_t * reg = NULL; size_t num_btl_avail; int rc; - + bool release = false; + + ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset); if(recvreq->req_rdma_cnt) { - + /* * Select the next btl out of the list w/ preregistered * memory. @@ -602,6 +632,8 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) } } else { + char* base; + long lb; /* * Otherwise, schedule round-robin across the @@ -631,10 +663,16 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) if (bml_btl->btl_max_rdma_size != 0 && size > bml_btl->btl_max_rdma_size) { size = bml_btl->btl_max_rdma_size; } + if(mca_pml_ob1.leave_pinned_pipeline) { + /* lookup and/or create a cached registration */ + ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb); + base = recvreq->req_recv.req_convertor.pBaseBuf + lb + recvreq->req_rdma_offset; + reg = mca_pml_ob1_rdma_register(bml_btl, base, size); + release = true; + } } /* prepare a descriptor for RDMA */ - ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset); mca_bml_base_prepare_dst( bml_btl, reg, @@ -648,6 +686,9 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); break; } + if(release == true && NULL != bml_btl->btl_mpool) { + bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg); + } dst->des_cbfunc = mca_pml_ob1_put_completion; dst->des_cbdata = recvreq; diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 258b63d233..7bd84220aa 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -945,6 +945,7 @@ void mca_pml_ob1_send_request_put( size_t offset = hdr->hdr_rdma_offset; size_t i, size = 0; int rc; + bool release = false; bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); MCA_PML_OB1_RDMA_FRAG_ALLOC(frag, rc); @@ -971,9 +972,21 @@ void mca_pml_ob1_send_request_put( break; } } + + /* set convertor at current offset */ + ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset); + + /* if registration doesnt exist - create one */ + if (mca_pml_ob1.leave_pinned_pipeline && reg == NULL) { + unsigned char* base; + long lb; + ompi_ddt_type_lb(sendreq->req_send.req_convertor.pDesc, &lb); + base = (unsigned char*)sendreq->req_send.req_convertor.pBaseBuf + lb + offset; + reg = mca_pml_ob1_rdma_register(bml_btl, base, size); + release = true; + } /* setup descriptor */ - ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset); mca_bml_base_prepare_src( bml_btl, reg, @@ -988,6 +1001,11 @@ void mca_pml_ob1_send_request_put( opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); } + + if(release == true && NULL != bml_btl->btl_mpool) { + bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg); + } + frag->rdma_state = MCA_PML_OB1_RDMA_PUT; frag->rdma_length = size;