diff --git a/ompi/class/ompi_free_list.c b/ompi/class/ompi_free_list.c index b28aa6df70..3f42bfdd39 100644 --- a/ompi/class/ompi_free_list.c +++ b/ompi/class/ompi_free_list.c @@ -155,7 +155,7 @@ int ompi_free_list_grow(ompi_free_list_t* flist, size_t num_elements) if (NULL != flist->fl_mpool) alloc_ptr = (ompi_free_list_memory_t*)flist->fl_mpool->mpool_alloc(flist->fl_mpool, - alloc_size, 0, 0, &user_out); + alloc_size, 0, MCA_MPOOL_FLAGS_CACHE_BYPASS, &user_out); else alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size); diff --git a/ompi/mca/bml/base/bml_base_endpoint.c b/ompi/mca/bml/base/bml_base_endpoint.c index 7dbf456541..20f5a88f1f 100644 --- a/ompi/mca/bml/base/bml_base_endpoint.c +++ b/ompi/mca/bml/base/bml_base_endpoint.c @@ -29,7 +29,6 @@ static void mca_bml_base_endpoint_construct(mca_bml_base_endpoint_t* ep) { ep->btl_rdma_offset = 0; ep->btl_max_send_size = 0; - ep->btl_rdma_size = 0; ep->btl_rdma_align = 0; OBJ_CONSTRUCT(&ep->btl_eager, mca_bml_base_btl_array_t); diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index a7c7504ea4..bb232d5d85 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -238,7 +238,6 @@ struct mca_bml_base_endpoint_t { struct ompi_proc_t* btl_proc; /**< backpointer to target ompi_proc_t */ size_t btl_rdma_offset; /**< max of min rdma size for available rmda btls */ size_t btl_max_send_size; /**< min of max send size for available send btls */ - size_t btl_rdma_size; /**< max of min rdma size for available rmda btls */ size_t btl_rdma_align; /**< max of min rdma size for available rmda btls */ mca_bml_base_btl_array_t btl_eager; /**< array of btls to use for first fragments */ mca_bml_base_btl_array_t btl_send; /**< array of btls to use for remaining fragments */ diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 02111f387e..81ff4c57fb 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -265,7 +265,6 @@ int mca_bml_r2_add_procs( mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; - bml_endpoint->btl_rdma_size = -1; bml_endpoint->btl_proc = proc; proc->proc_bml = bml_endpoint; @@ -426,10 +425,6 @@ int mca_bml_r2_add_procs( if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) { bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size; } - if(bml_endpoint->btl_rdma_size > btl->btl_max_rdma_size) { - bml_endpoint->btl_rdma_size = btl->btl_max_rdma_size; - bml_endpoint->btl_rdma_align = bml_base_log2(bml_endpoint->btl_rdma_size); - } } } } @@ -677,10 +672,6 @@ int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl) if (ep->btl_rdma_offset < bml_btl->btl_min_rdma_size) { ep->btl_rdma_offset = bml_btl->btl_min_rdma_size; } - if (ep->btl_rdma_size > bml_btl->btl_max_rdma_size) { - ep->btl_rdma_size = bml_btl->btl_max_rdma_size; - ep->btl_rdma_align = bml_base_log2(ep->btl_rdma_size); - } } /* compute weighting factor for this btl */ diff --git a/ompi/mca/btl/gm/btl_gm.c b/ompi/mca/btl/gm/btl_gm.c index 24009a4b17..44f58a671f 100644 --- a/ompi/mca/btl/gm/btl_gm.c +++ b/ompi/mca/btl/gm/btl_gm.c @@ -245,7 +245,8 @@ int mca_btl_gm_free( struct mca_btl_base_module_t* btl, mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)des; if( NULL != frag->registration ) { - btl->btl_mpool->mpool_release(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->registration); + btl->btl_mpool->mpool_deregister(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->registration); + frag->registration = NULL; } MCA_BTL_GM_FRAG_RETURN(btl, frag); @@ -268,142 +269,95 @@ mca_btl_base_descriptor_t* mca_btl_gm_prepare_src( size_t* size ) { - mca_btl_gm_frag_t* frag; + mca_btl_gm_frag_t *frag = NULL; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; #if (OMPI_MCA_BTL_GM_HAVE_RDMA_GET || OMPI_MCA_BTL_GM_HAVE_RDMA_PUT) - /* - * If the data has already been pinned and is contigous than we can - * use it in place. - */ - if (NULL != registration && 0 == ompi_convertor_need_buffers(convertor)) { - size_t reg_len; - MCA_BTL_GM_FRAG_ALLOC_USER(btl, frag, rc); - if(NULL == frag){ - return NULL; + if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) { + if(registration != NULL || max_data > btl->btl_max_send_size) { + MCA_BTL_GM_FRAG_ALLOC_USER(btl, frag, rc); + if(NULL == frag) { + return NULL; + } + + /* + * just assign it something.. + * we will assign the real value in put/get + */ + frag->type = MCA_BTL_GM_PUT; + iov.iov_len = max_data; + iov.iov_base = NULL; + + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + + *size = max_data; + + if(NULL == registration) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, + iov.iov_base, max_data, 0, ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { + MCA_BTL_GM_FRAG_RETURN(btl, frag); + return NULL; + } + /* keep track of the registration we did */ + frag->registration = registration; + } + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + return &frag->base; } - /* - * just assign it something.. - * we will assign the real value in put/get - */ - frag->type = MCA_BTL_GM_PUT; - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - - reg_len = (unsigned char*)registration->bound - (unsigned char*)iov.iov_base + 1; - - /* bump reference count as so that the registration - * doesn't go away when the operation completes - */ - btl->btl_mpool->mpool_retain(btl->btl_mpool, - (mca_mpool_base_registration_t*) registration); - - frag->registration = registration; - - /* - * if the data is not already pinned - but the leave pinned option is set, - * then go ahead and pin contigous data. however, if a reserve is required - * then we must allocated a fragment w/ buffer space - */ - } else if (max_data > btl->btl_max_send_size && - ompi_convertor_need_buffers(convertor) == 0 && - reserve == 0) { - - mca_mpool_base_module_t* mpool = btl->btl_mpool; - MCA_BTL_GM_FRAG_ALLOC_USER(btl, frag, rc); - if(NULL == frag){ - return NULL; - } - /* - * just assign it something.. - * we will assign the real value in put/get - */ - frag->type = MCA_BTL_GM_PUT; - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - - rc = mpool->mpool_register( mpool, iov.iov_base, max_data, 0, ®istration ); - - if(rc != OMPI_SUCCESS) { - MCA_BTL_GM_FRAG_RETURN(btl,frag); - return NULL; - } - - frag->registration = registration; - } - - /* - * if we aren't pinning the data and the requested size is less - * than the eager limit pack into a fragment from the eager pool - */ - else + } #endif - if (max_data+reserve <= btl->btl_eager_limit) { - + if (max_data + reserve <= btl->btl_eager_limit) { + /* the data is small enough to fit in the eager frag and + * memory is not prepinned */ MCA_BTL_GM_FRAG_ALLOC_EAGER(btl, frag, rc); - if(NULL == frag) { - return NULL; + if(frag != NULL) { + frag->type = MCA_BTL_GM_EAGER; } - frag->type = MCA_BTL_GM_EAGER; - - iov.iov_len = max_data; - iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - if( rc < 0 ) { - MCA_BTL_GM_FRAG_RETURN(btl, frag); - return NULL; - } - frag->segment.seg_len = max_data + reserve; } - /* - * otherwise pack as much data as we can into a fragment - * that is the max send size. - */ - else { - + if(NULL == frag) { + /* the data doesn't fit into eager frag or eger frag is + * not available */ MCA_BTL_GM_FRAG_ALLOC_MAX(btl, frag, rc); if(NULL == frag) { return NULL; } frag->type = MCA_BTL_GM_SEND; - if(max_data + reserve > btl->btl_max_send_size){ + if(max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } - iov.iov_len = max_data; - iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - - if( rc < 0 ) { - MCA_BTL_GM_FRAG_RETURN(btl, frag); - return NULL; - } - frag->segment.seg_len = max_data + reserve; } + iov.iov_len = max_data; + iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve; + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + if(rc < 0) { + MCA_BTL_GM_FRAG_RETURN(btl, frag); + return NULL; + } + *size = max_data; + frag->segment.seg_len = max_data + reserve; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; + return &frag->base; } @@ -462,18 +416,7 @@ mca_btl_base_descriptor_t* mca_btl_gm_prepare_dst( frag->base.des_dst = &frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_flags = 0; - if(NULL != registration) { - /* bump reference count as so that the registration - * doesn't go away when the operation completes - */ - - mpool->mpool_retain(mpool, - (mca_mpool_base_registration_t*) registration); - - frag->registration = registration; - - } else { - + if(NULL == registration) { rc = mpool->mpool_register( mpool, frag->segment.seg_addr.pval, frag->segment.seg_len, @@ -483,7 +426,6 @@ mca_btl_base_descriptor_t* mca_btl_gm_prepare_dst( MCA_BTL_GM_FRAG_RETURN(btl,frag); return NULL; } - frag->registration = registration; } return &frag->base; diff --git a/ompi/mca/btl/gm/btl_gm_component.c b/ompi/mca/btl/gm/btl_gm_component.c index b9ee922b4c..8c857698f2 100644 --- a/ompi/mca/btl/gm/btl_gm_component.c +++ b/ompi/mca/btl/gm/btl_gm_component.c @@ -30,7 +30,7 @@ #include "opal/mca/base/mca_base_param.h" #include "orte/mca/errmgr/errmgr.h" #include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/mpool/gm/mpool_gm.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "btl_gm.h" #include "btl_gm_frag.h" #include "btl_gm_endpoint.h" @@ -45,6 +45,9 @@ #if OMPI_ENABLE_PROGRESS_THREADS static void* mca_btl_gm_progress_thread( opal_object_t* arg ); #endif +static int gm_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg); +static int gm_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg); mca_btl_gm_component_t mca_btl_gm_component = { @@ -131,7 +134,7 @@ int mca_btl_gm_component_open(void) mca_btl_gm_component.gm_debug = mca_btl_gm_param_register_int("debug", 0); mca_btl_gm_component.gm_mpool_name = - mca_btl_gm_param_register_string("mpool", "gm"); + mca_btl_gm_param_register_string("mpool", "rdma"); mca_btl_gm_component.gm_max_ports = mca_btl_gm_param_register_int("max_ports", 16); mca_btl_gm_component.gm_max_boards = @@ -201,6 +204,35 @@ int mca_btl_gm_component_close(void) return OMPI_SUCCESS; } +static int gm_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + struct gm_port *port = (struct gm_port*)reg_data; + int rc; + + rc = gm_register_memory(port, base, size); + + if(rc != GM_SUCCESS) + return OMPI_ERR_OUT_OF_RESOURCE; + + return MPI_SUCCESS; +} + +static int gm_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) +{ + struct gm_port *port = (struct gm_port*)reg_data; + int rc; + + rc = gm_deregister_memory(port, reg->base, reg->bound - reg->base + 1); + + if(rc != GM_SUCCESS) { + opal_output(0, "%s: error unpinning gm memory errno says %s\n", + __func__, strerror(errno)); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} /** * Initialize module instance @@ -243,7 +275,10 @@ mca_btl_gm_module_init (mca_btl_gm_module_t * btl) } /* initialize memory pool */ - resources.port = btl->port; + resources.reg_data = (void*)btl->port; + resources.sizeof_reg = sizeof(mca_mpool_base_registration_t); + resources.register_mem = gm_reg_mr; + resources.deregister_mem = gm_dereg_mr; btl->super.btl_mpool = mca_mpool_base_module_create( mca_btl_gm_component.gm_mpool_name, &btl->super, @@ -419,8 +454,6 @@ static int mca_btl_gm_discover( void ) return OMPI_SUCCESS; } - - /* * Register GM component addressing information. The MCA framework * will make this available to all peers. diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index c25c8f08df..6857e3b45c 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -31,7 +31,7 @@ #include "ompi/datatype/datatype.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/mpool.h" -#include "ompi/mca/mpool/mvapi/mpool_mvapi.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "ompi/mca/btl/base/btl_base_error.h" #include #include /* for log2 */ @@ -225,8 +225,9 @@ int mca_btl_mvapi_free( mca_btl_base_descriptor_t* des) { mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*)des; - if (MCA_BTL_MVAPI_FRAG_FRAG == frag->type) { - btl->btl_mpool->mpool_release(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->vapi_reg); + if (MCA_BTL_MVAPI_FRAG_FRAG == frag->type && frag->registration != NULL) { + btl->btl_mpool->mpool_deregister(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->registration); + frag->registration = NULL; } MCA_BTL_IB_FRAG_RETURN(btl, frag); @@ -267,164 +268,99 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src( ) { mca_btl_mvapi_module_t* mvapi_btl; - mca_btl_mvapi_frag_t* frag; - mca_mpool_mvapi_registration_t * vapi_reg; + mca_btl_mvapi_frag_t* frag = NULL; + mca_btl_mvapi_reg_t *mvapi_reg; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; - mvapi_btl = (mca_btl_mvapi_module_t*) btl; - vapi_reg = (mca_mpool_mvapi_registration_t*) registration; - - if(NULL != vapi_reg && 0 == ompi_convertor_need_buffers(convertor)){ - size_t reg_len; - /* the memory is already pinned and we have contiguous user data */ - MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); - if(NULL == frag){ - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - reg_len = (unsigned char*)vapi_reg->base_reg.bound - (unsigned char*)iov.iov_base + 1; + mvapi_btl = (mca_btl_mvapi_module_t*)btl; - frag->sg_entry.len = max_data; - frag->sg_entry.lkey = vapi_reg->l_key; - frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) iov.iov_base; - - frag->segment.seg_key.key32[0] = (uint32_t) vapi_reg->l_key; - - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = 0; - frag->vapi_reg = vapi_reg; - - btl->btl_mpool->mpool_retain(btl->btl_mpool, (mca_mpool_base_registration_t*) vapi_reg); - if(vapi_reg->base_reg.flags & MCA_MPOOL_FLAGS_CACHE) { - assert(vapi_reg->base_reg.ref_count >= 4); - } else { - assert(vapi_reg->base_reg.ref_count >= 2); + if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) { + if(registration != NULL || max_data > btl->btl_max_send_size) { + MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); + if(NULL == frag) { + return NULL; + } + + iov.iov_len = max_data; + iov.iov_base = NULL; + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + + *size = max_data; + + if(NULL == registration) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, + iov.iov_base, max_data, 0, ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { + MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag); + return NULL; + } + frag->registration = (mca_btl_mvapi_reg_t*)registration; + } + mvapi_reg = (mca_btl_mvapi_reg_t*)registration; + + frag->base.des_flags = 0; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + frag->sg_entry.len = max_data; + frag->sg_entry.lkey = mvapi_reg->l_key; + frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t)iov.iov_base; + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; + + BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " + "frag->segment.seg_key.key32[0] = %lu", + frag->sg_entry.lkey, frag->sg_entry.addr, + frag->segment.seg_key.key32[0])); + return &frag->base; } - - return &frag->base; - - } else if( max_data > btl->btl_max_send_size && - ompi_convertor_need_buffers(convertor) == 0 && - reserve == 0) - { - /* The user buffer is contigous and we are asked to send more than the max send size. */ - - MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); - if(NULL == frag){ - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - frag->base.des_flags = 0; - - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - iov.iov_base, - max_data, - 0, - (mca_mpool_base_registration_t**) &vapi_reg); - if(OMPI_SUCCESS != rc || NULL == vapi_reg) { - BTL_ERROR(("mpool_register(%p,%lu) failed", iov.iov_base, max_data)); - MCA_BTL_IB_FRAG_RETURN(btl, frag); - return NULL; - } - - frag->sg_entry.len = max_data; - frag->sg_entry.lkey = vapi_reg->l_key; - frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) iov.iov_base; - - frag->segment.seg_key.key32[0] = (uint32_t) vapi_reg->l_key; - - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->vapi_reg = vapi_reg; - - return &frag->base; - - } else if (max_data+reserve <= btl->btl_eager_limit) { - /* the data is small enough to fit in the eager frag and - either we received no prepinned memory or leave pinned is - not set - */ - MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); - if(NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - if( rc < 0 ) { - MCA_BTL_IB_FRAG_RETURN(btl, frag); - return NULL; - } - - frag->segment.seg_len = max_data + reserve; - frag->segment.seg_key.key32[0] = (uint32_t) frag->sg_entry.lkey; - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = 0; - - return &frag->base; - - } else { - - MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); - if(NULL == frag) { - return NULL; - } - if(max_data + reserve > btl->btl_max_send_size){ - max_data = btl->btl_max_send_size - reserve; - } - iov.iov_len = max_data; - iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - - if( rc < 0 ) { - MCA_BTL_IB_FRAG_RETURN(btl, frag); - return NULL; - } - - frag->segment.seg_len = max_data + reserve; - frag->segment.seg_key.key32[0] = (uint32_t) frag->sg_entry.lkey; - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags=0; - - return &frag->base; } - return NULL; + + if(max_data + reserve <= btl->btl_eager_limit) { + /* the data is small enough to fit in the eager frag and + * memory is not prepinned */ + MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); + } + + if(NULL == frag) { + /* the data doesn't fit into eager frag or eger frag is + * not available */ + MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); + if(NULL == frag) { + return NULL; + } + if(max_data + reserve > btl->btl_max_send_size) { + max_data = btl->btl_max_send_size - reserve; + } + } + + iov.iov_len = max_data; + iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + if( rc < 0 ) { + MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag); + return NULL; + } + *size = max_data; + frag->segment.seg_len = max_data + reserve; + frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + return &frag->base; } @@ -453,12 +389,11 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( { mca_btl_mvapi_module_t* mvapi_btl; mca_btl_mvapi_frag_t* frag; - mca_mpool_mvapi_registration_t * vapi_reg; + mca_btl_mvapi_reg_t *mvapi_reg; ptrdiff_t lb; int rc; mvapi_btl = (mca_btl_mvapi_module_t*) btl; - vapi_reg = (mca_mpool_mvapi_registration_t*) registration; MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); @@ -471,42 +406,32 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( frag->segment.seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted; frag->base.des_flags = 0; - if(NULL!= vapi_reg){ - /* the memory is already pinned- use it*/ - btl->btl_mpool->mpool_retain(btl->btl_mpool, (mca_mpool_base_registration_t*) vapi_reg); - if(vapi_reg->base_reg.flags & MCA_MPOOL_FLAGS_CACHE) { - assert(vapi_reg->base_reg.ref_count >= 4); - } else { - assert(vapi_reg->base_reg.ref_count >= 2); - } - } else { + if(NULL == registration) { /* we didn't get a memory registration passed in, so we have to register the region * ourselves */ rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - frag->segment.seg_addr.pval, - *size, - 0, - (mca_mpool_base_registration_t**) &vapi_reg); - if(OMPI_SUCCESS != rc || NULL == vapi_reg) { + frag->segment.seg_addr.pval, *size, 0, ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { BTL_ERROR(("mpool_register(%p,%lu) failed: base %p lb %lu offset %lu", frag->segment.seg_addr.pval, *size, convertor->pBaseBuf, lb, convertor->bConverted)); MCA_BTL_IB_FRAG_RETURN(btl, frag); return NULL; } + frag->registration = (mca_btl_mvapi_reg_t*)registration; } + mvapi_reg = (mca_btl_mvapi_reg_t*)registration; - frag->sg_entry.len = *size; - frag->sg_entry.lkey = vapi_reg->l_key; + frag->sg_entry.len = *size; + frag->sg_entry.lkey = mvapi_reg->l_key; frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->segment.seg_addr.pval; - frag->segment.seg_key.key32[0] = (uint32_t) vapi_reg->r_key; - + frag->segment.seg_key.key32[0] =mvapi_reg->r_key; + frag->base.des_dst = &frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_src = NULL; frag->base.des_src_cnt = 0; - frag->vapi_reg = vapi_reg; return &frag->base; diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index 40ce6daae0..7c880007c9 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -192,7 +192,13 @@ struct mca_btl_mvapi_module_t { uint32_t eager_rdma_buffers_count; /**< number of RDMA buffers */ }; typedef struct mca_btl_mvapi_module_t mca_btl_mvapi_module_t; - +struct mca_btl_mvapi_reg_t { + mca_mpool_base_registration_t base; + VAPI_mr_hndl_t hndl; /* Memory region handle */ + VAPI_lkey_t l_key; /* Local key to registered memory */ + VAPI_rkey_t r_key; /* Remote key to registered memory */ +}; +typedef struct mca_btl_mvapi_reg_t mca_btl_mvapi_reg_t; #define MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, \ additional) \ diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index f65b0a7b18..2b5e3672d9 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -47,10 +47,13 @@ #include #include #include "ompi/datatype/convertor.h" -#include "ompi/mca/mpool/mvapi/mpool_mvapi.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "btl_mvapi_endpoint.h" #include "ompi/mca/pml/base/pml_base_module_exchange.h" +static int mvapi_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg); +static int mvapi_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg); mca_btl_mvapi_component_t mca_btl_mvapi_component = { { @@ -147,7 +150,7 @@ int mca_btl_mvapi_component_open(void) mca_btl_mvapi_param_register_int ("free_list_inc", "increment size of free lists", 32, &mca_btl_mvapi_component.ib_free_list_inc); mca_btl_mvapi_param_register_string("mpool", "name of the memory pool to be used", - "mvapi", &mca_btl_mvapi_component.ib_mpool_name); + "rdma", &mca_btl_mvapi_component.ib_mpool_name); mca_btl_mvapi_param_register_int("reg_mru_len", "length of the registration cache most recently used list", 16, (int*) &mca_btl_mvapi_component.reg_mru_len); #ifdef VAPI_FEATURE_SRQ @@ -337,7 +340,51 @@ static void mca_btl_mvapi_control( } } +static int mvapi_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + mca_btl_mvapi_module_t *mvapi_btl = (mca_btl_mvapi_module_t*)reg_data; + mca_btl_mvapi_reg_t *mvapi_reg = (mca_btl_mvapi_reg_t*)reg; + VAPI_mrw_t mr_in, mr_out; + VAPI_ret_t ret; + memset(&mr_in, 0, sizeof(VAPI_mrw_t)); + memset(&mr_out, 0, sizeof(VAPI_mrw_t)); + mr_in.acl = + VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE | VAPI_EN_REMOTE_READ; + mr_in.pd_hndl = mvapi_btl->ptag; + mr_in.size = size; + mr_in.start = (VAPI_virt_addr_t)(MT_virt_addr_t)base; + mr_in.type = VAPI_MR; + mvapi_reg->hndl = VAPI_INVAL_HNDL; + + ret = VAPI_register_mr(mvapi_btl->nic, &mr_in, &mvapi_reg->hndl, &mr_out); + + if(ret != VAPI_OK) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + mvapi_reg->l_key = mr_out.l_key; + mvapi_reg->r_key = mr_out.r_key; + return OMPI_SUCCESS; +} + +static int mvapi_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) +{ + mca_btl_mvapi_module_t *mvapi_btl = (mca_btl_mvapi_module_t*)reg_data; + mca_btl_mvapi_reg_t *mvapi_reg = (mca_btl_mvapi_reg_t*)reg; + VAPI_ret_t ret; + + if(mvapi_reg->hndl != VAPI_INVAL_HNDL) { + ret = VAPI_deregister_mr(mvapi_btl->nic, mvapi_reg->hndl); + if(ret != VAPI_OK) { + opal_output(0, "%s: error unpinning mvapi memory errno says %s\n", + __func__, strerror(errno)); + return OMPI_ERROR; + } + } + return OMPI_SUCCESS; +} /* * IB component initialization: @@ -513,9 +560,10 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, return NULL; } - hca_pd.hca = mvapi_btl->nic; - hca_pd.pd_tag = mvapi_btl->ptag; - + hca_pd.reg_data = mvapi_btl; + hca_pd.sizeof_reg = sizeof(mca_btl_mvapi_reg_t); + hca_pd.register_mem = mvapi_reg_mr; + hca_pd.deregister_mem = mvapi_dereg_mr; /* initialize the memory pool using the hca */ mvapi_btl->super.btl_mpool = mca_mpool_base_module_create(mca_btl_mvapi_component.ib_mpool_name, diff --git a/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h b/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h index f100129f99..dd33349a9f 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h +++ b/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h @@ -12,16 +12,16 @@ #include "ompi_config.h" #include "btl_mvapi.h" -#include "btl_mvapi_endpoint.h" -#include "ompi/mca/mpool/mvapi/mpool_mvapi.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif +struct mca_btl_mvapi_reg_t; + struct mca_btl_mvapi_eager_rdma_local_t { ompi_ptr_t base; /**< buffer for RDMAing eager messages */ - mca_mpool_mvapi_registration_t *reg; + struct mca_btl_mvapi_reg_t *reg; uint16_t head; /**< RDMA buffer to poll */ uint16_t tail; /**< Needed for credit managment */ int32_t credits; /**< number of RDMA credits */ diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index 105da43ba9..3ea1ea1000 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -1230,7 +1230,8 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma( buf = mvapi_btl->super.btl_mpool->mpool_alloc(mvapi_btl->super.btl_mpool, mvapi_btl->eager_rdma_frag_size * - mca_btl_mvapi_component.eager_rdma_num, 0, 0, + mca_btl_mvapi_component.eager_rdma_num, 0, + MCA_MPOOL_FLAGS_CACHE_BYPASS, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) @@ -1239,7 +1240,7 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma( for(i = 0; i < mca_btl_mvapi_component.eager_rdma_num; i++) { ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf + i*mvapi_btl->eager_rdma_frag_size); - item->user_data = endpoint->eager_rdma_local.reg; + item->user_data = (void*)endpoint->eager_rdma_local.reg; OBJ_CONSTRUCT(item, mca_btl_mvapi_recv_frag_eager_t); ((mca_btl_mvapi_frag_t*)item)->endpoint = endpoint; ((mca_btl_mvapi_frag_t*)item)->type = MCA_BTL_MVAPI_FRAG_EAGER_RDMA; diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h index 4335966460..828ab6cc46 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h @@ -26,7 +26,7 @@ #include "btl_mvapi_frag.h" #include "btl_mvapi.h" #include "btl_mvapi_eager_rdma.h" -#include "ompi/mca/mpool/mvapi/mpool_mvapi.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include #include diff --git a/ompi/mca/btl/mvapi/btl_mvapi_frag.c b/ompi/mca/btl/mvapi/btl_mvapi_frag.c index 7b3e7f38d9..6879dafa8d 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_frag.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_frag.c @@ -18,25 +18,22 @@ #include "btl_mvapi_frag.h" -#include "ompi/mca/mpool/mvapi/mpool_mvapi.h" - - static void mca_btl_mvapi_frag_common_constructor( mca_btl_mvapi_frag_t* frag) { - mca_mpool_mvapi_registration_t* mem_hndl = (mca_mpool_mvapi_registration_t*) frag->base.super.user_data; + mca_btl_mvapi_reg_t* mem_hndl = + (mca_btl_mvapi_reg_t*)frag->base.super.user_data; frag->hdr = (mca_btl_mvapi_header_t*) (frag+1); /* initialize btl header to start at end of frag */ frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_mvapi_header_t); /* init the segment address to start after the btl header */ frag->segment.seg_len = frag->size; - frag->segment.seg_key.key32[0] = (uint32_t) mem_hndl->l_key; - frag->sg_entry.lkey = mem_hndl->l_key; + frag->sg_entry.lkey = mem_hndl->l_key; + frag->segment.seg_key.key32[0] = frag->sg_entry.lkey; frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr; frag->base.des_flags = 0; } - static void mca_btl_mvapi_send_frag_common_constructor(mca_btl_mvapi_frag_t* frag) { diff --git a/ompi/mca/btl/mvapi/btl_mvapi_frag.h b/ompi/mca/btl/mvapi/btl_mvapi_frag.h index a756302e6a..aead65f170 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_frag.h +++ b/ompi/mca/btl/mvapi/btl_mvapi_frag.h @@ -27,12 +27,13 @@ #include #include #include -#include "ompi/mca/mpool/mvapi/mpool_mvapi.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif +struct mca_btl_mvapi_reg_t; + struct mca_btl_mvapi_header_t { mca_btl_base_tag_t tag; int16_t credits; @@ -95,7 +96,7 @@ struct mca_btl_mvapi_frag_t { VAPI_sg_lst_entry_t sg_entry; mca_btl_mvapi_header_t *hdr; mca_btl_mvapi_footer_t *ftr; - mca_mpool_mvapi_registration_t * vapi_reg; + struct mca_btl_mvapi_reg_t *registration; ompi_free_list_t* my_list; }; typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_frag_t; diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index ed330274f9..f872ba3241 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -32,7 +32,7 @@ #include "ompi/datatype/datatype.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/mpool.h" -#include "ompi/mca/mpool/openib/mpool_openib.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include #include #include @@ -292,10 +292,11 @@ int mca_btl_openib_free( { mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des; - if(frag->size == 0) { - btl->btl_mpool->mpool_release(btl->btl_mpool, - (mca_mpool_base_registration_t*) - frag->openib_reg); + if(MCA_BTL_OPENIB_FRAG_FRAG == frag->type && frag->registration != NULL) { + btl->btl_mpool->mpool_deregister(btl->btl_mpool, + (mca_mpool_base_registration_t*) + frag->registration); + frag->registration = NULL; } MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*) btl), frag); @@ -335,165 +336,101 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( size_t* size ) { - mca_btl_openib_module_t* openib_btl; - mca_btl_openib_frag_t* frag; - mca_mpool_openib_registration_t * openib_reg; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = *size; - int rc; - - openib_btl = (mca_btl_openib_module_t*) btl; - openib_reg = (mca_mpool_openib_registration_t*) registration; + mca_btl_openib_module_t *openib_btl; + mca_btl_openib_frag_t *frag = NULL; + mca_btl_openib_reg_t *openib_reg; + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data = *size; + int rc; - - if(NULL != openib_reg && 0 == ompi_convertor_need_buffers(convertor)){ - size_t reg_len; + openib_btl = (mca_btl_openib_module_t*)btl; - /* the memory is already pinned and we have contiguous user data */ + if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) { + if(registration != NULL || max_data > btl->btl_max_send_size) { + MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); + if(NULL == frag) { + return NULL; + } - MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); - if(NULL == frag){ - return NULL; - } + iov.iov_len = max_data; + iov.iov_base = NULL; - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - - - reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)iov.iov_base + 1; - - frag->mr = openib_reg->mr; - frag->sg_entry.length = max_data; - frag->sg_entry.lkey = frag->mr->lkey; - - frag->sg_entry.addr = (unsigned long) iov.iov_base; - - frag->segment.seg_key.key32[0] = (uint32_t) frag->sg_entry.lkey; - - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = 0; - frag->openib_reg= openib_reg; - btl->btl_mpool->mpool_retain(btl->btl_mpool, (mca_mpool_base_registration_t*) openib_reg); - return &frag->base; - - } else if( max_data > btl->btl_max_send_size && - ompi_convertor_need_buffers(convertor) == 0 && - reserve == 0) { - /* The user buffer is contigous and we are asked to send more than the max send size. */ - - MCA_BTL_IB_FRAG_ALLOC_FRAG(openib_btl, frag, rc); - if(NULL == frag){ - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - frag->base.des_flags = 0; + *size = max_data; - - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - iov.iov_base, - max_data, - 0, - (mca_mpool_base_registration_t**) &openib_reg); - if(OMPI_SUCCESS != rc || NULL == openib_reg) { - MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); + if(NULL == registration) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, + iov.iov_base, max_data, 0, ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { + MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); + return NULL; + } + /* keep track of the registration we did */ + frag->registration = (mca_btl_openib_reg_t*)registration; + } + openib_reg = (mca_btl_openib_reg_t*)registration; + + frag->base.des_flags = 0; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + frag->sg_entry.length = max_data; + frag->sg_entry.lkey = openib_reg->mr->lkey; + frag->sg_entry.addr = (unsigned long)iov.iov_base; + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; + + BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " + "frag->segment.seg_key.key32[0] = %lu", + frag->sg_entry.lkey, frag->sg_entry.addr, + frag->segment.seg_key.key32[0])); + + return &frag->base; + } + } + + if(max_data + reserve <= btl->btl_eager_limit) { + /* the data is small enough to fit in the eager frag and + * memory is not prepinned */ + MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); + } + + if(NULL == frag) { + /* the data doesn't fit into eager frag or eger frag is + * not available */ + MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); + if(NULL == frag) { return NULL; } - - - frag->mr = openib_reg->mr; - frag->sg_entry.length = max_data; - frag->sg_entry.lkey = openib_reg->mr->lkey; - - frag->sg_entry.addr = (unsigned long) iov.iov_base; - - frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->rkey; - - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->openib_reg = openib_reg; - BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu", frag->sg_entry.lkey, frag->sg_entry.addr)); - - return &frag->base; - - } else if (max_data+reserve <= btl->btl_eager_limit) { - /* the data is small enough to fit in the eager frag and - either we received no prepinned memory or leave pinned is - not set - */ - MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); - if(NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - if( rc < 0 ) { - MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); - return NULL; - } - - frag->segment.seg_len = max_data + reserve; - frag->segment.seg_key.key32[0] = (uint32_t) frag->sg_entry.lkey; - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = 0; - - return &frag->base; - - } else { - - MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); - if(NULL == frag) { - return NULL; - } - if(max_data + reserve > btl->btl_max_send_size){ - max_data = btl->btl_max_send_size - reserve; + if(max_data + reserve > btl->btl_max_send_size) { + max_data = btl->btl_max_send_size - reserve; } - iov.iov_len = max_data; - iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - *size = max_data; - - if( rc < 0 ) { - MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); - return NULL; - } - - frag->segment.seg_len = max_data + reserve; - frag->segment.seg_key.key32[0] = (uint32_t) frag->sg_entry.lkey; - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags=0; - - return &frag->base; } - return NULL; + + iov.iov_len = max_data; + iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + if(rc < 0) { + MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); + return NULL; + } + *size = max_data; + frag->segment.seg_len = max_data + reserve; + frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + return &frag->base; } /** @@ -513,69 +450,62 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, + mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { - mca_btl_openib_module_t* openib_btl; - mca_btl_openib_frag_t* frag; - mca_mpool_openib_registration_t * openib_reg; - int rc; + mca_btl_openib_module_t *openib_btl; + mca_btl_openib_frag_t *frag; + mca_btl_openib_reg_t *openib_reg; + int rc; ptrdiff_t lb; - size_t reg_len; - openib_btl = (mca_btl_openib_module_t*) btl; - openib_reg = (mca_mpool_openib_registration_t*) registration; + openib_btl = (mca_btl_openib_module_t*)btl; - MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); - if(NULL == frag){ - return NULL; + MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); + if(NULL == frag) { + return NULL; } ompi_ddt_type_lb(convertor->pDesc, &lb); - frag->segment.seg_len = *size; - frag->segment.seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted; - frag->base.des_flags = 0; + frag->segment.seg_addr.pval = convertor->pBaseBuf + lb + + convertor->bConverted; - if(NULL!= openib_reg){ - /* the memory is already pinned try to use it if the pinned region is large enough*/ - reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1; - btl->btl_mpool->mpool_retain(btl->btl_mpool, - (mca_mpool_base_registration_t*) openib_reg); - } else { - /* we didn't get a memory registration passed in, so we have to register the region - * ourselves + if(NULL == registration){ + /* we didn't get a memory registration passed in, so we have to + * register the region ourselves */ - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - frag->segment.seg_addr.pval, - *size, - 0, - (mca_mpool_base_registration_t**) &openib_reg); - if(OMPI_SUCCESS != rc || NULL == openib_reg) { + frag->segment.seg_addr.pval, *size, 0, ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); return NULL; } + /* keep track of the registration we did */ + frag->registration = (mca_btl_openib_reg_t*)registration; } + openib_reg = (mca_btl_openib_reg_t*)registration; - - frag->mr = openib_reg->mr; - frag->sg_entry.length = *size; - frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (unsigned long) frag->segment.seg_addr.pval; - - frag->segment.seg_key.key32[0] = frag->mr->rkey; - - frag->base.des_dst = &frag->segment; - frag->base.des_dst_cnt = 1; - frag->base.des_src = NULL; - frag->base.des_src_cnt = 0; - frag->openib_reg = openib_reg; - BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu frag->segment.seg_key.key32[0] = %lu" , frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0])); + frag->sg_entry.length = *size; + frag->sg_entry.lkey = openib_reg->mr->lkey; + frag->sg_entry.addr = (unsigned long) frag->segment.seg_addr.pval; - return &frag->base; - + frag->segment.seg_len = *size; + frag->segment.seg_key.key32[0] = openib_reg->mr->rkey; + + frag->base.des_dst = &frag->segment; + frag->base.des_dst_cnt = 1; + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + frag->base.des_flags = 0; + + BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " + "frag->segment.seg_key.key32[0] = %lu", + frag->sg_entry.lkey, frag->sg_entry.addr, + frag->segment.seg_key.key32[0])); + + return &frag->base; } int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl) diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 62c41db63e..4c42f42679 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -220,9 +220,15 @@ struct mca_btl_openib_module_t { orte_pointer_array_t *endpoints; }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; - + extern mca_btl_openib_module_t mca_btl_openib_module; +struct mca_btl_openib_reg_t { + mca_mpool_base_registration_t base; + struct ibv_mr *mr; +}; +typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t; + #if OMPI_ENABLE_PROGRESS_THREADS == 1 extern void* mca_btl_openib_progress_thread(opal_object_t*); #endif @@ -417,10 +423,8 @@ extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( * @param frag (IN) IB send fragment * */ -extern void mca_btl_openib_send_frag_return( - struct mca_btl_base_module_t* btl, - struct mca_btl_openib_frag_t* - ); +extern void mca_btl_openib_send_frag_return(mca_btl_base_module_t* btl, + mca_btl_openib_frag_t*); int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t* openib_btl); diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index ac30b92587..26b4f613cb 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -35,6 +35,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/sys_info.h" #include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "ompi/mca/btl/base/base.h" #include "btl_openib.h" #include "btl_openib_frag.h" @@ -81,6 +82,9 @@ static int btl_openib_module_progress(mca_btl_openib_module_t *openib_btl); static void btl_openib_frag_progress_pending( mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint, const int prio); +static int openib_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg); +static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg); mca_btl_openib_component_t mca_btl_openib_component = { @@ -235,6 +239,36 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl, } } +static int openib_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + mca_btl_openib_hca_t *hca = (mca_btl_openib_hca_t*)reg_data; + mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; + + openib_reg->mr = ibv_reg_mr(hca->ib_pd, base, size, IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); + + if(NULL == openib_reg->mr) + return OMPI_ERR_OUT_OF_RESOURCE; + + return OMPI_SUCCESS; +} + +static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) +{ + mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; + + if(openib_reg->mr != NULL) { + if(ibv_dereg_mr(openib_reg->mr)) { + opal_output(0, "%s: error unpinning openib memory errno says %s\n", + __func__, strerror(errno)); + return OMPI_ERROR; + } + } + openib_reg->mr = NULL; + return OMPI_SUCCESS; +} + static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, uint8_t port_num, struct ibv_port_attr *ib_port_attr) { @@ -399,7 +433,10 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) goto close_hca; } - mpool_resources.ib_pd = hca->ib_pd; + mpool_resources.reg_data = (void*)hca; + mpool_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t); + mpool_resources.register_mem = openib_reg_mr; + mpool_resources.deregister_mem = openib_dereg_mr; hca->mpool = mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name, hca, &mpool_resources); @@ -469,6 +506,7 @@ free_hca: free(hca); return ret; } + /* * IB component initialization: * (1) read interface list from kernel and compare against component parameters diff --git a/ompi/mca/btl/openib/btl_openib_eager_rdma.h b/ompi/mca/btl/openib/btl_openib_eager_rdma.h index 89c7a5adc4..17a60436f8 100644 --- a/ompi/mca/btl/openib/btl_openib_eager_rdma.h +++ b/ompi/mca/btl/openib/btl_openib_eager_rdma.h @@ -12,7 +12,6 @@ #include "ompi_config.h" #include "btl_openib.h" -#include "ompi/mca/mpool/openib/mpool_openib.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { @@ -20,7 +19,7 @@ extern "C" { struct mca_btl_openib_eager_rdma_local_t { ompi_ptr_t base; /**< buffer for RDMAing eager messages */ - mca_mpool_openib_registration_t *reg; + mca_btl_openib_reg_t *reg; uint16_t head; /**< RDMA buffer to poll */ uint16_t tail; /**< Needed for credit managment */ int32_t credits; /**< number of RDMA credits */ diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 093eaa56ec..fe62685d2f 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -1206,7 +1206,8 @@ void mca_btl_openib_endpoint_connect_eager_rdma( openib_btl->eager_rdma_frag_size * mca_btl_openib_component.eager_rdma_num + mca_btl_openib_component.buffer_alignment + - sizeof(mca_btl_openib_recv_frag_eager_t), 0, 0, + sizeof(mca_btl_openib_recv_frag_eager_t), 0, + MCA_MPOOL_FLAGS_CACHE_BYPASS, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) @@ -1221,7 +1222,7 @@ void mca_btl_openib_endpoint_connect_eager_rdma( for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf + i*openib_btl->eager_rdma_frag_size); - item->user_data = endpoint->eager_rdma_local.reg; + item->user_data = (void*)endpoint->eager_rdma_local.reg; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_eager_t); ((mca_btl_openib_frag_t*)item)->endpoint = endpoint; ((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 9b683eaa03..806a6d52ca 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -29,7 +29,6 @@ #include #include #include "ompi/mca/btl/base/btl_base_error.h" -#include "ompi/mca/mpool/openib/mpool_openib.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 5ba626b6af..e0f61d06ed 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -18,23 +18,21 @@ #include "btl_openib_frag.h" #include "btl_openib_eager_rdma.h" -#include "ompi/mca/mpool/openib/mpool_openib.h" - static void mca_btl_openib_frag_common_constructor( mca_btl_openib_frag_t* frag) { - mca_mpool_openib_registration_t* registration = - (mca_mpool_openib_registration_t*) frag->base.super.user_data; + mca_btl_openib_reg_t* registration = + (mca_btl_openib_reg_t*)frag->base.super.user_data; frag->hdr = (mca_btl_openib_header_t*) (frag+1); /* initialize the btl header to start at end of frag */ frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); /* init the segment address to start after the btl header */ if(registration) { - frag->mr = registration->mr; - frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey; - frag->sg_entry.lkey = frag->mr->lkey; + frag->registration = registration; + frag->sg_entry.lkey = registration->mr->lkey; + frag->segment.seg_key.key32[0] = frag->sg_entry.lkey; } frag->segment.seg_len = frag->size; frag->sg_entry.addr = (unsigned long) frag->hdr; diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 10e4bee8d1..fd23c186ea 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -22,13 +22,14 @@ #include "ompi_config.h" #include -#include "ompi/mca/mpool/openib/mpool_openib.h" #include "ompi/mca/btl/btl.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif +struct mca_btl_openib_reg_t; + struct mca_btl_openib_header_t { mca_btl_base_tag_t tag; #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT @@ -173,8 +174,7 @@ struct mca_btl_openib_frag_t { struct ibv_send_wr sr_desc; } wr_desc; struct ibv_sge sg_entry; - struct ibv_mr *mr; - mca_mpool_openib_registration_t * openib_reg; + struct mca_btl_openib_reg_t *registration; }; typedef struct mca_btl_openib_frag_t mca_btl_openib_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t); diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 484c005e30..886face1f8 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -145,7 +145,7 @@ int btl_openib_register_mca_params(void) REGINT_GE_ONE)); CHECK(reg_string("mpool", "Name of the memory pool to be used (it is unlikely that you will ever want to change this", - "openib", &mca_btl_openib_component.ib_mpool_name, + "rdma", &mca_btl_openib_component.ib_mpool_name, 0)); CHECK(reg_int("reg_mru_len", "Length of the registration cache most recently used list " diff --git a/ompi/mca/btl/udapl/btl_udapl.c b/ompi/mca/btl/udapl/btl_udapl.c index dc2363690f..c3ca29a94a 100644 --- a/ompi/mca/btl/udapl/btl_udapl.c +++ b/ompi/mca/btl/udapl/btl_udapl.c @@ -34,9 +34,12 @@ #include "ompi/datatype/convertor.h" #include "ompi/datatype/datatype.h" #include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/mpool/udapl/mpool_udapl.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "ompi/proc/proc.h" +static int udapl_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg); +static int udapl_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg); mca_btl_udapl_module_t mca_btl_udapl_module = { { @@ -67,6 +70,49 @@ mca_btl_udapl_module_t mca_btl_udapl_module = { } }; +static int udapl_reg_mr(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg) +{ + mca_btl_udapl_module_t *btl = (mca_btl_udapl_module_t*)reg_data; + mca_btl_udapl_reg_t *udapl_reg = (mca_btl_udapl_reg_t*)reg; + DAT_REGION_DESCRIPTION region; + DAT_VLEN dat_size; + DAT_VADDR dat_addr; + int rc; + + region.for_va = base; + udapl_reg->lmr_triplet.virtual_address = (DAT_VADDR)base; + udapl_reg->lmr_triplet.segment_length = size; + udapl_reg->lmr = NULL; + + rc = dat_lmr_create(btl->udapl_ia, DAT_MEM_TYPE_VIRTUAL, region, size, + btl->udapl_pz, DAT_MEM_PRIV_ALL_FLAG, &udapl_reg->lmr, + &udapl_reg->lmr_triplet.lmr_context, &udapl_reg->rmr_context, + &dat_size, &dat_addr); + + if(rc != DAT_SUCCESS) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + return OMPI_SUCCESS; +} + +static int udapl_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) +{ + mca_btl_udapl_reg_t *udapl_reg = (mca_btl_udapl_reg_t*)reg; + int rc; + + if(udapl_reg->lmr != NULL) { + rc = dat_lmr_free(udapl_reg->lmr); + if(rc != DAT_SUCCESS) { + opal_output(0, "%s: error unpinning dapl memory errno says %s\n", + __func__, strerror(errno)); + return OMPI_ERROR; + } + } + + return OMPI_SUCCESS; +} /** * Initialize module module resources. @@ -153,9 +199,10 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) ((struct sockaddr_in*)&btl->udapl_addr.addr)->sin_port = htons(port); /* initialize the memory pool */ - res.udapl_ia = btl->udapl_ia; - res.udapl_pz = btl->udapl_pz; - + res.reg_data = btl; + res.sizeof_reg = sizeof(mca_btl_udapl_reg_t); + res.register_mem = udapl_reg_mr; + res.deregister_mem = udapl_dereg_mr; btl->super.btl_mpool = mca_mpool_base_module_create( mca_btl_udapl_component.udapl_mpool_name, &btl->super, &res); @@ -200,7 +247,6 @@ failure: return OMPI_ERROR; } - /* * Cleanup/release module resources. */ @@ -352,8 +398,7 @@ mca_btl_base_descriptor_t* mca_btl_udapl_alloc( ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); frag->triplet.segment_length = frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t); - assert(frag->triplet.lmr_context == - ((mca_mpool_udapl_registration_t*)frag->registration)->lmr_triplet.lmr_context); + assert(frag->triplet.lmr_context == frag->registration->lmr_triplet.lmr_context); frag->btl = udapl_btl; frag->base.des_src = &frag->segment; @@ -376,8 +421,8 @@ int mca_btl_udapl_free( { mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)des; - if(frag->size == 0) { - btl->btl_mpool->mpool_release(btl->btl_mpool, frag->registration); + if(frag->size == 0 && frag->registration != NULL) { + btl->btl_mpool->mpool_deregister(btl->btl_mpool, frag->registration); MCA_BTL_UDAPL_FRAG_RETURN_USER(btl, frag); } else if(frag->size == mca_btl_udapl_component.udapl_eager_frag_size) { MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag); diff --git a/ompi/mca/btl/udapl/btl_udapl.h b/ompi/mca/btl/udapl/btl_udapl.h index 38bae0d15a..2f1fdc3da7 100644 --- a/ompi/mca/btl/udapl/btl_udapl.h +++ b/ompi/mca/btl/udapl/btl_udapl.h @@ -106,6 +106,14 @@ struct mca_btl_udapl_module_t { typedef struct mca_btl_udapl_module_t mca_btl_udapl_module_t; extern mca_btl_udapl_module_t mca_btl_udapl_module; +struct mca_btl_udapl_reg_t { + mca_mpool_base_registration_t base; + DAT_LMR_HANDLE lmr; /* local memory region (LMR) */ + DAT_LMR_TRIPLET lmr_triplet; /* LMR triplet - context, address, length */ + DAT_RMR_CONTEXT rmr_context; /* remote memory region context handle */ + +}; +typedef struct mca_btl_udapl_reg_t mca_btl_udapl_reg_t; /** * Report a uDAPL error - for debugging diff --git a/ompi/mca/btl/udapl/btl_udapl_component.c b/ompi/mca/btl/udapl/btl_udapl_component.c index 0e0fcda5f5..18e6684a6e 100644 --- a/ompi/mca/btl/udapl/btl_udapl_component.c +++ b/ompi/mca/btl/udapl/btl_udapl_component.c @@ -33,7 +33,7 @@ #include "opal/mca/base/mca_base_param.h" #include "orte/mca/errmgr/errmgr.h" #include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/mpool/udapl/mpool_udapl.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "btl_udapl.h" #include "btl_udapl_frag.h" #include "btl_udapl_endpoint.h" @@ -149,7 +149,7 @@ int mca_btl_udapl_component_open(void) mca_btl_udapl_component.udapl_free_list_inc = mca_btl_udapl_param_register_int("free_list_inc", 8); mca_btl_udapl_component.udapl_mpool_name = - mca_btl_udapl_param_register_string("mpool", "udapl"); + mca_btl_udapl_param_register_string("mpool", "rdma"); mca_btl_udapl_component.udapl_max_btls = mca_btl_udapl_param_register_int("max_modules", 8); mca_btl_udapl_component.udapl_evd_qlen = diff --git a/ompi/mca/btl/udapl/btl_udapl_endpoint.c b/ompi/mca/btl/udapl/btl_udapl_endpoint.c index a4cac125e2..70bb0bd162 100644 --- a/ompi/mca/btl/udapl/btl_udapl_endpoint.c +++ b/ompi/mca/btl/udapl/btl_udapl_endpoint.c @@ -30,7 +30,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/dss/dss.h" -#include "ompi/mca/mpool/udapl/mpool_udapl.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "btl_udapl.h" #include "btl_udapl_endpoint.h" #include "btl_udapl_proc.h" diff --git a/ompi/mca/btl/udapl/btl_udapl_frag.c b/ompi/mca/btl/udapl/btl_udapl_frag.c index 411400ecd2..f5e4c3f00a 100644 --- a/ompi/mca/btl/udapl/btl_udapl_frag.c +++ b/ompi/mca/btl/udapl/btl_udapl_frag.c @@ -21,12 +21,12 @@ #include "btl_udapl.h" #include "btl_udapl_frag.h" -#include "ompi/mca/mpool/udapl/mpool_udapl.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" static void mca_btl_udapl_frag_common_constructor(mca_btl_udapl_frag_t* frag) { - mca_mpool_udapl_registration_t* reg = frag->base.super.user_data; + mca_btl_udapl_reg_t* reg = (mca_btl_udapl_reg_t*)frag->base.super.user_data; #if OMPI_ENABLE_DEBUG frag->base.des_src = NULL; diff --git a/ompi/mca/btl/udapl/btl_udapl_frag.h b/ompi/mca/btl/udapl/btl_udapl_frag.h index 6f6803f540..14f9366e78 100644 --- a/ompi/mca/btl/udapl/btl_udapl_frag.h +++ b/ompi/mca/btl/udapl/btl_udapl_frag.h @@ -58,7 +58,7 @@ struct mca_btl_udapl_frag_t { struct mca_btl_udapl_module_t* btl; struct mca_btl_base_endpoint_t *endpoint; - struct mca_mpool_base_registration_t* registration; + struct mca_btl_udapl_reg_t* registration; DAT_LMR_TRIPLET triplet; mca_btl_udapl_footer_t *ftr; diff --git a/ompi/mca/mpool/base/mpool_base_alloc.c b/ompi/mca/mpool/base/mpool_base_alloc.c index 40b1d98722..d5102e5271 100644 --- a/ompi/mca/mpool/base/mpool_base_alloc.c +++ b/ompi/mca/mpool/base/mpool_base_alloc.c @@ -40,7 +40,9 @@ static void mca_mpool_base_registration_constructor( mca_mpool_base_registration reg->mpool = NULL; reg->base = NULL; reg->bound = NULL; + reg->alloc_base = NULL; reg->ref_count = 0; + reg->flags = 0; } static void mca_mpool_base_registration_destructor( mca_mpool_base_registration_t * reg ) @@ -74,58 +76,37 @@ OBJ_CLASS_INSTANCE( * @retval pointer to the allocated memory * @retval NULL on failure */ -void * mca_mpool_base_alloc(size_t size, ompi_info_t * info) +void *mca_mpool_base_alloc(size_t size, ompi_info_t *info) { opal_list_item_t * item; int num_modules = opal_list_get_size(&mca_mpool_base_modules); int reg_module_num = 0; - int i, j, num_keys; + int i, num_keys; mca_mpool_base_selected_module_t * current; mca_mpool_base_selected_module_t * no_reg_function = NULL; mca_mpool_base_selected_module_t ** has_reg_function = NULL; mca_mpool_base_registration_t * registration; - mca_mpool_base_tree_item_t* mpool_tree_item; - + mca_mpool_base_tree_item_t* mpool_tree_item = NULL; + mca_mpool_base_module_t *mpool; void * mem = NULL; char * key = NULL; char * value = NULL; int flag = 0; - bool match_found = false; - bool mpool_requested = false; + bool match_found = false, mpool_requested = false; - if (mca_mpool_base_use_mem_hooks && - 0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) { - /* if we're using memory hooks, it's possible (likely, based - on testing) that for some tests the memory returned from - any of the malloc functions below will be part of a larger - (lazily) freed chunk and therefore already be pinned. - Which causes our caches to get a little confused, as the - alloc/free pair are supposed to always have an exact match - in the rcache. This wasn't happening, leading to badness. - Instead, just malloc and we'll get to the pinning later, - when we try to first use it. Since we're leaving things - pinned, there's no advantage to doing it now over first - use, and it works if we wait ... */ - return malloc(size); - } - - - if (num_modules > 0) { + if(num_modules > 0) { has_reg_function = (mca_mpool_base_selected_module_t **) - malloc(num_modules * sizeof(mca_mpool_base_module_t *)); - if(!has_reg_function){ - return NULL; - } + malloc(num_modules * sizeof(mca_mpool_base_module_t *)); + if(!has_reg_function) + goto out; } mpool_tree_item = mca_mpool_base_tree_item_get(); - if(NULL == mpool_tree_item){ - if(has_reg_function) { - free(has_reg_function); - } - return NULL; - } + if(!mpool_tree_item) + goto out; + + mpool_tree_item->count = 0; if(&ompi_mpi_info_null == info) { @@ -182,10 +163,7 @@ void * mca_mpool_base_alloc(size_t size, ompi_info_t * info) /* there was more than one requested mpool that lacks * a registration function, so return failure */ free(key); - if(has_reg_function) { - free(has_reg_function); - } - return NULL; + goto out; } no_reg_function = current; } @@ -200,10 +178,7 @@ void * mca_mpool_base_alloc(size_t size, ompi_info_t * info) /* one of the keys given to us by the user did not match any * mpools, so return an error */ free(key); - if(has_reg_function) { - free(has_reg_function); - } - return NULL; + goto out; } } free(key); @@ -211,76 +186,59 @@ void * mca_mpool_base_alloc(size_t size, ompi_info_t * info) if(NULL == no_reg_function && 0 == reg_module_num) { - if(has_reg_function) { - free(has_reg_function); - } if(!mpool_requested) { /* if the info argument was NULL and there were no useable mpools * or there user provided info object but did not specifiy a "mpool" key, * just malloc the memory and return it */ mem = malloc(size); - if(NULL != mem){ - /* don't need the tree */ - mca_mpool_base_tree_item_put(mpool_tree_item); - return mem; - } + goto out; } /* the user passed info but we were not able to use any of the mpools * specified */ - return NULL; + goto out; } - i = j = 0; - num_modules = 0; if(NULL != no_reg_function) { - mca_mpool_base_module_t* mpool = no_reg_function->mpool_module; - mem = mpool->mpool_alloc(mpool, size, 0, MCA_MPOOL_FLAGS_PERSIST, ®istration); - num_modules++; - mpool_tree_item->key = mem; - mpool_tree_item->mpools[j] = mpool; - mpool_tree_item->regs[j++] = registration; - } - else - { - mca_mpool_base_module_t* mpool = has_reg_function[i]->mpool_module; - mem = mpool->mpool_alloc(mpool, size, 0, MCA_MPOOL_FLAGS_PERSIST, ®istration); - i++; - num_modules++; - mpool_tree_item->key = mem; - mpool_tree_item->mpools[j] = mpool; - mpool_tree_item->regs[j++] = registration; + mpool = no_reg_function->mpool_module; + i = 0; + } else { + mpool = has_reg_function[0]->mpool_module; + i = 1; } + mem = mpool->mpool_alloc(mpool, size, 0, MCA_MPOOL_FLAGS_PERSIST, + ®istration); + if(NULL == mem) + goto out; + + mpool_tree_item->key = mem; + mpool_tree_item->mpools[mpool_tree_item->count] = mpool; + mpool_tree_item->regs[mpool_tree_item->count++] = registration; while(i < reg_module_num) { - mca_mpool_base_module_t* mpool = has_reg_function[i]->mpool_module; - if(OMPI_SUCCESS != mpool->mpool_register(mpool, mem, size, MCA_MPOOL_FLAGS_PERSIST, ®istration)) - { - if (has_reg_function) { - free(has_reg_function); - } - return NULL; - } else { - mpool_tree_item->mpools[j] = mpool; - mpool_tree_item->regs[j++] = registration; - num_modules++; + mpool = has_reg_function[i]->mpool_module; + if(mpool->mpool_register(mpool, mem, size, MCA_MPOOL_FLAGS_PERSIST, + ®istration) != OMPI_SUCCESS) { + goto out; } + mpool_tree_item->mpools[mpool_tree_item->count] = mpool; + mpool_tree_item->regs[mpool_tree_item->count++] = registration; i++; } - if(has_reg_function) { - free(has_reg_function); - } - - /* null terminated array */ - mpool_tree_item->mpools[j] = NULL; - mpool_tree_item->regs[j] = NULL; mca_mpool_base_tree_insert(mpool_tree_item); - + mpool_tree_item = NULL; /* prevent it to be deleted below */ +out: + if(mpool_tree_item) + mca_mpool_base_tree_item_put(mpool_tree_item); + + if(has_reg_function) + free(has_reg_function); + return mem; } @@ -292,49 +250,38 @@ void * mca_mpool_base_alloc(size_t size, ompi_info_t * info) * @retval OMPI_SUCCESS * @retval OMPI_ERR_BAD_PARAM if the passed base pointer was invalid */ -int mca_mpool_base_free(void * base) +int mca_mpool_base_free(void *base) { - int i = 0, rc = OMPI_SUCCESS; - mca_mpool_base_tree_item_t* mpool_tree_item = NULL; - mca_mpool_base_module_t* mpool; - mca_mpool_base_registration_t* reg; - - if(!base) { + mca_mpool_base_tree_item_t *mpool_tree_item = NULL; + mca_mpool_base_module_t *mpool; + mca_mpool_base_registration_t *reg; + int i, rc; + + if(!base) { return OMPI_ERROR; } - /* see comment in alloc function above */ - if (mca_mpool_base_use_mem_hooks && - 0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) { + mpool_tree_item = mca_mpool_base_tree_find(base); + + if(!mpool_tree_item) { + /* nothing in the tree this was just plain old malloc'd memory */ free(base); return OMPI_SUCCESS; } - mpool_tree_item = mca_mpool_base_tree_find(base); - - if(!mpool_tree_item) { - /* nothing in the tree this was just - plain old malloc'd memory */ - free(base); - return OMPI_SUCCESS; - } - - for(i = 1; i < MCA_MPOOL_BASE_TREE_MAX; i++) { + for(i = 1; i < mpool_tree_item->count; i++) { mpool = mpool_tree_item->mpools[i]; reg = mpool_tree_item->regs[i]; - if(mpool) { + if(mpool && mpool->mpool_deregister) { mpool->mpool_deregister(mpool, reg); - } else { - break; } } mpool = mpool_tree_item->mpools[0]; reg = mpool_tree_item->regs[0]; mpool->mpool_free(mpool, base, reg); - + rc = mca_mpool_base_tree_delete(mpool_tree_item); return rc; } - diff --git a/ompi/mca/mpool/base/mpool_base_mem_cb.c b/ompi/mca/mpool/base/mpool_base_mem_cb.c index 003b1dcdc4..c27f54a55f 100644 --- a/ompi/mca/mpool/base/mpool_base_mem_cb.c +++ b/ompi/mca/mpool/base/mpool_base_mem_cb.c @@ -19,6 +19,7 @@ * @file */ #include "ompi_config.h" +#include "opal/util/output.h" #include "mpool_base_mem_cb.h" #include "base.h" #include "orte/util/proc_info.h" @@ -35,51 +36,30 @@ ompi_pointer_array_t mca_mpool_base_mem_cb_array; void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata, bool from_alloc) { - uint32_t i, cnt; - mca_mpool_base_registration_t* reg; mca_mpool_base_selected_module_t* current; int rc; opal_list_item_t* item; - void* base_addr; - void* bound_addr; if(size == 0) { return; } - base_addr = down_align_addr( base, mca_mpool_base_page_size_log); - bound_addr = up_align_addr((void*) ((ptrdiff_t) base + size - 1), mca_mpool_base_page_size_log); for(item = opal_list_get_first(&mca_mpool_base_modules); item != opal_list_get_end(&mca_mpool_base_modules); item = opal_list_get_next(item)) { + bool warn = true; current = (mca_mpool_base_selected_module_t*) item; - - if(NULL != current->mpool_module->mpool_find) { - rc = current->mpool_module->mpool_find( - current->mpool_module, - base_addr, - size, - &mca_mpool_base_mem_cb_array, - &cnt - ); - if(OMPI_SUCCESS != rc) { - continue; + + if(current->mpool_module->mpool_release_memory != NULL) { + rc = current->mpool_module->mpool_release_memory(current->mpool_module, + base, size); + + if(rc != OMPI_SUCCESS && true == warn) { + opal_output(0, "Memory %p:%llu cannot be freed from the " + "registration cache. Possible memory corruption.\n", + base, size); + warn = false; } - for(i = 0; i < cnt; i++) { - reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(&mca_mpool_base_mem_cb_array, i); -#if !defined(NDEBUG) - if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { - assert(reg->ref_count <= 3); - } else if(reg->flags & MCA_MPOOL_FLAGS_PERSIST) { - assert(reg->ref_count <= 2); - } else { - assert(reg->ref_count <= 1); - } -#endif - current->mpool_module->mpool_deregister(current->mpool_module, reg); - } - ompi_pointer_array_remove_all(&mca_mpool_base_mem_cb_array); } } - } diff --git a/ompi/mca/mpool/base/mpool_base_tree.h b/ompi/mca/mpool/base/mpool_base_tree.h index 048322061b..363be3f1d4 100644 --- a/ompi/mca/mpool/base/mpool_base_tree.h +++ b/ompi/mca/mpool/base/mpool_base_tree.h @@ -38,7 +38,8 @@ struct mca_mpool_base_tree_item_t ompi_free_list_item_t super; /**< the parent class */ void* key; /* the address this was alloc'd on */ mca_mpool_base_module_t* mpools[MCA_MPOOL_BASE_TREE_MAX]; /**< the mpools */ - mca_mpool_base_registration_t* regs[MCA_MPOOL_BASE_TREE_MAX]; /**< the registrations */ + mca_mpool_base_registration_t* regs[MCA_MPOOL_BASE_TREE_MAX]; /**< the registrations */ + uint8_t count; }; typedef struct mca_mpool_base_tree_item_t mca_mpool_base_tree_item_t; diff --git a/ompi/mca/mpool/gm/.ompi_ignore b/ompi/mca/mpool/gm/.ompi_ignore new file mode 100644 index 0000000000..60bf58b271 --- /dev/null +++ b/ompi/mca/mpool/gm/.ompi_ignore @@ -0,0 +1 @@ +quilt diff --git a/ompi/mca/mpool/mpool.h b/ompi/mca/mpool/mpool.h index 3b20dbf296..f3be488117 100644 --- a/ompi/mca/mpool/mpool.h +++ b/ompi/mca/mpool/mpool.h @@ -26,7 +26,7 @@ #include "ompi/class/ompi_free_list.h" #include "ompi/class/ompi_pointer_array.h" -#define MCA_MPOOL_FLAGS_CACHE 0x1 +#define MCA_MPOOL_FLAGS_CACHE_BYPASS 0x1 #define MCA_MPOOL_FLAGS_PERSIST 0x2 #define MCA_MPOOL_FLAGS_MPI_ALLOC_MEM 0x4 @@ -38,7 +38,6 @@ struct mca_mpool_base_registration_t { unsigned char* base; unsigned char* bound; unsigned char* alloc_base; - void* user_data; int32_t ref_count; uint32_t flags; }; @@ -47,19 +46,6 @@ typedef struct mca_mpool_base_registration_t mca_mpool_base_registration_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_base_registration_t); -#define MCA_MPOOL_REG_RETAIN(reg) { \ - do{ \ - OPAL_THREAD_ADD32(®->ref_count, 1); \ - } while(0); \ -} - -#define MCA_MPOOL_REG_RELEASE(reg) { \ - do{ \ - OPAL_THREAD_ADD32(®->ref_count, -1); \ - } while(0); \ -} - - /** * component initialize */ @@ -111,26 +97,12 @@ typedef int (*mca_mpool_base_module_deregister_fn_t)( mca_mpool_base_registration_t* registration); /** - * find registrations in this memory pool + * find registration in this memory pool */ typedef int (*mca_mpool_base_module_find_fn_t) ( - struct mca_mpool_base_module_t* mpool, - void* addr, - size_t size, - ompi_pointer_array_t* regs, - uint32_t *cnt - ); - - -/** - * retain registration - */ - -typedef int (*mca_mpool_base_module_retain_fn_t) ( - struct mca_mpool_base_module_t* mpool, - mca_mpool_base_registration_t* registration); - + struct mca_mpool_base_module_t* mpool, void* addr, size_t size, + mca_mpool_base_registration_t **reg); /** * release registration @@ -141,6 +113,12 @@ typedef int (*mca_mpool_base_module_release_fn_t) ( mca_mpool_base_registration_t* registration); +/** + * release memory region + */ +typedef int (*mca_mpool_base_module_release_memory_fn_t) ( + struct mca_mpool_base_module_t* mpool, void *base, size_t size); + /** * if appropriate - returns base address of memory pool */ @@ -185,8 +163,8 @@ struct mca_mpool_base_module_t { mca_mpool_base_module_register_fn_t mpool_register; /**< register memory */ mca_mpool_base_module_deregister_fn_t mpool_deregister; /**< deregister memory */ mca_mpool_base_module_find_fn_t mpool_find; /**< find regisrations in the cache */ - mca_mpool_base_module_retain_fn_t mpool_retain; /**< retain a registration from the cache */ mca_mpool_base_module_release_fn_t mpool_release; /**< release a registration from the cache */ + mca_mpool_base_module_release_memory_fn_t mpool_release_memory; /**< release memor region from the cache */ mca_mpool_base_module_finalize_fn_t mpool_finalize; /**< finalize */ struct mca_rcache_base_module_t *rcache; /* the rcache associated with this mpool */ uint32_t flags; /**< mpool flags */ diff --git a/ompi/mca/mpool/mvapi/.ompi_ignore b/ompi/mca/mpool/mvapi/.ompi_ignore new file mode 100644 index 0000000000..60bf58b271 --- /dev/null +++ b/ompi/mca/mpool/mvapi/.ompi_ignore @@ -0,0 +1 @@ +quilt diff --git a/ompi/mca/mpool/openib/.ompi_ignore b/ompi/mca/mpool/openib/.ompi_ignore new file mode 100644 index 0000000000..60bf58b271 --- /dev/null +++ b/ompi/mca/mpool/openib/.ompi_ignore @@ -0,0 +1 @@ +quilt diff --git a/ompi/mca/mpool/rdma/Makefile.am b/ompi/mca/mpool/rdma/Makefile.am new file mode 100644 index 0000000000..c7b6676be3 --- /dev/null +++ b/ompi/mca/mpool/rdma/Makefile.am @@ -0,0 +1,55 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Use the top-level Makefile.options + + + +AM_CPPFLAGS = $(mpool_rdma_CPPFLAGS) + +sources = \ + mpool_rdma.h \ + mpool_rdma_module.c \ + mpool_rdma_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_mpool_rdma_DSO +component_noinst = +component_install = mca_mpool_rdma.la +else +component_noinst = libmca_mpool_rdma.la +component_install = +endif + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component_install) +mca_mpool_rdma_la_SOURCES = $(sources) +mca_mpool_rdma_la_LDFLAGS = -module -avoid-version +mca_mpool_rdma_la_LIBADD = \ + $(mpool_rdma_LIBS) \ + $(top_ompi_builddir)/ompi/libmpi.la \ + $(top_ompi_builddir)/orte/libopen-rte.la \ + $(top_ompi_builddir)/opal/libopen-pal.la + +noinst_LTLIBRARIES = $(component_noinst) +libmca_mpool_rdma_la_SOURCES = $(sources) +libmca_mpool_rdma_la_LDFLAGS = -module -avoid-version +libmca_mpool_rdma_la_LIBADD = $(mpool_rdma_LIBS) diff --git a/ompi/mca/mpool/rdma/configure.params b/ompi/mca/mpool/rdma/configure.params new file mode 100644 index 0000000000..ca96d12ffa --- /dev/null +++ b/ompi/mca/mpool/rdma/configure.params @@ -0,0 +1,25 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Voltaire. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=mpool_rdma_component.c +PARAM_CONFIG_FILES="Makefile" diff --git a/ompi/mca/mpool/rdma/mpool_rdma.h b/ompi/mca/mpool/rdma/mpool_rdma.h new file mode 100644 index 0000000000..aa3dc8b1c4 --- /dev/null +++ b/ompi/mca/mpool/rdma/mpool_rdma.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_MPOOL_OPENIB_H +#define MCA_MPOOL_OPENIB_H + +#include "opal/class/opal_list.h" +#include "ompi/class/ompi_free_list.h" +#include "opal/event/event.h" +#include "ompi/mca/mpool/mpool.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +struct mca_mpool_rdma_component_t { + mca_mpool_base_component_t super; + char* rcache_name; + size_t rcache_size_limit; + bool print_stats; + uint32_t leave_pinned; +}; +typedef struct mca_mpool_rdma_component_t mca_mpool_rdma_component_t; + +OMPI_DECLSPEC extern mca_mpool_rdma_component_t mca_mpool_rdma_component; + +struct mca_mpool_base_resources_t { + void *reg_data; + size_t sizeof_reg; + int (*register_mem)(void *reg_data, void *base, size_t size, + mca_mpool_base_registration_t *reg); + int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg); +}; +typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t; + +struct mca_mpool_rdma_module_t { + mca_mpool_base_module_t super; + struct mca_mpool_base_resources_t resources; + ompi_free_list_t reg_list; + opal_list_t mru_list; + uint32_t stat_cache_hit; + uint32_t stat_cache_miss; + uint32_t stat_evicted; + uint32_t stat_cache_found; + uint32_t stat_cache_notfound; +}; typedef struct mca_mpool_rdma_module_t mca_mpool_rdma_module_t; + +/* + * Initializes the mpool module. + */ +void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t *mpool); + +/* + * Returns base address of shared memory mapping. + */ +void *mca_mpool_rdma_base(mca_mpool_base_module_t *mpool); + +/** + * Allocate block of registered memory. + */ +void* mca_mpool_rdma_alloc(mca_mpool_base_module_t *mpool, size_t size, + size_t align, uint32_t flags, + mca_mpool_base_registration_t** registration); + +/** + * realloc block of registered memory + */ +void* mca_mpool_rdma_realloc( mca_mpool_base_module_t *mpool, void* addr, + size_t size, mca_mpool_base_registration_t** registration); + +/** + * register block of memory + */ +int mca_mpool_rdma_register(mca_mpool_base_module_t* mpool, void *addr, + size_t size, uint32_t flags, mca_mpool_base_registration_t **reg); + +/** + * deregister memory + */ +int mca_mpool_rdma_deregister(mca_mpool_base_module_t *mpool, + mca_mpool_base_registration_t *reg); + +/** + * free memory allocated by alloc function + */ +void mca_mpool_rdma_free(mca_mpool_base_module_t *mpool, void * addr, + mca_mpool_base_registration_t *reg); + +/** + * find registration for a given block of memory + */ +int mca_mpool_rdma_find(struct mca_mpool_base_module_t* mpool, void* addr, + size_t size, mca_mpool_base_registration_t **reg); + +/** + * unregister all registration covering the block of memory + */ +int mca_mpool_rdma_release_memory(mca_mpool_base_module_t* mpool, void *base, + size_t size); + +/** + * finalize mpool + */ +void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool); +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/mpool/rdma/mpool_rdma_component.c b/ompi/mca/mpool/rdma/mpool_rdma_component.c new file mode 100644 index 0000000000..5578eb62a3 --- /dev/null +++ b/ompi/mca/mpool/rdma/mpool_rdma_component.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "mpool_rdma.h" +#include "orte/util/proc_info.h" +#include "orte/util/sys_info.h" +#include +#include + +/* + * Local functions + */ +static int mca_mpool_rdma_open(void); +static mca_mpool_base_module_t* mca_mpool_rdma_init( + struct mca_mpool_base_resources_t* resources); + +mca_mpool_rdma_component_t mca_mpool_rdma_component = { + { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + /* Indicate that we are a mpool v1.0.0 component (which also + implies a specific MCA version) */ + + MCA_MPOOL_BASE_VERSION_1_0_0, + + "rdma", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_mpool_rdma_open, /* component open */ + NULL + }, + + /* Next the MCA v1.0.0 component meta data */ + + { + /* Whether the component is checkpointable or not */ + false + }, + + mca_mpool_rdma_init + } +}; + +/** + * component open/close/init function + */ +static int mca_mpool_rdma_open(void) +{ + int param, val; + + mca_base_param_reg_string(&mca_mpool_rdma_component.super.mpool_version, + "rcache_name", + "The name of the registration cache the mpool should use", + false, false, "vma", &mca_mpool_rdma_component.rcache_name); + + mca_base_param_reg_int(&mca_mpool_rdma_component.super.mpool_version, + "rcache_size_limit", + "the maximum size of registration cache in bytes. " + "0 is unlimited (default 0)", false, false, 0, &val); + + mca_mpool_rdma_component.rcache_size_limit = (size_t)val; + + mca_base_param_reg_int(&mca_mpool_rdma_component.super.mpool_version, + "print_stats", + "print pool usage statistics at the end of the run", + false, false, 0, &val); + + mca_mpool_rdma_component.print_stats = val?true:false; + + mca_base_param_register_int("mpi", NULL, "leave_pinned", "leave_pinned", 0); + param = mca_base_param_find("mpi", NULL, "leave_pinned"); + mca_base_param_lookup_int(param, (int*)&mca_mpool_rdma_component.leave_pinned); + + if(0 == mca_mpool_rdma_component.leave_pinned) { + /* and now check leave_pinned_pipeline if necessary */ + mca_base_param_register_int("mpi", NULL, "leave_pinned_pipeline", + "leave_pinned_pipeline", 0); + param = mca_base_param_find("mpi", NULL, "leave_pinned_pipeline"); + mca_base_param_lookup_int(param, (int*)&mca_mpool_rdma_component.leave_pinned); + } + + return OMPI_SUCCESS; +} + +static mca_mpool_base_module_t* mca_mpool_rdma_init( + struct mca_mpool_base_resources_t *resources) +{ + mca_mpool_rdma_module_t* mpool_module; + + mpool_module = + (mca_mpool_rdma_module_t*)malloc(sizeof(mca_mpool_rdma_module_t)); + + mpool_module->resources = *resources; + + mca_mpool_rdma_module_init(mpool_module); + + return &mpool_module->super; +} diff --git a/ompi/mca/mpool/rdma/mpool_rdma_module.c b/ompi/mca/mpool/rdma/mpool_rdma_module.c new file mode 100644 index 0000000000..b4572f14ed --- /dev/null +++ b/ompi/mca/mpool/rdma/mpool_rdma_module.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "orte/util/proc_info.h" +#include "opal/util/output.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" +#include +#include +#include +#include "ompi/mca/rcache/rcache.h" +#include "ompi/mca/rcache/base/base.h" +#include "ompi/mca/mpool/base/base.h" + +extern uint32_t mca_mpool_base_page_size; +extern uint32_t mca_mpool_base_page_size_log; + +/* + * Initializes the mpool module. + */ +void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t* mpool) +{ + mpool->super.mpool_component = &mca_mpool_rdma_component.super; + mpool->super.mpool_base = NULL; /* no base .. */ + mpool->super.mpool_alloc = mca_mpool_rdma_alloc; + mpool->super.mpool_realloc = mca_mpool_rdma_realloc; + mpool->super.mpool_free = mca_mpool_rdma_free; + mpool->super.mpool_register = mca_mpool_rdma_register; + mpool->super.mpool_find = mca_mpool_rdma_find; + mpool->super.mpool_deregister = mca_mpool_rdma_deregister; + mpool->super.mpool_release_memory = mca_mpool_rdma_release_memory; + if(mca_mpool_rdma_component.print_stats == true) + mpool->super.mpool_finalize = mca_mpool_rdma_finalize; + else + mpool->super.mpool_finalize = NULL; + mpool->super.rcache = + mca_rcache_base_module_create(mca_mpool_rdma_component.rcache_name); + mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM; + + OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t); + ompi_free_list_init(&mpool->reg_list, mpool->resources.sizeof_reg, + OBJ_CLASS(mca_mpool_base_registration_t), 0, -1, 32, + NULL); + OBJ_CONSTRUCT(&mpool->mru_list, opal_list_t); + mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0; + mpool->stat_cache_found = mpool->stat_cache_notfound = 0; +} + +static inline int dereg_mem(mca_mpool_base_module_t *mpool, + mca_mpool_base_registration_t *reg) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t *)mpool; + + assert(reg->ref_count == 0); + return mpool_rdma->resources.deregister_mem(mpool_rdma->resources.reg_data, + reg); +} + +/** + * allocate function + */ +void* mca_mpool_rdma_alloc(mca_mpool_base_module_t *mpool, size_t size, + size_t align, uint32_t flags, mca_mpool_base_registration_t **reg) +{ + void *addr; + + if(posix_memalign(&addr, mca_mpool_base_page_size, size) != 0) + return NULL; + + if(OMPI_SUCCESS != mca_mpool_rdma_register(mpool, addr, size, flags, reg)) { + free(addr); + return NULL; + } + (*reg)->alloc_base = addr; + + return addr; +} + +static int register_cache_bypass(mca_mpool_base_module_t *mpool, + void *addr, size_t size, uint32_t flags, + mca_mpool_base_registration_t **reg) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; + mca_mpool_base_registration_t *rdma_reg; + ompi_free_list_item_t *item; + unsigned char *base, *bound; + int rc; + + base = down_align_addr(addr, mca_mpool_base_page_size_log); + bound = up_align_addr( (void*) ((char*) addr + size - 1), + mca_mpool_base_page_size_log); + OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc); + if(OMPI_SUCCESS != rc) { + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return rc; + } + rdma_reg = (mca_mpool_base_registration_t*)item; + + rdma_reg->mpool = mpool; + rdma_reg->base = base; + rdma_reg->bound = bound; + rdma_reg->flags = flags; + + rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data, + base, bound - base + 1, rdma_reg); + + if(rc != OMPI_SUCCESS) { + OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); + return rc; + } + + *reg = rdma_reg; + (*reg)->ref_count++; + return OMPI_SUCCESS; +} + +/* + * register memory + */ +int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr, + size_t size, uint32_t flags, + mca_mpool_base_registration_t **reg) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; + mca_mpool_base_registration_t *rdma_reg; + ompi_free_list_item_t *item; + unsigned char *base, *bound; + int rc; + + /* if cache bypass is requested don't use the cache */ + if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) { + return register_cache_bypass(mpool, addr, size, flags, reg); + } + + base = down_align_addr(addr, mca_mpool_base_page_size_log); + bound = up_align_addr((void*)((char*) addr + size - 1), + mca_mpool_base_page_size_log); + OPAL_THREAD_LOCK(&mpool->rcache->lock); + /* look through existing regs if not persistent registration requested. + * Persistent registration are always registered and placed in the cache */ + if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) { + /* check to see if memory is registered */ + mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); + if(*reg != NULL && + (mca_mpool_rdma_component.leave_pinned || + ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || + ((*reg)->base == base && (*reg)->bound == bound))) { + if(0 == (*reg)->ref_count && + mca_mpool_rdma_component.leave_pinned) { + opal_list_remove_item(&mpool_rdma->mru_list, + (opal_list_item_t*)(*reg)); + } + mpool_rdma->stat_cache_hit++; + (*reg)->ref_count++; + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return MPI_SUCCESS; + } + + mpool_rdma->stat_cache_miss++; + *reg = NULL; /* in case previous find found something */ + + /* If no suitable registration is in cache and leave_pinned isn't + * set and size of registration cache is unlimited don't use the cache. + * This is optimisation in case limit is not set. If limit is set we + * have to put registration into the cache to determine when we hit + * memory registration limit. + * NONE: cache is still used for persistent registrations so previous + * find can find something */ + if(!mca_mpool_rdma_component.leave_pinned && + mca_mpool_rdma_component.rcache_size_limit == 0) { + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return register_cache_bypass(mpool, addr, size, flags, reg); + } + } + + OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc); + if(OMPI_SUCCESS != rc) { + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return rc; + } + rdma_reg = (mca_mpool_base_registration_t*)item; + + rdma_reg->mpool = mpool; + rdma_reg->base = base; + rdma_reg->bound = bound; + rdma_reg->flags = flags; + + while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg, + mca_mpool_rdma_component.rcache_size_limit)) == + OMPI_ERR_TEMP_OUT_OF_RESOURCE) { + mca_mpool_base_registration_t *old_reg; + /* try to remove one unused reg and retry */ + old_reg = (mca_mpool_base_registration_t*) + opal_list_get_last(&mpool_rdma->mru_list); + if(opal_list_get_end(&mpool_rdma->mru_list) != + (opal_list_item_t*)old_reg) { + rc = dereg_mem(mpool, old_reg); + if(MPI_SUCCESS == rc) { + mpool->rcache->rcache_delete(mpool->rcache, old_reg); + opal_list_remove_item(&mpool_rdma->mru_list, + (opal_list_item_t*)old_reg); + OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, + (ompi_free_list_item_t*)old_reg); + mpool_rdma->stat_evicted++; + } else + break; + } else + break; + } + + if(rc != OMPI_SUCCESS) { + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); + return rc; + } + + rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data, + base, bound - base + 1, rdma_reg); + + if(rc != OMPI_SUCCESS) { + mpool->rcache->rcache_delete(mpool->rcache, rdma_reg); + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); + return rc; + } + + *reg = rdma_reg; + (*reg)->ref_count++; + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return OMPI_SUCCESS; +} + + +/** + * realloc function + */ +void* mca_mpool_rdma_realloc(mca_mpool_base_module_t *mpool, void *addr, + size_t size, mca_mpool_base_registration_t **reg) +{ + mca_mpool_base_registration_t *old_reg = *reg; + void *new_mem = mca_mpool_rdma_alloc(mpool, size, 0, old_reg->flags, reg); + memcpy(new_mem, addr, old_reg->bound - old_reg->base + 1); + mca_mpool_rdma_free(mpool, addr, old_reg); + + return new_mem; +} + +/** + * free function + */ +void mca_mpool_rdma_free(mca_mpool_base_module_t *mpool, void *addr, + mca_mpool_base_registration_t *registration) +{ + mca_mpool_rdma_deregister(mpool, registration); + free(registration->alloc_base); +} + +int mca_mpool_rdma_find(struct mca_mpool_base_module_t *mpool, void *addr, + size_t size, mca_mpool_base_registration_t **reg) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; + int rc; + unsigned char *base, *bound; + + base = down_align_addr(addr, mca_mpool_base_page_size_log); + bound = up_align_addr((void*)((char*) addr + size - 1), + mca_mpool_base_page_size_log); + + OPAL_THREAD_LOCK(&mpool->rcache->lock); + rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); + if(*reg != NULL && + (mca_mpool_rdma_component.leave_pinned || + ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || + ((*reg)->base == base && (*reg)->bound == bound))) { + assert(((void*)(*reg)->bound) >= addr); + if(0 == (*reg)->ref_count && + mca_mpool_rdma_component.leave_pinned) { + opal_list_remove_item(&mpool_rdma->mru_list, + (opal_list_item_t*)(*reg)); + } + mpool_rdma->stat_cache_found++; + (*reg)->ref_count++; + } else { + mpool_rdma->stat_cache_notfound++; + } + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + + return rc; +} + +int mca_mpool_rdma_deregister(struct mca_mpool_base_module_t *mpool, + mca_mpool_base_registration_t *reg) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; + int rc = OMPI_SUCCESS; + assert(reg->ref_count > 0); + + OPAL_THREAD_LOCK(&mpool->rcache->lock); + reg->ref_count--; + if(reg->ref_count > 0) { + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return OMPI_SUCCESS; + } + if(mca_mpool_rdma_component.leave_pinned && + !(reg->flags & (MCA_MPOOL_FLAGS_CACHE_BYPASS|MCA_MPOOL_FLAGS_PERSIST))) { + /* if leave_pinned is set don't deregister memory, but put it + * on MRU list for future use */ + opal_list_prepend(&mpool_rdma->mru_list, (opal_list_item_t*)reg); + } else { + rc = dereg_mem(mpool, reg); + if(OMPI_SUCCESS == rc) { + if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) + mpool->rcache->rcache_delete(mpool->rcache, reg); + OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, + (ompi_free_list_item_t*)reg); + } + } + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + + return rc; +} + +int mca_mpool_rdma_release_memory(struct mca_mpool_base_module_t *mpool, + void *base, size_t size) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; + mca_mpool_base_registration_t *reg; + ompi_pointer_array_t regs; + int reg_cnt, i, err = 0; + + OBJ_CONSTRUCT(®s, ompi_pointer_array_t); + + OPAL_THREAD_LOCK(&mpool->rcache->lock); + reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, base, size, ®s); + + for(i = 0; i < reg_cnt; i++) { + reg = (mca_mpool_base_registration_t*) + ompi_pointer_array_get_item(®s, i); + + if(0 == reg->ref_count) { + if(dereg_mem(mpool, reg) != OMPI_SUCCESS) { + err++; + continue; + } + } else { + /* remove registration from cache and wait for ref_count goes to + * zero before unregister memory. Note that our registered memory + * statistic can go wrong at this point, but it is better than + * potential memory corruption. And we return error in this case to + * the caller */ + reg->flags |= MCA_MPOOL_FLAGS_CACHE_BYPASS; + err++; /* tell caller that something was wrong */ + } + mpool->rcache->rcache_delete(mpool->rcache, reg); + if(0 == reg->ref_count) { + opal_list_remove_item(&mpool_rdma->mru_list, + (opal_list_item_t*)reg); + OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, + (ompi_free_list_item_t*)reg); + } + } + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + ompi_pointer_array_remove_all(®s); + + return err?OMPI_ERROR:OMPI_SUCCESS; +} + +void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool) +{ + mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; + opal_output(0, "[%lu,%lu,%lu] rdma: stats " + "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), + mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss, + mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound, + mpool_rdma->stat_evicted); +} diff --git a/ompi/mca/mpool/sm/mpool_sm_module.c b/ompi/mca/mpool/sm/mpool_sm_module.c index 6318133bac..023d1ea00c 100644 --- a/ompi/mca/mpool/sm/mpool_sm_module.c +++ b/ompi/mca/mpool/sm/mpool_sm_module.c @@ -36,6 +36,7 @@ void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool) mpool->super.mpool_find = NULL; mpool->super.mpool_register = NULL; mpool->super.mpool_deregister = NULL; + mpool->super.mpool_release_memory = NULL; mpool->super.mpool_finalize = NULL; mpool->super.flags = 0; } diff --git a/ompi/mca/mpool/udapl/.ompi_ignore b/ompi/mca/mpool/udapl/.ompi_ignore new file mode 100644 index 0000000000..60bf58b271 --- /dev/null +++ b/ompi/mca/mpool/udapl/.ompi_ignore @@ -0,0 +1 @@ +quilt diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index 28c7fda0db..68797507cd 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -42,10 +42,8 @@ size_t mca_pml_ob1_rdma_btls( mca_pml_ob1_rdma_btl_t* rdma_btls) { size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); - ompi_pointer_array_t regs; size_t num_btls_used = 0; size_t n; - int rc; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { @@ -53,223 +51,34 @@ size_t mca_pml_ob1_rdma_btls( } /* check to see if memory is registered */ - OBJ_CONSTRUCT(®s, ompi_pointer_array_t); - for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST; n++) { - - mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n); - mca_mpool_base_registration_t* fit = NULL; - mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool; - uint32_t reg_cnt; - size_t r; + for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST; + n++) { + mca_bml_base_btl_t* bml_btl = + mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n); + mca_mpool_base_registration_t* reg = NULL; + mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool; /* btl is rdma capable and registration is not required */ if(NULL == btl_mpool) { - rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = NULL; - num_btls_used++; - continue; - } - - /* look through existing registrations */ - ompi_pointer_array_remove_all(®s); - btl_mpool->mpool_find(btl_mpool, - base, - size, - ®s, - ®_cnt); - - /* - * find the best fit when there are multiple registrations - */ - for(r = 0; r < reg_cnt; r++) { - mca_mpool_base_registration_t* reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(®s, r); - size_t reg_len = reg->bound - base + 1; + reg = NULL; + } else { + if(!mca_pml_ob1.leave_pinned) { + /* look through existing registrations */ + btl_mpool->mpool_find(btl_mpool, base, size, ®); + } else { + /* register the memory */ + btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); + } - if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { - assert(reg->ref_count >= 3); - } - if(reg->base <= base && reg_len >= size) { - fit = reg; - } else if(mca_pml_ob1.leave_pinned){ - btl_mpool->mpool_deregister(btl_mpool, reg); - } else { - btl_mpool->mpool_release(btl_mpool, reg); - } - - } - - - /* if the leave pinned option is set - and there is not an existing - * registration that satisfies this request, create one. - */ - if(NULL == fit && mca_pml_ob1.leave_pinned) { - /* register the memory */ - rc = btl_mpool->mpool_register( - btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(ORTE_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - continue; - } - assert(fit->ref_count == 3); + if(NULL == reg) + bml_btl = NULL; /* skip it */ } - - if(NULL != fit) { + if(bml_btl != NULL) { rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = fit; + rdma_btls[num_btls_used].btl_reg = reg; num_btls_used++; } } return num_btls_used; } - - -/* - * For a given btl - find the best fit registration or - * optionally create one for leave pinned. - */ - -mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( - mca_bml_base_btl_t* bml_btl, - unsigned char* base, - size_t size) -{ - ompi_pointer_array_t regs; - mca_mpool_base_registration_t* fit = NULL; - mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool; - uint32_t reg_cnt; - size_t r; - int rc; - - /* btl is rdma capable and registration is not required */ - if(NULL == btl_mpool) { - return NULL; - } - - /* check to see if memory is registered */ - OBJ_CONSTRUCT(®s, ompi_pointer_array_t); - ompi_pointer_array_remove_all(®s); - - /* look through existing registrations */ - btl_mpool->mpool_find(btl_mpool, - base, - size, - ®s, - ®_cnt); - - - /* - * find the best fit when there are multiple registrations - */ - for(r = 0; r < reg_cnt; r++) { - mca_mpool_base_registration_t* reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(®s, r); - size_t reg_len = reg->bound - base + 1; - - if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { - assert(reg->ref_count >= 3); - } - if(reg->base <= base && reg_len >= size) { - fit = reg; - } else if(mca_pml_ob1.leave_pinned){ - btl_mpool->mpool_deregister(btl_mpool, reg); - } else { - btl_mpool->mpool_release(btl_mpool, reg); - } - } - - - /* if the leave pinned option is set - and there is not an existing - * registration that satisfies this request, create one. - */ - if(NULL == fit && mca_pml_ob1.leave_pinned) { - /* register the memory */ - rc = btl_mpool->mpool_register( - btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(ORTE_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - return NULL; - } - assert(fit->ref_count == 3); - } - - OBJ_DESTRUCT(®s); - return fit; -} - - - -/* - * For a given btl - find the best fit registration or - * optionally create one for leave pinned. - */ - -mca_mpool_base_registration_t* mca_pml_ob1_rdma_register( - mca_bml_base_btl_t* bml_btl, - unsigned char* base, - size_t size) -{ - ompi_pointer_array_t regs; - mca_mpool_base_registration_t* fit = NULL; - mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool; - uint32_t reg_cnt; - size_t r; - int rc; - - /* btl is rdma capable and registration is not required */ - if(NULL == btl_mpool) { - return NULL; - } - - /* check to see if memory is registered */ - OBJ_CONSTRUCT(®s, ompi_pointer_array_t); - ompi_pointer_array_remove_all(®s); - - /* look through existing registrations */ - btl_mpool->mpool_find(btl_mpool, - base, - size, - ®s, - ®_cnt); - - - /* - * find the best fit when there are multiple registrations - */ - for(r = 0; r < reg_cnt; r++) { - mca_mpool_base_registration_t* reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(®s, r); - size_t reg_len = reg->bound - base + 1; - if(reg->base <= base && reg_len >= size) { - fit = reg; - } else { - btl_mpool->mpool_deregister(btl_mpool, reg); - } - } - - - /* if the leave pinned option is set - and there is not an existing - * registration that satisfies this request, create one. - */ - if(NULL == fit) { - /* register the memory */ - rc = btl_mpool->mpool_register( - btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(ORTE_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - return NULL; - } - } - OBJ_DESTRUCT(®s); - return fit; -} diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h index bdefd692ff..d2c983ccee 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h @@ -43,31 +43,8 @@ typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t; * find those that already have registrations - or * register if required (for leave_pinned option) */ - -size_t mca_pml_ob1_rdma_btls( - struct mca_bml_base_endpoint_t* endpoint, - unsigned char* base, - size_t size, - struct mca_pml_ob1_rdma_btl_t* btls); - -/* - * For a given rdma capable btl - find the best fit - * registration or create one for leave pinned. - */ - -mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( - struct mca_bml_base_btl_t* bml_btl, - unsigned char* base, - size_t size); - -/* - * Create a registration - */ - -mca_mpool_base_registration_t* mca_pml_ob1_rdma_register( - struct mca_bml_base_btl_t* bml_btl, - unsigned char* base, - size_t size); +size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, + unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls); #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index cd5fc6ecfb..66876c6970 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -257,38 +257,9 @@ static int mca_pml_ob1_recv_request_ack( if (hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_PIN && recvreq->req_rdma_cnt != 0) { - /* start rdma at current fragment offset - no need to ack */ recvreq->req_rdma_offset = bytes_received; - return OMPI_SUCCESS; - } - /* are rdma devices available for long rdma protocol */ - if( mca_pml_ob1.leave_pinned_pipeline && - hdr->hdr_msg_length > bml_endpoint->btl_rdma_size && - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma) ) { - char* base; - char* align; - ptrdiff_t lb; - - /* round this up/down to the next aligned address */ - ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, &lb); - base = recvreq->req_recv.req_convertor.pBaseBuf + lb; - align = (char*)up_align_addr(base, bml_endpoint->btl_rdma_align)+1; - recvreq->req_rdma_offset = align - base; - - /* still w/in range */ - if(recvreq->req_rdma_offset < bytes_received) { - recvreq->req_rdma_offset = bytes_received; - } - if(recvreq->req_rdma_offset > hdr->hdr_msg_length) { - recvreq->req_rdma_offset = hdr->hdr_msg_length; - } else { - ompi_convertor_set_position( &recvreq->req_recv.req_convertor, - &recvreq->req_rdma_offset ); - } - /* are rdma devices available for long rdma protocol */ - } else if (!mca_pml_ob1.leave_pinned_pipeline && - bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length && + } else if (bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length && mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) { /* use convertor to figure out the rdma offset for this request */ @@ -300,6 +271,9 @@ static int mca_pml_ob1_recv_request_ack( &recvreq->req_rdma_offset ); } } + /* start rdma at current fragment offset - no need to ack */ + if(recvreq->req_rdma_offset == bytes_received) + return OMPI_SUCCESS; } /* let know to shedule function there is no need to put ACK flag */ recvreq->req_ack_sent = true; @@ -359,7 +333,6 @@ int mca_pml_ob1_recv_request_get_frag( mca_bml_base_endpoint_t* bml_endpoint = frag->rdma_ep; mca_bml_base_btl_t* bml_btl; mca_btl_base_descriptor_t* descriptor; - mca_mpool_base_registration_t* reg; size_t save_size = frag->rdma_length; int rc; @@ -370,21 +343,10 @@ int mca_pml_ob1_recv_request_get_frag( orte_errmgr.abort(); } - /* is there an existing registration for this btl */ - reg = mca_pml_ob1_rdma_registration( - bml_btl, - (unsigned char*)recvreq->req_recv.req_base.req_addr, - recvreq->req_recv.req_bytes_packed); - if(NULL != reg) { - recvreq->req_rdma[0].bml_btl = bml_btl; - recvreq->req_rdma[0].btl_reg = reg; - recvreq->req_rdma_cnt = 1; - } - /* prepare descriptor */ mca_bml_base_prepare_dst( bml_btl, - reg, + NULL, &recvreq->req_recv.req_convertor, 0, &frag->rdma_length, @@ -622,7 +584,6 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec mca_btl_base_descriptor_t* ctl; mca_mpool_base_registration_t * reg = NULL; int rc; - bool release = false; if(prev_bytes_remaining == bytes_remaining) { if( ++num_fail == num_tries ) { @@ -689,29 +650,9 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec size = bml_btl->btl_max_rdma_size; } - if(0 == recvreq->req_rdma_cnt) { - char* base; - ptrdiff_t lb; - - if(mca_pml_ob1.leave_pinned_pipeline) { - /* lookup and/or create a cached registration */ - ompi_ddt_type_lb(recvreq->req_recv.req_convertor.pDesc, - &lb); - base = recvreq->req_recv.req_convertor.pBaseBuf + lb + - recvreq->req_rdma_offset; - reg = mca_pml_ob1_rdma_register(bml_btl, - (unsigned char*)base, size); - release = true; - } - } - /* prepare a descriptor for RDMA */ mca_bml_base_prepare_dst(bml_btl, reg, &recvreq->req_recv.req_convertor, 0, &size, &dst); - if(reg && release == true && NULL != bml_btl->btl_mpool) { - bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg); - } - if(dst == NULL) { continue; } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index aaf1abbc17..be2b8cb160 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -135,7 +135,7 @@ do { for( r = 0; r < recvreq->req_rdma_cnt; r++ ) { \ mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[r].btl_reg; \ if( NULL != btl_reg ) { \ - btl_reg->mpool->mpool_release( btl_reg->mpool, btl_reg ); \ + btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); \ } \ } \ recvreq->req_rdma_cnt = 0; \ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 78bb1c9da3..9e9c704a94 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -652,7 +652,7 @@ int mca_pml_ob1_send_request_start_rdma( bml_btl->btl_flags & MCA_BTL_FLAGS_GET) { size_t old_position = sendreq->req_send.req_convertor.bConverted; - /* prepare source descriptor/segment(s) */ + /* prepare source descriptor/segment(s) */ mca_bml_base_prepare_src( bml_btl, reg, @@ -846,6 +846,7 @@ int mca_pml_ob1_send_request_start_rndv( des->des_cbdata = sendreq; des->des_cbfunc = mca_pml_ob1_rndv_completion; sendreq->req_send_offset = size; + sendreq->req_rdma_offset = size; /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); @@ -1023,15 +1024,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, /* check for request completion */ if( OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length) >= sendreq->req_send.req_bytes_packed) { - /* bump up the req_state after the last fin was sent.. - if rndv completion occurs after this (can happen!) then - the rndv completion will properly clean up after the request - we can't just do this on the first RDMA PUT + ACK ctl message in - mca_pml_ob1_send_request_put because then we might fall into sender - side scheduleing (pml pipeline protocol) */ - if(true == sendreq->req_got_put_ack) { - MCA_PML_OB1_SEND_REQUEST_ADVANCE_NO_SCHEDULE(sendreq); - } + /* if we've got completion on rndv packet */ if (sendreq->req_state == 2) { MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq); @@ -1058,7 +1051,6 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag ) size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; size_t i, save_size = frag->rdma_length; int rc; - bool release = false; bml_btl = mca_bml_base_btl_array_find(&frag->rdma_ep->btl_rdma, frag->rdma_btl); @@ -1074,16 +1066,6 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag ) /* set convertor at current offset */ ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset); - /* if registration doesnt exist - create one */ - if (mca_pml_ob1.leave_pinned_pipeline && reg == NULL) { - unsigned char* base; - ptrdiff_t lb; - ompi_ddt_type_lb(sendreq->req_send.req_convertor.pDesc, &lb); - base = (unsigned char*)sendreq->req_send.req_convertor.pBaseBuf + lb + offset; - reg = mca_pml_ob1_rdma_register(bml_btl, base, frag->rdma_length); - release = true; - } - /* setup descriptor */ mca_bml_base_prepare_src( bml_btl, reg, @@ -1092,10 +1074,6 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag ) &frag->rdma_length, &des ); - if(reg && release == true && bml_btl->btl_mpool) { - bml_btl->btl_mpool->mpool_release(bml_btl->btl_mpool, reg); - } - if(NULL == des) { frag->rdma_length = save_size; OPAL_THREAD_LOCK(&mca_pml_ob1.lock); @@ -1148,7 +1126,7 @@ void mca_pml_ob1_send_request_put( size_t i, size = 0; if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { - sendreq->req_got_put_ack = true; + MCA_PML_OB1_SEND_REQUEST_ADVANCE_NO_SCHEDULE(sendreq); } MCA_PML_OB1_RDMA_FRAG_ALLOC(frag, rc); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index a1e7f40476..906d418fb4 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -55,7 +55,6 @@ struct mca_pml_ob1_send_request_t { size_t req_bytes_delivered; size_t req_send_offset; size_t req_rdma_offset; - bool req_got_put_ack; mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST]; uint32_t req_rdma_cnt; mca_pml_ob1_send_pending_t req_pending; @@ -116,7 +115,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s for(r = 0; r < sendreq->req_rdma_cnt; r++) { mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; if( NULL != reg ) { - reg->mpool->mpool_release(reg->mpool, reg); + reg->mpool->mpool_deregister(reg->mpool, reg); } } sendreq->req_rdma_cnt = 0; @@ -359,7 +358,6 @@ static inline int mca_pml_ob1_send_request_start( sendreq->req_pipeline_depth = 0; sendreq->req_bytes_delivered = 0; sendreq->req_send_offset = 0; - sendreq->req_got_put_ack = false; sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE; sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32( &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1); diff --git a/ompi/mca/rcache/rb/.ompi_ignore b/ompi/mca/rcache/rb/.ompi_ignore new file mode 100644 index 0000000000..60bf58b271 --- /dev/null +++ b/ompi/mca/rcache/rb/.ompi_ignore @@ -0,0 +1 @@ +quilt diff --git a/ompi/mca/rcache/rcache.h b/ompi/mca/rcache/rcache.h index 0e377cf4a7..d7cf69718b 100644 --- a/ompi/mca/rcache/rcache.h +++ b/ompi/mca/rcache/rcache.h @@ -35,34 +35,26 @@ typedef struct mca_rcache_base_module_t* (*mca_rcache_base_component_init_fn_t)( typedef int (*mca_rcache_base_module_find_fn_t) ( - struct mca_rcache_base_module_t* rcache, - void* addr, - size_t size, - ompi_pointer_array_t *regs, - uint32_t *cnt - ); + struct mca_rcache_base_module_t* rcache, void* addr, size_t size, + mca_mpool_base_registration_t **reg); -typedef int (*mca_rcache_base_module_insert_fn_t)( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration, - uint32_t flags - ); +typedef int (*mca_rcache_base_module_find_all_fn_t)( + struct mca_rcache_base_module_t* rcache, void* addr, size_t size, + ompi_pointer_array_t *regs); -typedef int (*mca_rcache_base_module_delete_fn_t) ( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration, - uint32_t flags - ); +typedef int (*mca_rcache_base_module_insert_fn_t)( + struct mca_rcache_base_module_t* rcache, + mca_mpool_base_registration_t* registration, size_t limit); +typedef int (*mca_rcache_base_module_delete_fn_t)( + struct mca_rcache_base_module_t* rcache, + mca_mpool_base_registration_t* registration); /** * finalize */ typedef void (*mca_rcache_base_module_finalize_fn_t)( - struct mca_rcache_base_module_t* - ); - - + struct mca_rcache_base_module_t*); /** * rcache component descriptor. Contains component version information and @@ -83,19 +75,16 @@ typedef struct mca_rcache_base_component_1_0_0_t mca_rcache_base_component_t; /** * rcache module descriptor */ -struct mca_rcache_base_module_t { - mca_rcache_base_component_t *rcache_component; /**< component struct */ - mca_rcache_base_module_find_fn_t rcache_find; - mca_rcache_base_module_insert_fn_t rcache_insert; - mca_rcache_base_module_delete_fn_t rcache_delete; - mca_rcache_base_module_finalize_fn_t rcache_finalize; +struct mca_rcache_base_module_t { + mca_rcache_base_component_t *rcache_component; /**< component struct */ + mca_rcache_base_module_find_fn_t rcache_find; + mca_rcache_base_module_find_all_fn_t rcache_find_all; + mca_rcache_base_module_insert_fn_t rcache_insert; + mca_rcache_base_module_delete_fn_t rcache_delete; + mca_rcache_base_module_finalize_fn_t rcache_finalize; opal_mutex_t lock; -}; -typedef struct mca_rcache_base_module_t mca_rcache_base_module_t; - - - - +}; +typedef struct mca_rcache_base_module_t mca_rcache_base_module_t; /** * Macro for use in components that are of type rcache v1.0.0 diff --git a/ompi/mca/rcache/vma/Makefile.am b/ompi/mca/rcache/vma/Makefile.am index 776d96b6b9..745a5ce120 100644 --- a/ompi/mca/rcache/vma/Makefile.am +++ b/ompi/mca/rcache/vma/Makefile.am @@ -25,9 +25,7 @@ sources = \ rcache_vma.h \ rcache_vma_component.c \ rcache_vma_tree.c \ - rcache_vma_tree.h \ - rcache_vma_mru.c \ - rcache_vma_mru.h + rcache_vma_tree.h # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/ompi/mca/rcache/vma/rcache_vma.c b/ompi/mca/rcache/vma/rcache_vma.c index 9f7a55a653..9d69a989df 100644 --- a/ompi/mca/rcache/vma/rcache_vma.c +++ b/ompi/mca/rcache/vma/rcache_vma.c @@ -22,7 +22,6 @@ #include "ompi/mca/rcache/rcache.h" #include "rcache_vma.h" #include "rcache_vma_tree.h" -#include "rcache_vma_mru.h" #include "opal/util/output.h" #include "ompi/mca/mpool/base/base.h" @@ -34,26 +33,18 @@ extern unsigned int mca_mpool_base_page_size_log; */ void mca_rcache_vma_module_init( mca_rcache_vma_module_t* rcache ) { - rcache->base.rcache_find = mca_rcache_vma_find; + rcache->base.rcache_find_all = mca_rcache_vma_find_all; rcache->base.rcache_insert = mca_rcache_vma_insert; rcache->base.rcache_delete = mca_rcache_vma_delete; rcache->base.rcache_finalize = mca_rcache_vma_finalize; OBJ_CONSTRUCT(&rcache->base.lock, opal_mutex_t); mca_rcache_vma_tree_init(rcache); - mca_rcache_vma_mru_init(rcache); } -int mca_rcache_vma_find ( - struct mca_rcache_base_module_t* rcache, - void* addr, - size_t size, - ompi_pointer_array_t* regs, - uint32_t *cnt - ){ - - int rc = OMPI_SUCCESS; - mca_mpool_base_registration_t *reg; +int mca_rcache_vma_find(struct mca_rcache_base_module_t* rcache, + void* addr, size_t size, mca_mpool_base_registration_t **reg) +{ void* base_addr; void* bound_addr; @@ -61,108 +52,56 @@ int mca_rcache_vma_find ( return OMPI_ERROR; } - OPAL_THREAD_LOCK(&rcache->lock); - *cnt = 0; - base_addr = down_align_addr(addr, mca_mpool_base_page_size_log); bound_addr = up_align_addr((void*) ((unsigned long) addr + size - 1), mca_mpool_base_page_size_log); - reg = mca_rcache_vma_tree_find((mca_rcache_vma_module_t*)rcache, base_addr, + *reg = mca_rcache_vma_tree_find((mca_rcache_vma_module_t*)rcache, base_addr, bound_addr); - if (reg != NULL) { - ompi_pointer_array_add(regs, (void*) reg); - if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { - rc = mca_rcache_vma_mru_touch((mca_rcache_vma_module_t*)rcache, reg); - if(OMPI_SUCCESS != rc) { - OPAL_THREAD_UNLOCK(&rcache->lock); - return OMPI_ERROR; - } - } - - OPAL_THREAD_ADD32((int32_t*) ®->ref_count, 1); - (*cnt)++; - assert(((void*)reg->bound) >= addr); - } - OPAL_THREAD_UNLOCK(&rcache->lock); return OMPI_SUCCESS; } -int mca_rcache_vma_insert ( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* reg, - uint32_t flags - ) { +int mca_rcache_vma_find_all(struct mca_rcache_base_module_t* rcache, + void* addr, size_t size, ompi_pointer_array_t *regs) +{ + void *base_addr, *bound_addr; + + if(size == 0) { + return OMPI_ERROR; + } + + base_addr = down_align_addr(addr, mca_mpool_base_page_size_log); + bound_addr = up_align_addr((void*) ((unsigned long) addr + size - 1), mca_mpool_base_page_size_log); + + return mca_rcache_vma_tree_find_all((mca_rcache_vma_module_t*)rcache, + base_addr, bound_addr, regs); +} + +int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache, + mca_mpool_base_registration_t* reg, size_t limit) +{ size_t reg_size = reg->bound - reg->base + 1; - mca_mpool_base_registration_t* old_reg; + mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; - OPAL_THREAD_LOCK(&rcache->lock); - - if((flags & MCA_MPOOL_FLAGS_CACHE) && - reg_size > ((mca_rcache_vma_module_t*)rcache)->reg_max_mru_size) - { - OPAL_THREAD_UNLOCK(&rcache->lock); - /* if the registration is too big for the rcache, - don't cache it and reset the flags so the upper level - handles things appropriatly */ - reg->flags = 0; - return OMPI_SUCCESS; + if(limit != 0 && reg_size > limit) { + /* return out of resources if request is bigger than cache size + * return temp out of resources otherwise */ + return OMPI_ERR_OUT_OF_RESOURCE; } - reg->flags = flags; - - while(mca_rcache_vma_tree_insert((mca_rcache_vma_module_t*)rcache, reg) == - OMPI_ERR_TEMP_OUT_OF_RESOURCE) { - /* call deregister - which removes the registration from - * the tree and mru list. memory will be deregistered when - * the reference count goes to zero. - */ - old_reg = (mca_mpool_base_registration_t*)opal_list_get_first(&((mca_rcache_vma_module_t*)rcache)->mru_list); - /* we need to retain first, because we only want the registration - removed from the tree and the mru */ - old_reg->mpool->mpool_retain(old_reg->mpool, old_reg); - old_reg->mpool->mpool_deregister(old_reg->mpool, old_reg); - } - OPAL_THREAD_ADD32((int32_t*) ®->ref_count, 1); - - if(flags & MCA_MPOOL_FLAGS_CACHE) { - mca_rcache_vma_mru_insert((mca_rcache_vma_module_t*)rcache, reg); - OPAL_THREAD_ADD32((int32_t*)®->ref_count, 1); - } - OPAL_THREAD_UNLOCK(&rcache->lock); - - return OMPI_SUCCESS; + return mca_rcache_vma_tree_insert(vma_rcache, reg, limit); } -int mca_rcache_vma_delete ( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* reg, - uint32_t flags - ) { - int rc = OMPI_SUCCESS; - assert(reg->ref_count >= 1); - OPAL_THREAD_LOCK(&rcache->lock); - if(flags & MCA_MPOOL_FLAGS_CACHE) { - assert(reg->ref_count >= 2); - OPAL_THREAD_ADD32((int32_t*)®->ref_count, -1); - rc = mca_rcache_vma_mru_delete((mca_rcache_vma_module_t*)rcache, reg); - } - if(OMPI_SUCCESS != rc) { - OPAL_THREAD_UNLOCK(&rcache->lock); - return rc; - } - reg->flags = 0; - OPAL_THREAD_ADD32((int32_t*)®->ref_count, -1); - rc = mca_rcache_vma_tree_delete((mca_rcache_vma_module_t*)rcache, reg ); - OPAL_THREAD_UNLOCK(&rcache->lock); - return rc; +int mca_rcache_vma_delete(struct mca_rcache_base_module_t* rcache, + mca_mpool_base_registration_t* reg) +{ + mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; + return mca_rcache_vma_tree_delete(vma_rcache, reg); } /** * finalize */ -void mca_rcache_vma_finalize( - struct mca_rcache_base_module_t* rcache - ) { - +void mca_rcache_vma_finalize(struct mca_rcache_base_module_t* rcache) +{ } diff --git a/ompi/mca/rcache/vma/rcache_vma.h b/ompi/mca/rcache/vma/rcache_vma.h index 7b89626562..9474944a50 100644 --- a/ompi/mca/rcache/vma/rcache_vma.h +++ b/ompi/mca/rcache/vma/rcache_vma.h @@ -34,57 +34,41 @@ struct mca_rcache_vma_module_t { mca_rcache_base_module_t base; ompi_rb_tree_t rb_tree; opal_list_t vma_list; - opal_list_t mru_list; - size_t reg_mru_len; - size_t reg_max_mru_size; - size_t reg_cur_mru_size; - + size_t reg_cur_cache_size; }; typedef struct mca_rcache_vma_module_t mca_rcache_vma_module_t; struct mca_rcache_vma_component_t { mca_rcache_base_component_t super; - size_t reg_mru_len; - size_t reg_max_mru_size; }; typedef struct mca_rcache_vma_component_t mca_rcache_vma_component_t; OMPI_DECLSPEC extern mca_rcache_vma_component_t mca_rcache_vma_component; -void mca_rcache_vma_module_init( mca_rcache_vma_module_t* rcache ); +void mca_rcache_vma_module_init(mca_rcache_vma_module_t* rcache); -int mca_rcache_vma_find ( - mca_rcache_base_module_t* rcache, - void* addr, - size_t size, - ompi_pointer_array_t* regs, - uint32_t *cnt - ); +int mca_rcache_vma_find(mca_rcache_base_module_t* rcache, void* addr, + size_t size, mca_mpool_base_registration_t **reg); -int mca_rcache_vma_insert ( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration, - uint32_t flags - ); +int mca_rcache_vma_find_all(mca_rcache_base_module_t* rcache, void* addr, + size_t size, ompi_pointer_array_t *regs); -int mca_rcache_vma_delete ( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration, - uint32_t flags - ); +int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache, + mca_mpool_base_registration_t* registration, size_t limit); + +int mca_rcache_vma_delete(struct mca_rcache_base_module_t* rcache, + mca_mpool_base_registration_t* registration); /** * init/finalize */ -void mca_rcache_vma_module_init( mca_rcache_vma_module_t* rcache ); +void mca_rcache_vma_module_init(mca_rcache_vma_module_t *rcache); -void mca_rcache_vma_finalize( - struct mca_rcache_base_module_t* - ); +void mca_rcache_vma_finalize(struct mca_rcache_base_module_t*); #endif /* MCA_RCACHE_VMA_H */ diff --git a/ompi/mca/rcache/vma/rcache_vma_component.c b/ompi/mca/rcache/vma/rcache_vma_component.c index c26c45c012..3600f41c1f 100644 --- a/ompi/mca/rcache/vma/rcache_vma_component.c +++ b/ompi/mca/rcache/vma/rcache_vma_component.c @@ -44,22 +44,6 @@ mca_rcache_vma_component_t mca_rcache_vma_component = { static int mca_rcache_vma_component_open(void) { - mca_base_param_reg_int(&mca_rcache_vma_component.super.rcache_version, - "mru_len", - "The maximum size IN ENTRIES of the MRU (most recently used) rcache list", - false, - false, - 256, - (int*)&(mca_rcache_vma_component.reg_mru_len)); - - mca_base_param_reg_int(&mca_rcache_vma_component.super.rcache_version, - "mru_size", - "The maximum size IN BYTES of the MRU (most recently used) rcache list", - false, - false, - 1*1024*1024*1024, /* default to 1GB? */ - (int*)&(mca_rcache_vma_component.reg_max_mru_size)); - return OMPI_SUCCESS; } @@ -68,8 +52,6 @@ mca_rcache_base_module_t* mca_rcache_vma_component_init(void) { rcache = (mca_rcache_vma_module_t*) malloc(sizeof(mca_rcache_vma_module_t)); mca_rcache_vma_module_init(rcache); - rcache->reg_mru_len = mca_rcache_vma_component.reg_mru_len; - rcache->reg_max_mru_size = mca_rcache_vma_component.reg_max_mru_size; return &rcache->base; } diff --git a/ompi/mca/rcache/vma/rcache_vma_mru.c b/ompi/mca/rcache/vma/rcache_vma_mru.c deleted file mode 100644 index 98d758653c..0000000000 --- a/ompi/mca/rcache/vma/rcache_vma_mru.c +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * Description of the Registration Cache framework - */ - -#include "opal/mca/mca.h" -#include "rcache_vma_mru.h" -#include "ompi/mca/mpool/mpool.h" - -/* - * initialize the vma mru - */ -int mca_rcache_vma_mru_init(mca_rcache_vma_module_t* rcache){ - OBJ_CONSTRUCT(&rcache->mru_list, opal_list_t); - return OMPI_SUCCESS; -} - -/* - * insert an item in the vma mru - */ -int mca_rcache_vma_mru_insert( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ) { - mca_mpool_base_registration_t* old_reg; - - if(rcache->reg_mru_len <= rcache->mru_list.opal_list_length) { - /* call deregister - which removes the registration from - * the tree and mru list. memory will be deregistered when - * the reference count goes to zero. - */ - old_reg = (mca_mpool_base_registration_t*) - opal_list_get_first(&rcache->mru_list); - /* we need to retain first, because we only want the registration - removed from the tree and the mru */ - old_reg->mpool->mpool_retain(old_reg->mpool, old_reg); - old_reg->mpool->mpool_deregister(old_reg->mpool, old_reg); - } - - opal_list_append(&rcache->mru_list,(opal_list_item_t*) reg); - - return OMPI_SUCCESS; -} - -/* - * remove an item from the vma mru - */ -int mca_rcache_vma_mru_delete( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t *reg - ){ - int rc; - if(NULL == opal_list_remove_item(&rcache->mru_list, - (opal_list_item_t*)reg)) { - rc = OMPI_ERROR; - } else { - rc = OMPI_SUCCESS; - } - return rc; -} - -/* - * touch an item in the mru list - */ -int mca_rcache_vma_mru_touch( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ){ - int rc; - if(NULL == opal_list_remove_item(&rcache->mru_list, - (opal_list_item_t*)reg)) { - rc = OMPI_ERROR; - } else { - opal_list_append(&rcache->mru_list, (opal_list_item_t*)reg); - rc = OMPI_SUCCESS; - } - return rc; -} diff --git a/ompi/mca/rcache/vma/rcache_vma_mru.h b/ompi/mca/rcache/vma/rcache_vma_mru.h deleted file mode 100644 index c6021cda51..0000000000 --- a/ompi/mca/rcache/vma/rcache_vma_mru.h +++ /dev/null @@ -1,62 +0,0 @@ - -/** - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * Description of the Registration Cache framework - */ -#ifndef MCA_RCACHE_VMA_MRU_H -#define MCA_RCACHE_VMA_MRU_H -#include "opal/mca/mca.h" -#include "ompi/info/info.h" -#include "opal/class/opal_list.h" -#include "rcache_vma.h" - - - -/* - * initialize the rb mru - */ -int mca_rcache_vma_mru_init(mca_rcache_vma_module_t* rcache); - -/* - * insert an item in the rb mru - */ -int mca_rcache_vma_mru_insert( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ); - -/* - * remove an item from the rb mru - */ -int mca_rcache_vma_mru_delete( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ); - -int mca_rcache_vma_mru_touch( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ); - -#endif /* MCA_RCACHE_VMA_MRU_H */ - diff --git a/ompi/mca/rcache/vma/rcache_vma_tree.c b/ompi/mca/rcache/vma/rcache_vma_tree.c index eaa4c991b8..8ed2688855 100644 --- a/ompi/mca/rcache/vma/rcache_vma_tree.c +++ b/ompi/mca/rcache/vma/rcache_vma_tree.c @@ -26,6 +26,8 @@ #include "opal/mca/mca.h" #include "rcache_vma_tree.h" +extern unsigned int mca_mpool_base_page_size; + OBJ_CLASS_INSTANCE(mca_rcache_vma_reg_list_item_t, opal_list_item_t, NULL, NULL); static void mca_rcache_vma_construct(opal_object_t *object) @@ -130,8 +132,17 @@ static inline int mca_rcache_vma_compare_regs( mca_mpool_base_registration_t *reg1, mca_mpool_base_registration_t *reg2) { + /* persisten registration are on top */ + if((reg1->flags & MCA_MPOOL_FLAGS_PERSIST) && + !(reg2->flags & MCA_MPOOL_FLAGS_PERSIST)) + return 1; + + if(!(reg1->flags & MCA_MPOOL_FLAGS_PERSIST) && + (reg2->flags & MCA_MPOOL_FLAGS_PERSIST)) + return -1; + if (reg1->bound != reg2->bound) - return (int)(reg1->bound - reg2->bound); + return (int)(reg1->bound - reg2->bound); /* tie breaker */ return (int)((uintptr_t)reg1 - (uintptr_t)reg2); @@ -241,7 +252,7 @@ int mca_rcache_vma_tree_init(mca_rcache_vma_module_t* rcache) { OBJ_CONSTRUCT(&rcache->rb_tree, ompi_rb_tree_t); OBJ_CONSTRUCT(&rcache->vma_list, opal_list_t); - rcache->reg_cur_mru_size = 0; + rcache->reg_cur_cache_size = 0; return ompi_rb_tree_init(&rcache->rb_tree, mca_rcache_vma_tree_node_compare); } @@ -261,23 +272,81 @@ mca_mpool_base_registration_t *mca_rcache_vma_tree_find( item = (mca_rcache_vma_reg_list_item_t*)opal_list_get_first(&vma->reg_list); - if(item->reg->bound >= bound) - return item->reg; + do { + if(item->reg->bound >= bound) + return item->reg; + if(!(item->reg->flags & MCA_MPOOL_FLAGS_PERSIST)) + break; + item = (mca_rcache_vma_reg_list_item_t*)opal_list_get_next(item); + } while(item != + (mca_rcache_vma_reg_list_item_t*)opal_list_get_end(&vma->reg_list)); return NULL; } -static inline int mca_rcache_vma_can_insert( - mca_rcache_vma_module_t *vma_rcache, - uint32_t reg_flags, - size_t nbytes) +static inline bool is_reg_in_array(ompi_pointer_array_t *regs, void *p) { - if(0 == vma_rcache->reg_max_mru_size || - !(reg_flags & MCA_MPOOL_FLAGS_CACHE)) + int i; + + for(i = 0; i < ompi_pointer_array_get_size(regs); i++) { + if(ompi_pointer_array_get_item(regs, i) == p) + return true; + } + + return false; +} + +int mca_rcache_vma_tree_find_all( + mca_rcache_vma_module_t *vma_rcache, unsigned char *base, + unsigned char *bound, ompi_pointer_array_t *regs) +{ + int cnt = 0; + + if(opal_list_get_size(&vma_rcache->vma_list) == 0) + return cnt; + + do { + mca_rcache_vma_t *vma; + opal_list_item_t *item; + vma = ompi_rb_tree_find_with(&vma_rcache->rb_tree, base, + mca_rcache_vma_tree_node_compare_closest); + + if(NULL == vma) { + /* base is bigger than any registered memory */ + base = bound + 1; + continue; + } + + if(base < (unsigned char*)vma->start) { + base = (unsigned char*)vma->start; + continue; + } + + for(item = opal_list_get_first(&vma->reg_list); + item != opal_list_get_end(&vma->reg_list); + item = opal_list_get_next(item)) { + mca_rcache_vma_reg_list_item_t *vma_item; + vma_item = (mca_rcache_vma_reg_list_item_t*)item; + if(is_reg_in_array(regs, (void*)vma_item->reg)) { + continue; + } + ompi_pointer_array_add(regs, (void*)vma_item->reg); + cnt++; + } + + base = (unsigned char *)vma->end + 1; + } while(bound >= base); + + return cnt; +} + +static inline int mca_rcache_vma_can_insert( + mca_rcache_vma_module_t *vma_rcache, size_t nbytes, size_t limit) +{ + if(0 == limit) return 1; - if(vma_rcache->reg_cur_mru_size + nbytes <= - vma_rcache->reg_max_mru_size) + if(vma_rcache->reg_cur_cache_size + nbytes <= limit) return 1; return 0; @@ -287,13 +356,11 @@ static inline void mca_rcache_vma_update_byte_count( mca_rcache_vma_module_t* vma_rcache, size_t nbytes) { - vma_rcache->reg_cur_mru_size += nbytes; + vma_rcache->reg_cur_cache_size += nbytes; } -int mca_rcache_vma_tree_insert( - mca_rcache_vma_module_t* vma_rcache, - mca_mpool_base_registration_t* reg - ) +int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* vma_rcache, + mca_mpool_base_registration_t* reg, size_t limit) { mca_rcache_vma_t *i; uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound; @@ -309,7 +376,7 @@ int mca_rcache_vma_tree_insert( if((mca_rcache_vma_t*)opal_list_get_end(&vma_rcache->vma_list) == i) { vma = NULL; - if(mca_rcache_vma_can_insert(vma_rcache, reg->flags, end - begin + 1)) + if(mca_rcache_vma_can_insert(vma_rcache, end - begin + 1, limit)) vma = mca_rcache_vma_new(vma_rcache, begin, end); if(!vma) @@ -323,7 +390,7 @@ int mca_rcache_vma_tree_insert( } else if(i->start > begin) { uintptr_t tend = (i->start <= end)?(i->start - 1):end; vma = NULL; - if(mca_rcache_vma_can_insert(vma_rcache, reg->flags, tend - begin + 1)) + if(mca_rcache_vma_can_insert(vma_rcache, tend - begin + 1, limit)) vma = mca_rcache_vma_new(vma_rcache, begin, tend); if(!vma) diff --git a/ompi/mca/rcache/vma/rcache_vma_tree.h b/ompi/mca/rcache/vma/rcache_vma_tree.h index 06d1ea00f3..4eeeec1793 100644 --- a/ompi/mca/rcache/vma/rcache_vma_tree.h +++ b/ompi/mca/rcache/vma/rcache_vma_tree.h @@ -71,14 +71,18 @@ mca_mpool_base_registration_t* mca_rcache_vma_tree_find( unsigned char* base, unsigned char *bound ); +/** + * Returns all registration that overlaps given memory region + */ +int mca_rcache_vma_tree_find_all( + mca_rcache_vma_module_t *vma_rcache, unsigned char *base, + unsigned char *bound, ompi_pointer_array_t *regs); /* * insert an item in the vma tree */ -int mca_rcache_vma_tree_insert( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ); +int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* rcache, + mca_mpool_base_registration_t* reg, size_t limit); /* * remove an item from the vma tree