From 1e0ea9dd6dab376f48663517a6e91a5ceaaa313e Mon Sep 17 00:00:00 2001 From: Galen Shipman Date: Mon, 23 Jan 2006 22:51:50 +0000 Subject: [PATCH] Major fixes for the RDMA registration cache (leave_pinned). This commit fixes issues with HPL runs on node counts > 4. This commit was SVN r8793. --- ompi/mca/btl/mvapi/btl_mvapi_component.c | 6 - ompi/mca/mpool/base/mpool_base_lookup.c | 22 ++- ompi/mca/mpool/base/mpool_base_mem_cb.c | 72 +++----- ompi/mca/mpool/base/mpool_base_open.c | 17 +- ompi/mca/mpool/mvapi/mpool_mvapi_module.c | 4 +- ompi/mca/pml/ob1/pml_ob1_rdma.c | 202 +++++----------------- ompi/mca/rcache/rb/rcache_rb.c | 54 +++--- ompi/mca/rcache/rb/rcache_rb_tree.c | 14 +- 8 files changed, 137 insertions(+), 254 deletions(-) diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index d76fb58614..543e745f2f 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -312,12 +312,6 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, mca_btl_base_selected_module_t* ib_selected; opal_list_item_t* item; -#if 0 - /* ugly HACK!! */ - mallopt(M_TRIM_THRESHOLD, -1); - mallopt(M_MMAP_MAX, 0); -#endif - /* initialization */ *num_btl_modules = 0; diff --git a/ompi/mca/mpool/base/mpool_base_lookup.c b/ompi/mca/mpool/base/mpool_base_lookup.c index 2b716a362e..5192e9f0f2 100644 --- a/ompi/mca/mpool/base/mpool_base_lookup.c +++ b/ompi/mca/mpool/base/mpool_base_lookup.c @@ -27,6 +27,12 @@ #include "mca/mpool/base/base.h" #include "mpool_base_mem_cb.h" +#ifdef HAVE_MALLOC_H +#include +extern int mca_mpool_base_disable_sbrk; +#endif + + extern int mca_mpool_base_use_mem_hooks; mca_mpool_base_component_t* mca_mpool_base_component_lookup(const char* name) @@ -82,10 +88,18 @@ mca_mpool_base_module_t* mca_mpool_base_module_create( sm->mpool_resources = resources; opal_list_append(&mca_mpool_base_modules, (opal_list_item_t*) sm); /* on the very first creation of a module we init the memory callback*/ - if(mca_mpool_base_use_mem_hooks && - opal_list_get_size(&mca_mpool_base_modules) == 1 && - 0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) { + if(opal_list_get_size(&mca_mpool_base_modules) == 1) { + if(mca_mpool_base_use_mem_hooks && + 0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) { opal_mem_hooks_register_release(mca_mpool_base_mem_cb, NULL); + } + +#ifdef HAVE_MALLOC_H + else if(mca_mpool_base_disable_sbrk) { + mallopt(M_TRIM_THRESHOLD, -1); + mallopt(M_MMAP_MAX, 0); + } +#endif } return module; } @@ -94,7 +108,7 @@ mca_mpool_base_module_t* mca_mpool_base_module_create( mca_mpool_base_module_t* mca_mpool_base_module_lookup(const char* name) { opal_list_item_t* item; - + for (item = opal_list_get_first(&mca_mpool_base_modules); item != opal_list_get_end(&mca_mpool_base_modules); item = opal_list_get_next(item)) { diff --git a/ompi/mca/mpool/base/mpool_base_mem_cb.c b/ompi/mca/mpool/base/mpool_base_mem_cb.c index c164428b00..89c353d4cf 100644 --- a/ompi/mca/mpool/base/mpool_base_mem_cb.c +++ b/ompi/mca/mpool/base/mpool_base_mem_cb.c @@ -38,7 +38,6 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata, mca_mpool_base_registration_t* reg; mca_mpool_base_selected_module_t* current; int rc; - int dereg = 0; opal_list_item_t* item; void* base_addr; void* bound_addr; @@ -55,53 +54,34 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata, current = (mca_mpool_base_selected_module_t*) item; - for( ; base_addr <= bound_addr; - base_addr =(void*) ((unsigned long) base_addr + mca_mpool_base_page_size)) { - - if(NULL != current->mpool_module->mpool_find) { - rc = current->mpool_module->mpool_find( - current->mpool_module, - base_addr, - size, - ®s, - &cnt - ); - if(OMPI_SUCCESS != rc) { - continue; - } - for(i = 0; i < cnt; i++) { - - reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(®s, i); - if(base_addr < (void*) ((unsigned long) reg->bound - mca_mpool_base_page_size)) { - base_addr = reg->bound - mca_mpool_base_page_size; - } - if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { - assert(reg->ref_count <= 3); - } else if(reg->flags & MCA_MPOOL_FLAGS_PERSIST) { - assert(reg->ref_count <= 2); - } else { - assert(reg->ref_count <= 1); - } -#if 0 - fprintf(stderr, "[%lu,%lu,%lu] mca_mpool_base_mem_cb: base %p bound %p len %lu refcnt %d\n", - ORTE_NAME_ARGS(orte_process_info.my_name), reg->base, reg->bound, - reg->bound-reg->base+1, reg->ref_count); -#endif - current->mpool_module->mpool_deregister(current->mpool_module, reg); - dereg++; - } - ompi_pointer_array_remove_all(®s); + if(NULL != current->mpool_module->mpool_find) { + rc = current->mpool_module->mpool_find( + current->mpool_module, + base_addr, + size, + ®s, + &cnt + ); + if(OMPI_SUCCESS != rc) { + continue; } - } - } - OBJ_DESTRUCT(®s); -#if 0 - if(dereg != 0) { - fprintf(stderr, "[%lu,%lu,%lu] mca_mpool_base_mem_cb: addr %p size %lu base %p bound %p\n", - ORTE_NAME_ARGS(orte_process_info.my_name), base, size, - down_align_addr( base, mca_mpool_base_page_size_log), bound_addr); - } + for(i = 0; i < cnt; i++) { + reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(®s, i); +#if !defined(NDEBUG) + if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { + assert(reg->ref_count <= 3); + } else if(reg->flags & MCA_MPOOL_FLAGS_PERSIST) { + assert(reg->ref_count <= 2); + } else { + assert(reg->ref_count <= 1); + } #endif + current->mpool_module->mpool_deregister(current->mpool_module, reg); + } + ompi_pointer_array_remove_all(®s); + } + } + OBJ_DESTRUCT(®s); } diff --git a/ompi/mca/mpool/base/mpool_base_open.c b/ompi/mca/mpool/base/mpool_base_open.c index 038a6f5f76..5308f8c93a 100644 --- a/ompi/mca/mpool/base/mpool_base_open.c +++ b/ompi/mca/mpool/base/mpool_base_open.c @@ -41,6 +41,11 @@ */ int mca_mpool_base_output = -1; int mca_mpool_base_use_mem_hooks = 0; + +#ifdef HAVE_MALLOC_H +int mca_mpool_base_disable_sbrk = 0; +#endif + uint32_t mca_mpool_base_page_size; uint32_t mca_mpool_base_page_size_log; @@ -72,7 +77,7 @@ int mca_mpool_base_open(void) * check for use_mem_hooks (for diagnostics/testing) * however if leave_pinned is set we force this to be enabled */ - mca_base_param_reg_int_name("mpool_base", + mca_base_param_reg_int_name("mpool", "use_mem_hooks", "use memory hooks for deregistering freed memory", false, @@ -80,6 +85,16 @@ int mca_mpool_base_open(void) 0, &mca_mpool_base_use_mem_hooks); +#ifdef HAVE_MALLOC_H + mca_base_param_reg_int_name("mpool", + "disable_sbrk", + "use mallopt to override calling sbrk (doesn't return memory to OS!)", + false, + false, + 0, + &mca_mpool_base_disable_sbrk); +#endif + /* if(0 == mca_mpool_base_use_mem_hooks) { */ /* int param; */ /* mca_base_param_register_int("mpi", NULL, "leave_pinned", "leave_pinned", 0); */ diff --git a/ompi/mca/mpool/mvapi/mpool_mvapi_module.c b/ompi/mca/mpool/mvapi/mpool_mvapi_module.c index 626acbedb0..aeb72e7012 100644 --- a/ompi/mca/mpool/mvapi/mpool_mvapi_module.c +++ b/ompi/mca/mpool/mvapi/mpool_mvapi_module.c @@ -157,8 +157,8 @@ int mca_mpool_mvapi_deregister(mca_mpool_base_module_t* mpool, { if(registration->flags & (MCA_MPOOL_FLAGS_CACHE | MCA_MPOOL_FLAGS_PERSIST)) { mpool->rcache->rcache_delete(mpool->rcache, - registration, - registration->flags); + registration, + registration->flags); registration->flags = 0; } return mca_mpool_mvapi_release(mpool, registration); diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index d57ef79f7d..6d5ba97396 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -58,7 +58,6 @@ size_t mca_pml_ob1_rdma_btls( mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n); mca_mpool_base_registration_t* fit = NULL; - mca_mpool_base_registration_t* largest = NULL; mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool; uint32_t reg_cnt; size_t r; @@ -78,49 +77,10 @@ size_t mca_pml_ob1_rdma_btls( size, ®s, ®_cnt); - assert(reg_cnt <= 1); - /* shortcut for one entry - the typical case */ - if(reg_cnt == 1) { - mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, 0); - size_t reg_len = reg->bound - base + 1; - if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { - assert(reg->ref_count >= 3); - } - - /* is the existing registration the required size */ - if(reg->base <= base && reg_len >= size) { - - rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = reg; - num_btls_used++; - - /* otherwise if leave_pinned re-register */ - } else if( mca_pml_ob1.leave_pinned ) { - - btl_mpool->mpool_deregister(btl_mpool, reg); - rc = btl_mpool->mpool_register(btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - ®); - if(OMPI_SUCCESS != rc || NULL == reg) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - continue; - } - - rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = reg; - num_btls_used++; - - /* existing registration cannot be used */ - } else { - btl_mpool->mpool_release(btl_mpool, reg); - } - continue; - } + /* * find the best fit when there are multiple registrations - */ + */ for(r = 0; r < reg_cnt; r++) { mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, r); size_t reg_len = reg->bound - base + 1; @@ -130,60 +90,34 @@ size_t mca_pml_ob1_rdma_btls( } if(reg->base <= base && reg_len >= size) { fit = reg; - break; + } else if(mca_pml_ob1.leave_pinned){ + btl_mpool->mpool_deregister(btl_mpool, reg); } else { - if(NULL == largest) - largest = reg; - else if(reg->base <= base && (reg->bound - base) > (largest->bound - base)) { - largest = reg; - } + btl_mpool->mpool_release(btl_mpool, reg); } + } - + + /* if the leave pinned option is set - and there is not an existing * registration that satisfies this request, create one. */ if(NULL == fit && mca_pml_ob1.leave_pinned) { - if (NULL == largest) { - /* register the memory */ - rc = btl_mpool->mpool_register( - btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(ORTE_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - continue; - } - - /* a registration exists but is not large enough */ - } else { - - /* simplify cleanup - bump reference count as we decrement again below */ - btl_mpool->mpool_retain(btl_mpool,largest); - btl_mpool->mpool_deregister(btl_mpool, largest); - rc = btl_mpool->mpool_register(btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(ORTE_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - continue; - } + /* register the memory */ + rc = btl_mpool->mpool_register( + btl_mpool, + base, + size, + MCA_MPOOL_FLAGS_CACHE, + &fit); + if(ORTE_SUCCESS != rc || NULL == fit) { + opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); + continue; } assert(fit->ref_count == 3); } - /* decrement reference count on all unused entries */ - for(r = 0; r < reg_cnt; r++) { - mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, r); - if(reg != fit) { - btl_mpool->mpool_release(btl_mpool, reg); - } - } - + if(NULL != fit) { rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].btl_reg = fit; @@ -206,7 +140,6 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( { ompi_pointer_array_t regs; mca_mpool_base_registration_t* fit = NULL; - mca_mpool_base_registration_t* largest = NULL; mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool; uint32_t reg_cnt; size_t r; @@ -219,7 +152,8 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( /* check to see if memory is registered */ OBJ_CONSTRUCT(®s, ompi_pointer_array_t); - + ompi_pointer_array_remove_all(®s); + /* look through existing registrations */ btl_mpool->mpool_find(btl_mpool, base, @@ -227,95 +161,45 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration( ®s, ®_cnt); - assert(reg_cnt <= 1); - /* shortcut for one entry - the typical case */ - if(reg_cnt == 1) { - mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, 0); - size_t reg_len = reg->bound - base + 1; - - /* is the existing registration the required size */ - if(reg->base <= base && reg_len >= size) { - return reg; - - /* otherwise if leave_pinned re-register */ - } else if ( mca_pml_ob1.leave_pinned ) { - - btl_mpool->mpool_deregister(btl_mpool, reg); - rc = btl_mpool->mpool_register(btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - ®); - if(OMPI_SUCCESS != rc || NULL == reg) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - } - return reg; - - /* existing registration cannot be used */ - } else { - btl_mpool->mpool_release(btl_mpool, reg); - return NULL; - } - } - + + /* + * find the best fit when there are multiple registrations + */ for(r = 0; r < reg_cnt; r++) { mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(®s, r); size_t reg_len = reg->bound - base + 1; + if(reg->flags & MCA_MPOOL_FLAGS_CACHE) { assert(reg->ref_count >= 3); } - if(reg->base <= base && reg_len >= size) { fit = reg; - break; + } else if(mca_pml_ob1.leave_pinned){ + btl_mpool->mpool_deregister(btl_mpool, reg); } else { - if(NULL == largest) - largest = reg; - else if(reg->base <= base && (reg->bound - base) > (largest->bound - base)) { - largest = reg; - } + btl_mpool->mpool_release(btl_mpool, reg); } } - + + /* if the leave pinned option is set - and there is not an existing * registration that satisfies this request, create one. */ if(NULL == fit && mca_pml_ob1.leave_pinned) { - if (NULL == largest) { - /* register the memory */ - rc = btl_mpool->mpool_register( - btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(OMPI_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - } - /* a registration exists but is not large enough */ - } else { - - btl_mpool->mpool_retain(btl_mpool, largest); - btl_mpool->mpool_deregister(btl_mpool, largest); - rc = btl_mpool->mpool_register(btl_mpool, - base, - size, - MCA_MPOOL_FLAGS_CACHE, - &fit); - if(OMPI_SUCCESS != rc || NULL == fit) { - opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); - } - } - assert(fit->ref_count >= 3); - } - - /* release reference count */ - for(r = 0; r < reg_cnt; r++) { - mca_mpool_base_registration_t *reg = ompi_pointer_array_get_item(®s, r); - if(reg != fit) { - btl_mpool->mpool_release(btl_mpool, reg); + /* register the memory */ + rc = btl_mpool->mpool_register( + btl_mpool, + base, + size, + MCA_MPOOL_FLAGS_CACHE, + &fit); + if(ORTE_SUCCESS != rc || NULL == fit) { + opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size); + return NULL; } + assert(fit->ref_count == 3); } + OBJ_DESTRUCT(®s); return fit; } diff --git a/ompi/mca/rcache/rb/rcache_rb.c b/ompi/mca/rcache/rb/rcache_rb.c index cd3a497236..133e165215 100644 --- a/ompi/mca/rcache/rb/rcache_rb.c +++ b/ompi/mca/rcache/rb/rcache_rb.c @@ -48,7 +48,7 @@ int mca_rcache_rb_find ( uint32_t *cnt ){ - int pos, rc = OMPI_SUCCESS; + int rc = OMPI_SUCCESS; mca_rcache_rb_tree_item_t* tree_item = NULL; void* base_addr; void* bound_addr; @@ -60,44 +60,34 @@ int mca_rcache_rb_find ( base_addr = down_align_addr(addr, mca_mpool_base_page_size_log); bound_addr = up_align_addr((void*) ((unsigned long) addr + size - 1), mca_mpool_base_page_size_log); + - for( ; base_addr <= bound_addr; - base_addr =(void*) ((unsigned long) base_addr + mca_mpool_base_page_size)) { + while(base_addr <= bound_addr) { tree_item = mca_rcache_rb_tree_find( (mca_rcache_rb_module_t*) rcache, base_addr ); if(NULL != tree_item) { - break; + ompi_pointer_array_add(regs, (void*) tree_item->reg); + if( tree_item->reg->flags & MCA_MPOOL_FLAGS_CACHE ) { + rc = mca_rcache_rb_mru_touch((mca_rcache_rb_module_t*)rcache, + tree_item->reg); + if(OMPI_SUCCESS != rc) { + OPAL_THREAD_UNLOCK(&rcache->lock); + return OMPI_ERROR; + } + } + OPAL_THREAD_ADD32((int32_t*) &tree_item->reg->ref_count, 1); + (*cnt)++; + assert(tree_item->reg->bound - tree_item->reg->base >= 0); + assert(((void*) tree_item->reg->bound) >= addr); + base_addr = tree_item->reg->bound + 1; + } + else { + base_addr =(void*) ((unsigned long) base_addr + mca_mpool_base_page_size); } } - - if(NULL == tree_item) { - OPAL_THREAD_UNLOCK(&rcache->lock); - return OMPI_ERROR; - } - pos = ompi_pointer_array_add(regs, (void*) tree_item->reg); - if(0 != pos) { - opal_output(0, "error inserting registration in 1st position"); - return OMPI_ERROR; - } - - if(OMPI_SUCCESS != rc) { - OPAL_THREAD_UNLOCK(&rcache->lock); - return rc; - } - - if( tree_item->reg->flags & MCA_MPOOL_FLAGS_CACHE ) { - rc = mca_rcache_rb_mru_touch((mca_rcache_rb_module_t*)rcache, - tree_item->reg); - } - OPAL_THREAD_ADD32((int32_t*) &tree_item->reg->ref_count, 1); OPAL_THREAD_UNLOCK(&rcache->lock); - if(rc == OMPI_SUCCESS) { - *cnt = 1; - } - assert(tree_item->reg->bound - tree_item->reg->base >= 0); - assert(((void*) tree_item->reg->bound) >= addr); - return rc; + return OMPI_SUCCESS; } int mca_rcache_rb_insert ( @@ -106,8 +96,10 @@ int mca_rcache_rb_insert ( uint32_t flags ) { int rc = OMPI_SUCCESS; + OPAL_THREAD_LOCK(&rcache->lock); reg->flags = flags; + if(flags & MCA_MPOOL_FLAGS_CACHE) { rc = mca_rcache_rb_mru_insert( (mca_rcache_rb_module_t*) rcache, reg); if(OMPI_SUCCESS != rc) { diff --git a/ompi/mca/rcache/rb/rcache_rb_tree.c b/ompi/mca/rcache/rb/rcache_rb_tree.c index 17b6da3eca..d117b3b996 100644 --- a/ompi/mca/rcache/rb/rcache_rb_tree.c +++ b/ompi/mca/rcache/rb/rcache_rb_tree.c @@ -26,6 +26,7 @@ OBJ_CLASS_INSTANCE(mca_rcache_rb_tree_item_t, opal_list_item_t, NULL, NULL); + int mca_rcache_rb_tree_node_compare(void * key1, void * key2); int mca_rcache_rb_tree_init(mca_rcache_rb_module_t* rcache) { @@ -60,9 +61,9 @@ struct mca_rcache_rb_tree_item_t * mca_rcache_rb_tree_find( key.bound = base; found = (mca_rcache_rb_tree_item_t *) ompi_rb_tree_find(&rcache->rb_tree, &key); - if(found) + if(found) { assert((void*)found->reg->bound >= base); - + } return found; } @@ -108,7 +109,7 @@ int mca_rcache_rb_tree_insert( opal_list_item_t *item; int rc; mca_rcache_rb_tree_item_t* rb_tree_item; - + OMPI_FREE_LIST_GET(&rb_module->rb_tree_item_list, item, rc); if(rc != OMPI_SUCCESS) return rc; @@ -140,14 +141,17 @@ int mca_rcache_rb_tree_delete(mca_rcache_rb_module_t* rb_module, mca_mpool_base_registration_t* reg) { int rc; - mca_rcache_rb_tree_item_t* tree_item; + mca_rcache_rb_tree_item_t *tree_item; tree_item = mca_rcache_rb_tree_find(rb_module, reg->base); if(NULL == tree_item) { return OMPI_ERROR; } + assert(reg == tree_item->reg); rc = ompi_rb_tree_delete(&rb_module->rb_tree, &tree_item->key); - + + OMPI_FREE_LIST_RETURN(&rb_module->rb_tree_item_list, (opal_list_item_t*) tree_item); + return rc; }