1
1

Major fixes for the RDMA registration cache (leave_pinned).

This commit fixes issues with HPL runs on node counts > 4. 

This commit was SVN r8793.
Этот коммит содержится в:
Galen Shipman 2006-01-23 22:51:50 +00:00
родитель 83cd8fac9d
Коммит 1e0ea9dd6d
8 изменённых файлов: 137 добавлений и 254 удалений

Просмотреть файл

@ -312,12 +312,6 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
mca_btl_base_selected_module_t* ib_selected;
opal_list_item_t* item;
#if 0
/* ugly HACK!! */
mallopt(M_TRIM_THRESHOLD, -1);
mallopt(M_MMAP_MAX, 0);
#endif
/* initialization */
*num_btl_modules = 0;

Просмотреть файл

@ -27,6 +27,12 @@
#include "mca/mpool/base/base.h"
#include "mpool_base_mem_cb.h"
#ifdef HAVE_MALLOC_H
#include <malloc.h>
extern int mca_mpool_base_disable_sbrk;
#endif
extern int mca_mpool_base_use_mem_hooks;
mca_mpool_base_component_t* mca_mpool_base_component_lookup(const char* name)
@ -82,10 +88,18 @@ mca_mpool_base_module_t* mca_mpool_base_module_create(
sm->mpool_resources = resources;
opal_list_append(&mca_mpool_base_modules, (opal_list_item_t*) sm);
/* on the very first creation of a module we init the memory callback*/
if(mca_mpool_base_use_mem_hooks &&
opal_list_get_size(&mca_mpool_base_modules) == 1 &&
0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) {
if(opal_list_get_size(&mca_mpool_base_modules) == 1) {
if(mca_mpool_base_use_mem_hooks &&
0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) {
opal_mem_hooks_register_release(mca_mpool_base_mem_cb, NULL);
}
#ifdef HAVE_MALLOC_H
else if(mca_mpool_base_disable_sbrk) {
mallopt(M_TRIM_THRESHOLD, -1);
mallopt(M_MMAP_MAX, 0);
}
#endif
}
return module;
}
@ -94,7 +108,7 @@ mca_mpool_base_module_t* mca_mpool_base_module_create(
mca_mpool_base_module_t* mca_mpool_base_module_lookup(const char* name)
{
opal_list_item_t* item;
for (item = opal_list_get_first(&mca_mpool_base_modules);
item != opal_list_get_end(&mca_mpool_base_modules);
item = opal_list_get_next(item)) {

Просмотреть файл

@ -38,7 +38,6 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
mca_mpool_base_registration_t* reg;
mca_mpool_base_selected_module_t* current;
int rc;
int dereg = 0;
opal_list_item_t* item;
void* base_addr;
void* bound_addr;
@ -55,53 +54,34 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
current = (mca_mpool_base_selected_module_t*) item;
for( ; base_addr <= bound_addr;
base_addr =(void*) ((unsigned long) base_addr + mca_mpool_base_page_size)) {
if(NULL != current->mpool_module->mpool_find) {
rc = current->mpool_module->mpool_find(
current->mpool_module,
base_addr,
size,
&regs,
&cnt
);
if(OMPI_SUCCESS != rc) {
continue;
}
for(i = 0; i < cnt; i++) {
reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(&regs, i);
if(base_addr < (void*) ((unsigned long) reg->bound - mca_mpool_base_page_size)) {
base_addr = reg->bound - mca_mpool_base_page_size;
}
if(reg->flags & MCA_MPOOL_FLAGS_CACHE) {
assert(reg->ref_count <= 3);
} else if(reg->flags & MCA_MPOOL_FLAGS_PERSIST) {
assert(reg->ref_count <= 2);
} else {
assert(reg->ref_count <= 1);
}
#if 0
fprintf(stderr, "[%lu,%lu,%lu] mca_mpool_base_mem_cb: base %p bound %p len %lu refcnt %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), reg->base, reg->bound,
reg->bound-reg->base+1, reg->ref_count);
#endif
current->mpool_module->mpool_deregister(current->mpool_module, reg);
dereg++;
}
ompi_pointer_array_remove_all(&regs);
if(NULL != current->mpool_module->mpool_find) {
rc = current->mpool_module->mpool_find(
current->mpool_module,
base_addr,
size,
&regs,
&cnt
);
if(OMPI_SUCCESS != rc) {
continue;
}
}
}
OBJ_DESTRUCT(&regs);
#if 0
if(dereg != 0) {
fprintf(stderr, "[%lu,%lu,%lu] mca_mpool_base_mem_cb: addr %p size %lu base %p bound %p\n",
ORTE_NAME_ARGS(orte_process_info.my_name), base, size,
down_align_addr( base, mca_mpool_base_page_size_log), bound_addr);
}
for(i = 0; i < cnt; i++) {
reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(&regs, i);
#if !defined(NDEBUG)
if(reg->flags & MCA_MPOOL_FLAGS_CACHE) {
assert(reg->ref_count <= 3);
} else if(reg->flags & MCA_MPOOL_FLAGS_PERSIST) {
assert(reg->ref_count <= 2);
} else {
assert(reg->ref_count <= 1);
}
#endif
current->mpool_module->mpool_deregister(current->mpool_module, reg);
}
ompi_pointer_array_remove_all(&regs);
}
}
OBJ_DESTRUCT(&regs);
}

Просмотреть файл

@ -41,6 +41,11 @@
*/
int mca_mpool_base_output = -1;
int mca_mpool_base_use_mem_hooks = 0;
#ifdef HAVE_MALLOC_H
int mca_mpool_base_disable_sbrk = 0;
#endif
uint32_t mca_mpool_base_page_size;
uint32_t mca_mpool_base_page_size_log;
@ -72,7 +77,7 @@ int mca_mpool_base_open(void)
* check for use_mem_hooks (for diagnostics/testing)
* however if leave_pinned is set we force this to be enabled
*/
mca_base_param_reg_int_name("mpool_base",
mca_base_param_reg_int_name("mpool",
"use_mem_hooks",
"use memory hooks for deregistering freed memory",
false,
@ -80,6 +85,16 @@ int mca_mpool_base_open(void)
0,
&mca_mpool_base_use_mem_hooks);
#ifdef HAVE_MALLOC_H
mca_base_param_reg_int_name("mpool",
"disable_sbrk",
"use mallopt to override calling sbrk (doesn't return memory to OS!)",
false,
false,
0,
&mca_mpool_base_disable_sbrk);
#endif
/* if(0 == mca_mpool_base_use_mem_hooks) { */
/* int param; */
/* mca_base_param_register_int("mpi", NULL, "leave_pinned", "leave_pinned", 0); */

Просмотреть файл

@ -157,8 +157,8 @@ int mca_mpool_mvapi_deregister(mca_mpool_base_module_t* mpool,
{
if(registration->flags & (MCA_MPOOL_FLAGS_CACHE | MCA_MPOOL_FLAGS_PERSIST)) {
mpool->rcache->rcache_delete(mpool->rcache,
registration,
registration->flags);
registration,
registration->flags);
registration->flags = 0;
}
return mca_mpool_mvapi_release(mpool, registration);

Просмотреть файл

@ -58,7 +58,6 @@ size_t mca_pml_ob1_rdma_btls(
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n);
mca_mpool_base_registration_t* fit = NULL;
mca_mpool_base_registration_t* largest = NULL;
mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool;
uint32_t reg_cnt;
size_t r;
@ -78,49 +77,10 @@ size_t mca_pml_ob1_rdma_btls(
size,
&regs,
&reg_cnt);
assert(reg_cnt <= 1);
/* shortcut for one entry - the typical case */
if(reg_cnt == 1) {
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(&regs, 0);
size_t reg_len = reg->bound - base + 1;
if(reg->flags & MCA_MPOOL_FLAGS_CACHE) {
assert(reg->ref_count >= 3);
}
/* is the existing registration the required size */
if(reg->base <= base && reg_len >= size) {
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
num_btls_used++;
/* otherwise if leave_pinned re-register */
} else if( mca_pml_ob1.leave_pinned ) {
btl_mpool->mpool_deregister(btl_mpool, reg);
rc = btl_mpool->mpool_register(btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&reg);
if(OMPI_SUCCESS != rc || NULL == reg) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
continue;
}
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
num_btls_used++;
/* existing registration cannot be used */
} else {
btl_mpool->mpool_release(btl_mpool, reg);
}
continue;
}
/*
* find the best fit when there are multiple registrations
*/
*/
for(r = 0; r < reg_cnt; r++) {
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(&regs, r);
size_t reg_len = reg->bound - base + 1;
@ -130,60 +90,34 @@ size_t mca_pml_ob1_rdma_btls(
}
if(reg->base <= base && reg_len >= size) {
fit = reg;
break;
} else if(mca_pml_ob1.leave_pinned){
btl_mpool->mpool_deregister(btl_mpool, reg);
} else {
if(NULL == largest)
largest = reg;
else if(reg->base <= base && (reg->bound - base) > (largest->bound - base)) {
largest = reg;
}
btl_mpool->mpool_release(btl_mpool, reg);
}
}
/* if the leave pinned option is set - and there is not an existing
* registration that satisfies this request, create one.
*/
if(NULL == fit && mca_pml_ob1.leave_pinned) {
if (NULL == largest) {
/* register the memory */
rc = btl_mpool->mpool_register(
btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(ORTE_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
continue;
}
/* a registration exists but is not large enough */
} else {
/* simplify cleanup - bump reference count as we decrement again below */
btl_mpool->mpool_retain(btl_mpool,largest);
btl_mpool->mpool_deregister(btl_mpool, largest);
rc = btl_mpool->mpool_register(btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(ORTE_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
continue;
}
/* register the memory */
rc = btl_mpool->mpool_register(
btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(ORTE_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
continue;
}
assert(fit->ref_count == 3);
}
/* decrement reference count on all unused entries */
for(r = 0; r < reg_cnt; r++) {
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(&regs, r);
if(reg != fit) {
btl_mpool->mpool_release(btl_mpool, reg);
}
}
if(NULL != fit) {
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = fit;
@ -206,7 +140,6 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
{
ompi_pointer_array_t regs;
mca_mpool_base_registration_t* fit = NULL;
mca_mpool_base_registration_t* largest = NULL;
mca_mpool_base_module_t* btl_mpool = bml_btl->btl_mpool;
uint32_t reg_cnt;
size_t r;
@ -219,7 +152,8 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
/* check to see if memory is registered */
OBJ_CONSTRUCT(&regs, ompi_pointer_array_t);
ompi_pointer_array_remove_all(&regs);
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool,
base,
@ -227,95 +161,45 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
&regs,
&reg_cnt);
assert(reg_cnt <= 1);
/* shortcut for one entry - the typical case */
if(reg_cnt == 1) {
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(&regs, 0);
size_t reg_len = reg->bound - base + 1;
/* is the existing registration the required size */
if(reg->base <= base && reg_len >= size) {
return reg;
/* otherwise if leave_pinned re-register */
} else if ( mca_pml_ob1.leave_pinned ) {
btl_mpool->mpool_deregister(btl_mpool, reg);
rc = btl_mpool->mpool_register(btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&reg);
if(OMPI_SUCCESS != rc || NULL == reg) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
}
return reg;
/* existing registration cannot be used */
} else {
btl_mpool->mpool_release(btl_mpool, reg);
return NULL;
}
}
/*
* find the best fit when there are multiple registrations
*/
for(r = 0; r < reg_cnt; r++) {
mca_mpool_base_registration_t* reg = ompi_pointer_array_get_item(&regs, r);
size_t reg_len = reg->bound - base + 1;
if(reg->flags & MCA_MPOOL_FLAGS_CACHE) {
assert(reg->ref_count >= 3);
}
if(reg->base <= base && reg_len >= size) {
fit = reg;
break;
} else if(mca_pml_ob1.leave_pinned){
btl_mpool->mpool_deregister(btl_mpool, reg);
} else {
if(NULL == largest)
largest = reg;
else if(reg->base <= base && (reg->bound - base) > (largest->bound - base)) {
largest = reg;
}
btl_mpool->mpool_release(btl_mpool, reg);
}
}
/* if the leave pinned option is set - and there is not an existing
* registration that satisfies this request, create one.
*/
if(NULL == fit && mca_pml_ob1.leave_pinned) {
if (NULL == largest) {
/* register the memory */
rc = btl_mpool->mpool_register(
btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(OMPI_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
}
/* a registration exists but is not large enough */
} else {
btl_mpool->mpool_retain(btl_mpool, largest);
btl_mpool->mpool_deregister(btl_mpool, largest);
rc = btl_mpool->mpool_register(btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(OMPI_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
}
}
assert(fit->ref_count >= 3);
}
/* release reference count */
for(r = 0; r < reg_cnt; r++) {
mca_mpool_base_registration_t *reg = ompi_pointer_array_get_item(&regs, r);
if(reg != fit) {
btl_mpool->mpool_release(btl_mpool, reg);
/* register the memory */
rc = btl_mpool->mpool_register(
btl_mpool,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
if(ORTE_SUCCESS != rc || NULL == fit) {
opal_output(0, "[%s:%d] mpool_register(%p,%lu) failed, \n", __FILE__, __LINE__, base, size);
return NULL;
}
assert(fit->ref_count == 3);
}
OBJ_DESTRUCT(&regs);
return fit;
}

Просмотреть файл

@ -48,7 +48,7 @@ int mca_rcache_rb_find (
uint32_t *cnt
){
int pos, rc = OMPI_SUCCESS;
int rc = OMPI_SUCCESS;
mca_rcache_rb_tree_item_t* tree_item = NULL;
void* base_addr;
void* bound_addr;
@ -60,44 +60,34 @@ int mca_rcache_rb_find (
base_addr = down_align_addr(addr, mca_mpool_base_page_size_log);
bound_addr = up_align_addr((void*) ((unsigned long) addr + size - 1), mca_mpool_base_page_size_log);
for( ; base_addr <= bound_addr;
base_addr =(void*) ((unsigned long) base_addr + mca_mpool_base_page_size)) {
while(base_addr <= bound_addr) {
tree_item = mca_rcache_rb_tree_find( (mca_rcache_rb_module_t*) rcache, base_addr );
if(NULL != tree_item) {
break;
ompi_pointer_array_add(regs, (void*) tree_item->reg);
if( tree_item->reg->flags & MCA_MPOOL_FLAGS_CACHE ) {
rc = mca_rcache_rb_mru_touch((mca_rcache_rb_module_t*)rcache,
tree_item->reg);
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&rcache->lock);
return OMPI_ERROR;
}
}
OPAL_THREAD_ADD32((int32_t*) &tree_item->reg->ref_count, 1);
(*cnt)++;
assert(tree_item->reg->bound - tree_item->reg->base >= 0);
assert(((void*) tree_item->reg->bound) >= addr);
base_addr = tree_item->reg->bound + 1;
}
else {
base_addr =(void*) ((unsigned long) base_addr + mca_mpool_base_page_size);
}
}
if(NULL == tree_item) {
OPAL_THREAD_UNLOCK(&rcache->lock);
return OMPI_ERROR;
}
pos = ompi_pointer_array_add(regs, (void*) tree_item->reg);
if(0 != pos) {
opal_output(0, "error inserting registration in 1st position");
return OMPI_ERROR;
}
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&rcache->lock);
return rc;
}
if( tree_item->reg->flags & MCA_MPOOL_FLAGS_CACHE ) {
rc = mca_rcache_rb_mru_touch((mca_rcache_rb_module_t*)rcache,
tree_item->reg);
}
OPAL_THREAD_ADD32((int32_t*) &tree_item->reg->ref_count, 1);
OPAL_THREAD_UNLOCK(&rcache->lock);
if(rc == OMPI_SUCCESS) {
*cnt = 1;
}
assert(tree_item->reg->bound - tree_item->reg->base >= 0);
assert(((void*) tree_item->reg->bound) >= addr);
return rc;
return OMPI_SUCCESS;
}
int mca_rcache_rb_insert (
@ -106,8 +96,10 @@ int mca_rcache_rb_insert (
uint32_t flags
) {
int rc = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&rcache->lock);
reg->flags = flags;
if(flags & MCA_MPOOL_FLAGS_CACHE) {
rc = mca_rcache_rb_mru_insert( (mca_rcache_rb_module_t*) rcache, reg);
if(OMPI_SUCCESS != rc) {

Просмотреть файл

@ -26,6 +26,7 @@
OBJ_CLASS_INSTANCE(mca_rcache_rb_tree_item_t, opal_list_item_t, NULL, NULL);
int mca_rcache_rb_tree_node_compare(void * key1, void * key2);
int mca_rcache_rb_tree_init(mca_rcache_rb_module_t* rcache) {
@ -60,9 +61,9 @@ struct mca_rcache_rb_tree_item_t * mca_rcache_rb_tree_find(
key.bound = base;
found = (mca_rcache_rb_tree_item_t *)
ompi_rb_tree_find(&rcache->rb_tree, &key);
if(found)
if(found) {
assert((void*)found->reg->bound >= base);
}
return found;
}
@ -108,7 +109,7 @@ int mca_rcache_rb_tree_insert(
opal_list_item_t *item;
int rc;
mca_rcache_rb_tree_item_t* rb_tree_item;
OMPI_FREE_LIST_GET(&rb_module->rb_tree_item_list, item, rc);
if(rc != OMPI_SUCCESS)
return rc;
@ -140,14 +141,17 @@ int mca_rcache_rb_tree_delete(mca_rcache_rb_module_t* rb_module,
mca_mpool_base_registration_t* reg)
{
int rc;
mca_rcache_rb_tree_item_t* tree_item;
mca_rcache_rb_tree_item_t *tree_item;
tree_item = mca_rcache_rb_tree_find(rb_module,
reg->base);
if(NULL == tree_item) {
return OMPI_ERROR;
}
assert(reg == tree_item->reg);
rc = ompi_rb_tree_delete(&rb_module->rb_tree, &tree_item->key);
OMPI_FREE_LIST_RETURN(&rb_module->rb_tree_item_list, (opal_list_item_t*) tree_item);
return rc;
}