rfc timeout. retry registration after removing old registration from lru
This commit was SVN r25587.
Этот коммит содержится в:
родитель
58938b2f50
Коммит
87b7e85d53
@ -10,6 +10,8 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Voltaire. All rights reserved.
|
* Copyright (c) 2006 Voltaire. All rights reserved.
|
||||||
|
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||||
|
* reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -55,7 +57,7 @@ struct mca_mpool_rdma_module_t {
|
|||||||
mca_mpool_base_module_t super;
|
mca_mpool_base_module_t super;
|
||||||
struct mca_mpool_base_resources_t resources;
|
struct mca_mpool_base_resources_t resources;
|
||||||
ompi_free_list_t reg_list;
|
ompi_free_list_t reg_list;
|
||||||
opal_list_t mru_list;
|
opal_list_t lru_list;
|
||||||
opal_list_t gc_list;
|
opal_list_t gc_list;
|
||||||
uint32_t stat_cache_hit;
|
uint32_t stat_cache_hit;
|
||||||
uint32_t stat_cache_miss;
|
uint32_t stat_cache_miss;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
@ -13,6 +14,8 @@
|
|||||||
* Copyright (c) 2006 Voltaire. All rights reserved.
|
* Copyright (c) 2006 Voltaire. All rights reserved.
|
||||||
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||||
|
* reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -63,7 +66,7 @@ void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t* mpool)
|
|||||||
OBJ_CLASS(mca_mpool_base_registration_t),
|
OBJ_CLASS(mca_mpool_base_registration_t),
|
||||||
0,opal_cache_line_size,
|
0,opal_cache_line_size,
|
||||||
0, -1, 32, NULL);
|
0, -1, 32, NULL);
|
||||||
OBJ_CONSTRUCT(&mpool->mru_list, opal_list_t);
|
OBJ_CONSTRUCT(&mpool->lru_list, opal_list_t);
|
||||||
OBJ_CONSTRUCT(&mpool->gc_list, opal_list_t);
|
OBJ_CONSTRUCT(&mpool->gc_list, opal_list_t);
|
||||||
mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0;
|
mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0;
|
||||||
mpool->stat_cache_found = mpool->stat_cache_notfound = 0;
|
mpool->stat_cache_found = mpool->stat_cache_notfound = 0;
|
||||||
@ -179,6 +182,40 @@ static int register_cache_bypass(mca_mpool_base_module_t *mpool,
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool mca_mpool_rdma_deregister_lru (mca_mpool_base_module_t *mpool) {
|
||||||
|
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t *) mpool;
|
||||||
|
mca_mpool_base_registration_t *old_reg;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* Remove the registration from the cache and list before
|
||||||
|
deregistering the memory */
|
||||||
|
old_reg = (mca_mpool_base_registration_t*)
|
||||||
|
opal_list_remove_first (&mpool_rdma->lru_list);
|
||||||
|
if (NULL == old_reg) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
mpool->rcache->rcache_delete(mpool->rcache, old_reg);
|
||||||
|
|
||||||
|
/* Drop the rcache lock while we deregister the memory */
|
||||||
|
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||||
|
rc = dereg_mem(mpool, old_reg);
|
||||||
|
OPAL_THREAD_LOCK(&mpool->rcache->lock);
|
||||||
|
|
||||||
|
/* This introduces a potential leak of registrations if
|
||||||
|
the deregistration fails to occur as we no longer have
|
||||||
|
a reference to it. Is this possible? */
|
||||||
|
if (OMPI_SUCCESS != rc) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
|
||||||
|
(ompi_free_list_item_t*)old_reg);
|
||||||
|
mpool_rdma->stat_evicted++;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* register memory
|
* register memory
|
||||||
*/
|
*/
|
||||||
@ -215,7 +252,7 @@ int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
((*reg)->base == base && (*reg)->bound == bound))) {
|
((*reg)->base == base && (*reg)->bound == bound))) {
|
||||||
if(0 == (*reg)->ref_count &&
|
if(0 == (*reg)->ref_count &&
|
||||||
mca_mpool_rdma_component.leave_pinned) {
|
mca_mpool_rdma_component.leave_pinned) {
|
||||||
opal_list_remove_item(&mpool_rdma->mru_list,
|
opal_list_remove_item(&mpool_rdma->lru_list,
|
||||||
(opal_list_item_t*)(*reg));
|
(opal_list_item_t*)(*reg));
|
||||||
}
|
}
|
||||||
mpool_rdma->stat_cache_hit++;
|
mpool_rdma->stat_cache_hit++;
|
||||||
@ -256,35 +293,10 @@ int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg,
|
while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg,
|
||||||
mca_mpool_rdma_component.rcache_size_limit)) ==
|
mca_mpool_rdma_component.rcache_size_limit)) ==
|
||||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
|
OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
|
||||||
mca_mpool_base_registration_t *old_reg;
|
|
||||||
/* try to remove one unused reg and retry */
|
/* try to remove one unused reg and retry */
|
||||||
old_reg = (mca_mpool_base_registration_t*)
|
if (!mca_mpool_rdma_deregister_lru (mpool)) {
|
||||||
opal_list_get_last(&mpool_rdma->mru_list);
|
|
||||||
if(opal_list_get_end(&mpool_rdma->mru_list) !=
|
|
||||||
(opal_list_item_t*)old_reg) {
|
|
||||||
|
|
||||||
/* Remove the registration from the cache and list before
|
|
||||||
deregistering the memory */
|
|
||||||
mpool->rcache->rcache_delete(mpool->rcache, old_reg);
|
|
||||||
opal_list_remove_item(&mpool_rdma->mru_list,
|
|
||||||
(opal_list_item_t*)old_reg);
|
|
||||||
|
|
||||||
/* Drop the rcache lock while we deregister the memory */
|
|
||||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
|
||||||
rc = dereg_mem(mpool, old_reg);
|
|
||||||
OPAL_THREAD_LOCK(&mpool->rcache->lock);
|
|
||||||
|
|
||||||
/* This introduces a potential leak of registrations if
|
|
||||||
the deregistration fails to occur as we no longer have
|
|
||||||
a reference to it. Is this possible? */
|
|
||||||
if(OMPI_SUCCESS == rc) {
|
|
||||||
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
|
|
||||||
(ompi_free_list_item_t*)old_reg);
|
|
||||||
mpool_rdma->stat_evicted++;
|
|
||||||
} else
|
|
||||||
break;
|
|
||||||
} else
|
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(rc != OMPI_SUCCESS) {
|
if(rc != OMPI_SUCCESS) {
|
||||||
@ -293,8 +305,14 @@ int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data,
|
while (OMPI_ERR_OUT_OF_RESOURCE ==
|
||||||
base, bound - base + 1, rdma_reg);
|
(rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data,
|
||||||
|
base, bound - base + 1, rdma_reg))) {
|
||||||
|
/* try to remove one unused reg and retry */
|
||||||
|
if (!mca_mpool_rdma_deregister_lru (mpool)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(rc != OMPI_SUCCESS) {
|
if(rc != OMPI_SUCCESS) {
|
||||||
mpool->rcache->rcache_delete(mpool->rcache, rdma_reg);
|
mpool->rcache->rcache_delete(mpool->rcache, rdma_reg);
|
||||||
@ -358,7 +376,7 @@ int mca_mpool_rdma_find(struct mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
assert(((void*)(*reg)->bound) >= addr);
|
assert(((void*)(*reg)->bound) >= addr);
|
||||||
if(0 == (*reg)->ref_count &&
|
if(0 == (*reg)->ref_count &&
|
||||||
mca_mpool_rdma_component.leave_pinned) {
|
mca_mpool_rdma_component.leave_pinned) {
|
||||||
opal_list_remove_item(&mpool_rdma->mru_list,
|
opal_list_remove_item(&mpool_rdma->lru_list,
|
||||||
(opal_list_item_t*)(*reg));
|
(opal_list_item_t*)(*reg));
|
||||||
}
|
}
|
||||||
mpool_rdma->stat_cache_found++;
|
mpool_rdma->stat_cache_found++;
|
||||||
@ -395,8 +413,8 @@ int mca_mpool_rdma_deregister(struct mca_mpool_base_module_t *mpool,
|
|||||||
if(mca_mpool_rdma_component.leave_pinned && registration_is_cachebale(reg))
|
if(mca_mpool_rdma_component.leave_pinned && registration_is_cachebale(reg))
|
||||||
{
|
{
|
||||||
/* if leave_pinned is set don't deregister memory, but put it
|
/* if leave_pinned is set don't deregister memory, but put it
|
||||||
* on MRU list for future use */
|
* on LRU list for future use */
|
||||||
opal_list_prepend(&mpool_rdma->mru_list, (opal_list_item_t*)reg);
|
opal_list_append(&mpool_rdma->lru_list, (opal_list_item_t*)reg);
|
||||||
} else {
|
} else {
|
||||||
/* Remove from rcache first */
|
/* Remove from rcache first */
|
||||||
if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
|
if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
|
||||||
@ -449,7 +467,7 @@ int mca_mpool_rdma_release_memory(struct mca_mpool_base_module_t *mpool,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_list_remove_item(&mpool_rdma->mru_list,(opal_list_item_t*)reg);
|
opal_list_remove_item(&mpool_rdma->lru_list,(opal_list_item_t*)reg);
|
||||||
opal_list_append(&mpool_rdma->gc_list, (opal_list_item_t*)reg);
|
opal_list_append(&mpool_rdma->gc_list, (opal_list_item_t*)reg);
|
||||||
}
|
}
|
||||||
} while(reg_cnt == RDMA_MPOOL_NREGS);
|
} while(reg_cnt == RDMA_MPOOL_NREGS);
|
||||||
@ -490,7 +508,7 @@ void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
|
|||||||
if(reg->ref_count) {
|
if(reg->ref_count) {
|
||||||
reg->ref_count = 0; /* otherway dereg will fail on assert */
|
reg->ref_count = 0; /* otherway dereg will fail on assert */
|
||||||
} else if (mca_mpool_rdma_component.leave_pinned) {
|
} else if (mca_mpool_rdma_component.leave_pinned) {
|
||||||
opal_list_remove_item(&mpool_rdma->mru_list,
|
opal_list_remove_item(&mpool_rdma->lru_list,
|
||||||
(opal_list_item_t*)reg);
|
(opal_list_item_t*)reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -513,7 +531,7 @@ void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
|
|||||||
}
|
}
|
||||||
} while(reg_cnt == RDMA_MPOOL_NREGS);
|
} while(reg_cnt == RDMA_MPOOL_NREGS);
|
||||||
|
|
||||||
OBJ_DESTRUCT(&mpool_rdma->mru_list);
|
OBJ_DESTRUCT(&mpool_rdma->lru_list);
|
||||||
OBJ_DESTRUCT(&mpool_rdma->gc_list);
|
OBJ_DESTRUCT(&mpool_rdma->gc_list);
|
||||||
OBJ_DESTRUCT(&mpool_rdma->reg_list);
|
OBJ_DESTRUCT(&mpool_rdma->reg_list);
|
||||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user