1
1

rcache: fix deadlock in multi-threaded environments

This commit fixes several bugs in the registration cache code:

 - Fix a programming error in the grdma invalidation function that can
   cause an infinite loop if more than 100 registrations are
   associated with a munmapped region. This happens because the
   mca_rcache_base_vma_find_all function returns the same 100
   registrations on each call. This has been fixed by adding an
   iterate function to the vma tree interface.

 - Always obtain the vma lock when needed. This is required because
   there may be other threads in the system even if
   opal_using_threads() is false. Additionally, since it is safe to do
   so (the vma lock is recursive) the vma interface has been made
   thread safe.

 - Avoid calling free() while holding a lock. This avoids race
   conditions with locks held outside the Open MPI code.

Fixes open-mpi/ompi#1654.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-05-16 15:25:26 -06:00
родитель 4e21933a74
Коммит ab8ed177f5
7 изменённых файлов: 346 добавлений и 203 удалений

Просмотреть файл

@ -539,6 +539,17 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
ep->fifo = NULL;
}
#if OPAL_BTL_VADER_HAVE_XPMEM
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
{
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
/* otherwise dereg will fail on assert */
reg->ref_count = 0;
(void) mca_rcache_base_vma_delete (vma_module, reg);
return OPAL_SUCCESS;
}
#endif
static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
{
OBJ_DESTRUCT(&ep->pending_frags);
@ -548,21 +559,11 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
if (ep->segment_data.xpmem.vma_module) {
/* clean out the registration cache */
const int nregs = 100;
mca_rcache_base_registration_t *regs[nregs];
int reg_cnt;
do {
reg_cnt = mca_rcache_base_vma_find_all (ep->segment_data.xpmem.vma_module,
0, (size_t) -1, regs, nregs);
for (int i = 0 ; i < reg_cnt ; ++i) {
/* otherwise dereg will fail on assert */
regs[i]->ref_count = 0;
OBJ_RELEASE(regs[i]);
}
} while (reg_cnt == nregs);
ep->segment_data.xpmem.vma_module = NULL;
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
NULL, (size_t) -1,
mca_btl_vader_endpoint_rcache_cleanup,
(void *) ep->segment_data.xpmem.vma_module);
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
}
if (ep->segment_base) {

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
@ -144,6 +144,14 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
return mca_rcache_base_vma_tree_delete (vma_module, reg);
}
int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx)
{
return mca_rcache_base_vma_tree_iterate (vma_module, base, size, callback_fn, ctx);
}
void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg)
{

Просмотреть файл

@ -13,7 +13,7 @@
*
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
@ -34,6 +34,7 @@
#include "opal_config.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_rb_tree.h"
#include "opal/class/opal_lifo.h"
BEGIN_C_DECLS
@ -69,6 +70,26 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg);
/**
* Iterate over registrations in the specified range.
*
* @param[in] vma_module vma tree
* @param[in] base base address of region
* @param[in] size size of region
* @param[in] callback_fn function to call for each matching registration handle
* @param[in] ctx callback context
*
* The callback will be made with the vma lock held. This is a recursive lock so
* it is still safe to call any vma functions on this vma_module. Keep in mind it
* is only safe to call mca_rcache_base_vma_delete() on the supplied registration
* from the callback. The iteration will terminate if the callback returns anything
* other than OPAL_SUCCESS.
*/
int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx);
END_C_DECLS
#endif /* MCA_RCACHE_BASE_VMA_H */

Просмотреть файл

@ -258,9 +258,12 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *item;
opal_mutex_lock (&vma_module->vma_lock);
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_search);
if (!vma) {
opal_mutex_unlock (&vma_module->vma_lock);
return NULL;
}
@ -269,12 +272,18 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
continue;
}
if(item->reg->bound >= bound)
if(item->reg->bound >= bound) {
opal_mutex_unlock (&vma_module->vma_lock);
return item->reg;
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST))
}
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
break;
}
}
opal_mutex_unlock (&vma_module->vma_lock);
return NULL;
}
@ -299,6 +308,8 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
if(opal_list_get_size(&vma_module->vma_list) == 0)
return cnt;
opal_mutex_lock (&vma_module->vma_lock);
do {
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *vma_item;
@ -316,39 +327,99 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
}
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if ((vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID) ||
if (vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID ||
is_reg_in_array (regs, cnt, vma_item->reg)) {
continue;
}
regs[cnt++] = vma_item->reg;
if (cnt == reg_cnt) {
opal_mutex_unlock (&vma_module->vma_lock);
return cnt; /* no space left in the provided array */
}
}
base = (unsigned char *)vma->end + 1;
} while(bound >= base);
} while (bound >= base);
opal_mutex_unlock (&vma_module->vma_lock);
return cnt;
}
static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
}
static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_module_t *vma_module,
size_t nbytes)
{
vma_module->reg_cur_cache_size += nbytes;
}
int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module, unsigned char *base,
size_t size, int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx)
{
unsigned char *bound = base + size - 1;
mca_rcache_base_vma_item_t *vma;
int rc = OPAL_SUCCESS;
if (opal_list_get_size(&vma_module->vma_list) == 0) {
/* nothin to do */
return OPAL_SUCCESS;
}
opal_mutex_lock (&vma_module->vma_lock);
do {
mca_rcache_base_vma_reg_list_item_t *vma_item, *next;
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_closest);
if (NULL == vma) {
/* base is bigger than any registered memory */
break;
}
if (base < (unsigned char *) vma->start) {
base = (unsigned char *) vma->start;
continue;
}
base = (unsigned char *)vma->end + 1;
/* all the registrations in the vma may be deleted by the callback so keep a
* reference until we are done with it. */
OBJ_RETAIN(vma);
OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
rc = callback_fn (vma_item->reg, ctx);
if (OPAL_SUCCESS != rc) {
break;
}
}
OBJ_RELEASE(vma);
if (OPAL_SUCCESS != rc) {
break;
}
} while (bound >= base);
opal_mutex_unlock (&vma_module->vma_lock);
return rc;
}
static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
}
int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg, size_t limit)
{
mca_rcache_base_vma_item_t *i;
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;
opal_mutex_lock (&vma_module->vma_lock);
i = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree,
(void *) begin, mca_rcache_base_vma_tree_node_compare_closest);
@ -373,6 +444,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
opal_list_append(&vma_module->vma_list, &vma->super);
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_SUCCESS;
}
@ -434,10 +506,14 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
}
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_SUCCESS;
remove:
mca_rcache_base_vma_tree_delete (vma_module, reg);
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
}
@ -453,17 +529,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg)
{
mca_rcache_base_vma_item_t *vma;
opal_list_t deleted_vmas;
opal_mutex_lock (&vma_module->vma_lock);
vma = (mca_rcache_base_vma_item_t *)
opal_rb_tree_find_with (&vma_module->rb_tree, reg->base,
mca_rcache_base_vma_tree_node_compare_search);
if (!vma) {
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_ERROR;
}
OBJ_CONSTRUCT(&deleted_vmas, opal_list_t);
while (vma != (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list)
&& vma->start <= (uintptr_t) reg->bound) {
&& vma->start <= (uintptr_t) reg->bound) {
mca_rcache_base_vma_remove_reg(vma, reg);
if(opal_list_is_empty(&vma->reg_list)) {
@ -473,7 +555,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_vma_update_byte_count (vma_module,
vma->start - vma->end - 1);
opal_list_remove_item (&vma_module->vma_list, &vma->super);
OBJ_RELEASE(vma);
opal_list_append (&deleted_vmas, &vma->super);
vma = next;
} else {
int merged;
@ -491,7 +573,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
prev->end = vma->end;
opal_list_remove_item(&vma_module->vma_list, &vma->super);
opal_rb_tree_delete(&vma_module->rb_tree, vma);
OBJ_RELEASE(vma);
opal_list_append (&deleted_vmas, &vma->super);
vma = prev;
merged = 1;
}
@ -505,7 +587,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
vma->end = next->end;
opal_list_remove_item(&vma_module->vma_list, &next->super);
opal_rb_tree_delete(&vma_module->rb_tree, next);
OBJ_RELEASE(next);
opal_list_append (&deleted_vmas, &next->super);
merged = 1;
}
} while (merged);
@ -514,6 +596,11 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
}
}
opal_mutex_unlock (&vma_module->vma_lock);
/* actually free vmas now that the lock has been dropped */
OPAL_LIST_DESTRUCT(&deleted_vmas);
return 0;
}
@ -558,7 +645,7 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
reg = vma_item->reg;
opal_output(0, " reg: base=%p, bound=%p, ref_count=%d, flags=0x%x",
reg->base, reg->bound, reg->ref_count, reg->flags);
(void *) reg->base, (void *) reg->bound, reg->ref_count, reg->flags);
}
base = (unsigned char *)vma->end + 1;
} while (bound >= base);

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2009 IBM Corporation. All rights reserved.
*
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -106,4 +106,12 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
unsigned char *base, size_t size, char *msg);
/*
* Iterate over matching registration handles in the tree.
*/
int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx);
#endif /* MCA_RCACHE_BASE_VMA_TREE_H */

Просмотреть файл

@ -40,7 +40,7 @@ struct mca_rcache_grdma_cache_t {
opal_list_item_t super;
char *cache_name;
opal_list_t lru_list;
opal_list_t gc_list;
opal_lifo_t gc_lifo;
mca_rcache_base_vma_module_t *vma_module;
};
typedef struct mca_rcache_grdma_cache_t mca_rcache_grdma_cache_t;

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
*
@ -75,15 +75,18 @@ static void mca_rcache_grdma_cache_contructor (mca_rcache_grdma_cache_t *cache)
memset ((void *)((uintptr_t)cache + sizeof (cache->super)), 0, sizeof (*cache) - sizeof (cache->super));
OBJ_CONSTRUCT(&cache->lru_list, opal_list_t);
OBJ_CONSTRUCT(&cache->gc_list, opal_list_t);
OBJ_CONSTRUCT(&cache->gc_lifo, opal_lifo_t);
cache->vma_module = mca_rcache_base_vma_module_alloc ();
}
static void mca_rcache_grdma_cache_destructor (mca_rcache_grdma_cache_t *cache)
{
/* clear the lru before releasing the list */
while (NULL != opal_list_remove_first (&cache->lru_list));
OBJ_DESTRUCT(&cache->lru_list);
OBJ_DESTRUCT(&cache->gc_list);
OBJ_DESTRUCT(&cache->gc_lifo);
if (cache->vma_module) {
OBJ_RELEASE(cache->vma_module);
}
@ -133,34 +136,36 @@ static inline int dereg_mem(mca_rcache_base_registration_t *reg)
rc = rcache_grdma->resources.deregister_mem (rcache_grdma->resources.reg_data, reg);
if (OPAL_LIKELY(OPAL_SUCCESS == rc)) {
opal_free_list_return (&rcache_grdma->reg_list,
(opal_free_list_item_t *) reg);
opal_free_list_return_mt (&rcache_grdma->reg_list,
(opal_free_list_item_t *) reg);
}
OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output,
"registration %p destroyed", (void *) reg));
return rc;
}
/* This function must be called with the rcache lock held */
static inline void do_unregistration_gc (mca_rcache_base_module_t *rcache)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache;
opal_list_item_t *item;
/* Remove registration from garbage collection list
before deregistering it */
while (NULL !=
(item = opal_list_remove_first(&rcache_grdma->cache->gc_list))) {
dereg_mem((mca_rcache_base_registration_t *) item);
/* Remove registration from garbage collection list before deregistering it */
while (NULL != (item = opal_lifo_pop_atomic (&rcache_grdma->cache->gc_lifo))) {
OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output,
"deleting stale registration %p", (void *) item));
dereg_mem ((mca_rcache_base_registration_t *) item);
}
}
static inline bool mca_rcache_grdma_evict_lru_local (mca_rcache_grdma_cache_t *cache)
{
mca_rcache_grdma_module_t *rcache_grdma;
mca_rcache_base_registration_t *old_reg;
opal_mutex_lock (&cache->vma_module->vma_lock);
old_reg = (mca_rcache_base_registration_t *)
opal_list_remove_first (&cache->lru_list);
opal_mutex_unlock (&cache->vma_module->vma_lock);
if (NULL == old_reg) {
return false;
}
@ -179,6 +184,63 @@ static bool mca_rcache_grdma_evict (mca_rcache_base_module_t *rcache)
return mca_rcache_grdma_evict_lru_local (((mca_rcache_grdma_module_t *) rcache)->cache);
}
struct mca_rcache_base_find_args_t {
mca_rcache_base_registration_t *reg;
mca_rcache_grdma_module_t *rcache_grdma;
unsigned char *base;
unsigned char *bound;
int access_flags;
};
typedef struct mca_rcache_base_find_args_t mca_rcache_base_find_args_t;
static int mca_rcache_grdma_check_cached (mca_rcache_base_registration_t *grdma_reg, void *ctx)
{
mca_rcache_base_find_args_t *args = (mca_rcache_base_find_args_t *) ctx;
mca_rcache_grdma_module_t *rcache_grdma = args->rcache_grdma;
if ((grdma_reg->flags & MCA_RCACHE_FLAGS_INVALID) || &rcache_grdma->super != grdma_reg->rcache ||
grdma_reg->base > args->base || grdma_reg->bound < args->bound) {
return 0;
}
if (OPAL_UNLIKELY((args->access_flags & grdma_reg->access_flags) != args->access_flags)) {
args->access_flags |= grdma_reg->access_flags;
if (0 != grdma_reg->ref_count) {
if (!(grdma_reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) {
mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, grdma_reg);
}
/* mark the registration to go away when it is deregistered */
grdma_reg->flags |= MCA_RCACHE_FLAGS_INVALID | MCA_RCACHE_FLAGS_CACHE_BYPASS;
} else {
if (registration_is_cacheable(grdma_reg)) {
opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg);
}
dereg_mem (grdma_reg);
}
} else {
if (0 == grdma_reg->ref_count) {
/* Leave pinned must be set for this to still be in the rcache. */
opal_list_remove_item(&rcache_grdma->cache->lru_list,
(opal_list_item_t *) grdma_reg);
}
/* This segment fits fully within an existing segment. */
rcache_grdma->stat_cache_hit++;
int32_t ref_cnt = opal_atomic_add_32 (&grdma_reg->ref_count, 1);
OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output,
"returning existing registration %p. references %d", (void *) grdma_reg, ref_cnt));
args->reg = grdma_reg;
return 1;
}
/* can't use this registration */
return 0;
}
/*
* register memory
*/
@ -195,15 +257,11 @@ static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *ad
unsigned int page_size = opal_getpagesize ();
int rc;
OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock);
*reg = NULL;
/* if cache bypass is requested don't use the cache */
base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
if (!opal_list_is_empty (&rcache_grdma->cache->gc_list))
do_unregistration_gc(rcache);
#if OPAL_CUDA_GDR_SUPPORT
if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) {
@ -216,58 +274,30 @@ static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *ad
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
do_unregistration_gc (rcache);
/* look through existing regs if not persistent registration requested.
* Persistent registration are always registered and placed in the cache */
if(!(bypass_cache || persist)) {
if (!(bypass_cache || persist)) {
mca_rcache_base_find_args_t find_args = {.reg = NULL, .rcache_grdma = rcache_grdma,
.base = base, .bound = bound,
.access_flags = access_flags};
/* check to see if memory is registered */
mca_rcache_base_vma_find (rcache_grdma->cache->vma_module, base, bound - base + 1, &grdma_reg);
if (grdma_reg && !(flags & MCA_RCACHE_FLAGS_INVALID)) {
if (OPAL_UNLIKELY((access_flags & grdma_reg->access_flags) != access_flags)) {
access_flags |= grdma_reg->access_flags;
if (0 != grdma_reg->ref_count) {
if (!(grdma_reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) {
mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, grdma_reg);
}
/* mark the registration to go away when it is deregistered */
grdma_reg->flags |= MCA_RCACHE_FLAGS_INVALID | MCA_RCACHE_FLAGS_CACHE_BYPASS;
} else {
if (registration_is_cacheable (grdma_reg)) {
/* pull the item out of the lru */
opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg);
}
(void) dereg_mem (grdma_reg);
}
} else {
*reg = grdma_reg;
if (0 == grdma_reg->ref_count) {
/* Leave pinned must be set for this to still be in the rcache. */
opal_list_remove_item(&rcache_grdma->cache->lru_list,
(opal_list_item_t *) grdma_reg);
}
/* This segment fits fully within an existing segment. */
rcache_grdma->stat_cache_hit++;
grdma_reg->ref_count++;
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
return OPAL_SUCCESS;
}
rc = mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size,
mca_rcache_grdma_check_cached, (void *) &find_args);
if (1 == rc) {
*reg = find_args.reg;
return OPAL_SUCCESS;
}
rcache_grdma->stat_cache_miss++;
/* get updated access flags */
access_flags = find_args.access_flags;
/* Unless explicitly requested by the caller always store the
* registration in the rcache. This will speed up the case where
* no leave pinned protocol is in use but the same segment is in
* use in multiple simultaneous transactions. We used to set bypass_cache
* here is !mca_rcache_grdma_component.leave_pinned. */
OPAL_THREAD_ADD32((volatile int32_t *) &rcache_grdma->stat_cache_miss, 1);
}
item = opal_free_list_get (&rcache_grdma->reg_list);
item = opal_free_list_get_mt (&rcache_grdma->reg_list);
if(NULL == item) {
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
grdma_reg = (mca_rcache_base_registration_t*)item;
@ -277,22 +307,13 @@ static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *ad
grdma_reg->bound = bound;
grdma_reg->flags = flags;
grdma_reg->access_flags = access_flags;
grdma_reg->ref_count = 1;
#if OPAL_CUDA_GDR_SUPPORT
if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) {
mca_common_cuda_get_buffer_id(grdma_reg);
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
if (false == bypass_cache) {
rc = mca_rcache_base_vma_insert (rcache_grdma->cache->vma_module, grdma_reg, 0);
if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
opal_free_list_return (&rcache_grdma->reg_list, item);
return rc;
}
}
while (OPAL_ERR_OUT_OF_RESOURCE ==
(rc = rcache_grdma->resources.register_mem(rcache_grdma->resources.reg_data,
base, bound - base + 1, grdma_reg))) {
@ -303,17 +324,30 @@ static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *ad
}
if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
if (false == bypass_cache) {
mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, grdma_reg);
}
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
opal_free_list_return (&rcache_grdma->reg_list, item);
opal_free_list_return_mt (&rcache_grdma->reg_list, item);
return rc;
}
if (false == bypass_cache) {
/* Unless explicitly requested by the caller always store the
* registration in the rcache. This will speed up the case where
* no leave pinned protocol is in use but the same segment is in
* use in multiple simultaneous transactions. We used to set bypass_cache
* here is !mca_rcache_grdma_component.leave_pinned. */
rc = mca_rcache_base_vma_insert (rcache_grdma->cache->vma_module, grdma_reg, 0);
if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
rcache_grdma->resources.deregister_mem (rcache_grdma->resources.reg_data, grdma_reg);
opal_free_list_return_mt (&rcache_grdma->reg_list, item);
return rc;
}
}
OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output,
"created new registration %p for region {%p, %p} with flags 0x%x",
(void *) grdma_reg, base, bound, grdma_reg->flags));
*reg = grdma_reg;
(*reg)->ref_count++;
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
return OPAL_SUCCESS;
}
@ -329,7 +363,7 @@ static int mca_rcache_grdma_find (mca_rcache_base_module_t *rcache, void *addr,
base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
bound = OPAL_ALIGN_PTR((intptr_t) addr + size - 1, page_size, unsigned char *);
OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock);
opal_mutex_lock (&rcache_grdma->cache->vma_module->vma_lock);
rc = mca_rcache_base_vma_find (rcache_grdma->cache->vma_module, base, bound - base + 1, reg);
if(NULL != *reg &&
@ -343,12 +377,12 @@ static int mca_rcache_grdma_find (mca_rcache_base_module_t *rcache, void *addr,
(opal_list_item_t*)(*reg));
}
rcache_grdma->stat_cache_found++;
(*reg)->ref_count++;
opal_atomic_add_32 (&(*reg)->ref_count, 1);
} else {
rcache_grdma->stat_cache_notfound++;
}
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
opal_mutex_unlock (&rcache_grdma->cache->vma_module->vma_lock);
return rc;
}
@ -357,59 +391,70 @@ static int mca_rcache_grdma_deregister (mca_rcache_base_module_t *rcache,
mca_rcache_base_registration_t *reg)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache;
int rc = OPAL_SUCCESS;
assert(reg->ref_count > 0);
int32_t ref_count;
int rc;
OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock);
reg->ref_count--;
if(reg->ref_count > 0) {
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
opal_mutex_lock (&rcache_grdma->cache->vma_module->vma_lock);
ref_count = opal_atomic_add_32 (&reg->ref_count, -1);
OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output,
"returning registration %p, remaining references %d", (void *) reg, ref_count));
assert (ref_count >= 0);
if (ref_count > 0) {
opal_mutex_unlock (&rcache_grdma->cache->vma_module->vma_lock);
return OPAL_SUCCESS;
}
if (registration_is_cacheable(reg)) {
opal_list_append(&rcache_grdma->cache->lru_list, (opal_list_item_t *) reg);
} else {
rc = dereg_mem (reg);
opal_mutex_unlock (&rcache_grdma->cache->vma_module->vma_lock);
return OPAL_SUCCESS;
}
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
rc = dereg_mem (reg);
opal_mutex_unlock (&rcache_grdma->cache->vma_module->vma_lock);
return rc;
}
#define GRDMA_RCACHE_NREGS 100
static int gc_add (mca_rcache_base_registration_t *grdma_reg, void *ctx)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) grdma_reg->rcache;
/* unused */
(void) ctx;
if (grdma_reg->flags & MCA_RCACHE_FLAGS_INVALID) {
/* nothing more to do */
return OPAL_SUCCESS;
}
if (grdma_reg->ref_count) {
/* attempted to remove an active registration */
return OPAL_ERROR;
}
/* This may be called from free() so avoid recursively calling into free by just
* shifting this registration into the garbage collection list. The cleanup will
* be done on the next registration attempt. */
if (registration_is_cacheable (grdma_reg)) {
opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg);
}
grdma_reg->flags |= MCA_RCACHE_FLAGS_INVALID;
opal_lifo_push_atomic (&rcache_grdma->cache->gc_lifo, (opal_list_item_t *) grdma_reg);
return OPAL_SUCCESS;
}
static int mca_rcache_grdma_invalidate_range (mca_rcache_base_module_t *rcache,
void *base, size_t size)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache;
mca_rcache_base_registration_t *regs[GRDMA_RCACHE_NREGS];
int reg_cnt, i, rc = OPAL_SUCCESS;
OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock);
do {
reg_cnt = mca_rcache_base_vma_find_all (rcache_grdma->cache->vma_module, base,
size, regs, GRDMA_RCACHE_NREGS);
for(i = 0 ; i < reg_cnt ; ++i) {
regs[i]->flags |= MCA_RCACHE_FLAGS_INVALID;
if (regs[i]->ref_count) {
/* memory is being freed, but there are registration in use that
* covers the memory. This can happen even in a correct program,
* but may also be an user error. We can't tell. Mark the
* registration as invalid. It will not be used any more and
* will be unregistered when ref_count will become zero */
rc = OPAL_ERROR; /* tell caller that something was wrong */
} else {
opal_list_remove_item(&rcache_grdma->cache->lru_list,(opal_list_item_t *) regs[i]);
opal_list_append(&rcache_grdma->cache->gc_list, (opal_list_item_t *) regs[i]);
}
}
} while (reg_cnt == GRDMA_RCACHE_NREGS);
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
return rc;
return mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size, gc_add, NULL);
}
/* Make sure this registration request is not stale. In other words, ensure
@ -417,11 +462,10 @@ static int mca_rcache_grdma_invalidate_range (mca_rcache_base_module_t *rcache,
* kick out the regisrations and deregister. This function needs to be called
* with the rcache->vma_module->vma_lock held. */
#if OPAL_CUDA_GDR_SUPPORT
static int check_for_cuda_freed_memory (mca_rcache_base_module_t *rcache, void *addr, size_t size)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache;
mca_rcache_base_registration_t *regs[GRDMA_RCACHE_NREGS];
int reg_cnt, i, rc = OPAL_SUCCESS;
mca_rcache_base_registration_t *reg;
mca_rcache_base_vma_find (rcache_grdma->cache->vma_module, addr, size, &reg);
@ -434,45 +478,35 @@ static int check_for_cuda_freed_memory (mca_rcache_base_module_t *rcache, void *
return OPAL_SUCCESS;
}
/* rcache->vma_module->rcache_dump_range(rcache->rcache, 0, (size_t)-1, "Before free"); */
/* This memory has been freed. Find all registrations and delete */
do {
reg_cnt = mca_rcache_base_vma_find_all (rcache_grdma->cache->vma_module, reg->base,
reg->bound - reg->base + 1, regs,
GRDMA_RCACHE_NREGS);
for(i = 0 ; i < reg_cnt ; ++i) {
regs[i]->flags |= MCA_RCACHE_FLAGS_INVALID;
if (regs[i]->ref_count) {
opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d",
regs[i]->ref_count, regs[i]->base, regs[i]->bound,
(int) (regs[i]->bound - regs[i]->base + 1));
/* memory is being freed, but there are registration in use that
* covers the memory. This can happen even in a correct program,
* but may also be an user error. We can't tell. Mark the
* registration as invalid. It will not be used any more and
* will be unregistered when ref_count will become zero */
rc = OPAL_ERROR; /* tell caller that something was wrong */
} else {
opal_list_remove_item(&rcache_grdma->cache->lru_list,(opal_list_item_t *) regs[i]);
/* Now deregister. Do not use gc_list as we need to kick this out now. */
dereg_mem(regs[i]);
}
}
} while(reg_cnt == GRDMA_RCACHE_NREGS);
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
/* rcache->rcache->rcache_dump_range(rcache->rcache, 0, (size_t)-1, "After free");*/
return rc;
/* This memory has been freed. Find all registrations and delete. Ensure they are deregistered
* now by passing dereg_mem as the delete function. This is safe because the vma lock is
* recursive and this is only called from register. */
return mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size, gc_add, NULL);
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
static int iterate_dereg_finalize (mca_rcache_base_registration_t *grdma_reg, void *ctx)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) ctx;
if ((mca_rcache_base_module_t *) rcache_grdma != grdma_reg->rcache) {
return 0;
}
if (registration_is_cacheable (grdma_reg)) {
opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg);
}
/* set the reference count to 0 otherwise dereg will fail on assert */
grdma_reg->ref_count = 0;
return dereg_mem (grdma_reg);
}
static void mca_rcache_grdma_finalize (mca_rcache_base_module_t *rcache)
{
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t*)rcache;
mca_rcache_base_registration_t *regs[GRDMA_RCACHE_NREGS];
int reg_cnt, i;
/* Statistic */
if (true == mca_rcache_grdma_component.print_stats) {
@ -484,30 +518,14 @@ static void mca_rcache_grdma_finalize (mca_rcache_base_module_t *rcache)
rcache_grdma->stat_evicted);
}
OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock);
do_unregistration_gc (rcache_grdma);
do_unregistration_gc(rcache);
do {
reg_cnt = mca_rcache_base_vma_find_all (rcache_grdma->cache->vma_module, 0, (size_t)-1,
regs, GRDMA_RCACHE_NREGS);
for (i = 0 ; i < reg_cnt ; ++i) {
if (regs[i]->ref_count) {
regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */
} else if (mca_rcache_grdma_component.leave_pinned) {
opal_list_remove_item(&rcache_grdma->cache->lru_list,
(opal_list_item_t *) regs[i]);
}
(void) dereg_mem(regs[i]);
}
} while (reg_cnt == GRDMA_RCACHE_NREGS);
(void) mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, NULL, (size_t) -1,
iterate_dereg_finalize, (void *) rcache);
OBJ_RELEASE(rcache_grdma->cache);
OBJ_DESTRUCT(&rcache_grdma->reg_list);
OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock);
/* this rcache was allocated by grdma_init in rcache_grdma_component.c */
free(rcache);