rcache/grdma: fix potential deadlock
This commit fixes a potential deadlock that can occur between the memory hooks and region registation. This deadlock occurs because of a hold and wait error between two mutexes. The first mutex is the VMA lock used to protect internal rcache/grdma structures and the reader/writer lock in the interval tree. In the case of the memory hooks a reader lock is obtained on the interval tree then the VMA lock is obtained to remove the registration from the LRU. In the case of LRU evictions the VMA lock is obtained then the writer lock on the interval tree is obtained. This leads to the deadlock. To fix the issue the code that evicts from the LRU has been updated to only invalidate the registration while the VMA lock is held then remove the registration from the VMA after the lock is released. This should completely eliminate the above deadlock. Signed-off-by: Nathan Hjelm <hjelmn@google.com>
Этот коммит содержится в:
родитель
395e2c9d8f
Коммит
14b6f4931f
@ -19,6 +19,7 @@
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2020 Google, LLC. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -167,24 +168,56 @@ static inline void do_unregistration_gc (mca_rcache_base_module_t *rcache)
|
||||
dereg_mem ((mca_rcache_base_registration_t *) item);
|
||||
}
|
||||
}
|
||||
|
||||
static inline mca_rcache_base_registration_t *mca_rcache_grdma_remove_lru_head(mca_rcache_grdma_cache_t *cache) {
|
||||
mca_rcache_base_registration_t *old_reg;
|
||||
int32_t old_flags;
|
||||
|
||||
do {
|
||||
opal_mutex_lock (&cache->vma_module->vma_lock);
|
||||
old_reg = (mca_rcache_base_registration_t *) opal_list_remove_first (&cache->lru_list);
|
||||
if (NULL == old_reg) {
|
||||
opal_mutex_unlock (&cache->vma_module->vma_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
do {
|
||||
int32_t new_flags;
|
||||
old_flags = old_reg->flags;
|
||||
/* registration has been selected for removal and is no longer in the LRU. mark it
|
||||
* as such. */
|
||||
new_flags = (old_flags & ~MCA_RCACHE_GRDMA_REG_FLAG_IN_LRU) | MCA_RCACHE_FLAGS_INVALID;
|
||||
if (opal_atomic_compare_exchange_strong_32(&old_reg->flags, &old_flags, new_flags)) {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
opal_mutex_unlock (&cache->vma_module->vma_lock);
|
||||
|
||||
if (old_flags & MCA_RCACHE_FLAGS_INVALID) {
|
||||
/* registration was already invalidated. in this case its fate is being determined
|
||||
* by another thread. */
|
||||
continue;
|
||||
}
|
||||
|
||||
return old_reg;
|
||||
} while (1);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool mca_rcache_grdma_evict_lru_local (mca_rcache_grdma_cache_t *cache)
|
||||
{
|
||||
mca_rcache_grdma_module_t *rcache_grdma;
|
||||
mca_rcache_base_registration_t *old_reg;
|
||||
|
||||
opal_mutex_lock (&cache->vma_module->vma_lock);
|
||||
old_reg = (mca_rcache_base_registration_t *)
|
||||
opal_list_remove_first (&cache->lru_list);
|
||||
old_reg = mca_rcache_grdma_remove_lru_head(cache);
|
||||
if (NULL == old_reg) {
|
||||
opal_mutex_unlock (&cache->vma_module->vma_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
rcache_grdma = (mca_rcache_grdma_module_t *) old_reg->rcache;
|
||||
|
||||
(void) dereg_mem (old_reg);
|
||||
opal_mutex_unlock (&cache->vma_module->vma_lock);
|
||||
|
||||
rcache_grdma->stat_evicted++;
|
||||
|
||||
return true;
|
||||
@ -230,11 +263,9 @@ static inline void mca_rcache_grdma_remove_from_lru (mca_rcache_grdma_module_t *
|
||||
|
||||
/* opal lists are not thread safe at this time so we must lock :'( */
|
||||
opal_mutex_lock (&rcache_grdma->cache->vma_module->vma_lock);
|
||||
|
||||
opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg);
|
||||
/* clear the LRU flag */
|
||||
grdma_reg->flags &= ~MCA_RCACHE_GRDMA_REG_FLAG_IN_LRU;
|
||||
|
||||
opal_mutex_unlock (&rcache_grdma->cache->vma_module->vma_lock);
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user