1
1
openmpi/opal/mca/mpool/grdma/mpool_grdma_module.c
Nathan Hjelm 59aa93e1b6 opal/mpool: add support for passing access flags to register
This commit adds a access_flags argument to the mpool registration
function. This flag indicates what kind of access is being requested:
local write, remote read, remote write, and remote atomic. The values
of the registration access flags in the btl are tied to the new flags
in the mpool. All mpools have been updated to include the new argument
but only the grdma and udreg mpools have been updated to make use of
the access flags. In both mpools existing registrations are checked
for sufficient access before being returned. If a registration does
not contain sufficient access it is marked as invalid and a new
registration is generated.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2015-10-05 13:53:55 -06:00

593 строки
21 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
#include "opal_config.h"
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include "opal/align.h"
#include "opal/util/proc.h"
#if OPAL_CUDA_GDR_SUPPORT
#include "opal/mca/common/cuda/common_cuda.h"
#endif /* OPAL_CUDA_GDR_SUPPORT */
#include "opal/mca/rcache/rcache.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "mpool_grdma.h"
static inline bool registration_is_cacheable(mca_mpool_base_registration_t *reg)
{
return (mca_mpool_grdma_component.leave_pinned &&
!(reg->flags &
(MCA_MPOOL_FLAGS_CACHE_BYPASS |
MCA_MPOOL_FLAGS_PERSIST |
MCA_MPOOL_FLAGS_INVALID)));
}
#if OPAL_CUDA_GDR_SUPPORT
static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size);
#endif /* OPAL_CUDA_GDR_SUPPORT */
static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool)
{
memset ((void *)((uintptr_t)pool + sizeof (pool->super)), 0, sizeof (*pool) - sizeof (pool->super));
OBJ_CONSTRUCT(&pool->lru_list, opal_list_t);
OBJ_CONSTRUCT(&pool->gc_list, opal_list_t);
pool->rcache = mca_rcache_base_module_create(mca_mpool_grdma_component.rcache_name);
}
static void mca_mpool_grdma_pool_destructor (mca_mpool_grdma_pool_t *pool)
{
OBJ_DESTRUCT(&pool->lru_list);
OBJ_DESTRUCT(&pool->gc_list);
free (pool->pool_name);
}
OBJ_CLASS_INSTANCE(mca_mpool_grdma_pool_t, opal_list_item_t,
mca_mpool_grdma_pool_contructor,
mca_mpool_grdma_pool_destructor);
/*
* Initializes the mpool module.
*/
void mca_mpool_grdma_module_init(mca_mpool_grdma_module_t* mpool, mca_mpool_grdma_pool_t *pool)
{
OBJ_RETAIN(pool);
mpool->pool = pool;
mpool->super.mpool_component = &mca_mpool_grdma_component.super;
mpool->super.mpool_base = NULL; /* no base .. */
mpool->super.mpool_alloc = mca_mpool_grdma_alloc;
mpool->super.mpool_realloc = mca_mpool_grdma_realloc;
mpool->super.mpool_free = mca_mpool_grdma_free;
mpool->super.mpool_register = mca_mpool_grdma_register;
mpool->super.mpool_find = mca_mpool_grdma_find;
mpool->super.mpool_deregister = mca_mpool_grdma_deregister;
mpool->super.mpool_release_memory = mca_mpool_grdma_release_memory;
mpool->super.mpool_finalize = mca_mpool_grdma_finalize;
mpool->super.mpool_ft_event = mca_mpool_grdma_ft_event;
mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM;
mpool->super.rcache = pool->rcache;
mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0;
mpool->stat_cache_found = mpool->stat_cache_notfound = 0;
OBJ_CONSTRUCT(&mpool->reg_list, opal_free_list_t);
opal_free_list_init (&mpool->reg_list, mpool->resources.sizeof_reg,
opal_cache_line_size,
OBJ_CLASS(mca_mpool_base_registration_t),
0, opal_cache_line_size, 0, -1, 32, NULL, 0,
NULL, NULL, NULL);
}
static inline int dereg_mem(mca_mpool_base_registration_t *reg)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) reg->mpool;
int rc;
if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
reg->mpool->rcache->rcache_delete(reg->mpool->rcache, reg);
/* Drop the rcache lock before deregistring the memory */
OPAL_THREAD_UNLOCK(&reg->mpool->rcache->lock);
rc = mpool_grdma->resources.deregister_mem(mpool_grdma->resources.reg_data,
reg);
OPAL_THREAD_LOCK(&reg->mpool->rcache->lock);
if (OPAL_LIKELY(OPAL_SUCCESS == rc)) {
opal_free_list_return (&mpool_grdma->reg_list,
(opal_free_list_item_t *) reg);
}
return rc;
}
/**
* allocate function
*/
void* mca_mpool_grdma_alloc(mca_mpool_base_module_t *mpool, size_t size,
size_t align, uint32_t flags, mca_mpool_base_registration_t **reg)
{
void *base_addr, *addr;
if(0 == align)
align = mca_mpool_base_page_size;
#if OPAL_CUDA_SUPPORT
/* CUDA cannot handle registering overlapping regions, so make
* sure each region is page sized and page aligned. */
align = mca_mpool_base_page_size;
size = OPAL_ALIGN(size, mca_mpool_base_page_size, size_t);
#endif
#ifdef HAVE_POSIX_MEMALIGN
if((errno = posix_memalign(&base_addr, align, size)) != 0)
return NULL;
addr = base_addr;
#else
base_addr = malloc(size + align);
if(NULL == base_addr)
return NULL;
addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t);
#endif
if(OPAL_SUCCESS != mca_mpool_grdma_register(mpool, addr, size, flags,
MCA_MPOOL_ACCESS_ANY, reg)) {
free(base_addr);
return NULL;
}
(*reg)->alloc_base = (unsigned char *) base_addr;
return addr;
}
/* This function must be called with the rcache lock held */
static inline void do_unregistration_gc(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
opal_list_item_t *item;
/* Remove registration from garbage collection list
before deregistering it */
while (NULL !=
(item = opal_list_remove_first(&mpool_grdma->pool->gc_list))) {
dereg_mem((mca_mpool_base_registration_t *) item);
}
}
static inline bool mca_mpool_grdma_evict_lru_local (mca_mpool_grdma_pool_t *pool)
{
mca_mpool_grdma_module_t *mpool_grdma;
mca_mpool_base_registration_t *old_reg;
old_reg = (mca_mpool_base_registration_t *)
opal_list_remove_first (&pool->lru_list);
if (NULL == old_reg) {
return false;
}
mpool_grdma = (mca_mpool_grdma_module_t *) old_reg->mpool;
(void) dereg_mem (old_reg);
mpool_grdma->stat_evicted++;
return true;
}
enum {
MCA_MPOOL_GRDMA_MSG_EMPTY = 0,
MCA_MPOOL_GRDMA_MSG_NEED_DEREG = 1,
MCA_MPOOL_GRDMA_MSG_BUSY = 2,
MCA_MPOOL_GRDMA_MSG_COMPLETE = 3
};
bool mca_mpool_grdma_evict (struct mca_mpool_base_module_t *mpool)
{
return mca_mpool_grdma_evict_lru_local (((mca_mpool_grdma_module_t *) mpool)->pool);
}
/*
* register memory
*/
int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr,
size_t size, uint32_t flags, int32_t access_flags,
mca_mpool_base_registration_t **reg)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
const bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS);
const bool persist = !!(flags & MCA_MPOOL_FLAGS_PERSIST);
mca_mpool_base_registration_t *grdma_reg;
opal_free_list_item_t *item;
unsigned char *base, *bound;
int rc;
OPAL_THREAD_LOCK(&mpool->rcache->lock);
*reg = NULL;
/* if cache bypass is requested don't use the cache */
base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log);
bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1),
mca_mpool_base_page_size_log);
if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
do_unregistration_gc(mpool);
#if OPAL_CUDA_GDR_SUPPORT
if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
size_t psize;
mca_common_cuda_get_address_range(&base, &psize, addr);
bound = base + psize - 1;
/* Check to see if this memory is in the cache and if it has been freed. If so,
* this call will boot it out of the cache. */
check_for_cuda_freed_memory(mpool, base, psize);
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
/* look through existing regs if not persistent registration requested.
* Persistent registration are always registered and placed in the cache */
if(!(bypass_cache || persist)) {
/* check to see if memory is registered */
mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, &grdma_reg);
if (grdma_reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) {
if (OPAL_UNLIKELY((access_flags & grdma_reg->access_flags) != access_flags)) {
access_flags |= grdma_reg->access_flags;
if (0 != grdma_reg->ref_count) {
if (!(grdma_reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) {
grdma_reg->mpool->rcache->rcache_delete(grdma_reg->mpool->rcache, grdma_reg);
}
/* mark the registration to go away when it is deregistered */
grdma_reg->flags |= MCA_MPOOL_FLAGS_INVALID | MCA_MPOOL_FLAGS_CACHE_BYPASS;
} else {
if (registration_is_cacheable (grdma_reg)) {
/* pull the item out of the lru */
opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg);
}
(void) dereg_mem (grdma_reg);
}
} else {
*reg = grdma_reg;
if (0 == grdma_reg->ref_count) {
/* Leave pinned must be set for this to still be in the rcache. */
opal_list_remove_item(&mpool_grdma->pool->lru_list,
(opal_list_item_t *) grdma_reg);
}
/* This segment fits fully within an existing segment. */
mpool_grdma->stat_cache_hit++;
grdma_reg->ref_count++;
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return OPAL_SUCCESS;
}
}
mpool_grdma->stat_cache_miss++;
/* Unless explicitly requested by the caller always store the
* registration in the rcache. This will speed up the case where
* no leave pinned protocol is in use but the same segment is in
* use in multiple simultaneous transactions. We used to set bypass_cache
* here is !mca_mpool_grdma_component.leave_pinned. */
}
item = opal_free_list_get (&mpool_grdma->reg_list);
if(NULL == item) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
grdma_reg = (mca_mpool_base_registration_t*)item;
grdma_reg->mpool = mpool;
grdma_reg->base = base;
grdma_reg->bound = bound;
grdma_reg->flags = flags;
grdma_reg->access_flags = access_flags;
#if OPAL_CUDA_GDR_SUPPORT
if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
mca_common_cuda_get_buffer_id(grdma_reg);
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
if (false == bypass_cache) {
rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);
if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
opal_free_list_return (&mpool_grdma->reg_list, item);
return rc;
}
}
while (OPAL_ERR_OUT_OF_RESOURCE ==
(rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data,
base, bound - base + 1, grdma_reg))) {
/* try to remove one unused reg and retry */
if (!mca_mpool_grdma_evict (mpool)) {
break;
}
}
if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
if (false == bypass_cache) {
mpool->rcache->rcache_delete(mpool->rcache, grdma_reg);
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
opal_free_list_return (&mpool_grdma->reg_list, item);
return rc;
}
*reg = grdma_reg;
(*reg)->ref_count++;
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* Cleanup any vmas that we have deferred deletion on */
mpool->rcache->rcache_clean(mpool->rcache);
return OPAL_SUCCESS;
}
/**
* realloc function
*/
void* mca_mpool_grdma_realloc(mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
mca_mpool_base_registration_t *old_reg = *reg;
void *new_mem = mca_mpool_grdma_alloc(mpool, size, 0, old_reg->flags, reg);
memcpy(new_mem, addr, old_reg->bound - old_reg->base + 1);
mca_mpool_grdma_free(mpool, addr, old_reg);
return new_mem;
}
/**
* free function
*/
void mca_mpool_grdma_free(mca_mpool_base_module_t *mpool, void *addr,
mca_mpool_base_registration_t *registration)
{
void *alloc_base = registration->alloc_base;
mca_mpool_grdma_deregister(mpool, registration);
free(alloc_base);
}
int mca_mpool_grdma_find(struct mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
unsigned char *base, *bound;
int rc;
base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log);
bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1),
mca_mpool_base_page_size_log);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
rc = mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, reg);
if(NULL != *reg &&
(mca_mpool_grdma_component.leave_pinned ||
((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
((*reg)->base == base && (*reg)->bound == bound))) {
assert(((void*)(*reg)->bound) >= addr);
if(0 == (*reg)->ref_count &&
mca_mpool_grdma_component.leave_pinned) {
opal_list_remove_item(&mpool_grdma->pool->lru_list,
(opal_list_item_t*)(*reg));
}
mpool_grdma->stat_cache_found++;
(*reg)->ref_count++;
} else {
mpool_grdma->stat_cache_notfound++;
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
int mca_mpool_grdma_deregister(struct mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool;
int rc = OPAL_SUCCESS;
assert(reg->ref_count > 0);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
reg->ref_count--;
if(reg->ref_count > 0) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return OPAL_SUCCESS;
}
if (registration_is_cacheable(reg)) {
opal_list_append(&mpool_grdma->pool->lru_list, (opal_list_item_t *) reg);
} else {
rc = dereg_mem (reg);
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* Cleanup any vmas that we have deferred deletion on */
mpool->rcache->rcache_clean(mpool->rcache);
return rc;
}
#define GRDMA_MPOOL_NREGS 100
int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool,
void *base, size_t size)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool;
mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS];
int reg_cnt, i, rc = OPAL_SUCCESS;
OPAL_THREAD_LOCK(&mpool->rcache->lock);
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, base, size,
regs, GRDMA_MPOOL_NREGS);
for(i = 0 ; i < reg_cnt ; ++i) {
regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID;
if (regs[i]->ref_count) {
/* memory is being freed, but there are registration in use that
* covers the memory. This can happen even in a correct program,
* but may also be an user error. We can't tell. Mark the
* registration as invalid. It will not be used any more and
* will be unregistered when ref_count will become zero */
rc = OPAL_ERROR; /* tell caller that something was wrong */
} else {
opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]);
opal_list_append(&mpool_grdma->pool->gc_list, (opal_list_item_t *) regs[i]);
}
}
} while(reg_cnt == GRDMA_MPOOL_NREGS);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
/* Make sure this registration request is not stale. In other words, ensure
* that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state. If we do
* kick out the regisrations and deregister. This function needs to be called
* with the mpool->rcache->lock held. */
#if OPAL_CUDA_GDR_SUPPORT
static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool;
mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS];
int reg_cnt, i, rc = OPAL_SUCCESS;
mca_mpool_base_registration_t *reg;
mpool->rcache->rcache_find(mpool->rcache, addr, size, &reg);
if (NULL == reg) {
return OPAL_SUCCESS;
}
/* If not previously freed memory, just return 0 */
if (!(mca_common_cuda_previously_freed_memory(reg))) {
return OPAL_SUCCESS;
}
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */
/* This memory has been freed. Find all registrations and delete */
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, reg->base, reg->bound - reg->base + 1,
regs, GRDMA_MPOOL_NREGS);
for(i = 0 ; i < reg_cnt ; ++i) {
regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID;
if (regs[i]->ref_count) {
opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d",
regs[i]->ref_count, regs[i]->base, regs[i]->bound,
(int) (regs[i]->bound - regs[i]->base + 1));
/* memory is being freed, but there are registration in use that
* covers the memory. This can happen even in a correct program,
* but may also be an user error. We can't tell. Mark the
* registration as invalid. It will not be used any more and
* will be unregistered when ref_count will become zero */
rc = OPAL_ERROR; /* tell caller that something was wrong */
} else {
opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]);
/* Now deregister. Do not use gc_list as we need to kick this out now. */
dereg_mem(regs[i]);
}
}
} while(reg_cnt == GRDMA_MPOOL_NREGS);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "After free");*/
return rc;
}
#endif /* OPAL_CUDA_GDR_SUPPORT */
void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS];
int reg_cnt, i;
/* Statistic */
if (true == mca_mpool_grdma_component.print_stats) {
opal_output(0, "%s grdma: stats "
"(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
mpool_grdma->stat_cache_hit, mpool_grdma->stat_cache_miss,
mpool_grdma->stat_cache_found, mpool_grdma->stat_cache_notfound,
mpool_grdma->stat_evicted);
}
OPAL_THREAD_LOCK(&mpool->rcache->lock);
do_unregistration_gc(mpool);
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
regs, GRDMA_MPOOL_NREGS);
for (i = 0 ; i < reg_cnt ; ++i) {
if (regs[i]->ref_count) {
regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */
} else if (mca_mpool_grdma_component.leave_pinned) {
opal_list_remove_item(&mpool_grdma->pool->lru_list,
(opal_list_item_t *) regs[i]);
}
(void) dereg_mem(regs[i]);
}
} while (reg_cnt == GRDMA_MPOOL_NREGS);
OBJ_RELEASE(mpool_grdma->pool);
OBJ_DESTRUCT(&mpool_grdma->reg_list);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* Cleanup any vmas that we have deferred deletion on */
mpool->rcache->rcache_clean(mpool->rcache);
/* this mpool was allocated by grdma_init in mpool_grdma_component.c */
free(mpool);
}
int mca_mpool_grdma_ft_event(int state) {
return OPAL_SUCCESS;
}