1
1
openmpi/ompi/mca/mpool/rdma/mpool_rdma_module.c
Gleb Natapov 8bfcfa464a Don't call free(), or library functions that may call free() inside (such as
ibv_dereg_mr() for instance) from ptmalloc callback. Call to free() from the
callback causes deadlock. Notice what should be unregistered inside the callback
and do actual cleanup at the next call to mpool->register().

This commit was SVN r17064.
2008-01-08 08:55:42 +00:00

472 строки
16 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#define OMPI_DISABLE_ENABLE_MEM_DEBUG 1
#include "ompi_config.h"
#include "opal/include/opal/align.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include <errno.h>
#include <string.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include "ompi/mca/rcache/rcache.h"
#include "ompi/mca/rcache/base/base.h"
#include "ompi/mca/mpool/base/base.h"
/*
* Initializes the mpool module.
*/
void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t* mpool)
{
mpool->super.mpool_component = &mca_mpool_rdma_component.super;
mpool->super.mpool_base = NULL; /* no base .. */
mpool->super.mpool_alloc = mca_mpool_rdma_alloc;
mpool->super.mpool_realloc = mca_mpool_rdma_realloc;
mpool->super.mpool_free = mca_mpool_rdma_free;
mpool->super.mpool_register = mca_mpool_rdma_register;
mpool->super.mpool_find = mca_mpool_rdma_find;
mpool->super.mpool_deregister = mca_mpool_rdma_deregister;
mpool->super.mpool_release_memory = mca_mpool_rdma_release_memory;
mpool->super.mpool_finalize = mca_mpool_rdma_finalize;
mpool->super.rcache =
mca_rcache_base_module_create(mca_mpool_rdma_component.rcache_name);
mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM;
OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t);
ompi_free_list_init_new(&mpool->reg_list, mpool->resources.sizeof_reg,
CACHE_LINE_SIZE,
OBJ_CLASS(mca_mpool_base_registration_t),
0,CACHE_LINE_SIZE,
0, -1, 32, NULL);
OBJ_CONSTRUCT(&mpool->mru_list, opal_list_t);
OBJ_CONSTRUCT(&mpool->gc_list, opal_list_t);
mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0;
mpool->stat_cache_found = mpool->stat_cache_notfound = 0;
}
static inline int dereg_mem(mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t *)mpool;
assert(reg->ref_count == 0);
return mpool_rdma->resources.deregister_mem(mpool_rdma->resources.reg_data,
reg);
}
/**
* allocate function
*/
void* mca_mpool_rdma_alloc(mca_mpool_base_module_t *mpool, size_t size,
size_t align, uint32_t flags, mca_mpool_base_registration_t **reg)
{
void *base_addr, *addr;
if(0 == align)
align = mca_mpool_base_page_size;
#ifdef HAVE_POSIX_MEMALIGN
if((errno = posix_memalign(&base_addr, align, size)) != 0)
return NULL;
addr = base_addr;
#else
base_addr = malloc(size + align);
if(NULL == base_addr)
return NULL;
addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t);
#endif
if(OMPI_SUCCESS != mca_mpool_rdma_register(mpool, addr, size, flags, reg)) {
free(base_addr);
return NULL;
}
(*reg)->alloc_base = base_addr;
return addr;
}
static void do_unregistration_gc(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
mca_mpool_base_registration_t *reg;
do {
reg = (mca_mpool_base_registration_t *)
opal_list_remove_first(&mpool_rdma->gc_list);
dereg_mem(mpool, reg);
mpool->rcache->rcache_delete(mpool->rcache, reg);
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
(ompi_free_list_item_t*)reg);
} while(!opal_list_is_empty(&mpool_rdma->gc_list));
}
static int register_cache_bypass(mca_mpool_base_module_t *mpool,
void *addr, size_t size, uint32_t flags,
mca_mpool_base_registration_t **reg)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
mca_mpool_base_registration_t *rdma_reg;
ompi_free_list_item_t *item;
unsigned char *base, *bound;
int rc;
base = down_align_addr(addr, mca_mpool_base_page_size_log);
bound = up_align_addr( (void*) ((char*) addr + size - 1),
mca_mpool_base_page_size_log);
OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc);
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
rdma_reg = (mca_mpool_base_registration_t*)item;
rdma_reg->mpool = mpool;
rdma_reg->base = base;
rdma_reg->bound = bound;
rdma_reg->flags = flags;
rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data,
base, bound - base + 1, rdma_reg);
if(rc != OMPI_SUCCESS) {
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
return rc;
}
*reg = rdma_reg;
(*reg)->ref_count++;
return OMPI_SUCCESS;
}
/*
* register memory
*/
int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr,
size_t size, uint32_t flags,
mca_mpool_base_registration_t **reg)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
mca_mpool_base_registration_t *rdma_reg;
ompi_free_list_item_t *item;
unsigned char *base, *bound;
int rc;
/* if cache bypass is requested don't use the cache */
if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) {
return register_cache_bypass(mpool, addr, size, flags, reg);
}
base = down_align_addr(addr, mca_mpool_base_page_size_log);
bound = up_align_addr((void*)((char*) addr + size - 1),
mca_mpool_base_page_size_log);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
if(!opal_list_is_empty(&mpool_rdma->gc_list))
do_unregistration_gc(mpool);
/* look through existing regs if not persistent registration requested.
* Persistent registration are always registered and placed in the cache */
if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) {
/* check to see if memory is registered */
mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
if(*reg != NULL &&
(mca_mpool_rdma_component.leave_pinned ||
((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
((*reg)->base == base && (*reg)->bound == bound))) {
if(0 == (*reg)->ref_count &&
mca_mpool_rdma_component.leave_pinned) {
opal_list_remove_item(&mpool_rdma->mru_list,
(opal_list_item_t*)(*reg));
}
mpool_rdma->stat_cache_hit++;
(*reg)->ref_count++;
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return MPI_SUCCESS;
}
mpool_rdma->stat_cache_miss++;
*reg = NULL; /* in case previous find found something */
/* If no suitable registration is in cache and leave_pinned isn't
* set and size of registration cache is unlimited don't use the cache.
* This is optimisation in case limit is not set. If limit is set we
* have to put registration into the cache to determine when we hit
* memory registration limit.
* NONE: cache is still used for persistent registrations so previous
* find can find something */
if(!mca_mpool_rdma_component.leave_pinned &&
mca_mpool_rdma_component.rcache_size_limit == 0) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return register_cache_bypass(mpool, addr, size, flags, reg);
}
}
OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc);
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
rdma_reg = (mca_mpool_base_registration_t*)item;
rdma_reg->mpool = mpool;
rdma_reg->base = base;
rdma_reg->bound = bound;
rdma_reg->flags = flags;
while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg,
mca_mpool_rdma_component.rcache_size_limit)) ==
OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
mca_mpool_base_registration_t *old_reg;
/* try to remove one unused reg and retry */
old_reg = (mca_mpool_base_registration_t*)
opal_list_get_last(&mpool_rdma->mru_list);
if(opal_list_get_end(&mpool_rdma->mru_list) !=
(opal_list_item_t*)old_reg) {
rc = dereg_mem(mpool, old_reg);
if(MPI_SUCCESS == rc) {
mpool->rcache->rcache_delete(mpool->rcache, old_reg);
opal_list_remove_item(&mpool_rdma->mru_list,
(opal_list_item_t*)old_reg);
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
(ompi_free_list_item_t*)old_reg);
mpool_rdma->stat_evicted++;
} else
break;
} else
break;
}
if(rc != OMPI_SUCCESS) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
return rc;
}
rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data,
base, bound - base + 1, rdma_reg);
if(rc != OMPI_SUCCESS) {
mpool->rcache->rcache_delete(mpool->rcache, rdma_reg);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
return rc;
}
*reg = rdma_reg;
(*reg)->ref_count++;
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return OMPI_SUCCESS;
}
/**
* realloc function
*/
void* mca_mpool_rdma_realloc(mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
mca_mpool_base_registration_t *old_reg = *reg;
void *new_mem = mca_mpool_rdma_alloc(mpool, size, 0, old_reg->flags, reg);
memcpy(new_mem, addr, old_reg->bound - old_reg->base + 1);
mca_mpool_rdma_free(mpool, addr, old_reg);
return new_mem;
}
/**
* free function
*/
void mca_mpool_rdma_free(mca_mpool_base_module_t *mpool, void *addr,
mca_mpool_base_registration_t *registration)
{
void *alloc_base = registration->alloc_base;
mca_mpool_rdma_deregister(mpool, registration);
free(alloc_base);
}
int mca_mpool_rdma_find(struct mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
int rc;
unsigned char *base, *bound;
base = down_align_addr(addr, mca_mpool_base_page_size_log);
bound = up_align_addr((void*)((char*) addr + size - 1),
mca_mpool_base_page_size_log);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
if(*reg != NULL &&
(mca_mpool_rdma_component.leave_pinned ||
((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
((*reg)->base == base && (*reg)->bound == bound))) {
assert(((void*)(*reg)->bound) >= addr);
if(0 == (*reg)->ref_count &&
mca_mpool_rdma_component.leave_pinned) {
opal_list_remove_item(&mpool_rdma->mru_list,
(opal_list_item_t*)(*reg));
}
mpool_rdma->stat_cache_found++;
(*reg)->ref_count++;
} else {
mpool_rdma->stat_cache_notfound++;
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
static inline bool registration_is_cachebale(mca_mpool_base_registration_t *reg)
{
return !(reg->flags &
(MCA_MPOOL_FLAGS_CACHE_BYPASS |
MCA_MPOOL_FLAGS_PERSIST |
MCA_MPOOL_FLAGS_INVALID));
}
int mca_mpool_rdma_deregister(struct mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
int rc = OMPI_SUCCESS;
assert(reg->ref_count > 0);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
reg->ref_count--;
if(reg->ref_count > 0) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return OMPI_SUCCESS;
}
if(mca_mpool_rdma_component.leave_pinned && registration_is_cachebale(reg))
{
/* if leave_pinned is set don't deregister memory, but put it
* on MRU list for future use */
opal_list_prepend(&mpool_rdma->mru_list, (opal_list_item_t*)reg);
} else {
rc = dereg_mem(mpool, reg);
if(OMPI_SUCCESS == rc) {
if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
mpool->rcache->rcache_delete(mpool->rcache, reg);
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
(ompi_free_list_item_t*)reg);
}
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
#define RDMA_MPOOL_NREGS 100
int mca_mpool_rdma_release_memory(struct mca_mpool_base_module_t *mpool,
void *base, size_t size)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
mca_mpool_base_registration_t *reg;
mca_mpool_base_registration_t *regs[RDMA_MPOOL_NREGS];
int reg_cnt, i, err = 0;
OPAL_THREAD_LOCK(&mpool->rcache->lock);
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, base, size,
regs, RDMA_MPOOL_NREGS);
for(i = 0; i < reg_cnt; i++) {
reg = regs[i];
reg->flags |= MCA_MPOOL_FLAGS_INVALID;
if(reg->ref_count) {
/* memory is being freed, but there are registration in use that
* covers the memory. This can happen even in a correct program,
* but may also be an user error. We can't tell. Mark the
* registration as invalid. It will not be used any more and
* will be unregistered when ref_count will become zero */
err++; /* tell caller that something was wrong */
continue;
}
opal_list_remove_item(&mpool_rdma->mru_list,(opal_list_item_t*)reg);
opal_list_append(&mpool_rdma->gc_list, (opal_list_item_t*)reg);
}
} while(reg_cnt == RDMA_MPOOL_NREGS);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return err ? OMPI_ERROR : OMPI_SUCCESS;
}
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
mca_mpool_base_registration_t *reg;
mca_mpool_base_registration_t *regs[RDMA_MPOOL_NREGS];
int reg_cnt, i;
/* Statistic */
if(true == mca_mpool_rdma_component.print_stats) {
opal_output(0, "%s rdma: stats "
"(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss,
mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound,
mpool_rdma->stat_evicted);
}
OPAL_THREAD_LOCK(&mpool->rcache->lock);
if(!opal_list_is_empty(&mpool_rdma->gc_list))
do_unregistration_gc(mpool);
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
regs, RDMA_MPOOL_NREGS);
for(i = 0; i < reg_cnt; i++) {
reg = regs[i];
if(reg->ref_count) {
reg->ref_count = 0; /* otherway dereg will fail on assert */
} else if (mca_mpool_rdma_component.leave_pinned) {
opal_list_remove_item(&mpool_rdma->mru_list,
(opal_list_item_t*)reg);
}
if(dereg_mem(mpool, reg) != OMPI_SUCCESS) {
continue;
}
mpool->rcache->rcache_delete(mpool->rcache, reg);
OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
(ompi_free_list_item_t*)reg);
}
} while(reg_cnt == RDMA_MPOOL_NREGS);
OBJ_DESTRUCT(&mpool_rdma->mru_list);
OBJ_DESTRUCT(&mpool_rdma->gc_list);
OBJ_DESTRUCT(&mpool_rdma->reg_list);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
}