1
1
openmpi/ompi/mca/mpool/udreg/mpool_udreg_module.c
Nathan Hjelm 24a7e7aa34 Add support for the udreg registration cache and dynamics on XE/XK/XC.
To support the new mpool two changes were made to the mpool infrastructure:

 1) Added an mpool flag to indicate that an mpool does not need the memory
    hooks to use the leave pinned protocols. This flag is checked in the
    mpool lookup.

 2) Add a mpool context to the base registration. This new member is used
    by the udreg mpool to store the udreg context associated with the
    particular registration. The new member will not break the ABI
    compatibility as the new member is only currently used by the udreg
    mpool.

Dynamics support for Cray systems makes use of the global rank provided by
orte to give the ugni library a unique rank for each process. Dynamics
support is not available under direct-launch (srun.)

cmr=v1.7.4

This commit was SVN r29719.
2013-11-18 04:58:37 +00:00

489 строки
15 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
#include "ompi_config.h"
#include "opal/align.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "mpool_udreg.h"
#include <errno.h>
#include <string.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include "ompi/mca/mpool/base/base.h"
#include "ompi/runtime/params.h"
#include "opal/include/opal_stdint.h"
#include <fcntl.h>
#include <udreg_pub.h>
#include <sys/mman.h>
static void *mca_mpool_udreg_reg_func (void *addr, uint64_t len, void *reg_context);
static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context);
static void mca_mpool_udreg_hugepage_constructor (mca_mpool_udreg_hugepage_t *huge_page)
{
memset ((char *)huge_page + sizeof(huge_page->super), 0, sizeof (*huge_page) - sizeof (huge_page->super));
OBJ_CONSTRUCT(&huge_page->allocations, opal_list_t);
}
static void mca_mpool_udreg_hugepage_destructor (mca_mpool_udreg_hugepage_t *huge_page)
{
opal_list_item_t *item;
if (huge_page->path) {
free (huge_page->path);
}
while (NULL != (item = opal_list_remove_first (&huge_page->allocations))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&huge_page->allocations);
}
OBJ_CLASS_INSTANCE(mca_mpool_udreg_hugepage_t, opal_list_item_t,
mca_mpool_udreg_hugepage_constructor,
mca_mpool_udreg_hugepage_destructor);
static void mca_mpool_udreg_hugepage_alloc_constructor (mca_mpool_udreg_hugepage_alloc_t *alloc)
{
memset ((char *)alloc + sizeof(alloc->super), 0, sizeof (*alloc) - sizeof (alloc->super));
alloc->fd = -1;
}
static void mca_mpool_udreg_hugepage_alloc_destructor (mca_mpool_udreg_hugepage_alloc_t *alloc)
{
if (NULL != alloc->ptr) {
munmap (alloc->ptr, alloc->size);
}
if (NULL == alloc->path) {
return;
}
free (alloc->path);
}
OBJ_CLASS_INSTANCE(mca_mpool_udreg_hugepage_alloc_t, opal_list_item_t,
mca_mpool_udreg_hugepage_alloc_constructor,
mca_mpool_udreg_hugepage_alloc_destructor);
static mca_mpool_udreg_hugepage_t *udreg_find_matching_pagesize (size_t size) {
mca_mpool_udreg_hugepage_t *huge_table;
opal_list_item_t *item;
for (item = opal_list_get_first (&mca_mpool_udreg_component.huge_pages) ;
item != opal_list_get_end (&mca_mpool_udreg_component.huge_pages) ;
item = opal_list_get_next (item)) {
huge_table = (mca_mpool_udreg_hugepage_t *) item;
if (huge_table->page_size == size) {
return huge_table;
}
}
return NULL;
}
/*
* Initializes the mpool module.
*/
int mca_mpool_udreg_module_init(mca_mpool_udreg_module_t* mpool)
{
struct udreg_cache_attr cache_attr;
int urc;
mpool->super.mpool_component = &mca_mpool_udreg_component.super;
mpool->super.mpool_base = NULL; /* no base .. */
mpool->super.mpool_alloc = mca_mpool_udreg_alloc;
mpool->super.mpool_realloc = mca_mpool_udreg_realloc;
mpool->super.mpool_free = mca_mpool_udreg_free;
mpool->super.mpool_register = mca_mpool_udreg_register;
mpool->super.mpool_find = mca_mpool_udreg_find;
mpool->super.mpool_deregister = mca_mpool_udreg_deregister;
/* This module relies on udreg for notification of memory release */
mpool->super.mpool_release_memory = NULL;
mpool->super.mpool_finalize = mca_mpool_udreg_finalize;
mpool->super.mpool_ft_event = mca_mpool_udreg_ft_event;
mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM | MCA_MPOOL_FLAGS_NO_HOOKS;
if (4096 < mpool->resources.page_size) {
mpool->huge_page = udreg_find_matching_pagesize (mpool->resources.page_size);
} else {
mpool->huge_page = NULL;
}
cache_attr.modes = 0;
/* Create udreg cache */
if (mpool->resources.use_kernel_cache) {
cache_attr.modes |= UDREG_CC_MODE_USE_KERNEL_CACHE;
}
if (mpool->resources.use_evict_w_unreg) {
cache_attr.modes |= UDREG_CC_MODE_USE_EVICT_W_UNREG;
}
if (mca_mpool_udreg_component.leave_pinned) {
cache_attr.modes |= UDREG_CC_MODE_USE_LAZY_DEREG;
}
strncpy (cache_attr.cache_name, mpool->resources.pool_name, UDREG_MAX_CACHENAME_LEN);
cache_attr.max_entries = mpool->resources.max_entries;
cache_attr.debug_mode = 0;
cache_attr.debug_rank = 0;
cache_attr.reg_context = mpool;
cache_attr.dreg_context = mpool;
cache_attr.destructor_context = mpool;
cache_attr.device_reg_func = mca_mpool_udreg_reg_func;
cache_attr.device_dereg_func = mca_mpool_udreg_dereg_func;
cache_attr.destructor_callback = NULL;
/* attempt to create the udreg cache. this will fail if one already exists */
(void) UDREG_CacheCreate (&cache_attr);
urc = UDREG_CacheAccess (mpool->resources.pool_name, (udreg_cache_handle_t *) &mpool->udreg_handle);
if (UDREG_RC_SUCCESS != urc) {
return OMPI_ERROR;
}
OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t);
ompi_free_list_init_new(&mpool->reg_list, mpool->resources.sizeof_reg,
opal_cache_line_size,
OBJ_CLASS(mca_mpool_base_registration_t),
0, opal_cache_line_size, 0, -1, 32, NULL);
return OMPI_SUCCESS;
}
/* udreg callback functions */
static void *mca_mpool_udreg_reg_func (void *addr, uint64_t len, void *reg_context)
{
mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) reg_context;
mca_mpool_base_registration_t *udreg_reg;
ompi_free_list_item_t *item;
int rc;
OMPI_FREE_LIST_GET_MT(&mpool_udreg->reg_list, item);
if (NULL == item) {
return NULL;
}
udreg_reg = (mca_mpool_base_registration_t *) item;
udreg_reg->mpool = reg_context;
udreg_reg->base = addr;
udreg_reg->bound = (void *)((uintptr_t) addr + len);
rc = mpool_udreg->resources.register_mem(mpool_udreg->resources.reg_data,
addr, len, udreg_reg);
if (OMPI_SUCCESS != rc) {
OMPI_FREE_LIST_RETURN_MT(&mpool_udreg->reg_list, item);
mpool_udreg = NULL;
}
return udreg_reg;
}
static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context)
{
mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) dreg_context;
mca_mpool_base_registration_t *udreg_reg = (mca_mpool_base_registration_t *) device_data;
int rc;
rc = mpool_udreg->resources.deregister_mem(mpool_udreg->resources.reg_data, udreg_reg);
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
OMPI_FREE_LIST_RETURN_MT(&mpool_udreg->reg_list,
(ompi_free_list_item_t *) udreg_reg);
}
/* might be worth printing out a warning if an error occurs here */
return 0;
}
/* */
static int mca_mpool_udreg_alloc_huge (mca_mpool_udreg_module_t *mpool, size_t size,
void **addr, void **base_addr) {
mca_mpool_udreg_hugepage_alloc_t *alloc;
int rc;
alloc = OBJ_NEW(mca_mpool_udreg_hugepage_alloc_t);
alloc->size = size;
rc = asprintf (&alloc->path, "%s/hugepage.openmpi.%d.%d", mpool->huge_page->path,
getpid (), mpool->huge_page->cnt++);
if (0 > rc) {
OBJ_RELEASE(alloc);
return -1;
}
alloc->fd = open (alloc->path, O_RDWR | O_CREAT, 0600);
if (-1 == alloc->fd) {
OBJ_RELEASE(alloc);
return -1;
}
if (0 != ftruncate (alloc->fd, size)) {
close (alloc->fd);
unlink (alloc->path);
OBJ_RELEASE(alloc);
return -1;
}
alloc->ptr = mmap (NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
alloc->fd, 0);
if (NULL == alloc->ptr) {
OBJ_RELEASE(alloc);
return -1;
}
close (alloc->fd);
unlink (alloc->path);
alloc->huge_table = mpool->huge_page;
opal_list_append (&mpool->huge_page->allocations, &alloc->super);
*addr = alloc->ptr;
*base_addr = alloc;
return 0;
}
/**
* allocate function
*/
void* mca_mpool_udreg_alloc(mca_mpool_base_module_t *mpool, size_t size,
size_t align, uint32_t flags, mca_mpool_base_registration_t **reg)
{
mca_mpool_udreg_module_t *udreg_module = (mca_mpool_udreg_module_t *) mpool;
void *base_addr, *addr;
if(0 == align)
align = mca_mpool_base_page_size;
#if OMPI_CUDA_SUPPORT
/* CUDA cannot handle registering overlapping regions, so make
* sure each region is page sized and page aligned. */
align = mca_mpool_base_page_size;
size = OPAL_ALIGN(size, mca_mpool_base_page_size, size_t);
#endif
addr = base_addr = NULL;
if (NULL != udreg_module->huge_page) {
size = OPAL_ALIGN(size, udreg_module->huge_page->page_size, size_t);
mca_mpool_udreg_alloc_huge (udreg_module, size, &addr, &base_addr);
} else {
#ifdef HAVE_POSIX_MEMALIGN
if((errno = posix_memalign(&base_addr, align, size)) != 0)
return NULL;
addr = base_addr;
#else
base_addr = malloc(size + align);
if(NULL == base_addr)
return NULL;
addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t);
#endif
}
if(OMPI_SUCCESS != mca_mpool_udreg_register(mpool, addr, size, flags, reg)) {
free(base_addr);
return NULL;
}
(*reg)->alloc_base = (unsigned char *) base_addr;
return addr;
}
bool mca_mpool_udreg_evict (struct mca_mpool_base_module_t *mpool)
{
mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool;
udreg_return_t urc;
urc = UDREG_Evict (mpool_udreg->udreg_handle);
return (UDREG_RC_SUCCESS == urc);
}
/*
* register memory
*/
int mca_mpool_udreg_register(mca_mpool_base_module_t *mpool, void *addr,
size_t size, uint32_t flags,
mca_mpool_base_registration_t **reg)
{
mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool;
mca_mpool_base_registration_t *udreg_reg;
bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS);
udreg_entry_t *udreg_entry;
udreg_return_t urc;
if (false == bypass_cache) {
/* Get a udreg entry for this region */
while (UDREG_RC_ERROR_RESOURCE ==
(urc = UDREG_Register (mpool_udreg->udreg_handle, addr, size, &udreg_entry))) {
/* try to remove one unused reg and retry */
if (!mca_mpool_udreg_evict (mpool)) {
break;
}
}
udreg_reg = (mca_mpool_base_registration_t *) udreg_entry->device_data;
udreg_reg->mpool_context = udreg_entry;
} else {
/* if cache bypass is requested don't use the udreg cache */
while (NULL == (udreg_reg = mca_mpool_udreg_reg_func (addr, size, mpool))) {
/* try to remove one unused reg and retry */
if (!mca_mpool_udreg_evict (mpool)) {
break;
}
}
udreg_reg->mpool_context = NULL;
}
if (NULL == udreg_reg) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
udreg_reg->flags = flags;
*reg = udreg_reg;
(*reg)->ref_count++;
return OMPI_SUCCESS;
}
/**
* realloc function
*/
void* mca_mpool_udreg_realloc(mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
mca_mpool_base_registration_t *old_reg = *reg;
void *new_mem = mca_mpool_udreg_alloc(mpool, size, 0, old_reg->flags, reg);
memcpy(new_mem, addr, old_reg->bound - old_reg->base + 1);
mca_mpool_udreg_free(mpool, addr, old_reg);
return new_mem;
}
static void mca_mpool_udreg_free_huge (mca_mpool_udreg_hugepage_alloc_t *alloc) {
opal_list_remove_item (&alloc->huge_table->allocations, &alloc->super);
OBJ_RELEASE(alloc);
}
/**
* free function
*/
void mca_mpool_udreg_free(mca_mpool_base_module_t *mpool, void *addr,
mca_mpool_base_registration_t *registration)
{
mca_mpool_udreg_module_t *udreg_module = (mca_mpool_udreg_module_t *) mpool;
mca_mpool_udreg_deregister(mpool, registration);
if (udreg_module->huge_page) {
mca_mpool_udreg_free_huge ((mca_mpool_udreg_hugepage_alloc_t *) registration->alloc_base);
} else {
free (registration->alloc_base);
}
}
int mca_mpool_udreg_find(struct mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
*reg = NULL;
return OMPI_ERR_NOT_FOUND;
}
int mca_mpool_udreg_deregister(struct mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg)
{
mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool;
assert(reg->ref_count > 0);
reg->ref_count--;
if (0 == reg->ref_count && reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) {
mca_mpool_udreg_dereg_func (reg, mpool);
} else if (!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) {
UDREG_DecrRefcount (mpool_udreg->udreg_handle, reg->mpool_context);
}
return OMPI_SUCCESS;
}
void mca_mpool_udreg_finalize(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t*)mpool;
/* Statistic */
if (true == mca_mpool_udreg_component.print_stats) {
uint64_t hit = 0, miss = 0, evicted = 0;
(void) UDREG_GetStat (mpool_udreg->udreg_handle,
UDREG_STAT_CACHE_HIT, &hit);
(void) UDREG_GetStat (mpool_udreg->udreg_handle,
UDREG_STAT_CACHE_MISS, &miss);
(void) UDREG_GetStat (mpool_udreg->udreg_handle,
UDREG_STAT_CACHE_EVICTED, &evicted);
opal_output(0, "%s udreg: stats (hit/miss/evicted): %" PRIu64 "/%" PRIu64 "/%" PRIu64 "\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hit, miss, evicted);
}
UDREG_CacheRelease (mpool_udreg->udreg_handle);
OBJ_DESTRUCT(&mpool_udreg->reg_list);
}
int mca_mpool_udreg_ft_event(int state) {
return OMPI_SUCCESS;
}