1
1

Two new mpools. They are not used now (and by default, not compiled)

but they will be soon.  Provide support for GPU buffer transfers within a node.

This commit was SVN r26008.
Этот коммит содержится в:
Rolf vandeVaart 2012-02-22 23:32:36 +00:00
родитель 94549d024b
Коммит c7a0ce2755
12 изменённых файлов: 1432 добавлений и 14 удалений

Просмотреть файл

@ -355,7 +355,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
CUdeviceptr pbase;
size_t psize;
mca_mpool_rcuda_reg_t *cuda_reg = (mca_mpool_rcuda_reg_t*)newreg;
mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
/* We should only be there if this is a CUDA device pointer */
result = cuPointerGetAttribute(&memType,
@ -416,7 +416,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
*/
int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg)
{
CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_rcuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
opal_output_verbose(5, mca_common_cuda_output,
"CUDA: cuda_ungetmemhandle: base=%p",
reg_data);
@ -434,7 +434,7 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
{
CUresult result;
CUipcMemHandle memHandle;
mca_mpool_rcuda_reg_t *cuda_newreg = (mca_mpool_rcuda_reg_t*)newreg;
mca_mpool_common_cuda_reg_t *cuda_newreg = (mca_mpool_common_cuda_reg_t*)newreg;
/* Need to copy into memory handle for call into CUDA library. */
memcpy(&memHandle, cuda_newreg->memHandle, sizeof(memHandle));
@ -473,7 +473,7 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
{
CUresult result;
mca_mpool_rcuda_reg_t *cuda_reg = (mca_mpool_rcuda_reg_t*)reg;
mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)reg;
result = cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
if (CUDA_SUCCESS != result) {
@ -526,7 +526,7 @@ void mca_common_cuda_destruct_event(uint64_t *event)
* Put remote event on stream to ensure that the the start of the
* copy does not start until the completion of the event.
*/
void mca_common_wait_stream_synchronize(mca_mpool_rcuda_reg_t *rget_reg)
void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
{
CUipcEventHandle evtHandle;
CUevent event;
@ -724,8 +724,8 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
* Need to make sure the handle we are retrieving from the cache is still
* valid. Compare the cached handle to the one received.
*/
int mca_common_cuda_memhandle_matches(mca_mpool_rcuda_reg_t *new_reg,
mca_mpool_rcuda_reg_t *old_reg)
int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg,
mca_mpool_common_cuda_reg_t *old_reg)
{
if (0 == memcmp(new_reg->memHandle, old_reg->memHandle, sizeof(new_reg->memHandle))) {

Просмотреть файл

@ -21,28 +21,30 @@
#define OMPI_MCA_COMMON_CUDA_H
#include "ompi/mca/btl/btl.h"
struct mca_mpool_rcuda_reg_t {
#define MEMHANDLE_SIZE 8
#define EVTHANDLE_SIZE 8
struct mca_mpool_common_cuda_reg_t {
mca_mpool_base_registration_t base;
uint64_t memHandle[8];
uint64_t evtHandle[8];
uint64_t memHandle[MEMHANDLE_SIZE];
uint64_t evtHandle[EVTHANDLE_SIZE];
uint64_t event;
};
typedef struct mca_mpool_rcuda_reg_t mca_mpool_rcuda_reg_t;
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
OMPI_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);
OMPI_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);
OMPI_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_rcuda_reg_t *rget_reg);
OMPI_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg);
OMPI_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
struct mca_btl_base_descriptor_t *, int *done);
OMPI_DECLSPEC int progress_one_cuda_event(struct mca_btl_base_descriptor_t **);
OMPI_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_rcuda_reg_t *new_reg,
mca_mpool_rcuda_reg_t *old_reg);
OMPI_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg,
mca_mpool_common_cuda_reg_t *old_reg);
OMPI_DECLSPEC void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle);
OMPI_DECLSPEC void mca_common_cuda_destruct_event(uint64_t *event);

57
ompi/mca/mpool/gpusm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(mpool_gpusm_CPPFLAGS)
sources = \
mpool_gpusm_module.c \
mpool_gpusm_component.c
if WANT_INSTALL_HEADERS
ompidir = $(includedir)/openmpi/$(subdir)
ompi_HEADERS = mpool_gpusm.h
endif
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_mpool_gpusm_DSO
component_noinst =
component_install = mca_mpool_gpusm.la
else
component_noinst = libmca_mpool_gpusm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_mpool_gpusm_la_SOURCES = $(sources)
mca_mpool_gpusm_la_LDFLAGS = -module -avoid-version
mca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS)
if MCA_ompi_cuda_support
mca_mpool_gpusm_la_LIBADD += \
$(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
endif
noinst_LTLIBRARIES = $(component_noinst)
libmca_mpool_gpusm_la_SOURCES = $(sources)
libmca_mpool_gpusm_la_LDFLAGS = -module -avoid-version
libmca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS)

25
ompi/mca/mpool/gpusm/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,25 @@
# -*- shell-script -*-
#
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#
# If CUDA support was requested, then build the CUDA memory pools.
# This code checks the variable CUDA_SUPPORT which was set earlier in
# the configure sequence by the opal_configure_options.m4 code.
#
AC_DEFUN([MCA_ompi_mpool_gpusm_CONFIG],[
AC_CONFIG_FILES([ompi/mca/mpool/gpusm/Makefile])
# Use CUDA_SUPPORT which was filled in by the opal configure code.
AS_IF([test "x$CUDA_SUPPORT_41" = "x1"],
[$1],
[$2])
])dnl

103
ompi/mca/mpool/gpusm/mpool_gpusm.h Обычный файл
Просмотреть файл

@ -0,0 +1,103 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_MPOOL_GPUSM_H
#define MCA_MPOOL_GPUSM_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/mca/mpool/mpool.h"
BEGIN_C_DECLS
#define MEMHANDLE_SIZE 8
#define EVTHANDLE_SIZE 8
struct mca_mpool_gpusm_registration_t {
mca_mpool_base_registration_t base;
uint64_t memHandle[MEMHANDLE_SIZE];
uint64_t evtHandle[EVTHANDLE_SIZE];
uint64_t event;
};
typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t);
struct mca_mpool_gpusm_component_t {
mca_mpool_base_component_t super;
};
typedef struct mca_mpool_gpusm_component_t mca_mpool_gpusm_component_t;
OMPI_DECLSPEC extern mca_mpool_gpusm_component_t mca_mpool_gpusm_component;
struct mca_mpool_base_resources_t {
void *reg_data;
size_t sizeof_reg;
int (*register_mem)(void *base, size_t size, mca_mpool_base_registration_t *newreg,
mca_mpool_base_registration_t *hdrreg);
int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg);
};
typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t;
struct mca_mpool_gpusm_module_t {
mca_mpool_base_module_t super;
struct mca_mpool_base_resources_t resources;
ompi_free_list_t reg_list;
}; typedef struct mca_mpool_gpusm_module_t mca_mpool_gpusm_module_t;
/*
* Initializes the mpool module.
*/
void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t *mpool);
/**
* register block of memory
*/
int mca_mpool_gpusm_register(mca_mpool_base_module_t* mpool, void *addr,
size_t size, uint32_t flags, mca_mpool_base_registration_t **reg);
/**
* deregister memory
*/
int mca_mpool_gpusm_deregister(mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg);
/**
* find registration for a given block of memory
*/
int mca_mpool_gpusm_find(struct mca_mpool_base_module_t* mpool, void* addr,
size_t size, mca_mpool_base_registration_t **reg);
/**
* finalize mpool
*/
void mca_mpool_gpusm_finalize(struct mca_mpool_base_module_t *mpool);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_mpool_gpusm_ft_event(int state);
END_C_DECLS
#endif

103
ompi/mca/mpool/gpusm/mpool_gpusm_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,103 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
#include "ompi_config.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "mpool_gpusm.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
/*
* Local functions
*/
static int gpusm_open(void);
static int gpusm_close(void);
static int gpusm_register(void);
static mca_mpool_base_module_t* gpusm_init(struct mca_mpool_base_resources_t* resources);
mca_mpool_gpusm_component_t mca_mpool_gpusm_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
MCA_MPOOL_BASE_VERSION_2_0_0,
"gpusm", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
gpusm_open, /* component open */
gpusm_close,
NULL,
gpusm_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
gpusm_init
}
};
/**
* Component open/close/init/register functions. Most do not do anything,
* but keep around for placeholders.
*/
static int gpusm_open(void)
{
return OMPI_SUCCESS;
}
static int gpusm_register(void)
{
return OMPI_SUCCESS;
}
static int gpusm_close(void)
{
return OMPI_SUCCESS;
}
static mca_mpool_base_module_t* gpusm_init(struct mca_mpool_base_resources_t *resources)
{
mca_mpool_gpusm_module_t* mpool_module;
mpool_module =
(mca_mpool_gpusm_module_t*)malloc(sizeof(mca_mpool_gpusm_module_t));
mpool_module->resources = *resources;
mca_mpool_gpusm_module_init(mpool_module);
return &mpool_module->super;
}

197
ompi/mca/mpool/gpusm/mpool_gpusm_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,197 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
*
* This file implements a simple memory pool that is used by the GPU
* buffer on the sending side. It just gets a memory handle and event
* handle that can be sent to the remote side which can then use the
* handles to get access to the memory and the event to determine when
* it can start accessing the memory. There is no caching of the
* memory handles as getting new ones is fast. The event handles are
* cached by the cuda_common code.
*/
#include "ompi_config.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/gpusm/mpool_gpusm.h"
#include "ompi/runtime/params.h"
#include "ompi/mca/common/cuda/common_cuda.h"
/**
* Called when the registration free list is created. An event is created
* for each entry.
*/
static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registration_t *item )
{
mca_common_cuda_construct_event_and_handle((uint64_t **)&item->event,
(void **)&item->evtHandle);
}
/**
* Called when the program is exiting. This destroys the events.
*/
static void mca_mpool_gpusm_registration_destructor( mca_mpool_gpusm_registration_t *item )
{
mca_common_cuda_destruct_event((uint64_t *)item->event);
}
OBJ_CLASS_INSTANCE(mca_mpool_gpusm_registration_t, mca_mpool_base_registration_t,
mca_mpool_gpusm_registration_constructor,
mca_mpool_gpusm_registration_destructor);
/*
* Initializes the mpool module.
*/
void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t* mpool)
{
mpool->super.mpool_component = &mca_mpool_gpusm_component.super;
mpool->super.mpool_base = NULL;
mpool->super.mpool_alloc = NULL;
mpool->super.mpool_realloc = NULL;
mpool->super.mpool_free = NULL;
mpool->super.mpool_register = mca_mpool_gpusm_register;
mpool->super.mpool_find = mca_mpool_gpusm_find;
mpool->super.mpool_deregister = mca_mpool_gpusm_deregister;
mpool->super.mpool_release_memory = NULL;
mpool->super.mpool_finalize = mca_mpool_gpusm_finalize;
mpool->super.mpool_ft_event = mca_mpool_gpusm_ft_event;
mpool->super.rcache = NULL;
mpool->super.flags = 0;
mpool->resources.reg_data = NULL;
mpool->resources.sizeof_reg = sizeof(struct mca_mpool_common_cuda_reg_t);
mpool->resources.register_mem = cuda_getmemhandle;
mpool->resources.deregister_mem = cuda_ungetmemhandle;
OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t);
/* Start with 0 entries in the free list since CUDA may not have
* been initialized when this free list is created and there is
* some CUDA specific activities that need to be done. */
ompi_free_list_init_new(&mpool->reg_list, mpool->resources.sizeof_reg,
opal_cache_line_size,
OBJ_CLASS(mca_mpool_gpusm_registration_t),
0,opal_cache_line_size,
0, -1, 64, NULL);
}
/**
* Just go ahead and get a new registration. The find and register
* functions are the same thing for this memory pool.
*/
int mca_mpool_gpusm_find(mca_mpool_base_module_t *mpool, void *addr,
size_t size,
mca_mpool_base_registration_t **reg)
{
return mca_mpool_gpusm_register(mpool, addr, size, 0, reg);
}
/*
* This is the one function that does all the work. It will call into
* the register function to get the memory handle for the sending
* buffer. There is no need to deregister the memory handle so the
* deregister function is a no-op.
*/
int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr,
size_t size, uint32_t flags,
mca_mpool_base_registration_t **reg)
{
mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t*)mpool;
mca_mpool_base_registration_t *gpusm_reg;
ompi_free_list_item_t *item;
unsigned char *base, *bound;
int rc;
/* In spite of the fact we return an error code, the existing code
* checks the registration for a NULL value rather than looking at
* the return code. So, initialize the registration to NULL in
* case we run into a failure. */
*reg = NULL;
base = addr;
bound = (unsigned char *)addr + size - 1;
OMPI_FREE_LIST_GET(&mpool_gpusm->reg_list, item, rc);
if(OMPI_SUCCESS != rc) {
return rc;
}
gpusm_reg = (mca_mpool_base_registration_t*)item;
gpusm_reg->mpool = mpool;
gpusm_reg->base = base;
gpusm_reg->bound = bound;
gpusm_reg->flags = flags;
rc = mpool_gpusm->resources.register_mem(base, size, gpusm_reg, NULL);
if(rc != OMPI_SUCCESS) {
OMPI_FREE_LIST_RETURN(&mpool_gpusm->reg_list, item);
return rc;
}
*reg = gpusm_reg;
(*reg)->ref_count++;
return OMPI_SUCCESS;
}
/*
* Return the registration to the free list.
*/
int mca_mpool_gpusm_deregister(struct mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg)
{
int rc;
mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool;
rc = mpool_gpusm->resources.deregister_mem(mpool, reg);
OMPI_FREE_LIST_RETURN(&mpool_gpusm->reg_list, (ompi_free_list_item_t*)reg);
return OMPI_SUCCESS;
}
/**
* Free up the resources.
*/
void mca_mpool_gpusm_finalize(struct mca_mpool_base_module_t *mpool)
{
ompi_free_list_item_t *item;
mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool;
/* Need to run the destructor on each item in the free list explicitly.
* The destruction of the free list only runs the destructor on the
* main free list, not each item. */
while (NULL != (item = (ompi_free_list_item_t *)opal_atomic_lifo_pop(&(mpool_gpusm->reg_list.super)))) {
OBJ_DESTRUCT(item);
}
OBJ_DESTRUCT(&mpool_gpusm->reg_list);
return;
}
int mca_mpool_gpusm_ft_event(int state) {
return OMPI_SUCCESS;
}

57
ompi/mca/mpool/rgpusm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(mpool_rgpusm_CPPFLAGS)
sources = \
mpool_rgpusm_module.c \
mpool_rgpusm_component.c
if WANT_INSTALL_HEADERS
ompidir = $(includedir)/openmpi/$(subdir)
ompi_HEADERS = mpool_rgpusm.h
endif
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_mpool_rgpusm_DSO
component_noinst =
component_install = mca_mpool_rgpusm.la
else
component_noinst = libmca_mpool_rgpusm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_mpool_rgpusm_la_SOURCES = $(sources)
mca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version
mca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS)
if MCA_ompi_cuda_support
mca_mpool_rgpusm_la_LIBADD += \
$(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
endif
noinst_LTLIBRARIES = $(component_noinst)
libmca_mpool_rgpusm_la_SOURCES = $(sources)
libmca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version
libmca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS)

25
ompi/mca/mpool/rgpusm/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,25 @@
# -*- shell-script -*-
#
# Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#
# If CUDA support was requested, then build the CUDA memory pools.
# This code checks the variable CUDA_SUPPORT which was set earlier in
# the configure sequence by the opal_configure_options.m4 code.
#
AC_DEFUN([MCA_ompi_mpool_rgpusm_CONFIG],[
AC_CONFIG_FILES([ompi/mca/mpool/rgpusm/Makefile])
# Use CUDA_SUPPORT which was filled in by the opal configure code.
AS_IF([test "x$CUDA_SUPPORT_41" = "x1"],
[$1],
[$2])
])dnl

117
ompi/mca/mpool/rgpusm/mpool_rgpusm.h Обычный файл
Просмотреть файл

@ -0,0 +1,117 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_MPOOL_RGPUSM_H
#define MCA_MPOOL_RGPUSM_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/mca/mpool/mpool.h"
BEGIN_C_DECLS
struct mca_mpool_rgpusm_component_t {
mca_mpool_base_component_t super;
char* rcache_name;
size_t rcache_size_limit;
bool print_stats;
uint32_t leave_pinned;
int output;
};
typedef struct mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component_t;
OMPI_DECLSPEC extern mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component;
struct mca_mpool_base_resources_t {
void *reg_data;
size_t sizeof_reg;
int (*register_mem)(void *base, size_t size, mca_mpool_base_registration_t *newreg,
mca_mpool_base_registration_t *hdrreg);
int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg);
};
typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t;
struct mca_mpool_rgpusm_module_t {
mca_mpool_base_module_t super;
struct mca_mpool_base_resources_t resources;
ompi_free_list_t reg_list;
opal_list_t lru_list;
uint32_t stat_cache_hit;
uint32_t stat_cache_valid;
uint32_t stat_cache_invalid;
uint32_t stat_cache_miss;
uint32_t stat_evicted;
uint32_t stat_cache_found;
uint32_t stat_cache_notfound;
}; typedef struct mca_mpool_rgpusm_module_t mca_mpool_rgpusm_module_t;
/*
* Initializes the mpool module.
*/
void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t *mpool);
/**
* register block of memory
*/
int mca_mpool_rgpusm_register(mca_mpool_base_module_t* mpool, void *addr,
size_t size, uint32_t flags, mca_mpool_base_registration_t **reg);
/**
* deregister memory
*/
int mca_mpool_rgpusm_deregister(mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg);
/**
* free memory allocated by alloc function
*/
void mca_mpool_rgpusm_free(mca_mpool_base_module_t *mpool, void * addr,
mca_mpool_base_registration_t *reg);
/**
* find registration for a given block of memory
*/
int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t* mpool, void* addr,
size_t size, mca_mpool_base_registration_t **reg);
/**
* unregister all registration covering the block of memory
*/
int mca_mpool_rgpusm_release_memory(mca_mpool_base_module_t* mpool, void *base,
size_t size);
/**
* finalize mpool
*/
void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_mpool_rgpusm_ft_event(int state);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,142 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
#include "ompi_config.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "mpool_rgpusm.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
/*
* Local functions
*/
static int rgpusm_open(void);
static int rgpusm_close(void);
static int rgpusm_register(void);
static mca_mpool_base_module_t* rgpusm_init(struct mca_mpool_base_resources_t* resources);
mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
MCA_MPOOL_BASE_VERSION_2_0_0,
"rgpusm", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
rgpusm_open, /* component open */
rgpusm_close,
NULL,
rgpusm_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
rgpusm_init
}
};
/**
* component open/close/init function
*/
static int rgpusm_open(void)
{
return OMPI_SUCCESS;
}
static int rgpusm_register(void)
{
int val;
mca_base_param_reg_string(&mca_mpool_rgpusm_component.super.mpool_version,
"rcache_name",
"The name of the registration cache the mpool should use",
false, false, "vma", &mca_mpool_rgpusm_component.rcache_name);
mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
"rcache_size_limit",
"the maximum size of registration cache in bytes. "
"0 is unlimited (default 0)", false, false, 0, &val);
mca_mpool_rgpusm_component.rcache_size_limit = (size_t)val;
mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
"leave_pinned",
"Whether to keep memory handles around or release them when done. ",
false, false, 1, &val);
mca_mpool_rgpusm_component.leave_pinned = (size_t)val;
mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
"print_stats",
"print pool usage statistics at the end of the run",
false, false, 0, &val);
mca_mpool_rgpusm_component.print_stats = val?true:false;
/* Set different levels of verbosity in the rgpusm related code. */
mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
"verbose",
"Set level of mpool rgpusm verbosity",
false, false, 0, &val);
mca_mpool_rgpusm_component.output = opal_output_open(NULL);
opal_output_set_verbosity(mca_mpool_rgpusm_component.output, val);
return OMPI_SUCCESS;
}
static int rgpusm_close(void)
{
if (NULL != mca_mpool_rgpusm_component.rcache_name) {
free(mca_mpool_rgpusm_component.rcache_name);
}
return OMPI_SUCCESS;
}
static mca_mpool_base_module_t* rgpusm_init(
struct mca_mpool_base_resources_t *resources)
{
mca_mpool_rgpusm_module_t* mpool_module;
mpool_module =
(mca_mpool_rgpusm_module_t*)malloc(sizeof(mca_mpool_rgpusm_module_t));
mpool_module->resources = *resources;
mca_mpool_rgpusm_module_init(mpool_module);
return &mpool_module->super;
}

590
ompi/mca/mpool/rgpusm/mpool_rgpusm_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,590 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
*
* This memory pool is used for getting the memory handle of remote
* GPU memory when using CUDA. Hence, the name is "rgpusm" for "remote
* CUDA" GPU memory. There is a cache that can be used to store the
* remote handles in case they are reused to save on the registration
* cost as that can be expensive, on the order of 100 usecs. The
* cache can also be used just to track how many handles are in use at
* a time. It is best to look at this with the three different
* scenarios that are possible.
* 1. mpool_rgpusm_leave_pinned=0, cache_size=unlimited
* 2. mpool_rgpusm_leave_pinned=0, cache_size=limited
* 3. mpool_rgpusm_leave_pinned=1, cache_size=unlimited (default)
* 4. mpool_rgpusm_leave_pinned=1, cache_size=limited.
*
* Case 1: The cache is unused and remote memory is registered and
* unregistered for each transaction. The amount of outstanding
* registered memory is unlimited.
* Case 2: The cache keeps track of how much memory is registered at a
* time. Since leave pinned is 0, any memory that is registered is in
* use. If the amount to register exceeds the amount, we will error
* out. This could be handled more gracefully, but this is not a
* common way to run, so we will leave as is.
* Case 3: The cache is needed to track current and past transactions.
* However, there is no limit on the number that can be stored.
* Therefore, once memory enters the cache, and gets registered, it
* stays that way forever.
* Case 4: The cache is needed to track current and past transactions.
* In addition, a list of most recently used (but no longer in use)
* registrations is stored so that it can be used to evict
* registrations from the cache. In addition, these registrations are
* deregistered.
*
* I also want to capture how we can run into the case where we do not
* find something in the cache, but when we try to register it, we get
* an error back from the CUDA library saying the memory is in use.
* This can happen in the following scenario. The application mallocs
* a buffer of size 32K. The library loads this in the cache and
* registers it. The application then frees the buffer. It then
* mallocs a buffer of size 64K. This malloc returns the same base
* address as the first 32K allocation. The library searches the
* cache, but since the size is larger than the original allocation it
* does not find the registration. It then attempts to register this.
* The CUDA library returns an error saying it is already mapped. To
* handle this, we return an error of OMPI_ERR_WOULD_BLOCK to the
* memory pool. The memory pool then looks for the registration based
* on the base address and a size of 4. We use the small size to make
* sure that we find the registration. This registration is evicted,
* and we try to register again.
*/
#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
#include "ompi_config.h"
#include "opal/align.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "ompi/mca/mpool/rgpusm/mpool_rgpusm.h"
#include <errno.h>
#include <string.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include "ompi/mca/rcache/rcache.h"
#include "ompi/mca/rcache/base/base.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/runtime/params.h"
#include "ompi/mca/common/cuda/common_cuda.h"
/* A hack so that page alignment is disabled in my instantiation of
* the rcache. This needs to be fixed. */
static size_t saved_page_size;
#define SET_PAGE_ALIGNMENT_TO_ZERO() \
saved_page_size = mca_mpool_base_page_size_log; \
mca_mpool_base_page_size_log = 0;
#define RESTORE_PAGE_ALIGNMENT() \
mca_mpool_base_page_size_log = saved_page_size;
static inline bool mca_mpool_rgpusm_deregister_lru (mca_mpool_base_module_t *mpool) {
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *) mpool;
mca_mpool_base_registration_t *old_reg;
int rc;
/* Remove the registration from the cache and list before
deregistering the memory */
old_reg = (mca_mpool_base_registration_t*)
opal_list_remove_first (&mpool_rgpusm->lru_list);
if (NULL == old_reg) {
return false;
}
mpool->rcache->rcache_delete(mpool->rcache, old_reg);
/* Drop the rcache lock while we deregister the memory */
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
assert(old_reg->ref_count == 0);
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
old_reg);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
/* This introduces a potential leak of registrations if
the deregistration fails to occur as we no longer have
a reference to it. Is this possible? */
if (OMPI_SUCCESS != rc) {
return false;
}
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list,
(ompi_free_list_item_t*)old_reg);
mpool_rgpusm->stat_evicted++;
return true;
}
/*
* Initializes the mpool module.
*/
void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t* mpool)
{
mpool->super.mpool_component = &mca_mpool_rgpusm_component.super;
mpool->super.mpool_base = NULL; /* no base .. */
mpool->super.mpool_alloc = NULL;
mpool->super.mpool_realloc = NULL;
mpool->super.mpool_free = mca_mpool_rgpusm_free;
mpool->super.mpool_register = mca_mpool_rgpusm_register;
mpool->super.mpool_find = mca_mpool_rgpusm_find;
mpool->super.mpool_deregister = mca_mpool_rgpusm_deregister;
mpool->super.mpool_release_memory = NULL;
mpool->super.mpool_finalize = mca_mpool_rgpusm_finalize;
mpool->super.mpool_ft_event = mca_mpool_rgpusm_ft_event;
mpool->super.rcache =
mca_rcache_base_module_create(mca_mpool_rgpusm_component.rcache_name);
mpool->super.flags = 0;
mpool->resources.reg_data = NULL;
mpool->resources.sizeof_reg = sizeof(struct mca_mpool_common_cuda_reg_t);
mpool->resources.register_mem = cuda_openmemhandle;
mpool->resources.deregister_mem = cuda_closememhandle;
OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t);
ompi_free_list_init_new(&mpool->reg_list, mpool->resources.sizeof_reg,
opal_cache_line_size,
OBJ_CLASS(mca_mpool_base_registration_t),
0,opal_cache_line_size,
0, -1, 32, NULL);
OBJ_CONSTRUCT(&mpool->lru_list, opal_list_t);
mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0;
mpool->stat_cache_found = mpool->stat_cache_notfound = 0;
mpool->stat_cache_valid = mpool->stat_cache_invalid = 0;
}
/*
* This function opens and handle using the handle that was received
* from the remote memory. It uses the addr and size of the remote
* memory for caching the registration.
*/
int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
size_t size, uint32_t flags,
mca_mpool_base_registration_t **reg)
{
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
mca_mpool_common_cuda_reg_t *rgpusm_reg;
mca_mpool_common_cuda_reg_t *rget_reg;
ompi_free_list_item_t *item;
int rc;
int mypeer; /* just for debugging */
/* In order to preserve the signature of the mca_mpool_rgpusm_register
* function, we are using the **reg variable to not only get back the
* registration information, but to hand in the memory handle received
* from the remote side. */
rget_reg = (mca_mpool_common_cuda_reg_t *)*reg;
mypeer = flags;
flags = 0;
/* No need to support MCA_MPOOL_FLAGS_CACHE_BYPASS in here. It is not used. */
assert(0 == (flags & MCA_MPOOL_FLAGS_CACHE_BYPASS));
/* This chunk of code handles the case where leave pinned is not
* set and we do not use the cache. This is not typically how we
* will be running. This means that one can have an unlimited
* number of registrations occuring at the same time. Since we
* are not leaving the registrations pinned, the number of
* registrations is unlimited and there is no need for a cache. */
if(!mca_mpool_rgpusm_component.leave_pinned && 0 == mca_mpool_rgpusm_component.rcache_size_limit) {
OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
if(OMPI_SUCCESS != rc) {
return rc;
}
rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
rgpusm_reg->base.mpool = mpool;
rgpusm_reg->base.base = addr;
rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;;
rgpusm_reg->base.flags = flags;
/* Copy the memory handle received into the registration */
memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle));
/* The rget_reg registration is holding the memory handle needed
* to register the remote memory. This was received from the remote
* process. A pointer to the memory is returned in the alloc_base field. */
rc = mpool_rgpusm->resources.register_mem(addr, size,
(mca_mpool_base_registration_t *)rgpusm_reg,
(mca_mpool_base_registration_t *)rget_reg);
/* This error should not happen with no cache in use. */
assert(OMPI_ERR_WOULD_BLOCK != rc);
if(rc != OMPI_SUCCESS) {
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
return rc;
}
rgpusm_reg->base.ref_count++;
*reg = (mca_mpool_base_registration_t *)rgpusm_reg;
return OMPI_SUCCESS;
}
/* Check to see if memory is registered and stored in the cache. */
OPAL_THREAD_LOCK(&mpool->rcache->lock);
SET_PAGE_ALIGNMENT_TO_ZERO();
mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
RESTORE_PAGE_ALIGNMENT();
/* If *reg is not NULL, we have a registration. Let us see if the
* memory handle matches the one we were looking for. If not, the
* registration is invalid and needs to be removed. This happens
* if memory was allocated, freed, and allocated again and ends up
* with the same virtual address and within the limits of the
* previous registration. The memory handle check will catch that
* scenario as the handles have unique serial numbers. */
if (*reg != NULL) {
mpool_rgpusm->stat_cache_hit++;
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"Found addr=%p, size=%d (base=%p,size=%d)in cache",
addr, (int)size, (*reg)->base,
(int)((*reg)->bound - (*reg)->base));
if (mca_common_cuda_memhandle_matches((mca_mpool_common_cuda_reg_t *)*reg, rget_reg)) {
/* Registration matches what was requested. All is good. */
mpool_rgpusm->stat_cache_valid++;
} else {
/* This is an old registration. Need to boot it. */
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"Mismatched Handle: Evicting addr=%p, size=%d in cache",
addr, (int)size);
/* The ref_count has to be zero as this memory cannot possibly
* be in use. Assert on that just to make sure. */
assert(0 == (*reg)->ref_count);
if (mca_mpool_rgpusm_component.leave_pinned) {
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)(*reg));
}
/* Bump the reference count to keep things copacetic in deregister */
(*reg)->ref_count++;
/* Invalidate the registration so it will get booted out. */
(*reg)->flags |= MCA_MPOOL_FLAGS_INVALID;
mca_mpool_rgpusm_deregister(mpool, *reg);
*reg = NULL;
mpool_rgpusm->stat_cache_invalid++;
}
} else {
/* Nothing was found in the cache. */
mpool_rgpusm->stat_cache_miss++;
}
/* If we have a registration here, then we know it is valid. */
if (*reg != NULL) {
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
mypeer, addr, (int)size);
/* When using leave pinned, we keep an LRU list. */
if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
opal_output_verbose(20, mca_mpool_rgpusm_component.output,
"POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
mypeer, addr, (int)size);
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)(*reg));
}
(*reg)->ref_count++;
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
"Found entry in cache addr=%p, size=%d", addr, (int)size);
return OMPI_SUCCESS;
}
/* If we are here, then we did not find a registration, or it was invalid,
* so this is a new one, and we are going to use the cache. */
assert(NULL == *reg);
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"New registration ep=%d, addr=%p, size=%d in cache",
mypeer, addr, (int)size);
OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
rgpusm_reg->base.mpool = mpool;
rgpusm_reg->base.base = addr;
rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;
rgpusm_reg->base.flags = flags;
/* Need the memory handle saved in the registration */
memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle));
/* Actually register the memory, which opens the memory handle.
* Need to do this prior to putting in the cache as the base and
* bound values may be changed by the registration. The memory
* associated with the handle comes back in the alloc_base
* value. */
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
(mca_mpool_base_registration_t *)rget_reg);
/* There is a chance we can get the OMPI_ERR_WOULD_BLOCK from the
* CUDA codes attempt to register the memory. The case that this
* can happen is as follows. A block of memory is registered.
* Then the sending side frees the memory. The sending side then
* cuMemAllocs memory again and gets the same base
* address. However, it cuMemAllocs a block that is larger than
* the one in the cache. The cache will return that memory is not
* registered and call into CUDA to register it. However, that
* will fail with CUDA_ERROR_ALREADY_MAPPED. Therefore we need to
* boot that previous allocation out and deregister it first.
*/
if (OMPI_ERR_WOULD_BLOCK == rc) {
mca_mpool_base_registration_t *oldreg;
SET_PAGE_ALIGNMENT_TO_ZERO();
/* Need to make sure it is at least 4 bytes in size This will
* ensure we get the hit in the cache. */
mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
RESTORE_PAGE_ALIGNMENT();
/* The ref_count has to be zero as this memory cannot possibly
* be in use. Assert on that just to make sure. */
assert(0 == oldreg->ref_count);
if (mca_mpool_rgpusm_component.leave_pinned) {
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)oldreg);
}
/* Bump the reference count to keep things copacetic in deregister */
oldreg->ref_count++;
/* Invalidate the registration so it will get booted out. */
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
mca_mpool_rgpusm_deregister(mpool, oldreg);
mpool_rgpusm->stat_evicted++;
/* And try again. This only needs to be attempted one other time. */
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
(mca_mpool_base_registration_t *)rget_reg);
}
if(rc != OMPI_SUCCESS) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
return rc;
}
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
"About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
SET_PAGE_ALIGNMENT_TO_ZERO();
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
mca_mpool_rgpusm_component.rcache_size_limit)) ==
OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
opal_output(-1, "No room in the cache - boot one out");
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
break;
}
}
RESTORE_PAGE_ALIGNMENT();
if(rc != OMPI_SUCCESS) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
/* We cannot recover from this. We can be here if the size of the cache
* is smaller than the amount of memory we are trying to register in a single
* transfer. In that case, rc is MPI_ERR_OUT_OF_RESOURCES, but everything is
* stuck at that point. Therefore, just error out completely.
*/
return OMPI_ERROR;
}
rgpusm_reg->base.ref_count++;
*reg = (mca_mpool_base_registration_t *)rgpusm_reg;
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* Cleanup any vmas that we have deferred deletion on */
mpool->rcache->rcache_clean(mpool->rcache);
return OMPI_SUCCESS;
}
/**
* free function
*/
void mca_mpool_rgpusm_free(mca_mpool_base_module_t *mpool, void *addr,
mca_mpool_base_registration_t *registration)
{
void *alloc_base = registration->alloc_base;
mca_mpool_rgpusm_deregister(mpool, registration);
free(alloc_base);
}
int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr,
size_t size, mca_mpool_base_registration_t **reg)
{
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
int rc;
unsigned char *base, *bound;
base = addr;
bound = base + size - 1; /* To keep cache hits working correctly */
OPAL_THREAD_LOCK(&mpool->rcache->lock);
opal_output(-1, "Looking for addr=%p, size=%d", addr, (int)size);
SET_PAGE_ALIGNMENT_TO_ZERO();
rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
RESTORE_PAGE_ALIGNMENT();
if(*reg != NULL && mca_mpool_rgpusm_component.leave_pinned) {
if(0 == (*reg)->ref_count && mca_mpool_rgpusm_component.leave_pinned) {
opal_list_remove_item(&mpool_rgpusm->lru_list, (opal_list_item_t*)(*reg));
}
mpool_rgpusm->stat_cache_found++;
(*reg)->ref_count++;
} else {
mpool_rgpusm->stat_cache_notfound++;
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return rc;
}
static inline bool registration_is_cachebale(mca_mpool_base_registration_t *reg)
{
return !(reg->flags &
(MCA_MPOOL_FLAGS_CACHE_BYPASS |
MCA_MPOOL_FLAGS_INVALID));
}
int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
mca_mpool_base_registration_t *reg)
{
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
int rc = OMPI_SUCCESS;
assert(reg->ref_count > 0);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
reg->ref_count--;
opal_output(-1, "Deregister: reg->ref_count=%d", (int)reg->ref_count);
if(reg->ref_count > 0) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
return OMPI_SUCCESS;
}
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
{
/* if leave_pinned is set don't deregister memory, but put it
* on LRU list for future use */
opal_list_prepend(&mpool_rgpusm->lru_list, (opal_list_item_t*)reg);
} else {
/* Remove from rcache first */
if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
mpool->rcache->rcache_delete(mpool->rcache, reg);
/* Drop the rcache lock before deregistring the memory */
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
{
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool;
assert(reg->ref_count == 0);
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
reg);
}
OPAL_THREAD_LOCK(&mpool->rcache->lock);
if(OMPI_SUCCESS == rc) {
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list,
(ompi_free_list_item_t*)reg);
}
}
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* Cleanup any vmas that we have deferred deletion on */
mpool->rcache->rcache_clean(mpool->rcache);
return rc;
}
#define RGPUSM_MPOOL_NREGS 100
void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
mca_mpool_base_registration_t *reg;
mca_mpool_base_registration_t *regs[RGPUSM_MPOOL_NREGS];
int reg_cnt, i;
int rc;
/* Statistic */
if(true == mca_mpool_rgpusm_component.print_stats) {
opal_output(0, "%s rgpusm: stats "
"(hit/valid/invalid/miss/evicted): %d/%d/%d/%d/%d\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
mpool_rgpusm->stat_cache_hit, mpool_rgpusm->stat_cache_valid,
mpool_rgpusm->stat_cache_invalid, mpool_rgpusm->stat_cache_miss,
mpool_rgpusm->stat_evicted);
}
OPAL_THREAD_LOCK(&mpool->rcache->lock);
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
regs, RGPUSM_MPOOL_NREGS);
opal_output(-1, "Registration size at finalize = %d", reg_cnt);
for(i = 0; i < reg_cnt; i++) {
reg = regs[i];
if(reg->ref_count) {
reg->ref_count = 0; /* otherway dereg will fail on assert */
} else if (mca_mpool_rgpusm_component.leave_pinned) {
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)reg);
}
/* Remove from rcache first */
mpool->rcache->rcache_delete(mpool->rcache, reg);
/* Drop lock before deregistering memory */
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
assert(reg->ref_count == 0);
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
reg);
OPAL_THREAD_LOCK(&mpool->rcache->lock);
if(rc != OMPI_SUCCESS) {
/* Potentially lose track of registrations
do we have to put it back? */
continue;
}
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list,
(ompi_free_list_item_t*)reg);
}
} while(reg_cnt == RGPUSM_MPOOL_NREGS);
OBJ_DESTRUCT(&mpool_rgpusm->lru_list);
OBJ_DESTRUCT(&mpool_rgpusm->reg_list);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* Cleanup any vmas that we have deferred deletion on */
mpool->rcache->rcache_clean(mpool->rcache);
}
int mca_mpool_rgpusm_ft_event(int state) {
return OMPI_SUCCESS;
}