Fix initialization and cleanup code for CUDA-aware code. Eliminates all resource leaks.
This commit was SVN r32512.
Этот коммит содержится в:
родитель
e974bec57e
Коммит
c53c981506
@ -222,6 +222,10 @@ static int btl_openib_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
||||
mca_btl_openib_component.memory_registration_verbose = -1;
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_stage_one_init();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -279,6 +283,10 @@ static int btl_openib_component_close(void)
|
||||
/* close memory registration debugging output */
|
||||
opal_output_close (mca_btl_openib_component.memory_registration_verbose);
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_fini();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -49,6 +50,10 @@ mca_btl_sm_la_SOURCES = $(libmca_btl_sm_la_sources)
|
||||
mca_btl_sm_la_LDFLAGS = -module -avoid-version
|
||||
mca_btl_sm_la_LIBADD = \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/libmca_common_sm.la
|
||||
if OPAL_cuda_support
|
||||
mca_btl_sm_la_LIBADD = \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/libmca_common_cuda.la
|
||||
endif
|
||||
mca_btl_sm_la_CPPFLAGS = $(btl_sm_CPPFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -60,6 +60,9 @@
|
||||
#include "btl_sm.h"
|
||||
#include "btl_sm_frag.h"
|
||||
#include "btl_sm_fifo.h"
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
#include "opal/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
static int mca_btl_sm_component_open(void);
|
||||
static int mca_btl_sm_component_close(void);
|
||||
@ -355,6 +358,10 @@ static int mca_btl_sm_component_close(void)
|
||||
#endif
|
||||
|
||||
CLEANUP:
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_fini();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
/* return */
|
||||
return return_value;
|
||||
@ -878,6 +885,10 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_stage_one_init();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
return btls;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
|
@ -288,6 +288,10 @@ static int mca_btl_smcuda_component_close(void)
|
||||
|
||||
CLEANUP:
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_fini();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
/* return */
|
||||
return return_value;
|
||||
}
|
||||
@ -931,6 +935,7 @@ mca_btl_smcuda_component_init(int *num_btls,
|
||||
/* Register a smcuda control function to help setup IPC support */
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
|
||||
mca_common_cuda_stage_one_init();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
return btls;
|
||||
|
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2009 Oak Ridge National Laboratory
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -381,6 +381,10 @@ static int mca_btl_tcp_component_close(void)
|
||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
|
||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_fini();
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -104,7 +104,7 @@ struct cudaFunctionTable {
|
||||
typedef struct cudaFunctionTable cudaFunctionTable_t;
|
||||
cudaFunctionTable_t cuFunc;
|
||||
|
||||
static bool stage_one_init_complete = false;
|
||||
static int stage_one_init_ref_count = 0;
|
||||
static bool stage_three_init_complete = false;
|
||||
static bool common_cuda_initialized = false;
|
||||
static int mca_common_cuda_verbose;
|
||||
@ -155,15 +155,15 @@ static int mca_common_cuda_cumemcpy_timing;
|
||||
|
||||
/* Array of CUDA events to be queried for IPC stream, sending side and
|
||||
* receiving side. */
|
||||
CUevent *cuda_event_ipc_array;
|
||||
CUevent *cuda_event_dtoh_array;
|
||||
CUevent *cuda_event_htod_array;
|
||||
CUevent *cuda_event_ipc_array = NULL;
|
||||
CUevent *cuda_event_dtoh_array = NULL;
|
||||
CUevent *cuda_event_htod_array = NULL;
|
||||
|
||||
/* Array of fragments currently being moved by cuda async non-blocking
|
||||
* operations */
|
||||
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
|
||||
struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array;
|
||||
struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array;
|
||||
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
|
||||
struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
|
||||
struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
|
||||
|
||||
/* First free/available location in cuda_event_status_array */
|
||||
int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
|
||||
@ -181,7 +181,7 @@ static int cuda_event_dtoh_most = 0;
|
||||
static int cuda_event_htod_most = 0;
|
||||
|
||||
/* Handle to libcuda.so */
|
||||
opal_lt_dlhandle libcuda_handle;
|
||||
opal_lt_dlhandle libcuda_handle = NULL;
|
||||
|
||||
#define CUDA_COMMON_TIMING 0
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
@ -211,8 +211,9 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
|
||||
|
||||
/**
|
||||
* This is the first stage of initialization. This function is
|
||||
* triggered when there are memory registration requests from various
|
||||
* BTLs. This function will register some mca variables and then open
|
||||
* called explicitly by any BTLs that can support CUDA-aware.
|
||||
* It is called during the component open phase of initialization.
|
||||
* This function will register some mca variables and then open
|
||||
* and load the symbols needed from the CUDA driver library. Look for
|
||||
* the SONAME of the library which is libcuda.so.1. In most cases,
|
||||
* this will result in the library found. However, there are some
|
||||
@ -232,10 +233,14 @@ int mca_common_cuda_stage_one_init(void)
|
||||
int errsize;
|
||||
bool stage_one_init_passed = false;
|
||||
|
||||
if (true == stage_one_init_complete) {
|
||||
return 0;
|
||||
stage_one_init_ref_count++;
|
||||
if (stage_one_init_ref_count > 1) {
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: stage_one_init_ref_count is now %d, no need to init",
|
||||
stage_one_init_ref_count);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
stage_one_init_complete = true;
|
||||
|
||||
OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
|
||||
@ -313,6 +318,10 @@ int mca_common_cuda_stage_one_init(void)
|
||||
mca_common_cuda_output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
|
||||
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: stage_one_init_ref_count is now %d, initializing",
|
||||
stage_one_init_ref_count);
|
||||
|
||||
/* First check if the support is enabled. In the case that the user has
|
||||
* turned it off, we do not need to continue with any CUDA specific
|
||||
* initialization. Do this after MCA parameter registration. */
|
||||
@ -351,7 +360,7 @@ int mca_common_cuda_stage_one_init(void)
|
||||
* paths from the system. For the second loop, set /usr/lib64 to
|
||||
* the search path and try again. This is done to handle the case
|
||||
* where we have both 32 and 64 bit libcuda.so libraries installed.
|
||||
* Even when running in 64-bit mode, the /usr/lib direcotry
|
||||
* Even when running in 64-bit mode, the /usr/lib directory
|
||||
* is searched first and we may find a 32-bit libcuda.so.1 library.
|
||||
* Loading of this library will fail as libtool does not handle having
|
||||
* the wrong ABI in the search path (unlike ld or ld.so). Note that
|
||||
@ -515,7 +524,8 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta
|
||||
* This is the last phase of initialization. This is triggered when we examine
|
||||
* a buffer pointer and determine it is a GPU buffer. We then assume the user
|
||||
* has selected their GPU and we can go ahead with all the CUDA related
|
||||
* initializations.
|
||||
* initializations. If we get an error, just return. Cleanup of resources
|
||||
* will happen when fini is called.
|
||||
*/
|
||||
static int mca_common_cuda_stage_three_init(void)
|
||||
{
|
||||
@ -597,8 +607,6 @@ static int mca_common_cuda_stage_three_init(void)
|
||||
#if OPAL_CUDA_SUPPORT_41
|
||||
if (true == mca_common_cuda_enabled) {
|
||||
/* Set up an array to store outstanding IPC async copy events */
|
||||
cuda_event_ipc_array = NULL;
|
||||
cuda_event_ipc_frag_array = NULL;
|
||||
cuda_event_ipc_num_used = 0;
|
||||
cuda_event_ipc_first_avail = 0;
|
||||
cuda_event_ipc_first_used = 0;
|
||||
@ -638,8 +646,6 @@ static int mca_common_cuda_stage_three_init(void)
|
||||
if (true == mca_common_cuda_enabled) {
|
||||
/* Set up an array to store outstanding async dtoh events. Used on the
|
||||
* sending side for asynchronous copies. */
|
||||
cuda_event_dtoh_array = NULL;
|
||||
cuda_event_dtoh_frag_array = NULL;
|
||||
cuda_event_dtoh_num_used = 0;
|
||||
cuda_event_dtoh_first_avail = 0;
|
||||
cuda_event_dtoh_first_used = 0;
|
||||
@ -649,7 +655,7 @@ static int mca_common_cuda_stage_three_init(void)
|
||||
opal_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||
true, OPAL_PROC_MY_HOSTNAME);
|
||||
rc = OPAL_ERROR;
|
||||
goto cleanup_and_error;
|
||||
goto cleanup_and_error;
|
||||
}
|
||||
|
||||
/* Create the events since they can be reused. */
|
||||
@ -676,8 +682,6 @@ static int mca_common_cuda_stage_three_init(void)
|
||||
|
||||
/* Set up an array to store outstanding async htod events. Used on the
|
||||
* receiving side for asynchronous copies. */
|
||||
cuda_event_htod_array = NULL;
|
||||
cuda_event_htod_frag_array = NULL;
|
||||
cuda_event_htod_num_used = 0;
|
||||
cuda_event_htod_first_avail = 0;
|
||||
cuda_event_htod_first_used = 0;
|
||||
@ -784,53 +788,100 @@ static int mca_common_cuda_stage_three_init(void)
|
||||
|
||||
/* If we are here, something went wrong. Cleanup and return an error. */
|
||||
cleanup_and_error:
|
||||
for (i = 0; i < cuda_event_max; i++) {
|
||||
if (NULL != cuda_event_ipc_array[i]) {
|
||||
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
|
||||
}
|
||||
if (NULL != cuda_event_htod_array[i]) {
|
||||
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
|
||||
}
|
||||
if (NULL != cuda_event_dtoh_array[i]) {
|
||||
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
|
||||
}
|
||||
}
|
||||
if (NULL != cuda_event_ipc_array) {
|
||||
free(cuda_event_ipc_array);
|
||||
}
|
||||
if (NULL != cuda_event_htod_array) {
|
||||
free(cuda_event_htod_array);
|
||||
}
|
||||
if (NULL != cuda_event_dtoh_array) {
|
||||
free(cuda_event_dtoh_array);
|
||||
}
|
||||
if (NULL != cuda_event_ipc_frag_array) {
|
||||
free(cuda_event_ipc_frag_array);
|
||||
}
|
||||
if (NULL != cuda_event_htod_frag_array) {
|
||||
free(cuda_event_ipc_frag_array);
|
||||
}
|
||||
if (NULL != cuda_event_dtoh_frag_array) {
|
||||
free(cuda_event_dtoh_frag_array);
|
||||
}
|
||||
if (NULL != ipcStream) {
|
||||
cuFunc.cuStreamDestroy(ipcStream);
|
||||
}
|
||||
if (NULL != dtohStream) {
|
||||
cuFunc.cuStreamDestroy(dtohStream);
|
||||
}
|
||||
if (NULL != htodStream) {
|
||||
cuFunc.cuStreamDestroy(htodStream);
|
||||
}
|
||||
if (NULL != memcpyStream) {
|
||||
cuFunc.cuStreamDestroy(memcpyStream);
|
||||
}
|
||||
opal_atomic_mb(); /* Make sure next statement does not get reordered */
|
||||
stage_three_init_complete = true;
|
||||
OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup all CUDA resources.
|
||||
*
|
||||
* Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
|
||||
* mpool. Looks like with the memory pool from openib (grdma), the unregistering is
|
||||
* called as the free list is destructed. Not true for the sm mpool. This means we
|
||||
* are currently still leaking some host memory we registered with CUDA.
|
||||
*/
|
||||
void mca_common_cuda_fini(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (0 == stage_one_init_ref_count) {
|
||||
opal_output_verbose(20, mca_common_cuda_output,
|
||||
"CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
|
||||
stage_one_init_ref_count);
|
||||
return;
|
||||
}
|
||||
|
||||
if (1 == stage_one_init_ref_count) {
|
||||
opal_output_verbose(20, mca_common_cuda_output,
|
||||
"CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up",
|
||||
stage_one_init_ref_count);
|
||||
|
||||
if (NULL != cuda_event_ipc_array) {
|
||||
for (i = 0; i < cuda_event_max; i++) {
|
||||
if (NULL != cuda_event_ipc_array[i]) {
|
||||
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
|
||||
}
|
||||
}
|
||||
free(cuda_event_ipc_array);
|
||||
}
|
||||
if (NULL != cuda_event_htod_array) {
|
||||
for (i = 0; i < cuda_event_max; i++) {
|
||||
if (NULL != cuda_event_htod_array[i]) {
|
||||
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
|
||||
}
|
||||
}
|
||||
free(cuda_event_htod_array);
|
||||
}
|
||||
|
||||
if (NULL != cuda_event_dtoh_array) {
|
||||
for (i = 0; i < cuda_event_max; i++) {
|
||||
if (NULL != cuda_event_dtoh_array[i]) {
|
||||
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
|
||||
}
|
||||
}
|
||||
free(cuda_event_dtoh_array);
|
||||
}
|
||||
|
||||
if (NULL != cuda_event_ipc_frag_array) {
|
||||
free(cuda_event_ipc_frag_array);
|
||||
}
|
||||
if (NULL != cuda_event_htod_frag_array) {
|
||||
free(cuda_event_htod_frag_array);
|
||||
}
|
||||
if (NULL != cuda_event_dtoh_frag_array) {
|
||||
free(cuda_event_dtoh_frag_array);
|
||||
}
|
||||
if (NULL != ipcStream) {
|
||||
cuFunc.cuStreamDestroy(ipcStream);
|
||||
}
|
||||
if (NULL != dtohStream) {
|
||||
cuFunc.cuStreamDestroy(dtohStream);
|
||||
}
|
||||
if (NULL != htodStream) {
|
||||
cuFunc.cuStreamDestroy(htodStream);
|
||||
}
|
||||
if (NULL != memcpyStream) {
|
||||
cuFunc.cuStreamDestroy(memcpyStream);
|
||||
}
|
||||
OBJ_DESTRUCT(&common_cuda_init_lock);
|
||||
OBJ_DESTRUCT(&common_cuda_htod_lock);
|
||||
OBJ_DESTRUCT(&common_cuda_dtoh_lock);
|
||||
OBJ_DESTRUCT(&common_cuda_ipc_lock);
|
||||
if (NULL != libcuda_handle) {
|
||||
opal_lt_dlclose(libcuda_handle);
|
||||
opal_lt_dlexit();
|
||||
}
|
||||
opal_output_close(mca_common_cuda_output);
|
||||
|
||||
} else {
|
||||
opal_output_verbose(20, mca_common_cuda_output,
|
||||
"CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
|
||||
stage_one_init_ref_count);
|
||||
}
|
||||
stage_one_init_ref_count--;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call the CUDA register function so we pin the memory in the CUDA
|
||||
@ -843,17 +894,6 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
|
||||
if (!opal_cuda_support)
|
||||
return;
|
||||
|
||||
/* Registering memory during BTL initialization will be the first call
|
||||
* into the cuda common code, so this is where we do the first
|
||||
* initialization function. If the first stage fails, then disable
|
||||
* support and return. */
|
||||
if (!stage_one_init_complete) {
|
||||
if (0 != mca_common_cuda_stage_one_init()) {
|
||||
opal_cuda_support = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!common_cuda_initialized) {
|
||||
OPAL_THREAD_LOCK(&common_cuda_init_lock);
|
||||
if (!common_cuda_initialized) {
|
||||
|
@ -74,6 +74,7 @@ OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
|
||||
OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
|
||||
OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
|
||||
OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
|
||||
OPAL_DECLSPEC void mca_common_cuda_fini(void);
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
|
||||
OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user