1
1

Fix initialization and cleanup code for CUDA-aware code. Eliminates all resource leaks.

This commit was SVN r32512.
Этот коммит содержится в:
Rolf vandeVaart 2014-08-12 19:41:46 +00:00
родитель e974bec57e
Коммит c53c981506
7 изменённых файлов: 150 добавлений и 76 удалений

Просмотреть файл

@ -222,6 +222,10 @@ static int btl_openib_component_open(void)
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
mca_btl_openib_component.memory_registration_verbose = -1;
#if OPAL_CUDA_SUPPORT
mca_common_cuda_stage_one_init();
#endif /* OPAL_CUDA_SUPPORT */
return OPAL_SUCCESS;
}
@ -279,6 +283,10 @@ static int btl_openib_component_close(void)
/* close memory registration debugging output */
opal_output_close (mca_btl_openib_component.memory_registration_verbose);
#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
#endif /* OPAL_CUDA_SUPPORT */
return rc;
}

Просмотреть файл

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -49,6 +50,10 @@ mca_btl_sm_la_SOURCES = $(libmca_btl_sm_la_sources)
mca_btl_sm_la_LDFLAGS = -module -avoid-version
mca_btl_sm_la_LIBADD = \
$(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/libmca_common_sm.la
if OPAL_cuda_support
mca_btl_sm_la_LIBADD = \
$(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/libmca_common_cuda.la
endif
mca_btl_sm_la_CPPFLAGS = $(btl_sm_CPPFLAGS)
noinst_LTLIBRARIES = $(component_noinst)

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
@ -60,6 +60,9 @@
#include "btl_sm.h"
#include "btl_sm_frag.h"
#include "btl_sm_fifo.h"
#if OPAL_CUDA_SUPPORT
#include "opal/mca/common/cuda/common_cuda.h"
#endif /* OPAL_CUDA_SUPPORT */
static int mca_btl_sm_component_open(void);
static int mca_btl_sm_component_close(void);
@ -356,6 +359,10 @@ static int mca_btl_sm_component_close(void)
CLEANUP:
#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
#endif /* OPAL_CUDA_SUPPORT */
/* return */
return return_value;
}
@ -878,6 +885,10 @@ mca_btl_sm_component_init(int *num_btls,
}
#endif /* OPAL_BTL_SM_HAVE_CMA */
#if OPAL_CUDA_SUPPORT
mca_common_cuda_stage_one_init();
#endif /* OPAL_CUDA_SUPPORT */
return btls;
#if OPAL_BTL_SM_HAVE_KNEM

Просмотреть файл

@ -288,6 +288,10 @@ static int mca_btl_smcuda_component_close(void)
CLEANUP:
#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
#endif /* OPAL_CUDA_SUPPORT */
/* return */
return return_value;
}
@ -931,6 +935,7 @@ mca_btl_smcuda_component_init(int *num_btls,
/* Register a smcuda control function to help setup IPC support */
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
mca_common_cuda_stage_one_init();
#endif /* OPAL_CUDA_SUPPORT */
return btls;

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2009 Oak Ridge National Laboratory
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -381,6 +381,10 @@ static int mca_btl_tcp_component_close(void)
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
#endif /* OPAL_CUDA_SUPPORT */
return OPAL_SUCCESS;
}

Просмотреть файл

@ -104,7 +104,7 @@ struct cudaFunctionTable {
typedef struct cudaFunctionTable cudaFunctionTable_t;
cudaFunctionTable_t cuFunc;
static bool stage_one_init_complete = false;
static int stage_one_init_ref_count = 0;
static bool stage_three_init_complete = false;
static bool common_cuda_initialized = false;
static int mca_common_cuda_verbose;
@ -155,15 +155,15 @@ static int mca_common_cuda_cumemcpy_timing;
/* Array of CUDA events to be queried for IPC stream, sending side and
* receiving side. */
CUevent *cuda_event_ipc_array;
CUevent *cuda_event_dtoh_array;
CUevent *cuda_event_htod_array;
CUevent *cuda_event_ipc_array = NULL;
CUevent *cuda_event_dtoh_array = NULL;
CUevent *cuda_event_htod_array = NULL;
/* Array of fragments currently being moved by cuda async non-blocking
* operations */
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array;
struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array;
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
/* First free/available location in cuda_event_status_array */
int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
@ -181,7 +181,7 @@ static int cuda_event_dtoh_most = 0;
static int cuda_event_htod_most = 0;
/* Handle to libcuda.so */
opal_lt_dlhandle libcuda_handle;
opal_lt_dlhandle libcuda_handle = NULL;
#define CUDA_COMMON_TIMING 0
#if OPAL_ENABLE_DEBUG
@ -211,8 +211,9 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
/**
* This is the first stage of initialization. This function is
* triggered when there are memory registration requests from various
* BTLs. This function will register some mca variables and then open
* called explicitly by any BTLs that can support CUDA-aware.
* It is called during the component open phase of initialization.
* This function will register some mca variables and then open
* and load the symbols needed from the CUDA driver library. Look for
* the SONAME of the library which is libcuda.so.1. In most cases,
* this will result in the library found. However, there are some
@ -232,10 +233,14 @@ int mca_common_cuda_stage_one_init(void)
int errsize;
bool stage_one_init_passed = false;
if (true == stage_one_init_complete) {
return 0;
stage_one_init_ref_count++;
if (stage_one_init_ref_count > 1) {
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: stage_one_init_ref_count is now %d, no need to init",
stage_one_init_ref_count);
return OPAL_SUCCESS;
}
stage_one_init_complete = true;
OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
@ -313,6 +318,10 @@ int mca_common_cuda_stage_one_init(void)
mca_common_cuda_output = opal_output_open(NULL);
opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: stage_one_init_ref_count is now %d, initializing",
stage_one_init_ref_count);
/* First check if the support is enabled. In the case that the user has
* turned it off, we do not need to continue with any CUDA specific
* initialization. Do this after MCA parameter registration. */
@ -351,7 +360,7 @@ int mca_common_cuda_stage_one_init(void)
* paths from the system. For the second loop, set /usr/lib64 to
* the search path and try again. This is done to handle the case
* where we have both 32 and 64 bit libcuda.so libraries installed.
* Even when running in 64-bit mode, the /usr/lib direcotry
* Even when running in 64-bit mode, the /usr/lib directory
* is searched first and we may find a 32-bit libcuda.so.1 library.
* Loading of this library will fail as libtool does not handle having
* the wrong ABI in the search path (unlike ld or ld.so). Note that
@ -515,7 +524,8 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta
* This is the last phase of initialization. This is triggered when we examine
* a buffer pointer and determine it is a GPU buffer. We then assume the user
* has selected their GPU and we can go ahead with all the CUDA related
* initializations.
* initializations. If we get an error, just return. Cleanup of resources
* will happen when fini is called.
*/
static int mca_common_cuda_stage_three_init(void)
{
@ -597,8 +607,6 @@ static int mca_common_cuda_stage_three_init(void)
#if OPAL_CUDA_SUPPORT_41
if (true == mca_common_cuda_enabled) {
/* Set up an array to store outstanding IPC async copy events */
cuda_event_ipc_array = NULL;
cuda_event_ipc_frag_array = NULL;
cuda_event_ipc_num_used = 0;
cuda_event_ipc_first_avail = 0;
cuda_event_ipc_first_used = 0;
@ -638,8 +646,6 @@ static int mca_common_cuda_stage_three_init(void)
if (true == mca_common_cuda_enabled) {
/* Set up an array to store outstanding async dtoh events. Used on the
* sending side for asynchronous copies. */
cuda_event_dtoh_array = NULL;
cuda_event_dtoh_frag_array = NULL;
cuda_event_dtoh_num_used = 0;
cuda_event_dtoh_first_avail = 0;
cuda_event_dtoh_first_used = 0;
@ -649,7 +655,7 @@ static int mca_common_cuda_stage_three_init(void)
opal_show_help("help-mpi-common-cuda.txt", "No memory",
true, OPAL_PROC_MY_HOSTNAME);
rc = OPAL_ERROR;
goto cleanup_and_error;
goto cleanup_and_error;
}
/* Create the events since they can be reused. */
@ -676,8 +682,6 @@ static int mca_common_cuda_stage_three_init(void)
/* Set up an array to store outstanding async htod events. Used on the
* receiving side for asynchronous copies. */
cuda_event_htod_array = NULL;
cuda_event_htod_frag_array = NULL;
cuda_event_htod_num_used = 0;
cuda_event_htod_first_avail = 0;
cuda_event_htod_first_used = 0;
@ -784,53 +788,100 @@ static int mca_common_cuda_stage_three_init(void)
/* If we are here, something went wrong. Cleanup and return an error. */
cleanup_and_error:
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_ipc_array[i]) {
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
}
if (NULL != cuda_event_htod_array[i]) {
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
}
if (NULL != cuda_event_dtoh_array[i]) {
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
}
}
if (NULL != cuda_event_ipc_array) {
free(cuda_event_ipc_array);
}
if (NULL != cuda_event_htod_array) {
free(cuda_event_htod_array);
}
if (NULL != cuda_event_dtoh_array) {
free(cuda_event_dtoh_array);
}
if (NULL != cuda_event_ipc_frag_array) {
free(cuda_event_ipc_frag_array);
}
if (NULL != cuda_event_htod_frag_array) {
free(cuda_event_ipc_frag_array);
}
if (NULL != cuda_event_dtoh_frag_array) {
free(cuda_event_dtoh_frag_array);
}
if (NULL != ipcStream) {
cuFunc.cuStreamDestroy(ipcStream);
}
if (NULL != dtohStream) {
cuFunc.cuStreamDestroy(dtohStream);
}
if (NULL != htodStream) {
cuFunc.cuStreamDestroy(htodStream);
}
if (NULL != memcpyStream) {
cuFunc.cuStreamDestroy(memcpyStream);
}
opal_atomic_mb(); /* Make sure next statement does not get reordered */
stage_three_init_complete = true;
OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
return rc;
}
/**
* Cleanup all CUDA resources.
*
* Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
* mpool. Looks like with the memory pool from openib (grdma), the unregistering is
* called as the free list is destructed. Not true for the sm mpool. This means we
* are currently still leaking some host memory we registered with CUDA.
*/
void mca_common_cuda_fini(void)
{
int i;
if (0 == stage_one_init_ref_count) {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
stage_one_init_ref_count);
return;
}
if (1 == stage_one_init_ref_count) {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up",
stage_one_init_ref_count);
if (NULL != cuda_event_ipc_array) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_ipc_array[i]) {
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
}
}
free(cuda_event_ipc_array);
}
if (NULL != cuda_event_htod_array) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_htod_array[i]) {
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
}
}
free(cuda_event_htod_array);
}
if (NULL != cuda_event_dtoh_array) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_dtoh_array[i]) {
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
}
}
free(cuda_event_dtoh_array);
}
if (NULL != cuda_event_ipc_frag_array) {
free(cuda_event_ipc_frag_array);
}
if (NULL != cuda_event_htod_frag_array) {
free(cuda_event_htod_frag_array);
}
if (NULL != cuda_event_dtoh_frag_array) {
free(cuda_event_dtoh_frag_array);
}
if (NULL != ipcStream) {
cuFunc.cuStreamDestroy(ipcStream);
}
if (NULL != dtohStream) {
cuFunc.cuStreamDestroy(dtohStream);
}
if (NULL != htodStream) {
cuFunc.cuStreamDestroy(htodStream);
}
if (NULL != memcpyStream) {
cuFunc.cuStreamDestroy(memcpyStream);
}
OBJ_DESTRUCT(&common_cuda_init_lock);
OBJ_DESTRUCT(&common_cuda_htod_lock);
OBJ_DESTRUCT(&common_cuda_dtoh_lock);
OBJ_DESTRUCT(&common_cuda_ipc_lock);
if (NULL != libcuda_handle) {
opal_lt_dlclose(libcuda_handle);
opal_lt_dlexit();
}
opal_output_close(mca_common_cuda_output);
} else {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
stage_one_init_ref_count);
}
stage_one_init_ref_count--;
}
/**
* Call the CUDA register function so we pin the memory in the CUDA
@ -843,17 +894,6 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
if (!opal_cuda_support)
return;
/* Registering memory during BTL initialization will be the first call
* into the cuda common code, so this is where we do the first
* initialization function. If the first stage fails, then disable
* support and return. */
if (!stage_one_init_complete) {
if (0 != mca_common_cuda_stage_one_init()) {
opal_cuda_support = 0;
return;
}
}
if (!common_cuda_initialized) {
OPAL_THREAD_LOCK(&common_cuda_init_lock);
if (!common_cuda_initialized) {

Просмотреть файл

@ -74,6 +74,7 @@ OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
OPAL_DECLSPEC void mca_common_cuda_fini(void);
#if OPAL_CUDA_GDR_SUPPORT
OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);