Fix initialization and cleanup code for CUDA-aware code. Eliminates all resource leaks.
This commit was SVN r32512.
Этот коммит содержится в:
родитель
e974bec57e
Коммит
c53c981506
@ -222,6 +222,10 @@ static int btl_openib_component_open(void)
|
|||||||
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
||||||
mca_btl_openib_component.memory_registration_verbose = -1;
|
mca_btl_openib_component.memory_registration_verbose = -1;
|
||||||
|
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
mca_common_cuda_stage_one_init();
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -279,6 +283,10 @@ static int btl_openib_component_close(void)
|
|||||||
/* close memory registration debugging output */
|
/* close memory registration debugging output */
|
||||||
opal_output_close (mca_btl_openib_component.memory_registration_verbose);
|
opal_output_close (mca_btl_openib_component.memory_registration_verbose);
|
||||||
|
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
mca_common_cuda_fini();
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||||
|
# Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -49,6 +50,10 @@ mca_btl_sm_la_SOURCES = $(libmca_btl_sm_la_sources)
|
|||||||
mca_btl_sm_la_LDFLAGS = -module -avoid-version
|
mca_btl_sm_la_LDFLAGS = -module -avoid-version
|
||||||
mca_btl_sm_la_LIBADD = \
|
mca_btl_sm_la_LIBADD = \
|
||||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/libmca_common_sm.la
|
$(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/libmca_common_sm.la
|
||||||
|
if OPAL_cuda_support
|
||||||
|
mca_btl_sm_la_LIBADD = \
|
||||||
|
$(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/libmca_common_cuda.la
|
||||||
|
endif
|
||||||
mca_btl_sm_la_CPPFLAGS = $(btl_sm_CPPFLAGS)
|
mca_btl_sm_la_CPPFLAGS = $(btl_sm_CPPFLAGS)
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
|
||||||
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
|
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -60,6 +60,9 @@
|
|||||||
#include "btl_sm.h"
|
#include "btl_sm.h"
|
||||||
#include "btl_sm_frag.h"
|
#include "btl_sm_frag.h"
|
||||||
#include "btl_sm_fifo.h"
|
#include "btl_sm_fifo.h"
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
#include "opal/mca/common/cuda/common_cuda.h"
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
static int mca_btl_sm_component_open(void);
|
static int mca_btl_sm_component_open(void);
|
||||||
static int mca_btl_sm_component_close(void);
|
static int mca_btl_sm_component_close(void);
|
||||||
@ -355,6 +358,10 @@ static int mca_btl_sm_component_close(void)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
|
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
mca_common_cuda_fini();
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
/* return */
|
/* return */
|
||||||
return return_value;
|
return return_value;
|
||||||
@ -878,6 +885,10 @@ mca_btl_sm_component_init(int *num_btls,
|
|||||||
}
|
}
|
||||||
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
||||||
|
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
mca_common_cuda_stage_one_init();
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
return btls;
|
return btls;
|
||||||
|
|
||||||
#if OPAL_BTL_SM_HAVE_KNEM
|
#if OPAL_BTL_SM_HAVE_KNEM
|
||||||
|
@ -288,6 +288,10 @@ static int mca_btl_smcuda_component_close(void)
|
|||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
|
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
mca_common_cuda_fini();
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
/* return */
|
/* return */
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
@ -931,6 +935,7 @@ mca_btl_smcuda_component_init(int *num_btls,
|
|||||||
/* Register a smcuda control function to help setup IPC support */
|
/* Register a smcuda control function to help setup IPC support */
|
||||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
|
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
|
||||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
|
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
|
||||||
|
mca_common_cuda_stage_one_init();
|
||||||
#endif /* OPAL_CUDA_SUPPORT */
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
return btls;
|
return btls;
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
* Copyright (c) 2009 Oak Ridge National Laboratory
|
* Copyright (c) 2009 Oak Ridge National Laboratory
|
||||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -381,6 +381,10 @@ static int mca_btl_tcp_component_close(void)
|
|||||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
|
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
|
||||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
|
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
|
||||||
|
|
||||||
|
#if OPAL_CUDA_SUPPORT
|
||||||
|
mca_common_cuda_fini();
|
||||||
|
#endif /* OPAL_CUDA_SUPPORT */
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,7 +104,7 @@ struct cudaFunctionTable {
|
|||||||
typedef struct cudaFunctionTable cudaFunctionTable_t;
|
typedef struct cudaFunctionTable cudaFunctionTable_t;
|
||||||
cudaFunctionTable_t cuFunc;
|
cudaFunctionTable_t cuFunc;
|
||||||
|
|
||||||
static bool stage_one_init_complete = false;
|
static int stage_one_init_ref_count = 0;
|
||||||
static bool stage_three_init_complete = false;
|
static bool stage_three_init_complete = false;
|
||||||
static bool common_cuda_initialized = false;
|
static bool common_cuda_initialized = false;
|
||||||
static int mca_common_cuda_verbose;
|
static int mca_common_cuda_verbose;
|
||||||
@ -155,15 +155,15 @@ static int mca_common_cuda_cumemcpy_timing;
|
|||||||
|
|
||||||
/* Array of CUDA events to be queried for IPC stream, sending side and
|
/* Array of CUDA events to be queried for IPC stream, sending side and
|
||||||
* receiving side. */
|
* receiving side. */
|
||||||
CUevent *cuda_event_ipc_array;
|
CUevent *cuda_event_ipc_array = NULL;
|
||||||
CUevent *cuda_event_dtoh_array;
|
CUevent *cuda_event_dtoh_array = NULL;
|
||||||
CUevent *cuda_event_htod_array;
|
CUevent *cuda_event_htod_array = NULL;
|
||||||
|
|
||||||
/* Array of fragments currently being moved by cuda async non-blocking
|
/* Array of fragments currently being moved by cuda async non-blocking
|
||||||
* operations */
|
* operations */
|
||||||
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
|
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
|
||||||
struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array;
|
struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
|
||||||
struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array;
|
struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
|
||||||
|
|
||||||
/* First free/available location in cuda_event_status_array */
|
/* First free/available location in cuda_event_status_array */
|
||||||
int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
|
int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
|
||||||
@ -181,7 +181,7 @@ static int cuda_event_dtoh_most = 0;
|
|||||||
static int cuda_event_htod_most = 0;
|
static int cuda_event_htod_most = 0;
|
||||||
|
|
||||||
/* Handle to libcuda.so */
|
/* Handle to libcuda.so */
|
||||||
opal_lt_dlhandle libcuda_handle;
|
opal_lt_dlhandle libcuda_handle = NULL;
|
||||||
|
|
||||||
#define CUDA_COMMON_TIMING 0
|
#define CUDA_COMMON_TIMING 0
|
||||||
#if OPAL_ENABLE_DEBUG
|
#if OPAL_ENABLE_DEBUG
|
||||||
@ -211,8 +211,9 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the first stage of initialization. This function is
|
* This is the first stage of initialization. This function is
|
||||||
* triggered when there are memory registration requests from various
|
* called explicitly by any BTLs that can support CUDA-aware.
|
||||||
* BTLs. This function will register some mca variables and then open
|
* It is called during the component open phase of initialization.
|
||||||
|
* This function will register some mca variables and then open
|
||||||
* and load the symbols needed from the CUDA driver library. Look for
|
* and load the symbols needed from the CUDA driver library. Look for
|
||||||
* the SONAME of the library which is libcuda.so.1. In most cases,
|
* the SONAME of the library which is libcuda.so.1. In most cases,
|
||||||
* this will result in the library found. However, there are some
|
* this will result in the library found. However, there are some
|
||||||
@ -232,10 +233,14 @@ int mca_common_cuda_stage_one_init(void)
|
|||||||
int errsize;
|
int errsize;
|
||||||
bool stage_one_init_passed = false;
|
bool stage_one_init_passed = false;
|
||||||
|
|
||||||
if (true == stage_one_init_complete) {
|
stage_one_init_ref_count++;
|
||||||
return 0;
|
if (stage_one_init_ref_count > 1) {
|
||||||
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
|
"CUDA: stage_one_init_ref_count is now %d, no need to init",
|
||||||
|
stage_one_init_ref_count);
|
||||||
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
stage_one_init_complete = true;
|
|
||||||
OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
|
||||||
OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
|
||||||
OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
|
||||||
@ -313,6 +318,10 @@ int mca_common_cuda_stage_one_init(void)
|
|||||||
mca_common_cuda_output = opal_output_open(NULL);
|
mca_common_cuda_output = opal_output_open(NULL);
|
||||||
opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
|
opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
|
||||||
|
|
||||||
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
|
"CUDA: stage_one_init_ref_count is now %d, initializing",
|
||||||
|
stage_one_init_ref_count);
|
||||||
|
|
||||||
/* First check if the support is enabled. In the case that the user has
|
/* First check if the support is enabled. In the case that the user has
|
||||||
* turned it off, we do not need to continue with any CUDA specific
|
* turned it off, we do not need to continue with any CUDA specific
|
||||||
* initialization. Do this after MCA parameter registration. */
|
* initialization. Do this after MCA parameter registration. */
|
||||||
@ -351,7 +360,7 @@ int mca_common_cuda_stage_one_init(void)
|
|||||||
* paths from the system. For the second loop, set /usr/lib64 to
|
* paths from the system. For the second loop, set /usr/lib64 to
|
||||||
* the search path and try again. This is done to handle the case
|
* the search path and try again. This is done to handle the case
|
||||||
* where we have both 32 and 64 bit libcuda.so libraries installed.
|
* where we have both 32 and 64 bit libcuda.so libraries installed.
|
||||||
* Even when running in 64-bit mode, the /usr/lib direcotry
|
* Even when running in 64-bit mode, the /usr/lib directory
|
||||||
* is searched first and we may find a 32-bit libcuda.so.1 library.
|
* is searched first and we may find a 32-bit libcuda.so.1 library.
|
||||||
* Loading of this library will fail as libtool does not handle having
|
* Loading of this library will fail as libtool does not handle having
|
||||||
* the wrong ABI in the search path (unlike ld or ld.so). Note that
|
* the wrong ABI in the search path (unlike ld or ld.so). Note that
|
||||||
@ -515,7 +524,8 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta
|
|||||||
* This is the last phase of initialization. This is triggered when we examine
|
* This is the last phase of initialization. This is triggered when we examine
|
||||||
* a buffer pointer and determine it is a GPU buffer. We then assume the user
|
* a buffer pointer and determine it is a GPU buffer. We then assume the user
|
||||||
* has selected their GPU and we can go ahead with all the CUDA related
|
* has selected their GPU and we can go ahead with all the CUDA related
|
||||||
* initializations.
|
* initializations. If we get an error, just return. Cleanup of resources
|
||||||
|
* will happen when fini is called.
|
||||||
*/
|
*/
|
||||||
static int mca_common_cuda_stage_three_init(void)
|
static int mca_common_cuda_stage_three_init(void)
|
||||||
{
|
{
|
||||||
@ -597,8 +607,6 @@ static int mca_common_cuda_stage_three_init(void)
|
|||||||
#if OPAL_CUDA_SUPPORT_41
|
#if OPAL_CUDA_SUPPORT_41
|
||||||
if (true == mca_common_cuda_enabled) {
|
if (true == mca_common_cuda_enabled) {
|
||||||
/* Set up an array to store outstanding IPC async copy events */
|
/* Set up an array to store outstanding IPC async copy events */
|
||||||
cuda_event_ipc_array = NULL;
|
|
||||||
cuda_event_ipc_frag_array = NULL;
|
|
||||||
cuda_event_ipc_num_used = 0;
|
cuda_event_ipc_num_used = 0;
|
||||||
cuda_event_ipc_first_avail = 0;
|
cuda_event_ipc_first_avail = 0;
|
||||||
cuda_event_ipc_first_used = 0;
|
cuda_event_ipc_first_used = 0;
|
||||||
@ -638,8 +646,6 @@ static int mca_common_cuda_stage_three_init(void)
|
|||||||
if (true == mca_common_cuda_enabled) {
|
if (true == mca_common_cuda_enabled) {
|
||||||
/* Set up an array to store outstanding async dtoh events. Used on the
|
/* Set up an array to store outstanding async dtoh events. Used on the
|
||||||
* sending side for asynchronous copies. */
|
* sending side for asynchronous copies. */
|
||||||
cuda_event_dtoh_array = NULL;
|
|
||||||
cuda_event_dtoh_frag_array = NULL;
|
|
||||||
cuda_event_dtoh_num_used = 0;
|
cuda_event_dtoh_num_used = 0;
|
||||||
cuda_event_dtoh_first_avail = 0;
|
cuda_event_dtoh_first_avail = 0;
|
||||||
cuda_event_dtoh_first_used = 0;
|
cuda_event_dtoh_first_used = 0;
|
||||||
@ -649,7 +655,7 @@ static int mca_common_cuda_stage_three_init(void)
|
|||||||
opal_show_help("help-mpi-common-cuda.txt", "No memory",
|
opal_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||||
true, OPAL_PROC_MY_HOSTNAME);
|
true, OPAL_PROC_MY_HOSTNAME);
|
||||||
rc = OPAL_ERROR;
|
rc = OPAL_ERROR;
|
||||||
goto cleanup_and_error;
|
goto cleanup_and_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Create the events since they can be reused. */
|
/* Create the events since they can be reused. */
|
||||||
@ -676,8 +682,6 @@ static int mca_common_cuda_stage_three_init(void)
|
|||||||
|
|
||||||
/* Set up an array to store outstanding async htod events. Used on the
|
/* Set up an array to store outstanding async htod events. Used on the
|
||||||
* receiving side for asynchronous copies. */
|
* receiving side for asynchronous copies. */
|
||||||
cuda_event_htod_array = NULL;
|
|
||||||
cuda_event_htod_frag_array = NULL;
|
|
||||||
cuda_event_htod_num_used = 0;
|
cuda_event_htod_num_used = 0;
|
||||||
cuda_event_htod_first_avail = 0;
|
cuda_event_htod_first_avail = 0;
|
||||||
cuda_event_htod_first_used = 0;
|
cuda_event_htod_first_used = 0;
|
||||||
@ -784,53 +788,100 @@ static int mca_common_cuda_stage_three_init(void)
|
|||||||
|
|
||||||
/* If we are here, something went wrong. Cleanup and return an error. */
|
/* If we are here, something went wrong. Cleanup and return an error. */
|
||||||
cleanup_and_error:
|
cleanup_and_error:
|
||||||
for (i = 0; i < cuda_event_max; i++) {
|
|
||||||
if (NULL != cuda_event_ipc_array[i]) {
|
|
||||||
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_htod_array[i]) {
|
|
||||||
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_dtoh_array[i]) {
|
|
||||||
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_ipc_array) {
|
|
||||||
free(cuda_event_ipc_array);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_htod_array) {
|
|
||||||
free(cuda_event_htod_array);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_dtoh_array) {
|
|
||||||
free(cuda_event_dtoh_array);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_ipc_frag_array) {
|
|
||||||
free(cuda_event_ipc_frag_array);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_htod_frag_array) {
|
|
||||||
free(cuda_event_ipc_frag_array);
|
|
||||||
}
|
|
||||||
if (NULL != cuda_event_dtoh_frag_array) {
|
|
||||||
free(cuda_event_dtoh_frag_array);
|
|
||||||
}
|
|
||||||
if (NULL != ipcStream) {
|
|
||||||
cuFunc.cuStreamDestroy(ipcStream);
|
|
||||||
}
|
|
||||||
if (NULL != dtohStream) {
|
|
||||||
cuFunc.cuStreamDestroy(dtohStream);
|
|
||||||
}
|
|
||||||
if (NULL != htodStream) {
|
|
||||||
cuFunc.cuStreamDestroy(htodStream);
|
|
||||||
}
|
|
||||||
if (NULL != memcpyStream) {
|
|
||||||
cuFunc.cuStreamDestroy(memcpyStream);
|
|
||||||
}
|
|
||||||
opal_atomic_mb(); /* Make sure next statement does not get reordered */
|
opal_atomic_mb(); /* Make sure next statement does not get reordered */
|
||||||
stage_three_init_complete = true;
|
stage_three_init_complete = true;
|
||||||
OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
|
OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup all CUDA resources.
|
||||||
|
*
|
||||||
|
* Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
|
||||||
|
* mpool. Looks like with the memory pool from openib (grdma), the unregistering is
|
||||||
|
* called as the free list is destructed. Not true for the sm mpool. This means we
|
||||||
|
* are currently still leaking some host memory we registered with CUDA.
|
||||||
|
*/
|
||||||
|
void mca_common_cuda_fini(void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (0 == stage_one_init_ref_count) {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
|
||||||
|
stage_one_init_ref_count);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (1 == stage_one_init_ref_count) {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up",
|
||||||
|
stage_one_init_ref_count);
|
||||||
|
|
||||||
|
if (NULL != cuda_event_ipc_array) {
|
||||||
|
for (i = 0; i < cuda_event_max; i++) {
|
||||||
|
if (NULL != cuda_event_ipc_array[i]) {
|
||||||
|
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(cuda_event_ipc_array);
|
||||||
|
}
|
||||||
|
if (NULL != cuda_event_htod_array) {
|
||||||
|
for (i = 0; i < cuda_event_max; i++) {
|
||||||
|
if (NULL != cuda_event_htod_array[i]) {
|
||||||
|
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(cuda_event_htod_array);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NULL != cuda_event_dtoh_array) {
|
||||||
|
for (i = 0; i < cuda_event_max; i++) {
|
||||||
|
if (NULL != cuda_event_dtoh_array[i]) {
|
||||||
|
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(cuda_event_dtoh_array);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NULL != cuda_event_ipc_frag_array) {
|
||||||
|
free(cuda_event_ipc_frag_array);
|
||||||
|
}
|
||||||
|
if (NULL != cuda_event_htod_frag_array) {
|
||||||
|
free(cuda_event_htod_frag_array);
|
||||||
|
}
|
||||||
|
if (NULL != cuda_event_dtoh_frag_array) {
|
||||||
|
free(cuda_event_dtoh_frag_array);
|
||||||
|
}
|
||||||
|
if (NULL != ipcStream) {
|
||||||
|
cuFunc.cuStreamDestroy(ipcStream);
|
||||||
|
}
|
||||||
|
if (NULL != dtohStream) {
|
||||||
|
cuFunc.cuStreamDestroy(dtohStream);
|
||||||
|
}
|
||||||
|
if (NULL != htodStream) {
|
||||||
|
cuFunc.cuStreamDestroy(htodStream);
|
||||||
|
}
|
||||||
|
if (NULL != memcpyStream) {
|
||||||
|
cuFunc.cuStreamDestroy(memcpyStream);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&common_cuda_init_lock);
|
||||||
|
OBJ_DESTRUCT(&common_cuda_htod_lock);
|
||||||
|
OBJ_DESTRUCT(&common_cuda_dtoh_lock);
|
||||||
|
OBJ_DESTRUCT(&common_cuda_ipc_lock);
|
||||||
|
if (NULL != libcuda_handle) {
|
||||||
|
opal_lt_dlclose(libcuda_handle);
|
||||||
|
opal_lt_dlexit();
|
||||||
|
}
|
||||||
|
opal_output_close(mca_common_cuda_output);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
|
||||||
|
stage_one_init_ref_count);
|
||||||
|
}
|
||||||
|
stage_one_init_ref_count--;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Call the CUDA register function so we pin the memory in the CUDA
|
* Call the CUDA register function so we pin the memory in the CUDA
|
||||||
@ -843,17 +894,6 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
|
|||||||
if (!opal_cuda_support)
|
if (!opal_cuda_support)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* Registering memory during BTL initialization will be the first call
|
|
||||||
* into the cuda common code, so this is where we do the first
|
|
||||||
* initialization function. If the first stage fails, then disable
|
|
||||||
* support and return. */
|
|
||||||
if (!stage_one_init_complete) {
|
|
||||||
if (0 != mca_common_cuda_stage_one_init()) {
|
|
||||||
opal_cuda_support = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!common_cuda_initialized) {
|
if (!common_cuda_initialized) {
|
||||||
OPAL_THREAD_LOCK(&common_cuda_init_lock);
|
OPAL_THREAD_LOCK(&common_cuda_init_lock);
|
||||||
if (!common_cuda_initialized) {
|
if (!common_cuda_initialized) {
|
||||||
|
@ -74,6 +74,7 @@ OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
|
|||||||
OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
|
OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
|
||||||
OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
|
OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
|
||||||
OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
|
OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
|
||||||
|
OPAL_DECLSPEC void mca_common_cuda_fini(void);
|
||||||
#if OPAL_CUDA_GDR_SUPPORT
|
#if OPAL_CUDA_GDR_SUPPORT
|
||||||
OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
|
OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
|
||||||
OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
|
OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user