1
1

Add extra check during cleanup to make sure we really should clean up the CUDA resources.

Этот коммит содержится в:
Rolf vandeVaart 2015-03-06 13:12:08 -05:00
родитель c7aaeef0cb
Коммит 237c268a09
2 изменённых файлов: 70 добавлений и 31 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -187,6 +187,12 @@ static int cuda_event_htod_most = 0;
/* Handle to libcuda.so */
opal_lt_dlhandle libcuda_handle = NULL;
/* Unused variable that we register at init time and unregister at fini time.
* This is used to detect if user has done a device reset prior to MPI_Finalize.
* This is a workaround to avoid SEGVs.
*/
static int checkmem;
#define CUDA_COMMON_TIMING 0
#if OPAL_ENABLE_DEBUG
/* Some timing support structures. Enable this to help analyze
@ -785,6 +791,19 @@ static int mca_common_cuda_stage_three_init(void)
}
}
res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0);
if (res != CUDA_SUCCESS) {
/* If registering the memory fails, print a message and continue.
* This is not a fatal error. */
opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
true, &checkmem, sizeof(int),
OPAL_PROC_MY_HOSTNAME, res, "checkmem");
} else {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: cuMemHostRegister OK on test region");
}
opal_output_verbose(30, mca_common_cuda_output,
"CUDA: initialized");
opal_atomic_mb(); /* Make sure next statement does not get reordered */
@ -811,7 +830,8 @@ static int mca_common_cuda_stage_three_init(void)
*/
void mca_common_cuda_fini(void)
{
int i;
int i, ctx_ok = 0;
CUresult res;
if (0 == stage_one_init_ref_count) {
opal_output_verbose(20, mca_common_cuda_output,
@ -822,30 +842,49 @@ void mca_common_cuda_fini(void)
if (1 == stage_one_init_ref_count) {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up",
"CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started",
stage_one_init_ref_count);
/* This call is in here to make sure the context is still valid.
* This was the one way of checking which did not cause problems
* while calling into the CUDA library. This check will detect if
* a user has called cudaDeviceReset prior to MPI_Finalize. If so,
* then this call will fail and we skip cleaning up CUDA resources. */
res = cuFunc.cuMemHostUnregister(&checkmem);
if (CUDA_SUCCESS == res) {
ctx_ok = 1;
}
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d",
res, ctx_ok);
if (NULL != cuda_event_ipc_array) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_ipc_array[i]) {
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
}
}
if (ctx_ok) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_ipc_array[i]) {
cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
}
}
}
free(cuda_event_ipc_array);
}
if (NULL != cuda_event_htod_array) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_htod_array[i]) {
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
if (ctx_ok) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_htod_array[i]) {
cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
}
}
}
free(cuda_event_htod_array);
}
if (NULL != cuda_event_dtoh_array) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_dtoh_array[i]) {
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
if (ctx_ok) {
for (i = 0; i < cuda_event_max; i++) {
if (NULL != cuda_event_dtoh_array[i]) {
cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
}
}
}
free(cuda_event_dtoh_array);
@ -860,16 +899,16 @@ void mca_common_cuda_fini(void)
if (NULL != cuda_event_dtoh_frag_array) {
free(cuda_event_dtoh_frag_array);
}
if (NULL != ipcStream) {
if ((NULL != ipcStream) && ctx_ok) {
cuFunc.cuStreamDestroy(ipcStream);
}
if (NULL != dtohStream) {
if ((NULL != dtohStream) && ctx_ok) {
cuFunc.cuStreamDestroy(dtohStream);
}
if (NULL != htodStream) {
if ((NULL != htodStream) && ctx_ok) {
cuFunc.cuStreamDestroy(htodStream);
}
if (NULL != memcpyStream) {
if ((NULL != memcpyStream) && ctx_ok) {
cuFunc.cuStreamDestroy(memcpyStream);
}
OBJ_DESTRUCT(&common_cuda_init_lock);
@ -880,6 +919,11 @@ void mca_common_cuda_fini(void)
opal_lt_dlclose(libcuda_handle);
opal_lt_dlexit();
}
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done",
stage_one_init_ref_count);
opal_output_close(mca_common_cuda_output);
} else {
@ -959,11 +1003,12 @@ void mca_common_cuda_unregister(void *ptr, char *msg) {
if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
res = cuFunc.cuMemHostUnregister(ptr);
if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
/* If unregistering the memory fails, print a message and continue.
* This is not a fatal error. */
opal_show_help("help-mpi-common-cuda.txt", "cuMemHostUnregister failed",
true, ptr,
OPAL_PROC_MY_HOSTNAME, res, msg);
/* If unregistering the memory fails, just continue. This is during
* shutdown. Only print when running in verbose mode. */
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, mpool=%s",
ptr, res, msg);
} else {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: cuMemHostUnregister OK on mpool %s: "

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2011-2014 NVIDIA. All rights reserved.
# Copyright (c) 2011-2015 NVIDIA. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -48,12 +48,6 @@ The call to cuMemHostRegister(%p, %d, 0) failed.
cuMemHostRegister return value: %d
Memory Pool: %s
#
[cuMemHostUnregister failed]
The call to cuMemHostUnregister(%p) failed.
Host: %s
cuMemHostUnregister return value: %d
Memory Pool: %s
#
[cuIpcGetMemHandle failed]
The call to cuIpcGetMemHandle failed. This means the GPU RDMA protocol
cannot be used.