1
1

Make sure context is still around when doing some other cleanup

Этот коммит содержится в:
Rolf vandeVaart 2015-03-24 16:47:29 -04:00
родитель f588ad7af0
Коммит dfb7e00ef5
2 изменённых файлов: 29 добавлений и 18 удалений

Просмотреть файл

@ -196,6 +196,7 @@ opal_dl_handle_t *libcuda_handle = NULL;
* This is a workaround to avoid SEGVs. * This is a workaround to avoid SEGVs.
*/ */
static int checkmem; static int checkmem;
static int ctx_ok = 1;
#define CUDA_COMMON_TIMING 0 #define CUDA_COMMON_TIMING 0
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
@ -789,7 +790,7 @@ static int mca_common_cuda_stage_three_init(void)
*/ */
void mca_common_cuda_fini(void) void mca_common_cuda_fini(void)
{ {
int i, ctx_ok = 0; int i;
CUresult res; CUresult res;
if (0 == stage_one_init_ref_count) { if (0 == stage_one_init_ref_count) {
@ -810,8 +811,8 @@ void mca_common_cuda_fini(void)
* a user has called cudaDeviceReset prior to MPI_Finalize. If so, * a user has called cudaDeviceReset prior to MPI_Finalize. If so,
* then this call will fail and we skip cleaning up CUDA resources. */ * then this call will fail and we skip cleaning up CUDA resources. */
res = cuFunc.cuMemHostUnregister(&checkmem); res = cuFunc.cuMemHostUnregister(&checkmem);
if (CUDA_SUCCESS == res) { if (CUDA_SUCCESS != res) {
ctx_ok = 1; ctx_ok = 0;
} }
opal_output_verbose(20, mca_common_cuda_output, opal_output_verbose(20, mca_common_cuda_output,
"CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d", "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d",
@ -1133,16 +1134,22 @@ int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
CUresult result; CUresult result;
mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)reg; mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)reg;
result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base); /* Only attempt to close if we have valid context. This can change if a call
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { * to the fini function is made and we discover context is gone. */
opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed", if (ctx_ok) {
true, result, cuda_reg->base.alloc_base); result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
/* We will just continue on and hope things continue to work. */ if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
} else { opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
opal_output_verbose(10, mca_common_cuda_output, true, result, cuda_reg->base.alloc_base);
"CUDA: cuIpcCloseMemHandle passed: base=%p", opal_output(0, "Sleep on %d", getpid());
cuda_reg->base.alloc_base); sleep(20);
CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle")); /* We will just continue on and hope things continue to work. */
} else {
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuIpcCloseMemHandle passed: base=%p",
cuda_reg->base.alloc_base);
CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle"));
}
} }
return OPAL_SUCCESS; return OPAL_SUCCESS;
@ -1172,10 +1179,14 @@ void mca_common_cuda_destruct_event(uint64_t *event)
{ {
CUresult result; CUresult result;
result = cuFunc.cuEventDestroy((CUevent)event); /* Only attempt to destroy if we have valid context. This can change if a call
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { * to the fini function is made and we discover context is gone. */
opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed", if (ctx_ok) {
true, result); result = cuFunc.cuEventDestroy((CUevent)event);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
true, result);
}
} }
} }

Просмотреть файл

@ -85,7 +85,7 @@ memory footprint of your application.
[cuIpcCloseMemHandle failed] [cuIpcCloseMemHandle failed]
The call to cuIpcCloseMemHandle failed. This is a warning and the program The call to cuIpcCloseMemHandle failed. This is a warning and the program
will continue to run. will continue to run.
cuIpcOpenMemHandle return value: %d cuIpcCloseMemHandle return value: %d
address: %p address: %p
Check the cuda.h file for what the return value means. Perhaps a reboot Check the cuda.h file for what the return value means. Perhaps a reboot
of the node will clear the problem. of the node will clear the problem.