From 54ab0d1a513f738ee04a772012b4a7cadcc6f49f Mon Sep 17 00:00:00 2001 From: Rolf vandeVaart Date: Thu, 27 Aug 2015 17:15:28 -0400 Subject: [PATCH] Add config code to check for need of workaround. Add runtime way to turn it off just in case --- opal/mca/common/cuda/common_cuda.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 9a7e283f79..8097508945 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -126,6 +126,7 @@ static CUstream ipcStream = NULL; static CUstream dtohStream = NULL; static CUstream htodStream = NULL; static CUstream memcpyStream = NULL; +static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1; static opal_mutex_t common_cuda_init_lock; static opal_mutex_t common_cuda_htod_lock; static opal_mutex_t common_cuda_dtoh_lock; @@ -300,6 +301,13 @@ void mca_common_cuda_register_mca_variables(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_common_cuda_cumemcpy_timing); #endif /* OPAL_ENABLE_DEBUG */ + + (void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround", + "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_common_cuda_gpu_mem_check_workaround); } /** @@ -774,6 +782,9 @@ static int mca_common_cuda_stage_three_init(void) "CUDA: cuMemHostRegister OK on test region"); } + opal_output_verbose(20, mca_common_cuda_output, + "CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off"); + opal_output_verbose(30, mca_common_cuda_output, "CUDA: initialized"); opal_atomic_mb(); /* Make sure next statement does not get reordered */ @@ -1832,7 +1843,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t * made it this far, then the assumption at this point is we have GPU memory. * Unfotunately, this extra call is costing us another 100 ns almost doubling * the cost of this entire function. */ - { + if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) { CUdeviceptr pbase; size_t psize; res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);