From 54ab0d1a513f738ee04a772012b4a7cadcc6f49f Mon Sep 17 00:00:00 2001
From: Rolf vandeVaart <rvandevaart@nvidia.com>
Date: Thu, 27 Aug 2015 17:15:28 -0400
Subject: [PATCH] Add config code to check for need of workaround. Add runtime
 way to turn it off just in case

---
 opal/mca/common/cuda/common_cuda.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 9a7e283f79..8097508945 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -126,6 +126,7 @@ static CUstream ipcStream = NULL;
 static CUstream dtohStream = NULL;
 static CUstream htodStream = NULL;
 static CUstream memcpyStream = NULL;
+static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
 static opal_mutex_t common_cuda_init_lock;
 static opal_mutex_t common_cuda_htod_lock;
 static opal_mutex_t common_cuda_dtoh_lock;
@@ -300,6 +301,13 @@ void mca_common_cuda_register_mca_variables(void)
                                  MCA_BASE_VAR_SCOPE_READONLY,
                                  &mca_common_cuda_cumemcpy_timing);
 #endif /* OPAL_ENABLE_DEBUG */
+
+    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
+                                 "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
+                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                 OPAL_INFO_LVL_9,
+                                 MCA_BASE_VAR_SCOPE_READONLY,
+                                 &mca_common_cuda_gpu_mem_check_workaround);
 }
 
 /**
@@ -774,6 +782,9 @@ static int mca_common_cuda_stage_three_init(void)
                             "CUDA: cuMemHostRegister OK on test region");
     }
 
+    opal_output_verbose(20, mca_common_cuda_output,
+                        "CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
+
     opal_output_verbose(30, mca_common_cuda_output,
                         "CUDA: initialized");
     opal_atomic_mb();  /* Make sure next statement does not get reordered */
@@ -1832,7 +1843,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
      * made it this far, then the assumption at this point is we have GPU memory.
      * Unfotunately, this extra call is costing us another 100 ns almost doubling
      * the cost of this entire function. */
-    {
+    if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
         CUdeviceptr pbase;
         size_t psize;
         res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);