From 7703c964960226b4010bb55b7d998ec1160bd63a Mon Sep 17 00:00:00 2001
From: Rolf vandeVaart <rvandevaart@nvidia.com>
Date: Mon, 20 Jul 2015 17:07:26 -0400
Subject: [PATCH] Add a workaroud for issue in libcuda.so library

---
 opal/mca/common/cuda/common_cuda.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index be03f962a3..2c9785e9fc 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1749,6 +1749,9 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
     void *attrdata[] = {(void *)&memType, (void *)&ctx, (void *)&isManaged};
 
     res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
+    OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output, 
+			"dbuf=%p, memType=%d, ctx=%p, isManaged=%d, res=%d",
+			 (void *)dbuf, (int)memType, (void *)ctx, isManaged, res));
 
     /* Mark unified memory buffers with a flag.  This will allow all unified
      * memory to be forced through host buffers.  Note that this memory can
@@ -1822,6 +1825,26 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
         }
     }
 
+    /* WORKAROUND - They are times when the above code determines a pice of memory
+     * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
+     * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
+     * made it this far, then the assumption at this point is we have GPU memory.
+     * Unfotunately, this extra call is costing us another 100 ns almost doubling
+     * the cost of this entire function. */
+    {
+        CUdeviceptr pbase;
+        size_t psize;
+        res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);
+        if (CUDA_SUCCESS != res) {
+            opal_output_verbose(5, mca_common_cuda_output, 
+                                "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p "
+                                "Overriding check and setting to host pointer. ",
+                              res, (void *)dbuf);
+            /* This cannot be GPU memory if the previous call failed */
+            return 0;
+        }
+    }
+
     /* First access on a device pointer finalizes CUDA support initialization.
      * If initialization fails, disable support. */
     if (!stage_three_init_complete) {
@@ -1829,6 +1852,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
             opal_cuda_support = 0;
         }
     }
+
     return 1;
 }