From f2ff6e03ab90709f5c718ca759dfa84ca8c2ba73 Mon Sep 17 00:00:00 2001
From: Rolf vandeVaart <rvandevaart@nvidia.com>
Date: Thu, 29 Oct 2015 11:24:02 -0400
Subject: [PATCH] Make CUDA 4.1 a requirement for CUDA-aware support. Remove
 all related preprocessor conditionals.

---
 config/opal_check_cuda.m4          | 17 ++++++++---------
 ompi/mca/pml/bfo/pml_bfo_cuda.c    |  7 +------
 ompi/mca/pml/ob1/pml_ob1_cuda.c    |  5 -----
 opal/mca/common/cuda/common_cuda.c | 14 --------------
 4 files changed, 9 insertions(+), 34 deletions(-)

diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
index 7040f5c515..11456ef905 100644
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@@ -16,7 +16,7 @@ dnl Copyright (c) 2009      IBM Corporation.  All rights reserved.
 dnl Copyright (c) 2009      Los Alamos National Security, LLC.  All rights
 dnl                         reserved.
 dnl Copyright (c) 2009-2011 Oak Ridge National Labs.  All rights reserved.
-dnl Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
+dnl Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
 dnl Copyright (c) 2015      Research Organization for Information Science
 dnl                         and Technology (RIST). All rights reserved.
 dnl
@@ -79,10 +79,13 @@ dnl common framework, and likely configured first).  So we have to
 dnl defer this check until later (see the OPAL_CHECK_CUDA_AFTER_OPAL_DL m4
 dnl macro, below).  :-(
 
-# If we have CUDA support, check to see if we have CUDA 4.1 support
-AS_IF([test "$opal_check_cuda_happy"="yes"],
-    AC_CHECK_MEMBER([struct CUipcMemHandle_st.reserved], [CUDA_SUPPORT_41=1], [CUDA_SUPPORT_41=0],
-        [#include <$opal_cuda_incdir/cuda.h>]),
+# We require CUDA IPC support which started in CUDA 4.1. Error
+# out if the support is not there.
+AS_IF([test "$opal_check_cuda_happy" = "yes"],
+    [AC_CHECK_MEMBER([struct CUipcMemHandle_st.reserved],
+        [],
+        [AC_MSG_ERROR([Cannot continue because CUDA 4.1 or later is required])],
+        [#include <$opal_cuda_incdir/cuda.h>])],
     [])
 
 # If we have CUDA support, check to see if we have support for SYNC_MEMOPS
@@ -125,10 +128,6 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
 AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
                    [Whether we want cuda device pointer support])
 
-AM_CONDITIONAL([OPAL_cuda_support_41], [test "x$CUDA_SUPPORT_41" = "x1"])
-AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT_41],$CUDA_SUPPORT_41,
-                   [Whether we have CUDA 4.1 support available])
-
 AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])
 AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS,
                    [Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available])
diff --git a/ompi/mca/pml/bfo/pml_bfo_cuda.c b/ompi/mca/pml/bfo/pml_bfo_cuda.c
index 9c593cd691..eb35b226e0 100644
--- a/ompi/mca/pml/bfo/pml_bfo_cuda.c
+++ b/ompi/mca/pml/bfo/pml_bfo_cuda.c
@@ -11,7 +11,7 @@
  *                         All rights reserved.
  * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
  * Copyright (c) 2010-2012 Oracle and/or its affiliates.  All rights reserved.
- * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2012-2015 NVIDIA Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -50,7 +50,6 @@ int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
-#if OPAL_CUDA_SUPPORT_41
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
         unsigned char *base;
@@ -81,10 +80,6 @@ int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
     }
-#else
-    /* Just do the rendezvous but set initial data to be sent to zero */
-    rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-#endif /* OPAL_CUDA_SUPPORT_41 */
     return rc;
 }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index a44a8b377c..12ad396363 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -56,7 +56,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
-#if OPAL_CUDA_SUPPORT_41
 #if OPAL_CUDA_GDR_SUPPORT
     /* With some BTLs, switch to RNDV from RGET at large messages */
     if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
@@ -95,10 +94,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
     }
-#else
-    /* Just do the rendezvous but set initial data to be sent to zero */
-    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-#endif /* OPAL_CUDA_SUPPORT_41 */
     return rc;
 }
 
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index bf966747a5..11500b6668 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -88,13 +88,11 @@ struct cudaFunctionTable {
     int (*cuEventDestroy)(CUevent);
     int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
     int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
-#if OPAL_CUDA_SUPPORT_41
     int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
     int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
     int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
     int (*cuIpcCloseMemHandle)(CUdeviceptr);
     int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
-#endif /* OPAL_CUDA_SUPPORT_41 */
     int (*cuCtxGetDevice)(CUdevice *);
     int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
     int (*cuDeviceGet)(CUdevice *, int);
@@ -156,7 +154,6 @@ OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,
                    NULL,
                    NULL);
 
-#if OPAL_CUDA_SUPPORT_41
 static int mca_common_cuda_async = 1;
 static int mca_common_cuda_cumemcpy_async;
 #if OPAL_ENABLE_DEBUG
@@ -223,8 +220,6 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
 #define CUDA_DUMP_EVTHANDLE(a)
 #endif /* OPAL_ENABLE_DEBUG */
 
-#endif /* OPAL_CUDA_SUPPORT_41 */
-
 /* This is a seperate function so we can see these variables with ompi_info and
  * also set them with the tools interface */
 void mca_common_cuda_register_mca_variables(void)
@@ -263,7 +258,6 @@ void mca_common_cuda_register_mca_variables(void)
                                  MCA_BASE_VAR_SCOPE_READONLY,
                                  &mca_common_cuda_warning);
 
-#if OPAL_CUDA_SUPPORT_41
     /* Use this flag to test async vs sync copies */
     mca_common_cuda_async = 1;
     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
@@ -280,7 +274,6 @@ void mca_common_cuda_register_mca_variables(void)
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_READONLY,
                                  &cuda_event_max);
-#endif /* OPAL_CUDA_SUPPORT_41 */
 
     /* Use this flag to test cuMemcpyAsync vs cuMemcpy */
     mca_common_cuda_cumemcpy_async = 1;
@@ -465,13 +458,11 @@ int mca_common_cuda_stage_one_init(void)
     OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
     OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
     OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
-#if OPAL_CUDA_SUPPORT_41
     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
-#endif /* OPAL_CUDA_SUPPORT_41 */
     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
     OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
     OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
@@ -595,7 +586,6 @@ static int mca_common_cuda_stage_three_init(void)
         return OPAL_ERROR;
     }
 
-#if OPAL_CUDA_SUPPORT_41
     if (true == mca_common_cuda_enabled) {
         /* Set up an array to store outstanding IPC async copy events */
         cuda_event_ipc_num_used = 0;
@@ -633,7 +623,6 @@ static int mca_common_cuda_stage_three_init(void)
         }
     }
 
-#endif /* OPAL_CUDA_SUPPORT_41 */
     if (true == mca_common_cuda_enabled) {
         /* Set up an array to store outstanding async dtoh events.  Used on the
          * sending side for asynchronous copies. */
@@ -1006,7 +995,6 @@ void mca_common_cuda_unregister(void *ptr, char *msg) {
     }
 }
 
-#if OPAL_CUDA_SUPPORT_41
 /*
  * Get the memory handle of a local section of memory that can be sent
  * to the remote size so it can access the memory.  This is the
@@ -1739,8 +1727,6 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
 }
 #endif /* OPAL_ENABLE_DEBUG */
 
-#endif /* OPAL_CUDA_SUPPORT_41 */
-
 /* Routines that get plugged into the opal datatype code */
 static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
 {