Chnage some CUDA configure code and macro names per review request by jsquyres in ticket #3880.

Functionally, nothing changes. This commit was SVN r29815.
2013-12-06 14:35:10 +00:00 · 2013-12-06 14:35:10 +00:00 · d556b60b21
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@ -71,19 +71,28 @@ AS_IF([test "$with_cuda" = "no" -o "x$with_cuda" = "x"],

 # If we have CUDA support, check to see if we have CUDA 4.1 support
 AS_IF([test "$opal_check_cuda_happy"="yes"],
-    AC_CHECK_HEADER([$opal_cuda_incdir/cuda.h])
    AC_CHECK_MEMBER([struct CUipcMemHandle_st.reserved], [CUDA_SUPPORT_41=1], [CUDA_SUPPORT_41=0],
        [#include <$opal_cuda_incdir/cuda.h>]),
    [])

-# If we have CUDA support, check to see if we have CUDA 6.0 support.
-# Look for new CUDA 6.0 attribute.
+# If we have CUDA support, check to see if we have support for SYNC_MEMOPS
+# which was first introduced in CUDA 6.0.
 AS_IF([test "$opal_check_cuda_happy"="yes"],
-    AC_CHECK_HEADER([$opal_cuda_incdir/cuda.h])
-    AC_CHECK_DECL([CU_POINTER_ATTRIBUTE_BUFFER_ID], [CUDA_SUPPORT_60=1], [CUDA_SUPPORT_60=0],
+    AC_CHECK_DECL([CU_POINTER_ATTRIBUTE_SYNC_MEMOPS], [CUDA_SYNC_MEMOPS=1], [CUDA_SYNC_MEMOPS=0],
        [#include <$opal_cuda_incdir/cuda.h>]),
    [])

+# If we have CUDA support, check to see if we have CUDA 6.0 or later.
+AC_COMPILE_IFELSE(
+    [AC_LANG_PROGRAM([[#include <$opal_cuda_incdir/cuda.h>]],
+        [[
+#if CUDA_VERSION < 6000
+#error "CUDA_VERSION is less than 6000"
+#endif
+        ]])],
+        [CUDA_VERSION_60_OR_GREATER=1],
+        [CUDA_VERSION_60_OR_GREATER=0])
+
 AC_MSG_CHECKING([if have cuda support])
 if test "$opal_check_cuda_happy" = "yes"; then
    AC_MSG_RESULT([yes (-I$with_cuda)])
@ -103,8 +112,14 @@ AM_CONDITIONAL([OPAL_cuda_support_41], [test "x$CUDA_SUPPORT_41" = "x1"])
 AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT_41],$CUDA_SUPPORT_41,
                   [Whether we have CUDA 4.1 support available])

-AM_CONDITIONAL([OPAL_cuda_support_60], [test "x$CUDA_SUPPORT_60" = "x1"])
-AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT_60],$CUDA_SUPPORT_60,
-                   [Whether we have CUDA 6.0 support available])
+AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS,
+                   [Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available])
+
+# There is nothing specific we can check for to see if GPU Direct RDMA is available.
+# Therefore, we check to see whether we have CUDA 6.0 or later.
+AM_CONDITIONAL([OPAL_cuda_gdr_support], [test "x$CUDA_VERSION_60_OR_GREATER" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
+                   [Whether we have CUDA GDR support available])

 ])
--- a/ompi/mca/btl/base/btl_base_mca.c
+++ b/ompi/mca/btl/base/btl_base_mca.c
@ -76,7 +76,7 @@ int mca_btl_base_param_register(mca_base_component_t *version,
                                           OPAL_INFO_LVL_4,
                                           MCA_BASE_VAR_SCOPE_READONLY,
                                           &module->btl_eager_limit);
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    /* If no CUDA RDMA support, zero them out */
    if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
        module->btl_cuda_eager_limit = 0;
@ -92,7 +92,7 @@ int mca_btl_base_param_register(mca_base_component_t *version,
                                           OPAL_INFO_LVL_9,
                                           MCA_BASE_VAR_SCOPE_READONLY,
                                           &module->btl_cuda_rdma_limit);
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */

    (void) mca_base_component_var_register(version, "max_send_size", "Maximum size (in bytes) of a single \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1)",
                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
--- a/ompi/mca/btl/btl.h
+++ b/ompi/mca/btl/btl.h
@ -843,10 +843,10 @@ struct mca_btl_base_module_t {
    mca_btl_base_module_register_error_fn_t btl_register_error;
    /** fault tolerant even notification */
    mca_btl_base_module_ft_event_fn_t btl_ft_event;
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    size_t      btl_cuda_eager_limit;  /**< switch from eager to RDMA */
    size_t      btl_cuda_rdma_limit;   /**< switch from RDMA to rndv pipeline */
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;

--- a/ompi/mca/btl/openib/btl_openib.c
+++ b/ompi/mca/btl/openib/btl_openib.c
@ -1219,11 +1219,11 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(

    openib_btl = (mca_btl_openib_module_t*)btl;

-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
 #else
    if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT
        /* GMS  bloody HACK! */
        if(registration != NULL || max_data > btl->btl_max_send_size) {
            frag = alloc_send_user_frag();
@ -1382,11 +1382,11 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
         * register the region ourselves
         */
        uint32_t mflags = 0;
-#if OPAL_CUDA_SUPPORT
+#if OPAL_CUDA_GDR_SUPPORT
        if (convertor->flags & CONVERTOR_CUDA) {
            mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
        }
-#endif /* OPAL_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
        rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
                &registration);
        if(OMPI_SUCCESS != rc || NULL == registration) {
--- a/ompi/mca/btl/openib/btl_openib_mca.c
+++ b/ompi/mca/btl/openib/btl_openib_mca.c
@ -582,7 +582,7 @@ int btl_openib_register_mca_params(void)
    mca_btl_openib_component.use_message_coalescing = 0;

    /* Indicates if library was built with GPU Direct RDMA support.  Not changeable.  */
-    mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_SUPPORT_60);
+    mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_GDR_SUPPORT);
    (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_cuda_gdr_support",
                                           "Whether CUDA GPU Direct RDMA support is built into library or not",
                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
@ -601,7 +601,7 @@ int btl_openib_register_mca_params(void)
        opal_output(0, "GDR support requested but library does not have it built in.");
        return OMPI_ERROR;
    }
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    if (mca_btl_openib_component.cuda_want_gdr) {
        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
        mca_btl_openib_module.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number - indicates set it to minimum */
@ -610,7 +610,7 @@ int btl_openib_register_mca_params(void)
        mca_btl_openib_module.super.btl_cuda_eager_limit = 0; /* Turns off any of the GPU Direct RDMA code */
        mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
 #endif /* OPAL_CUDA_SUPPORT */
    CHECK(mca_btl_base_param_register(
            &mca_btl_openib_component.super.btl_version,
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -91,9 +91,9 @@ struct cudaFunctionTable {
    int (*cuCtxGetDevice)(CUdevice *);
    int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
    int (*cuDeviceGet)(CUdevice *, int);
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
    int (*cuCtxSetCurrent)(CUcontext);
 } cudaFunctionTable;
 typedef struct cudaFunctionTable cudaFunctionTable_t;
@ -452,9 +452,9 @@ int mca_common_cuda_stage_one_init(void)
    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
    OMPI_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
    OMPI_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    OMPI_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
    return 0;
 }
@ -842,7 +842,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
    cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
    memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));

-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_SYNC_MEMOPS
    /* With CUDA 6.0, we can set an attribute on the memory pointer that will
     * ensure any synchronous copies are completed prior to any other access
     * of the memory region.  This means we do not need to record an event
@ -870,7 +870,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
                       true, result, base);
        return OMPI_ERROR;
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_SYNC_MEMOPS */

    return OMPI_SUCCESS;
 }
@ -994,10 +994,10 @@ void mca_common_cuda_destruct_event(uint64_t *event)
 */
 void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
 {
-#if OPAL_CUDA_SUPPORT_60
-    /* No need for any of this with CUDA 6.0 */
+#if OPAL_CUDA_SYNC_MEMOPS
+    /* No need for any of this with SYNC_MEMOPS feature */
    return;
-#else /* OPAL_CUDA_SUPPORT_60 */
+#else /* OPAL_CUDA_SYNC_MEMOPS */
    CUipcEventHandle evtHandle;
    CUevent event;
    CUresult result;
@ -1035,7 +1035,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
        opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
                       true, result);
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_SYNC_MEMOPS */
 }

 /*
@ -1644,7 +1644,7 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
    return 0;
 }

-#if OPAL_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT
+#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT
 /* Check to see if the memory was freed between the time it was stored in
 * the registration cache and now.  Return true if the memory was previously
 * freed.  This is indicated by the BUFFER_ID value in the registration cache
@ -1707,5 +1707,5 @@ void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg)


 }
-#endif /* OPAL_CUDA_SUPPORT_60 */       
+#endif /* OPAL_CUDA_GDR_SUPPORT */       

--- a/ompi/mca/common/cuda/common_cuda.h
+++ b/ompi/mca/common/cuda/common_cuda.h
@ -75,10 +75,10 @@ OMPI_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
 OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
 OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
-#if OPAL_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT
+#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT
 OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
 /**
 * Return:   0 if no packing is required for sending (the upper layer
 *             can use directly the pointer to the contiguous user
--- a/ompi/mca/mpool/grdma/mpool_grdma_module.c
+++ b/ompi/mca/mpool/grdma/mpool_grdma_module.c
@ -36,9 +36,9 @@

 #include "opal/align.h"

-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
 #include "ompi/mca/rcache/rcache.h"
 #include "ompi/mca/rcache/base/base.h"
 #include "ompi/mca/rte/rte.h"
@ -47,9 +47,9 @@
 #include "ompi/mca/mpool/base/base.h"
 #include "mpool_grdma.h"

-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
 static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size);
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
 static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool)
 {
    memset ((void *)((uintptr_t)pool + sizeof (pool->super)), 0, sizeof (*pool) - sizeof (pool->super));
@ -236,7 +236,7 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
    if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
        do_unregistration_gc(mpool);

-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
        size_t psize;
        mca_common_cuda_get_address_range(&base, &psize, addr);
@ -245,7 +245,7 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
         * this call will boot it out of the cache. */
        check_for_cuda_freed_memory(mpool, base, psize);
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */

    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
@ -287,11 +287,11 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
    grdma_reg->base = base;
    grdma_reg->bound = bound;
    grdma_reg->flags = flags;
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
        mca_common_cuda_get_buffer_id(grdma_reg);
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */

    if (false == bypass_cache) {
        rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);
@ -466,7 +466,7 @@ int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool,
 * that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state.  If we do
 * kick out the regisrations and deregister.  This function needs to be called
 * with the mpool->rcache->lock held. */
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
 static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size)
 {
    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool;
@ -515,7 +515,7 @@ static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *add

    return rc;
 }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */

 void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool)
 {
--- a/ompi/mca/mpool/mpool.h
+++ b/ompi/mca/mpool/mpool.h
@ -57,9 +57,9 @@ struct mca_mpool_base_registration_t {
    int32_t ref_count; 
    uint32_t flags;
    void *mpool_context;
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    unsigned long long gpu_bufID;
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
 };  

 typedef struct mca_mpool_base_registration_t mca_mpool_base_registration_t; 
--- a/ompi/mca/pml/ob1/pml_ob1.c
+++ b/ompi/mca/pml/ob1/pml_ob1.c
@ -366,7 +366,7 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
            rc = OMPI_ERR_BAD_PARAM;
            goto cleanup_and_return;
        }
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
        /* If size is SIZE_MAX, then we know we want to set this to the minimum possible
         * value which is the size of the PML header. */
        if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
@ -405,7 +405,7 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
                goto cleanup_and_return;
            }
        }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */
    }


--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@ -54,13 +54,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                        size_t size) {
    int rc;
 #if OPAL_CUDA_SUPPORT_41
-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    /* With some BTLs, switch to RNDV from RGET at large messages */
    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && 
        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */    
+#endif /* OPAL_CUDA_GDR_SUPPORT */    

    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@ -366,11 +366,11 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
    size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_ob1_hdr_t);
    int rc;

-#if OPAL_CUDA_SUPPORT_60
+#if OPAL_CUDA_GDR_SUPPORT
    if (btl->btl_cuda_eager_limit && (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
        eager_limit = btl->btl_cuda_eager_limit - sizeof(mca_pml_ob1_hdr_t);
    }
-#endif /* OPAL_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_GDR_SUPPORT */

    if( OPAL_LIKELY(size <= eager_limit) ) {
        switch(sendreq->req_send.req_send_mode) {