Remove dependency on libcuda.so when building in CUDA-aware support. Dynamically load it if needed.

This commit was SVN r28140.
2013-03-01 13:21:52 +00:00 · 2013-03-01 13:21:52 +00:00 · ebe63118ac
--- a/config/opal_configure_options.m4
+++ b/config/opal_configure_options.m4
@ -488,7 +488,7 @@ AC_DEFINE_UNQUOTED([OPAL_ENABLE_CRDEBUG], [0],
 #
 AC_ARG_WITH([cuda],
            [AC_HELP_STRING([--with-cuda(=DIR)],
-            [Build cuda support, optionally adding DIR/include, DIR/lib, and DIR/lib64])])
+            [Build cuda support, optionally adding DIR/include])])
 AC_MSG_CHECKING([if --with-cuda is set])

 # CUDA support is off by default.  User has to request it.
@ -514,32 +514,6 @@ AS_IF([test "$with_cuda" = "no" -o "x$with_cuda" = "x"],
                           [opal_check_cuda_happy="yes"
                            AC_MSG_RESULT([found ($with_cuda/include/cuda.h)])])])])])

-# Check for optional libdir setting
-AC_ARG_WITH([cuda-libdir],
-            [AC_HELP_STRING([--with-cuda-libdir=DIR],
-            [Search for cuda libraries in DIR])])
-AC_MSG_CHECKING([if --with-cuda-libdir is set])
-
-# Only check for the extra cuda libdir if we have passed the --with-cuda tests.
-AS_IF([test "$opal_check_cuda_happy" = "yes"],
-      [AS_IF([test "$with_cuda_libdir" != "yes" -a "$with_cuda_libdir" != "no" -a "x$with_cuda_libdir" != "x"],
-             [AS_IF([test ! -d "$with_cuda_libdir"],
-                    [AC_MSG_RESULT([not found])
-                     AC_MSG_WARN([Directory $with_cuda_libdir not found])
-                     AC_MSG_ERROR([Cannot continue])],
-                    [AS_IF([test "x`ls $with_cuda_libdir/libcuda.* 2> /dev/null`" = "x"],
-                           [AC_MSG_RESULT([not found])
-                            AC_MSG_WARN([Expected file $with_cuda_libdir/libcuda.* not found])
-                            AC_MSG_ERROR([Cannot continue])],
-                           [AC_MSG_RESULT([ok - found directory ($with_cuda_libdir)])])])],
-             [with_cuda_libdir=/usr/lib64
-              AS_IF([test "x`ls $with_cuda_libdir/libcuda.* 2> /dev/null`" = "x"],
-                    [AC_MSG_RESULT([not found])
-                     AC_MSG_WARN([Expected file $with_cuda_libdir/libcuda.* not found])
-                     AC_MSG_ERROR([Cannot continue])],
-                    [AC_MSG_RESULT([ok - found directory ($with_cuda_libdir)])])])],
-      [AC_MSG_RESULT([not applicable since --with-cuda is not set])])
-
 # If we have CUDA support, check to see if we have CUDA 4.1 support
 AS_IF([test "$opal_check_cuda_happy"="yes"],
    AC_CHECK_MEMBER([struct CUipcMemHandle_st.reserved], [CUDA_SUPPORT_41=1], [CUDA_SUPPORT_41=0],
@ -548,10 +522,9 @@ AS_IF([test "$opal_check_cuda_happy"="yes"],

 AC_MSG_CHECKING([if have cuda support])
 if test "$opal_check_cuda_happy" = "yes"; then
-    AC_MSG_RESULT([yes (-I$with_cuda/include -L$with_cuda_libdir -lcuda)])
+    AC_MSG_RESULT([yes (-I$with_cuda/include)])
    CUDA_SUPPORT=1
    opal_datatype_cuda_CPPFLAGS="-I$with_cuda/include"
-    opal_datatype_cuda_LIBS="-L$with_cuda_libdir -lcuda"
    AC_SUBST([opal_datatype_cuda_CPPFLAGS])
    AC_SUBST([opal_datatype_cuda_LIBS])
 else
--- a/ompi/mca/common/cuda/Makefile.am
+++ b/ompi/mca/common/cuda/Makefile.am
@ -9,7 +9,7 @@
 #                         University of Stuttgart.  All rights reserved.
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
-# Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -34,12 +34,65 @@
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/util/output.h"
+#include "opal/util/lt_interface.h"
 #include "opal/util/show_help.h"
 #include "ompi/mca/mpool/base/base.h"
 #include "ompi/mca/rte/rte.h"
 #include "ompi/runtime/params.h"
 #include "common_cuda.h"

+/**
+ * Since function names can get redefined in cuda.h file, we need to do this
+ * stringifying to get the latest function name from the header file.  For
+ * example, cuda.h may have something like this:
+ * #define cuMemFree cuMemFree_v2
+ * We want to make sure we find cuMemFree_v2, not cuMemFree.
+ */
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+#define OMPI_CUDA_DLSYM(libhandle, funcName)                                         \
+do {                                                                                 \
+    *(void **)(&cuFunc.funcName) = opal_lt_dlsym(libhandle, STRINGIFY(funcName));    \
+    if (NULL == cuFunc.funcName) {                                                   \
+        opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true,             \
+                       STRINGIFY(funcName));                                         \
+        return 1;                                                                    \
+    } else {                                                                         \
+        opal_output_verbose(15, mca_common_cuda_output,                              \
+                            "CUDA: successful dlsym of %s",                          \
+                            STRINGIFY(funcName));                                    \
+    }                                                                                \
+} while (0)
+
+/* Structure to hold CUDA function pointers that get dynamically loaded. */
+struct cudaFunctionTable {
+    int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
+    int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+    int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
+    int (*cuMemAlloc)(CUdeviceptr *, unsigned int);
+    int (*cuMemFree)(CUdeviceptr buf);
+    int (*cuCtxGetCurrent)(void *cuContext);
+    int (*cuStreamCreate)(CUstream *, int);
+    int (*cuEventCreate)(CUevent *, int);
+    int (*cuEventRecord)(CUevent, CUstream);
+    int (*cuMemHostRegister)(void *, size_t, unsigned int);
+    int (*cuMemHostUnregister)(void *);
+    int (*cuEventQuery)(CUevent);
+    int (*cuEventDestroy)(CUevent);
+    int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
+    int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
+#if OMPI_CUDA_SUPPORT_41
+    int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
+    int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
+    int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
+    int (*cuIpcCloseMemHandle)(CUdeviceptr);
+    int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
+#endif /* OMPI_CUDA_SUPPORT_41 */
+} cudaFunctionTable;
+typedef struct cudaFunctionTable cudaFunctionTable_t;
+cudaFunctionTable_t cuFunc;
+
 static bool common_cuda_initialized = false;
 static bool common_cuda_init_function_added = false;
 static int mca_common_cuda_verbose;
@ -100,6 +153,9 @@ int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
 /* Size of array holding events */
 int cuda_event_max = 200;

+/* Handle to libcuda.so */
+opal_lt_dlhandle libcuda_handle;
+
 #define CUDA_COMMON_TIMING 0
 #if CUDA_COMMON_TIMING
 /* Some timing support structures.  Enable this to help analyze
@ -112,6 +168,7 @@ static double accum;
 static float mydifftime(struct timespec ts_start, struct timespec ts_end);
 #endif /* CUDA_COMMON_TIMING */

+static int mca_common_cuda_load_libcuda(void);
 /* These functions are typically unused in the optimized builds. */
 static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__ ;
 static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
@ -125,6 +182,12 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;

 #endif /* OMPI_CUDA_SUPPORT_41 */

+/**
+ * This function is registered with the OPAL CUDA support.  In that way,
+ * we will complete initialization when OPAL detects the first GPU memory
+ * access.  In the case that no GPU memory access happens, then this function
+ * never gets called.
+ */
 static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
 {
    int id, value, i, s;
@ -169,6 +232,13 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
                                     (int) mca_common_cuda_warning, &value);
    mca_common_cuda_warning = OPAL_INT_TO_BOOL(value);

+    /* If we cannot load the libary, then disable support */
+    if (0 != mca_common_cuda_load_libcuda()) {
+        common_cuda_initialized = true;
+        ompi_mpi_cuda_support = 0;
+        return OMPI_ERROR;
+    }
+
 #if OMPI_CUDA_SUPPORT_41
    /* Use this flag to test async vs sync copies */
    id = mca_base_param_reg_int_name("mpi", "common_cuda_memcpy_async",
@ -185,7 +255,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)

    /* Check to see if this process is running in a CUDA context.  If
     * so, all is good.  If not, then disable registration of memory. */
-    res = cuCtxGetCurrent(&cuContext);
+    res = cuFunc.cuCtxGetCurrent(&cuContext);
    if (CUDA_SUCCESS != res) {
        if (mca_common_cuda_warning) {
            /* Check for the not initialized error since we can make suggestions to
@ -234,7 +304,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)

        /* Create the events since they can be reused. */
        for (i = 0; i < cuda_event_max; i++) {
-            res = cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
+            res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, res);
@ -272,7 +342,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)

        /* Create the events since they can be reused. */
        for (i = 0; i < cuda_event_max; i++) {
-            res = cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
+            res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, res);
@ -307,7 +377,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)

        /* Create the events since they can be reused. */
        for (i = 0; i < cuda_event_max; i++) {
-            res = cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
+            res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, res);
@ -331,7 +401,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
        mem_reg = (common_cuda_mem_regs_t *)
            opal_list_remove_first(&common_cuda_memory_registrations);
        if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
-            res = cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
+            res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
            if (res != CUDA_SUCCESS) {
                /* If registering the memory fails, print a message and continue.
                 * This is not a fatal error. */
@ -350,7 +420,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
    }

    /* Create stream for use in ipc asynchronous copies */
-    res = cuStreamCreate(&ipcStream, 0);
+    res = cuFunc.cuStreamCreate(&ipcStream, 0);
    if (res != CUDA_SUCCESS) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                       true, res);
@ -358,7 +428,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
    }

    /* Create stream for use in dtoh asynchronous copies */
-    res = cuStreamCreate(&dtohStream, 0);
+    res = cuFunc.cuStreamCreate(&dtohStream, 0);
    if (res != CUDA_SUCCESS) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                       true, res);
@ -367,7 +437,7 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
    }

    /* Create stream for use in htod asynchronous copies */
-    res = cuStreamCreate(&htodStream, 0);
+    res = cuFunc.cuStreamCreate(&htodStream, 0);
    if (res != CUDA_SUCCESS) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                       true, res);
@ -381,6 +451,149 @@ static int mca_common_cuda_init(opal_common_cuda_function_table_t *ftable)
    return OMPI_SUCCESS;
 }

+/**
+ * This function will open and load the symbols needed from the CUDA driver
+ * library.  Any failure will result in a message and we will return 1.
+ */
+static int mca_common_cuda_load_libcuda(void)
+{
+    opal_lt_dladvise advise;
+    int retval;
+	int advise_support = 1;
+
+    if (0 != (retval = opal_lt_dlinit())) {
+        if (OPAL_ERR_NOT_SUPPORTED == retval) {
+            opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
+        } else {
+            opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
+                           "opal_lt_dlinit", retval, opal_lt_dlerror());
+        }
+        return 1;
+    }
+
+    /* Initialize the lt_dladvise structure.  If this does not work, we can
+     * proceed without the support.  Things should still work.  */
+    if (0 != (retval = opal_lt_dladvise_init(&advise))) {
+        if (OPAL_ERR_NOT_SUPPORTED == retval) {
+			advise_support = 0;
+        } else {
+    		opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
+	    				   "opal_lt_dladvise_init", retval, opal_lt_dlerror());
+            return 1;
+		}
+    }
+
+	if (advise_support) {
+		if (0 != (retval = opal_lt_dladvise_global(&advise))) {
+			opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
+						   "opal_lt_dladvise_global", retval, opal_lt_dlerror());
+			opal_lt_dladvise_destroy(&advise);
+			return 1;
+		}
+
+		/* 
+		 * Try and open libcuda.so and libcuda.so.1.  Note that we are not using
+		 * opal_lt_dladvise_ext() as we do not need ltdl to add any suffixes to
+		 * the library names being handed in.
+		 */
+		libcuda_handle = opal_lt_dlopenadvise("libcuda.so", advise);
+
+		/* If the first open fails, save the error message so that it can be printed
+		 * out of the second open fails as well.  If the second open succeeds, then 
+		 * we do not caer that the first open failed. */
+		if (NULL == libcuda_handle) {
+			char *err1;
+			const char *str1 = opal_lt_dlerror();
+			if (NULL != str1) {
+				err1 = strdup(str1);
+			} else {
+				err1 = strdup("lt_dlerror() returned NULL.");
+			}
+			libcuda_handle = opal_lt_dlopenadvise("libcuda.so.1", advise);
+			if (NULL == libcuda_handle) {
+				char *err2;
+				const char *str2 = opal_lt_dlerror();
+				if (NULL != str2) {
+					err2 = strdup(str2);
+				} else {
+					err2 = strdup("lt_dlerror() returned NULL.");
+				}
+				opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
+							   "libcuda.so", err1, "libcuda.so.1", err2);
+				free(err1);
+				free(err2);
+				opal_lt_dladvise_destroy(&advise);
+				return 1;
+			}
+			free(err1);
+		}
+
+		opal_lt_dladvise_destroy(&advise);
+	} else {
+		/* No lt_dladvise support.  This should rarely happen. */
+		/* 
+		 * Try and open libcuda.so and libcuda.so.1.  Note that we are not using
+		 * opal_lt_dladvise_ext() as we do not need ltdl to add any suffixes to
+		 * the library names being handed in.
+		 */
+		libcuda_handle = opal_lt_dlopen("libcuda.so");
+
+		/* If the first open fails, save the error message so that it can be printed
+		 * out of the second open fails as well.  If the second open succeeds, then 
+		 * we do not caer that the first open failed. */
+		if (NULL == libcuda_handle) {
+			char *err1;
+			const char *str1 = opal_lt_dlerror();
+			if (NULL != str1) {
+				err1 = strdup(str1);
+			} else {
+				err1 = strdup("lt_dlerror() returned NULL.");
+			}
+			libcuda_handle = opal_lt_dlopen("libcuda.so.1");
+			if (NULL == libcuda_handle) {
+				char *err2;
+				const char *str2 = opal_lt_dlerror();
+				if (NULL != str2) {
+					err2 = strdup(str2);
+				} else {
+					err2 = strdup("lt_dlerror() returned NULL.");
+				}
+				opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
+							   "libcuda.so", err1, "libcuda.so.1", err2);
+				free(err1);
+				free(err2);
+				return 1;
+			}
+			free(err1);
+		}
+	}
+
+    /* Map in the functions that we need */
+    OMPI_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuEventCreate);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuEventRecord);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuEventQuery);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemcpy);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemFree);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
+#if OMPI_CUDA_SUPPORT_41
+    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
+#endif /* OMPI_CUDA_SUPPORT_41 */
+    return 0;
+}
+
 /**
 * Call the CUDA register function so we pin the memory in the CUDA
 * space.
@ -405,7 +618,7 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
    }

    if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
-        res = cuMemHostRegister(ptr, amount, 0);
+        res = cuFunc.cuMemHostRegister(ptr, amount, 0);
        if (res != CUDA_SUCCESS) {
            /* If registering the memory fails, print a message and continue.
             * This is not a fatal error. */
@ -444,7 +657,7 @@ void mca_common_cuda_unregister(void *ptr, char *msg) {
    }

    if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
-        res = cuMemHostUnregister(ptr);
+        res = cuFunc.cuMemHostUnregister(ptr);
        if (res != CUDA_SUCCESS) {
            /* If unregistering the memory fails, print a message and continue.
             * This is not a fatal error. */
@ -479,13 +692,13 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;

    /* We should only be there if this is a CUDA device pointer */
-    result = cuPointerGetAttribute(&memType,
-                                   CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
+    result = cuFunc.cuPointerGetAttribute(&memType,
+                                          CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
    assert(CUDA_SUCCESS == result);
    assert(CU_MEMORYTYPE_DEVICE == memType);

    /* Get the memory handle so we can send it to the remote process. */
-    result = cuIpcGetMemHandle(&memHandle, (CUdeviceptr)base);
+    result = cuFunc.cuIpcGetMemHandle(&memHandle, (CUdeviceptr)base);
    CUDA_DUMP_MEMHANDLE((100, &memHandle, "GetMemHandle-After"));

    if (CUDA_SUCCESS != result) {
@ -500,7 +713,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne

    /* Need to get the real base and size of the memory handle.  This is
     * how the remote side saves the handles in a cache. */
-    result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
+    result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed",
                       true, result, base);
@ -523,7 +736,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
     * Note that this needs to be the NULL stream to make since it is
     * unknown what stream any copies into the device memory were done
     * with. */
-    result = cuEventRecord((CUevent)cuda_reg->event, 0);
+    result = cuFunc.cuEventRecord((CUevent)cuda_reg->event, 0);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                       true, result, base);
@ -564,8 +777,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
    CUDA_DUMP_MEMHANDLE((100, &memHandle, "Before call to cuIpcOpenMemHandle"));

    /* Open the memory handle and store it into the registration structure. */
-    result = cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, memHandle,
-                                CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+    result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, memHandle,
+                                       CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);

    /* If there are some stale entries in the cache, they can cause other
     * registrations to fail.  Let the caller know that so that can attempt
@ -599,7 +812,7 @@ int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
    CUresult result;
    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)reg;

-    result = cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
+    result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
                       true, result, cuda_reg->base.alloc_base);
@ -618,13 +831,13 @@ void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle)
 {
    CUresult result;

-    result = cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                       true, result);
    }

-    result = cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
+    result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
    if (CUDA_SUCCESS != result){
        opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed",
                       true, result);
@ -638,7 +851,7 @@ void mca_common_cuda_destruct_event(uint64_t *event)
 {
    CUresult result;

-    result = cuEventDestroy((CUevent)event);
+    result = cuFunc.cuEventDestroy((CUevent)event);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
                       true, result);
@ -659,7 +872,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
    memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle));
    CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));

-    result = cuIpcOpenEventHandle(&event, evtHandle);
+    result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle);
    if (CUDA_SUCCESS != result){
        opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
                       true, result);
@ -670,21 +883,21 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
     * it is not used, to make sure we do not short circuit our way
     * out of the cuStreamWaitEvent test.
     */
-    result = cuEventRecord(event, 0);
+    result = cuFunc.cuEventRecord(event, 0);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                       true, result);
    }
    /* END of Workaround */

-    result = cuStreamWaitEvent(0, event, 0);
+    result = cuFunc.cuStreamWaitEvent(0, event, 0);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed",
                       true, result);
    }

    /* All done with this event. */
-    result = cuEventDestroy(event);
+    result = cuFunc.cuEventDestroy(event);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
                       true, result);
@ -713,7 +926,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
    /* This is the standard way to run.  Running with synchronous copies is available
     * to measure the advantages of asynchronous copies. */
    if (OPAL_LIKELY(mca_common_cuda_async)) {
-        result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
+        result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
        if (CUDA_SUCCESS != result) {
            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
                           true, dst, src, amount, result);
@ -723,7 +936,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
                                "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
                                dst, src, (int)amount);
        }
-        result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
+        result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
        if (CUDA_SUCCESS != result) {
            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                           true, result);
@ -741,7 +954,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
        *done = 0;
    } else {
        /* Mimic the async function so they use the same memcpy call. */
-        result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
+        result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
        if (CUDA_SUCCESS != result) {
            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
                           true, dst, src, amount, result);
@ -753,7 +966,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
        }

        /* Record an event, then wait for it to complete with calls to cuEventQuery */
-        result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
+        result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
        if (CUDA_SUCCESS != result) {
            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                           true, result);
@ -769,7 +982,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
        }
        cuda_event_ipc_num_used++;

-        result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
+        result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
        if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
                           true, result);
@ -781,7 +994,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
            if (0 == (iter % 10)) {
                opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
            }
-            result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
+            result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
            if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
                               true, result);
@ -817,7 +1030,7 @@ int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

-    result = cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
+    result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                       true, result);
@ -852,7 +1065,7 @@ int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

-    result = cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
+    result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                       true, result);
@ -897,7 +1110,7 @@ int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
                           "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d",
                            cuda_event_ipc_num_used);

-        result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
+        result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);

        /* We found an event that is not ready, so return. */
        if (CUDA_ERROR_NOT_READY == result) {
@ -939,7 +1152,7 @@ int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
                           "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d",
                            cuda_event_dtoh_num_used);

-        result = cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);
+        result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);

        /* We found an event that is not ready, so return. */
        if (CUDA_ERROR_NOT_READY == result) {
@ -981,7 +1194,7 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
                           "CUDA: progress_one_cuda_htod_event, outstanding_events=%d",
                            cuda_event_htod_num_used);

-        result = cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);
+        result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);

        /* We found an event that is not ready, so return. */
        if (CUDA_ERROR_NOT_READY == result) {
@ -1133,8 +1346,8 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
    CUmemorytype memType;
    CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;

-    res = cuPointerGetAttribute(&memType,
-                                CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    res = cuFunc.cuPointerGetAttribute(&memType,
+                                       CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
    if (res != CUDA_SUCCESS) {
        /* If we cannot determine it is device pointer,
         * just assume it is not. */
@ -1151,13 +1364,13 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
 static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size,
                                         opal_convertor_t* convertor)
 {
-    return cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, 
-                         (CUstream)convertor->stream);
+    return cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, 
+                                (CUstream)convertor->stream);
 }

 static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
 {
-    return cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    return cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
 }

 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
@ -1165,19 +1378,19 @@ static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
    CUdeviceptr tmp;
    int res;

-    res = cuMemAlloc(&tmp,size);
-    res = cuMemcpy(tmp, (CUdeviceptr)src, size);
+    res = cuFunc.cuMemAlloc(&tmp,size);
+    res = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
    if(res != CUDA_SUCCESS){
        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, (void *)tmp, src, (int)size);
        return res;
    }
-    res = cuMemcpy((CUdeviceptr)dest, tmp, size);
+    res = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
    if(res != CUDA_SUCCESS){
        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, (void *)tmp, (int)size);
        return res;
    }
-    cuMemFree(tmp);
+    cuFunc.cuMemFree(tmp);
    return 0;
 }
--- a/ompi/mca/common/cuda/configure.m4
+++ b/ompi/mca/common/cuda/configure.m4
@ -30,9 +30,8 @@ AC_DEFUN([MCA_ompi_common_cuda_CONFIG],[
    AC_DEFINE_UNQUOTED([OMPI_CUDA_SUPPORT_41],$CUDA_SUPPORT_41,
                       [Whether we want support CUDA 4.1 features])
 
-    # Copy over the includes and libs needed to build CUDA
+    # Copy over the includes needed to build CUDA
    common_cuda_CPPFLAGS=$opal_datatype_cuda_CPPFLAGS
-    common_cuda_LIBS=$opal_datatype_cuda_LIBS
    AC_SUBST([common_cuda_CPPFLAGS])
    AC_SUBST([common_cuda_LIBS])

--- a/ompi/mca/common/cuda/help-mpi-common-cuda.txt
+++ b/ompi/mca/common/cuda/help-mpi-common-cuda.txt
@ -1,6 +1,6 @@
 # -*- text -*-
 #
-# Copyright (c) 2011-2012 NVIDIA.  All rights reserved.
+# Copyright (c) 2011-2013 NVIDIA.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -140,3 +140,38 @@ The call to cuStreamCreate failed.  This is a unrecoverable error and will
 cause the program to abort.
  cuStreamCreate return value:   %d
 Check the cuda.h file for what the return vale means.
+#
+[dlopen disabled]
+While trying to load the supporting libcuda.so library, an error was 
+detected.  This error indicates that the Open MPI library was probably
+configured with the --disable-dlopen flag.  When the library is 
+configured in this way, CUDA support is disabled because CUDA support
+depends on the ability to dynamically open libraries.  Reconfigure
+without the --disable-dlopen flag to get around this problem.
+#
+[dladvise disabled]
+While trying to initialize the lt_dladvise structure, an error was 
+detected.  This error indicates that the Open MPI library was
+configured such that there is no support for the lt_dladvise 
+structure.  This is needed for properly opening the libcuda library.
+Look around for the OPAL_HAVE_LTDL_ADVISE macro and ensure that it
+is defined as a 1.  
+#
+[unknown ltdl error]
+While attempting to load the supporting libcuda.so library, an error
+occurred.  This really should rarely happen.  Please notify the Open
+MPI developers. 
+   Function:     %s
+   Return Value: %d
+   Error string: %s
+#
+[dlopen failed]
+The library attempted to open the supporting CUDA libraries but failed.
+  Library attempted: %s
+  Error string:      %s
+  Library attempted: %s
+  Error string:      %s
+#
+[dlsym failed]
+An error occurred while trying to map in the address of a function.
+  Function Name: %s
--- a/ompi/mca/pml/ob1/Makefile.am
+++ b/ompi/mca/pml/ob1/Makefile.am
@ -66,6 +66,10 @@ mcacomponentdir = $(pkglibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_pml_ob1_la_SOURCES = $(ob1_sources)
 mca_pml_ob1_la_LDFLAGS = -module -avoid-version
+#if MCA_ompi_cuda_support
+#mca_pml_ob1_la_LIBADD = \
+#    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
+#endif

 noinst_LTLIBRARIES = $(component_noinst)
 libmca_pml_ob1_la_SOURCES = $(ob1_sources)