Add some basic support for sending and receiving CUDA device memory. Feature is disabled by default and has no effect on default code paths.

This commit was SVN r24659.
2011-04-28 23:05:55 +00:00 · 2011-04-28 23:05:55 +00:00 · 2634f6401a
--- a/opal/config/opal_configure_options.m4
+++ b/opal/config/opal_configure_options.m4
@ -16,6 +16,7 @@ dnl Copyright (c) 2009      IBM Corporation.  All rights reserved.
 dnl Copyright (c) 2009      Los Alamos National Security, LLC.  All rights
 dnl                         reserved.
 dnl Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+dnl Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 dnl
 dnl $COPYRIGHT$
 dnl 
@ -564,4 +565,78 @@ fi
 AC_DEFINE_UNQUOTED([OPAL_ENABLE_CRDEBUG], [$ompi_want_prd],
    [Whether we want checkpoint/restart enabled debugging functionality or not])

+#
+# Check to see if user wants CUDA support in datatype and convertor code.
+#
+AC_ARG_WITH([cuda],
+            [AC_HELP_STRING([--with-cuda(=DIR)],
+            [Build cuda support, optionally adding DIR/include, DIR/lib, and DIR/lib64])])
+AC_MSG_CHECKING([if --with-cuda is set])
+
+# CUDA support is off by default.  User has to request it.
+AS_IF([test "$with_cuda" = "no" -o "x$with_cuda" = "x"],
+      [opal_check_cuda_happy="no"
+       AC_MSG_RESULT([not set (--with-cuda=$with_cuda)])],
+      [AS_IF([test "$with_cuda" = "yes"],
+             [AS_IF([test "x`ls /usr/local/cuda/include/cuda.h 2> /dev/null`" = "x"],
+                    [AC_MSG_RESULT([not found in standard location])
+                     AC_MSG_WARN([Expected file /usr/local/cuda/include/cuda.h not found])
+                     AC_MSG_ERROR([Cannot continue])],
+                    [AC_MSG_RESULT([found])
+                     opal_check_cuda_happy="yes"
+                     with_cuda="/usr/local/cuda"])],
+             [AS_IF([test ! -d "$with_cuda"],
+                    [AC_MSG_RESULT([not found])
+                     AC_MSG_WARN([Directory $with_cuda not found])
+                     AC_MSG_ERROR([Cannot continue])],
+                    [AS_IF([test "x`ls $with_cuda/include/cuda.h 2> /dev/null`" = "x"],
+                           [AC_MSG_RESULT([not found])
+                            AC_MSG_WARN([Expected file $with_cuda/include/cuda.h not found])
+                            AC_MSG_ERROR([Cannot continue])],
+                           [opal_check_cuda_happy="yes"
+                            AC_MSG_RESULT([found ($with_cuda/include/cuda.h)])])])])])
+
+# Check for optional libdir setting
+AC_ARG_WITH([cuda-libdir],
+            [AC_HELP_STRING([--with-cuda-libdir=DIR],
+            [Search for cuda libraries in DIR])])
+AC_MSG_CHECKING([if --with-cuda-libdir is set])
+
+# Only check for the extra cuda libdir if we have passed the --with-cuda tests.
+AS_IF([test "$opal_check_cuda_happy" = "yes"],
+      [AS_IF([test "$with_cuda_libdir" != "yes" -a "$with_cuda_libdir" != "no" -a "x$with_cuda_libdir" != "x"],
+             [AS_IF([test ! -d "$with_cuda_libdir"],
+                    [AC_MSG_RESULT([not found])
+                     AC_MSG_WARN([Directory $with_cuda_libdir not found])
+                     AC_MSG_ERROR([Cannot continue])],
+                    [AS_IF([test "x`ls $with_cuda_libdir/libcuda.* 2> /dev/null`" = "x"],
+                           [AC_MSG_RESULT([not found])
+                            AC_MSG_WARN([Expected file $with_cuda_libdir/libcuda.* not found])
+                            AC_MSG_ERROR([Cannot continue])],
+                           [AC_MSG_RESULT([ok - found directory ($with_cuda_libdir)])])])],
+             [with_cuda_libdir=/usr/lib64
+              AS_IF([test "x`ls $with_cuda_libdir/libcuda.* 2> /dev/null`" = "x"],
+                    [AC_MSG_RESULT([not found])
+                     AC_MSG_WARN([Expected file $with_cuda_libdir/libcuda.* not found])
+                     AC_MSG_ERROR([Cannot continue])],
+                    [AC_MSG_RESULT([ok - found directory ($with_cuda_libdir)])])])],
+      [AC_MSG_RESULT([not applicable since --with-cuda is not set])])
+
+AC_MSG_CHECKING([if have cuda support])
+if test "$opal_check_cuda_happy" = "yes"; then
+    AC_MSG_RESULT([yes (-I$with_cuda/include -L$with_cuda_libdir -lcuda)])
+    CUDA_SUPPORT=1
+    opal_datatype_CPPFLAGS="-I$with_cuda/include"
+    opal_datatype_LIBS="-L$with_cuda_libdir -lcuda"
+    AC_SUBST([opal_datatype_CPPFLAGS])
+    AC_SUBST([opal_datatype_LIBS])
+else
+    AC_MSG_RESULT([no])
+    CUDA_SUPPORT=0
+fi
+
+AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
+                   [Whether we want cuda device pointer support])
+
 ])dnl
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@ -14,6 +14,7 @@
 #                         reserved.
 # Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
 # Copyright (c) 2010 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -71,3 +72,11 @@ if WANT_INSTALL_HEADERS
 opaldir = $(includedir)/openmpi/$(subdir)
 opal_HEADERS = $(headers)
 endif
+
+# If we have cuda support, modify file list and flags
+if OPAL_cuda_support
+libdatatype_la_SOURCES += opal_datatype_cuda.c
+headers += opal_datatype_cuda.h
+AM_CPPFLAGS = $(opal_datatype_CPPFLAGS)
+libdatatype_la_LIBADD += $(opal_datatype_LIBS)
+endif
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@ -11,6 +11,7 @@
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -37,6 +38,11 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
 #include "opal/datatype/opal_convertor_internal.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+#endif

 extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
                                                         int starting_point, const int* sizes );
@ -48,6 +54,9 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
    convertor->partial_length = 0;
    convertor->remoteArch     = opal_local_arch;
    convertor->flags          = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
+#if OPAL_CUDA_SUPPORT
+    convertor->cbmemcpy       = &memcpy;
+#endif
 }


@ -235,7 +244,11 @@ int32_t opal_convertor_pack( opal_convertor_t* pConv,
            if( OPAL_LIKELY(NULL == iov[i].iov_base) )
                iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
            else
+#if OPAL_CUDA_SUPPORT
+                MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
+#else
                MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
+#endif
            pending_length -= iov[i].iov_len;
            base_pointer += iov[i].iov_len;
        }
@ -248,7 +261,11 @@ complete_contiguous_data_pack:
        if( OPAL_LIKELY(NULL == iov[i].iov_base) )
            iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
        else
+#if OPAL_CUDA_SUPPORT
+            MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
+#else
            MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
+#endif
        pConv->bConverted = pConv->local_size;
        *out_size = i + 1;
        pConv->flags |= CONVERTOR_COMPLETED;
@ -282,7 +299,11 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv,
            if( iov[i].iov_len >= pending_length ) {
                goto complete_contiguous_data_unpack;
            }
+#if OPAL_CUDA_SUPPORT
+            MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
+#else
            MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
+#endif
            pending_length -= iov[i].iov_len;
            base_pointer += iov[i].iov_len;
        }
@ -292,7 +313,11 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv,

 complete_contiguous_data_unpack:
        iov[i].iov_len = pending_length;
+#if OPAL_CUDA_SUPPORT
+        MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
+#else
        MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
+#endif
        pConv->bConverted = pConv->local_size;
        *out_size = i + 1;
        pConv->flags |= CONVERTOR_COMPLETED;
@ -519,6 +544,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
    /* Here I should check that the data is not overlapping */

    convertor->flags |= CONVERTOR_RECV;
+#if OPAL_CUDA_SUPPORT
+    mca_cuda_convertor_init(convertor, pUserBuf);
+#endif

    OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );

@ -555,6 +583,9 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
                                         const void* pUserBuf )
 {
    convertor->flags |= CONVERTOR_SEND;
+#if OPAL_CUDA_SUPPORT
+    mca_cuda_convertor_init(convertor, pUserBuf);
+#endif

    OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );

@ -623,6 +654,9 @@ int opal_convertor_clone( const opal_convertor_t* source,
        destination->bConverted = source->bConverted;
        destination->stack_pos  = source->stack_pos;
    }
+#if OPAL_CUDA_SUPPORT
+    destination->cbmemcpy   = source->cbmemcpy;
+#endif
    return OPAL_SUCCESS;
 }

--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@ -34,6 +34,9 @@
 #ifdef HAVE_NET_UIO_H
 #include <net/uio.h>
 #endif
+#if HAVE_STRING_H
+#include <string.h>
+#endif

 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
@ -51,6 +54,7 @@ BEGIN_C_DECLS
 #define CONVERTOR_HOMOGENEOUS      0x00080000
 #define CONVERTOR_NO_OP            0x00100000
 #define CONVERTOR_WITH_CHECKSUM    0x00200000
+#define CONVERTOR_CUDA             0x00400000
 #define CONVERTOR_TYPE_MASK        0x00FF0000
 #define CONVERTOR_STATE_START      0x01000000
 #define CONVERTOR_STATE_COMPLETE   0x02000000
@ -65,6 +69,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
                                            uint32_t* out_size,
                                            size_t* max_data );
 typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n );

 /* The master convertor struct (defined in convertor_internal.h) */
 struct opal_convertor_master_t;
@ -109,6 +114,9 @@ struct opal_convertor_t {
    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */

+#if OPAL_CUDA_SUPPORT
+    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+#endif
    /* size: 248, cachelines: 4, members: 20 */
    /* last cacheline: 56 bytes */
 };
@ -156,6 +164,9 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
        convertor->pStack     = convertor->static_stack;
        convertor->stack_size = DT_STATIC_STACK_SIZE;
    }
+#if OPAL_CUDA_SUPPORT
+    convertor->cbmemcpy = &memcpy;
+#endif
    convertor->pDesc     = NULL;
    convertor->stack_pos = 0;
    convertor->flags     = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
@ -176,6 +187,9 @@ static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConv
 {
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    if (OPAL_UNLIKELY(0 == (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1;
+#endif
+#if OPAL_CUDA_SUPPORT
+    if( pConvertor->flags & CONVERTOR_CUDA ) return 1;
 #endif
    if( pConvertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS ) return 0;
    if( (pConvertor->count == 1) && (pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) return 0;
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@ -11,6 +11,7 @@
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -71,6 +72,31 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
 #define MEM_OP       MEMMOVE
 #include "opal_datatype_copy.h"

+#if OPAL_CUDA_SUPPORT
+#include "opal_datatype_cuda.h"
+
+#undef MEM_OP_NAME
+#define MEM_OP_NAME non_overlap_cuda
+#undef MEM_OP
+#define MEM_OP opal_cuda_memcpy
+#include "opal_datatype_copy.h"
+
+#undef MEM_OP_NAME
+#define MEM_OP_NAME overlap_cuda
+#undef MEM_OP
+#define MEM_OP opal_cuda_memmove
+#include "opal_datatype_copy.h"
+
+#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function)     \
+    do {                                                            \
+        if (true == cuda_device_bufs) {                             \
+            fct = copy_function;                                    \
+        }                                                           \
+    } while(0)
+#else
+#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) 
+#endif
+
 int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count,
                                             char* destination_base, char* source_base )
 {
@ -78,6 +104,10 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in
    size_t iov_len_local;
    int32_t (*fct)( const opal_datatype_t*, int32_t, char*, char*);

+#if OPAL_CUDA_SUPPORT
+    bool cuda_device_bufs = opal_cuda_check_bufs(destination_base, source_base);
+#endif
+
    DO_DEBUG( opal_output( 0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n",
                           (void*)datatype, count, destination_base, source_base ); );

@ -95,15 +125,18 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in
    extent = (datatype->true_ub - datatype->true_lb) + (count - 1) * (datatype->ub - datatype->lb);

    fct = non_overlap_copy_content_same_ddt;
+    SET_CUDA_COPY_FCT(cuda_device_bufs, fct, non_overlap_cuda_copy_content_same_ddt);
    if( destination_base < source_base ) {
        if( (destination_base + extent) > source_base ) {
            /* memmove */
            fct = overlap_copy_content_same_ddt;
+            SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt);
        }
    } else {
        if( (source_base + extent) > destination_base ) {
            /* memmove */
            fct = overlap_copy_content_same_ddt;
+            SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt);
        }
    }
    return fct( datatype, count, destination_base, source_base );
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <errno.h>
+#include <unistd.h>
+#include <cuda.h>
+
+#include "opal/align.h"
+#include "opal/mca/base/mca_base_param.h"
+#include "orte/util/show_help.h"
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+
+static bool initialized = false;
+static int opal_cuda_verbose;
+static int opal_cuda_output = 0;
+static void opal_cuda_support_init(void);
+
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+{   
+    int res;
+    CUmemorytype memType;
+    CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
+
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    res = cuPointerGetAttribute(&memType,
+                                CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    if (res != CUDA_SUCCESS) {
+        /* If we cannot determine it is device pointer,
+         * just assume it is not. */
+        return;
+    } else if (memType == CU_MEMORYTYPE_HOST) {
+        /* Host memory, nothing to do here */
+        return;
+    }
+    /* Must be a device pointer */
+    assert(memType == CU_MEMORYTYPE_DEVICE);
+
+    convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
+    convertor->flags |= CONVERTOR_CUDA;
+}
+
+/* Checks the type of pointer
+ *
+ * @param dest   One pointer to check
+ * @param source Another pointer to check
+ */
+bool opal_cuda_check_bufs(char *dest, char *src)
+{
+    int res;
+    CUmemorytype memType;
+
+    res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
+    if( memType == CU_MEMORYTYPE_DEVICE){
+        return true;
+    }
+    res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
+    if( memType == CU_MEMORYTYPE_DEVICE){
+        return true;
+    }
+    /* Assuming it is a host pointer for all other situations */
+    return false;
+}
+
+/*
+ * Need intermediate cuMemcpy function so we can check the return code
+ * of the call.  If we see an error, abort as there is no recovery at
+ * this point.
+ */
+void *opal_cuda_memcpy(void *dest, void *src, size_t size)
+{
+    int res;
+    res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    if (res != CUDA_SUCCESS) {
+        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                    res, dest, src, (int)size);
+        abort();
+    } else {
+        return dest;
+    }
+}
+
+/*
+ * In some cases, need an implementation of memmove.  This is not fast, but
+ * it is not often needed.
+ */
+void *opal_cuda_memmove(void *dest, void *src, size_t size)
+{
+    CUdeviceptr tmp;
+    int res;
+
+    res = cuMemAlloc(&tmp,size);
+    res = cuMemcpy(tmp, (CUdeviceptr) src, size);
+    if(res != CUDA_SUCCESS){
+        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                    res, (void *)tmp, src, (int)size);
+        abort();
+    }
+    res = cuMemcpy((CUdeviceptr) dest, tmp, size);
+    if(res != CUDA_SUCCESS){
+        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                    res, dest, (void *)tmp, (int)size);
+        abort();
+    }
+    cuMemFree(tmp);
+    return dest;
+}
+
+/**
+ * This function gets called once to check if the program is running in a cuda
+ * environment. 
+ */
+static void opal_cuda_support_init(void)
+{
+    int id;
+    CUresult res;
+    CUcontext cuContext;
+
+    if (initialized) {
+        return;
+    }
+
+    /* Set different levels of verbosity in the cuda related code. */
+    id = mca_base_param_reg_int_name("opal", "cuda_verbose", 
+                                     "Set level of opal cuda verbosity",
+                                     false, false, 0, &opal_cuda_verbose);
+    opal_cuda_output = opal_output_open(NULL);
+    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
+
+    /* Check to see if this process is running in a CUDA context.  If so,
+     * all is good.  Currently, just print out a message in verbose mode
+     * to help with debugging. */
+    res = cuCtxGetCurrent(&cuContext);
+    if (CUDA_SUCCESS != res) {
+        opal_output_verbose(10, opal_cuda_output,
+                            "CUDA: cuCtxGetCurrent failed, CUDA device pointers will not work");
+    } else {
+        opal_output_verbose(10, opal_cuda_output,
+                            "CUDA: cuCtxGetCurrent succeeded, CUDA device pointers will work");
+    }
+
+    initialized = true;
+}
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef _OPAL_DATATYPE_CUDA_H
+#define _OPAL_DATATYPE_CUDA_H
+
+#include "cuda.h"
+
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
+bool opal_cuda_check_bufs(char *dest, char *src);
+void* opal_cuda_memcpy(void * dest, void * src, size_t size);
+void* opal_cuda_memmove(void * dest, void * src, size_t size);
+
+#endif
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@ -4,6 +4,7 @@
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -21,6 +22,13 @@
 #include <stdint.h>
 #endif

+#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
+/* Make use of existing macro to do CUDA style memcpy */
+#undef MEMCPY_CSUM
+#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+#endif
+
 static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
                                         dt_elem_desc_t* ELEM,
                                         uint32_t* COUNT,
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@ -11,6 +11,7 @@
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008-2009 Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -318,8 +319,16 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
    memset( temporary, unused_byte, data_length );
    MEMCPY( temporary + start_position, partial_data, (end_position - start_position) );

+#if OPAL_CUDA_SUPPORT
+    /* In the case where the data is being unpacked from device
+     * memory, need to use the special host to device memory copy.
+     * Note this code path was only seen on large receives of
+     * noncontiguous data via buffered sends. */
+    pConvertor->cbmemcpy(saved_data, real_data, data_length );
+#else
    /* Save the content of the user memory */
    MEMCPY( saved_data, real_data, data_length );
+#endif

    /* Then unpack the data into the user memory */
    UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
@ -331,10 +340,25 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
    /* For every occurence of the unused byte move data from the saved
     * buffer back into the user memory.
     */
+#if OPAL_CUDA_SUPPORT
+    /* Need to copy the modified real_data again so we can see which
+     * bytes need to be converted back to their original values.  Note
+     * this code path was only seen on large receives of noncontiguous
+     * data via buffered sends. */
+    {
+        char resaved_data[16];
+        pConvertor->cbmemcpy(resaved_data, real_data, data_length );
+        for( i = 0; i < data_length; i++ ) {
+            if( unused_byte == resaved_data[i] )
+                pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1);
+        }
+    }
+#else
    for( i = 0; i < data_length; i++ ) {
        if( unused_byte == real_data[i] )
            real_data[i] = saved_data[i];
    }
+#endif
    return 0;
 }

--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@ -4,6 +4,7 @@
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -21,6 +22,13 @@
 #include <stdint.h>
 #endif

+#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
+/* Make use of existing macro to do CUDA style memcpy */
+#undef MEMCPY_CSUM
+#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+#endif
+
 #include "opal/datatype/opal_convertor.h"