Add support for CUDA Unified memory. Basically, add a new flag and disable some

optimizations when that flag is detected. Lightly reviewed by bosilca.
2014-10-29 06:17:23 -07:00 · 2014-10-29 06:17:23 -07:00 · f471b09ae9
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@ -11,6 +11,7 @@
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -45,11 +46,12 @@ BEGIN_C_DECLS
 #define CONVERTOR_WITH_CHECKSUM    0x00200000
 #define CONVERTOR_CUDA             0x00400000
 #define CONVERTOR_CUDA_ASYNC       0x00800000
-#define CONVERTOR_TYPE_MASK        0x00FF0000
+#define CONVERTOR_TYPE_MASK        0x10FF0000
 #define CONVERTOR_STATE_START      0x01000000
 #define CONVERTOR_STATE_COMPLETE   0x02000000
 #define CONVERTOR_STATE_ALLOC      0x04000000
 #define CONVERTOR_COMPLETED        0x08000000
+#define CONVERTOR_CUDA_UNIFIED     0x10000000

 union dt_elem_desc;
 typedef struct opal_convertor_t opal_convertor_t;
@ -177,7 +179,7 @@ static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConv
    if (OPAL_UNLIKELY(0 == (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1;
 #endif
 #if OPAL_CUDA_SUPPORT
-    if( pConvertor->flags & CONVERTOR_CUDA ) return 1;
+    if( pConvertor->flags & (CONVERTOR_CUDA | CONVERTOR_CUDA_UNIFIED)) return 1;
 #endif
    if( pConvertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS ) return 0;
    if( (pConvertor->count == 1) && (pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) return 0;
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@ -57,7 +57,7 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
        return;
    }

-    if (ftable.gpu_is_gpu_buffer(pUserBuf)) {
+    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
        convertor->flags |= CONVERTOR_CUDA;
    }
 }
@ -78,7 +78,7 @@ bool opal_cuda_check_bufs(char *dest, char *src)
        return false;
    }

-    if (ftable.gpu_is_gpu_buffer(dest) || ftable.gpu_is_gpu_buffer(src)) {
+    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
        return true;
    } else {
        return false;
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@ -14,7 +14,7 @@
 * common cuda code is initialized.  This removes any dependency on <cuda.h>
 * in the opal cuda datatype code. */
 struct opal_common_cuda_function_table {
-    int (*gpu_is_gpu_buffer)(const void*);
+    int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*);
    int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*);
    int (*gpu_cu_memcpy)(void*, const void*, size_t);
    int (*gpu_memmove)(void*, void*, size_t);
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@ -127,7 +127,7 @@ static opal_mutex_t common_cuda_dtoh_lock;
 static opal_mutex_t common_cuda_ipc_lock;

 /* Functions called by opal layer - plugged into opal function table */
-static int mca_common_cuda_is_gpu_buffer(const void*);
+static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*);
 static int mca_common_cuda_memmove(void*, void*, size_t);
 static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*);
 static int mca_common_cuda_cu_memcpy(void*, const void*, size_t);
@ -1700,7 +1700,7 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
 #endif /* OPAL_CUDA_SUPPORT_41 */

 /* Routines that get plugged into the opal datatype code */
-static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
+static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
 {
    int res;
    CUmemorytype memType = 0;
@ -1715,6 +1715,15 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
    void *attrdata[] = {(void *)&memType, (void *)&ctx, (void *)&isManaged};

    res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
+
+    /* Mark unified memory buffers with a flag.  This will allow all unified
+     * memory to be forced through host buffers.  Note that this memory can
+     * be either host or device so we need to set this flag prior to that check. */
+    if (1 == isManaged) {
+        if (NULL != convertor) {
+            convertor->flags |= CONVERTOR_CUDA_UNIFIED;
+        }
+    }
    if (res != CUDA_SUCCESS) {
        /* If we cannot determine it is device pointer,
         * just assume it is not. */
@ -1779,15 +1788,6 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
        }
    }

-#if OPAL_CUDA_GET_ATTRIBUTES
-    if (1 == isManaged) {
-        /* Currently cannot support managed memory */
-        opal_output(0, "CUDA: ptr=%p: CUDA-aware Open MPI detected managed memory but there "
-                    "is no support for it.  Result will be unpredictable.", pUserBuf);
-        return OPAL_ERROR;
-    }
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
-
    /* First access on a device pointer finalizes CUDA support initialization.
     * If initialization fails, disable support. */
    if (!stage_three_init_complete) {