Provide option to have synchronous copy be asynchronous with a wait. For now,

this has to be selected at runtime. Also fix up some error messages to have node name in them. This commit was SVN r30396.
2014-01-23 15:47:20 +00:00 · 2014-01-23 15:47:20 +00:00 · 9f3bf4747d
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -9,7 +9,7 @@
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
- * Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -95,6 +95,7 @@ struct cudaFunctionTable {
    int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
 #endif /* OPAL_CUDA_GDR_SUPPORT */
    int (*cuCtxSetCurrent)(CUcontext);
+    int (*cuEventSynchronize)(CUevent);
 } cudaFunctionTable;
 typedef struct cudaFunctionTable cudaFunctionTable_t;
 cudaFunctionTable_t cuFunc;
@ -111,6 +112,8 @@ static opal_list_t common_cuda_memory_registrations;
 static CUstream ipcStream;
 static CUstream dtohStream;
 static CUstream htodStream;
+static CUstream memcpyStream;
+static CUevent  memcpyEvent;

 /* Functions called by opal layer - plugged into opal function table */
 static int mca_common_cuda_is_gpu_buffer(const void*);
@ -135,6 +138,8 @@ OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,

 #if OPAL_CUDA_SUPPORT_41
 static int mca_common_cuda_async = 1;
+static int mca_common_cuda_cumemcpy_async;
+static int mca_common_cuda_cumemcpy_timing;

 /* Array of CUDA events to be queried for IPC stream, sending side and
 * receiving side. */
@ -167,7 +172,7 @@ static int cuda_event_htod_most = 0;
 opal_lt_dlhandle libcuda_handle;

 #define CUDA_COMMON_TIMING 0
-#if CUDA_COMMON_TIMING
+#if CUDA_COMMON_TIMING || OPAL_ENABLE_DEBUG
 /* Some timing support structures.  Enable this to help analyze
 * internal performance issues. */
 static struct timespec ts_start;
@ -292,6 +297,25 @@ int mca_common_cuda_stage_one_init(void)
                                 &cuda_event_max);
 #endif /* OPAL_CUDA_SUPPORT_41 */

+    /* Use this flag to test cuMemcpyAsync vs cuMemcpy */
+    mca_common_cuda_cumemcpy_async = 0;
+    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_async",
+                                 "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuEventRecord/cuEventSynchronize",
+                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                 OPAL_INFO_LVL_9,
+                                 MCA_BASE_VAR_SCOPE_READONLY,
+                                 &mca_common_cuda_cumemcpy_async);
+
+    /* Use this flag to dump out timing of cumempcy sync and async */
+    mca_common_cuda_cumemcpy_timing = 0;
+    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing",
+                                 "Set to 1 to dump timing of eager copies",
+                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                 OPAL_INFO_LVL_9,
+                                 MCA_BASE_VAR_SCOPE_READONLY,
+                                 &mca_common_cuda_cumemcpy_timing);
+
+
    mca_common_cuda_output = opal_output_open(NULL);
    opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);

@ -463,6 +487,7 @@ int mca_common_cuda_stage_one_init(void)
    OMPI_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
 #endif /* OPAL_CUDA_GDR_SUPPORT */
    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
    return 0;
 }

@ -559,7 +584,7 @@ static int mca_common_cuda_stage_three_init(void)
            res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
-                               true, res);
+                               true, ompi_process_info.nodename, res);
                return OMPI_ERROR;
            }
        }
@ -597,7 +622,7 @@ static int mca_common_cuda_stage_three_init(void)
            res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
-                               true, res);
+                               true, ompi_process_info.nodename, res);
                return OMPI_ERROR;
            }
        }
@ -632,7 +657,7 @@ static int mca_common_cuda_stage_three_init(void)
            res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
-                               true, res);
+                               true, ompi_process_info.nodename, res);
                return OMPI_ERROR;
            }
        }
@ -675,7 +700,7 @@ static int mca_common_cuda_stage_three_init(void)
    res = cuFunc.cuStreamCreate(&ipcStream, 0);
    if (res != CUDA_SUCCESS) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
-                       true, res);
+                       true, ompi_process_info.nodename, res);
        return OMPI_ERROR;
    }

@ -683,7 +708,7 @@ static int mca_common_cuda_stage_three_init(void)
    res = cuFunc.cuStreamCreate(&dtohStream, 0);
    if (res != CUDA_SUCCESS) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
-                       true, res);
+                       true, ompi_process_info.nodename, res);
        return OMPI_ERROR;

    }
@ -692,11 +717,26 @@ static int mca_common_cuda_stage_three_init(void)
    res = cuFunc.cuStreamCreate(&htodStream, 0);
    if (res != CUDA_SUCCESS) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
-                       true, res);
+                       true, ompi_process_info.nodename, res);
        return OMPI_ERROR;

    }

+    /* Create stream for use in cuMemcpyAsync synchronous copies */
+    res = cuFunc.cuStreamCreate(&memcpyStream, 0);
+    if (res != CUDA_SUCCESS) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
+                       true, ompi_process_info.nodename, res);
+        return OMPI_ERROR;
+    }
+    /* Create event for use in cuMemcpyAsync synchronous copies */
+    res = cuFunc.cuEventCreate(&memcpyEvent, 0);
+    if (res != CUDA_SUCCESS){ 
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                       true, ompi_process_info.nodename, res);
+        return OMPI_ERROR;
+    }
+
    opal_output_verbose(30, mca_common_cuda_output,
                        "CUDA: initialized");
    common_cuda_initialized = true;
@ -970,7 +1010,7 @@ void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle)
    result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
-                       true, result);
+                       true, ompi_process_info.nodename, result);
    }

    result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
@ -1026,7 +1066,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
    result = cuFunc.cuEventRecord(event, 0);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
-                       true, result);
+                       true, ompi_process_info.nodename, result);
    }
    /* END of Workaround */

@ -1089,7 +1129,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
        result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
        if (CUDA_SUCCESS != result) {
            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
-                           true, result);
+                           true, ompi_process_info.nodename, result);
            return OMPI_ERROR;
        }
        cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
@ -1119,7 +1159,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
        result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
        if (CUDA_SUCCESS != result) {
            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
-                           true, result);
+                           true, ompi_process_info.nodename, result);
            return OMPI_ERROR;
        }

@ -1192,7 +1232,7 @@ int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_
    result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
-                       true, result);
+                       true, ompi_process_info.nodename, result);
        return OMPI_ERROR;
    }
    cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
@ -1236,7 +1276,7 @@ int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_
    result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
    if (CUDA_SUCCESS != result) {
        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
-                       true, result);
+                       true, ompi_process_info.nodename, result);
        return OMPI_ERROR;
    }
    cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag;
@ -1480,7 +1520,7 @@ static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
 *   opal_output(0, "Function took   %7.2f usecs\n", accum);
 *
 */
-#if CUDA_COMMON_TIMING
+#if CUDA_COMMON_TIMING || OPAL_ENABLE_DEBUG
 static float mydifftime(struct timespec ts_start, struct timespec ts_end) {
    float seconds;
    float microseconds;
@ -1581,31 +1621,115 @@ static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t s
                                (CUstream)convertor->stream);
 }

+/**
+ * This function is plugged into various areas where a cuMemcpy would be called.
+ * This is a synchronous operation that will not return until the copy is complete.
+ */
 static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
 {
-    return cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    CUresult result;
+#if OPAL_ENABLE_DEBUG
+    CUmemorytype memTypeSrc, memTypeDst;
+    if (mca_common_cuda_cumemcpy_timing) {
+        /* Nice to know type of source and destination for timing output. Do
+         * not care about return code as memory type will just be set to 0 */
+        result = cuFunc.cuPointerGetAttribute(&memTypeDst,
+                                              CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
+        result = cuFunc.cuPointerGetAttribute(&memTypeSrc,
+                                              CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
+        clock_gettime(CLOCK_MONOTONIC, &ts_start);
+    }
+#endif
+    if (mca_common_cuda_cumemcpy_async) {
+        result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, memcpyStream);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
+                           true, dest, src, size, result);
+            return OMPI_ERROR;
+        }
+        result = cuFunc.cuEventRecord(memcpyEvent, memcpyStream);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
+                           true, ompi_process_info.nodename, result);
+            return 0;
+        }
+        result = cuFunc.cuEventSynchronize(memcpyEvent);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuEventSynchronize failed",
+                           true, ompi_process_info.nodename, result);
+            return OMPI_ERROR;
+        }
+    } else {
+         result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+         if (CUDA_SUCCESS != result) {
+             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
+                            true, result);
+             return OMPI_ERROR;
+         }
+    }
+#if OPAL_ENABLE_DEBUG
+    if (mca_common_cuda_cumemcpy_timing) {
+        clock_gettime(CLOCK_MONOTONIC, &ts_end);
+        accum = mydifftime(ts_start, ts_end);
+        if (mca_common_cuda_cumemcpy_async) {
+            opal_output(0, "cuMemcpyAsync took   %7.2f usecs (src=%p (%d), dst=%p (%d))\n",
+                        accum, src, memTypeSrc, dest, memTypeDst);
+        } else {
+            opal_output(0, "cuMemcpy took   %7.2f usecs (src=%p (%d), dst=%p (%d))\n",
+                        accum, src, memTypeSrc, dest, memTypeDst);
+        }
+    }
+#endif 
+    return OMPI_SUCCESS;
 }

 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
 {
    CUdeviceptr tmp;
-    int res;
+    int result;

-    res = cuFunc.cuMemAlloc(&tmp,size);
-    res = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
-    if(res != CUDA_SUCCESS){
-        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
-                    res, (void *)tmp, src, (int)size);
-        return res;
-    }
-    res = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
-    if(res != CUDA_SUCCESS){
-        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
-                    res, dest, (void *)tmp, (int)size);
-        return res;
+    result = cuFunc.cuMemAlloc(&tmp,size);
+    if (mca_common_cuda_cumemcpy_async) {
+        result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr)src, size, memcpyStream);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
+                           true, tmp, src, size, result);
+            return OMPI_ERROR;
+        }
+        result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, tmp, size, memcpyStream);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
+                           true, dest, tmp, size, result);
+            return OMPI_ERROR;
+        }
+        result = cuFunc.cuEventRecord(memcpyEvent, memcpyStream);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
+                           true, ompi_process_info.nodename, result);
+            return OMPI_ERROR;
+        }
+        result = cuFunc.cuEventSynchronize(memcpyEvent);
+        if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuEventSynchronize failed",
+                           true, ompi_process_info.nodename, result);
+            return OMPI_ERROR;
+        }
+    } else {
+        result = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
+        if(result != CUDA_SUCCESS){
+            opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                        result, (void *)tmp, src, (int)size);
+            return OMPI_ERROR;
+        }
+        result = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
+        if(result != CUDA_SUCCESS){
+            opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                        result, dest, (void *)tmp, (int)size);
+            return OMPI_ERROR;
+        }
    }
    cuFunc.cuMemFree(tmp);
-    return 0;
+    return OMPI_SUCCESS;
 }

 int mca_common_cuda_get_device(int *devicenum)
--- a/ompi/mca/common/cuda/help-mpi-common-cuda.txt
+++ b/ompi/mca/common/cuda/help-mpi-common-cuda.txt
@ -1,6 +1,6 @@
 # -*- text -*-
 #
-# Copyright (c) 2011-2013 NVIDIA.  All rights reserved.
+# Copyright (c) 2011-2014 NVIDIA.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -96,12 +96,14 @@ Check the cuda.h file for what the return value means.
 [cuEventCreate failed]
 The call to cuEventCreate failed. This is a unrecoverable error and will
 cause the program to abort.
+  Hostname:                     %s
  cuEventCreate return value:   %d
 Check the cuda.h file for what the return value means.
 #
 [cuEventRecord failed]
 The call to cuEventRecord failed. This is a unrecoverable error and will
 cause the program to abort.
+  Hostname:                     %s
  cuEventRecord return value:   %d
 Check the cuda.h file for what the return value means.
 #
@ -138,6 +140,7 @@ Check the cuda.h file for what the return value means.
 [cuStreamCreate failed]
 The call to cuStreamCreate failed.  This is a unrecoverable error and will
 cause the program to abort.
+  Hostname:                      %s
  cuStreamCreate return value:   %d
 Check the cuda.h file for what the return vale means.
 #
@ -182,3 +185,10 @@ continue, but report this error to the Open MPI developers.
  Address:                              %p
 Check the cuda.h file for what the return value means.
 #
+[cuEventSynchronize failed]
+The call to cuEventSynchronize failed. This is highly unusual and should
+not happen.  Please report this error to the Open MPI developers.
+  Hostname:                             %s
+  cuEventSynchronize return value:      %d
+Check the cuda.h file for what the return value means.
+#