Rename a few things for clarity. Add a stream.

This commit was SVN r26447.
2012-05-17 18:10:59 +00:00 · 2012-05-17 18:10:59 +00:00 · f8ace21366
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -9,7 +9,7 @@
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
- * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2011-2012 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -47,6 +47,7 @@ static bool mca_common_cuda_enabled = false;
 static bool mca_common_cuda_register_memory = true;
 static bool mca_common_cuda_warning = true;
 static opal_list_t common_cuda_memory_registrations;
+static CUstream ipcStream;

 /* Structure to hold memory registrations that are delayed until first
 * call to send or receive a GPU pointer */
@ -66,21 +67,21 @@ OBJ_CLASS_INSTANCE( common_cuda_mem_regs_t,
 #if OMPI_CUDA_SUPPORT_41
 static int mca_common_cuda_async = 1;

-/* Array of CUDA events to be queried */
-CUevent *cuda_event_status_array;
+/* Array of CUDA events to be queried for IPC stream */
+CUevent *cuda_event_ipc_array;

 /* Array of fragments currently being moved by cuda async non-blocking
 * operations */
-struct mca_btl_base_descriptor_t **cuda_event_frag_array;
+struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;

 /* First free/available location in cuda_event_status_array */
-int cuda_event_status_first_avail;
+int cuda_event_ipc_first_avail;

 /* First currently-being used location in the cuda_event_status_array */
-int cuda_event_status_first_used;
+int cuda_event_ipc_first_used;

 /* Number of status items currently in use */
-int cuda_event_status_num_used;
+int cuda_event_ipc_num_used;

 /* Size of array holding events */
 int cuda_event_max = 200;
@ -195,14 +196,14 @@ static int mca_common_cuda_init(void)
 #if OMPI_CUDA_SUPPORT_41
    if (true == mca_common_cuda_enabled) {
        /* Set up an array to store outstanding async copy events */
-        cuda_event_status_array = NULL;
-        cuda_event_frag_array = NULL;
-        cuda_event_status_num_used = 0;
-        cuda_event_status_first_avail = 0;
-        cuda_event_status_first_used = 0;
+        cuda_event_ipc_array = NULL;
+        cuda_event_ipc_frag_array = NULL;
+        cuda_event_ipc_num_used = 0;
+        cuda_event_ipc_first_avail = 0;
+        cuda_event_ipc_first_used = 0;

-        cuda_event_status_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
-        if (NULL == cuda_event_status_array) {
+        cuda_event_ipc_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
+        if (NULL == cuda_event_ipc_array) {
            orte_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, errno, strerror(errno));
            return OMPI_ERROR;
@ -210,7 +211,7 @@ static int mca_common_cuda_init(void)

        /* Create the events since they can be reused. */
        for (i = 0; i < cuda_event_max; i++) {
-            res = cuEventCreate(&cuda_event_status_array[i], CU_EVENT_DISABLE_TIMING);
+            res = cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
            if (CUDA_SUCCESS != res) {
                orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, res);
@ -220,9 +221,9 @@ static int mca_common_cuda_init(void)

        /* The first available status index is 0.  Make an empty frag
           array. */
-        cuda_event_frag_array = (struct mca_btl_base_descriptor_t **)
+        cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
            malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
-        if (NULL == cuda_event_frag_array) {
+        if (NULL == cuda_event_ipc_frag_array) {
            orte_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, errno, strerror(errno));
            return OMPI_ERROR;
@ -254,6 +255,14 @@ static int mca_common_cuda_init(void)
        OBJ_RELEASE(mem_reg);
    }

+    /* Create stream for use in ipc asynchronous copies */
+    res = cuStreamCreate(&ipcStream, 0);
+    if (res != CUDA_SUCCESS) {
+        orte_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
+                       true, res);
+        return OMPI_ERROR;
+    }
+
    opal_output_verbose(30, mca_common_cuda_output,
                        "CUDA: initialized");
    common_cuda_initialized = true;
@ -395,11 +404,13 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
    cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
    memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));

-    /* Need to record the event to ensure that any memcopies
-     * into the device memory have completed.  The event handle
-     * associated with this event is sent to the remote process
-     * so that it will wait on this event prior to copying data
-     * out of the device memory. */
+    /* Need to record the event to ensure that any memcopies into the
+     * device memory have completed.  The event handle associated with
+     * this event is sent to the remote process so that it will wait
+     * on this event prior to copying data out of the device memory.
+     * Note that this needs to be the NULL stream to make since it is
+     * unknown what stream any copies into the device memory were done
+     * with. */
    result = cuEventRecord((CUevent)cuda_reg->event, 0);
    if (CUDA_SUCCESS != result) {
        orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
@ -581,7 +592,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
    /* First make sure there is room to store the event.  If not, then
     * return an error.  The error message will tell the user to try and
     * run again, but with a larger array for storing events. */
-    if (cuda_event_status_num_used == cuda_event_max) {
+    if (cuda_event_ipc_num_used == cuda_event_max) {
        orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
                       true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
        return OMPI_ERR_OUT_OF_RESOURCE;
@ -590,7 +601,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
    /* This is the standard way to run.  Running with synchronous copies is available
     * to measure the advantages of asynchronous copies. */
    if (OPAL_LIKELY(mca_common_cuda_async)) {
-        result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
+        result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
        if (CUDA_SUCCESS != result) {
            orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
                           true, dst, src, amount, result);
@ -600,25 +611,25 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
                                "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
                                dst, src, (int)amount);
        }
-        result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
+        result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
        if (CUDA_SUCCESS != result) {
            orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                           true, result);
            return OMPI_ERROR;
        }
-        cuda_event_frag_array[cuda_event_status_first_avail] = frag;
+        cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;

        /* Bump up the first available slot and number used by 1 */
-        cuda_event_status_first_avail++;
-        if (cuda_event_status_first_avail >= cuda_event_max) {
-            cuda_event_status_first_avail = 0;
+        cuda_event_ipc_first_avail++;
+        if (cuda_event_ipc_first_avail >= cuda_event_max) {
+            cuda_event_ipc_first_avail = 0;
        }
-        cuda_event_status_num_used++;
+        cuda_event_ipc_num_used++;

        *done = 0;
    } else {
        /* Mimic the async function so they use the same memcpy call. */
-        result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
+        result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
        if (CUDA_SUCCESS != result) {
            orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
                           true, dst, src, amount, result);
@ -630,23 +641,23 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
        }

        /* Record an event, then wait for it to complete with calls to cuEventQuery */
-        result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
+        result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
        if (CUDA_SUCCESS != result) {
            orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                           true, result);
            return OMPI_ERROR;
        }

-        cuda_event_frag_array[cuda_event_status_first_avail] = frag;
+        cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;

        /* Bump up the first available slot and number used by 1 */
-        cuda_event_status_first_avail++;
-        if (cuda_event_status_first_avail >= cuda_event_max) {
-            cuda_event_status_first_avail = 0;
+        cuda_event_ipc_first_avail++;
+        if (cuda_event_ipc_first_avail >= cuda_event_max) {
+            cuda_event_ipc_first_avail = 0;
        }
-        cuda_event_status_num_used++;
+        cuda_event_ipc_num_used++;

-        result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
+        result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
        if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
            orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
                           true, result);
@ -658,7 +669,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
            if (0 == (iter % 10)) {
                opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
            }
-            result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
+            result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
            if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
                orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
                               true, result);
@ -667,10 +678,10 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
            iter++;
        }

-        --cuda_event_status_num_used;
-        ++cuda_event_status_first_used;
-        if (cuda_event_status_first_used >= cuda_event_max) {
-            cuda_event_status_first_used = 0;
+        --cuda_event_ipc_num_used;
+        ++cuda_event_ipc_first_used;
+        if (cuda_event_ipc_first_used >= cuda_event_max) {
+            cuda_event_ipc_first_used = 0;
        }
        *done = 1;
    }  
@ -685,12 +696,12 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
 int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
    CUresult result;

-    if (cuda_event_status_num_used > 0) {
+    if (cuda_event_ipc_num_used > 0) {
        opal_output_verbose(20, mca_common_cuda_output,
                           "CUDA: progress_one_cuda_event, outstanding_events=%d",
-                            cuda_event_status_num_used);
+                            cuda_event_ipc_num_used);

-        result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
+        result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);

        /* We found an event that is not ready, so return. */
        if (CUDA_ERROR_NOT_READY == result) {
@ -705,15 +716,15 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
            return OMPI_ERROR;
        }

-        *frag = cuda_event_frag_array[cuda_event_status_first_used];
+        *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
        opal_output_verbose(10, mca_common_cuda_output,
                            "CUDA: cuEventQuery returned %d", result);

        /* Bump counters, loop around the circular buffer if necessary */
-        --cuda_event_status_num_used;
-        ++cuda_event_status_first_used;
-        if (cuda_event_status_first_used >= cuda_event_max) {
-            cuda_event_status_first_used = 0;
+        --cuda_event_ipc_num_used;
+        ++cuda_event_ipc_first_used;
+        if (cuda_event_ipc_first_used >= cuda_event_max) {
+            cuda_event_ipc_first_used = 0;
        }
        /* A return value of 1 indicates an event completed and a frag was returned */
        return 1;
--- a/ompi/mca/common/cuda/help-mpi-common-cuda.txt
+++ b/ompi/mca/common/cuda/help-mpi-common-cuda.txt
@ -1,6 +1,6 @@
 # -*- text -*-
 #
-# Copyright (c) 2011      NVIDIA.  All rights reserved.
+# Copyright (c) 2011-2012 NVIDIA.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -135,3 +135,8 @@ cause the program to abort.
  cuEventDestory return value:   %d
 Check the cuda.h file for what the return value means.
 #
+[cuStreamCreate failed]
+The call to cuStreamCreate failed.  This is a unrecoverable error and will
+cause the program to abort.
+  cuStreamCreate return value:   %d
+Check the cuda.h file for what the return vale means.