Add a lock and clean up handling of some error conditions,

This commit was SVN r32258.
2014-07-17 19:33:10 +00:00 · 2014-07-17 19:33:10 +00:00 · 947a4e14b4
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -97,6 +97,7 @@ struct cudaFunctionTable {
    int (*cuCtxSetCurrent)(CUcontext);
    int (*cuEventSynchronize)(CUevent);
    int (*cuStreamSynchronize)(CUstream);
+    int (*cuStreamDestroy)(CUstream);
 } cudaFunctionTable;
 typedef struct cudaFunctionTable cudaFunctionTable_t;
 cudaFunctionTable_t cuFunc;
@ -110,10 +111,11 @@ bool mca_common_cuda_enabled = false;
 static bool mca_common_cuda_register_memory = true;
 static bool mca_common_cuda_warning = false;
 static opal_list_t common_cuda_memory_registrations;
-static CUstream ipcStream;
-static CUstream dtohStream;
-static CUstream htodStream;
-static CUstream memcpyStream;
+static CUstream ipcStream = NULL;
+static CUstream dtohStream = NULL;
+static CUstream htodStream = NULL;
+static CUstream memcpyStream = NULL;
+static opal_mutex_t common_cuda_init_lock;

 /* Functions called by opal layer - plugged into opal function table */
 static int mca_common_cuda_is_gpu_buffer(const void*);
@ -229,6 +231,7 @@ int mca_common_cuda_stage_one_init(void)
        return 0;
    }
    stage_one_init_complete = true;
+    OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);

    /* Set different levels of verbosity in the cuda related code. */
    mca_common_cuda_verbose = 0;
@ -472,6 +475,7 @@ int mca_common_cuda_stage_one_init(void)
    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
    OMPI_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
    OMPI_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
+    OMPI_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
    return 0;
 }

@ -506,26 +510,37 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta
 */
 static int mca_common_cuda_stage_three_init(void)
 {
-    int i, s;
+    int i, s, rc;
    CUresult res;
    CUcontext cuContext;
    common_cuda_mem_regs_t *mem_reg;

-    stage_three_init_complete = true;
-
+    OPAL_THREAD_LOCK(&common_cuda_init_lock);
    opal_output_verbose(20, mca_common_cuda_output,
                        "CUDA: entering stage three init");

+    /* Compiled without support or user disabled support */
    if (OPAL_UNLIKELY(!ompi_mpi_cuda_support)) {
        opal_output_verbose(20, mca_common_cuda_output,
                            "CUDA: No mpi cuda support, exiting stage three init");
+        stage_three_init_complete = true;
+        OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
        return OMPI_ERROR;
    }

-    if (OPAL_LIKELY(common_cuda_initialized)) {
-        opal_output_verbose(20, mca_common_cuda_output,
-                            "CUDA: Stage three already complete, exiting stage three init");
-        return OMPI_SUCCESS;
+    /* In case another thread snuck in and completed the initialization */
+    if (true == stage_three_init_complete) {
+        if (common_cuda_initialized) {
+            opal_output_verbose(20, mca_common_cuda_output,
+                                "CUDA: Stage three already complete, exiting stage three init");
+            OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
+            return OMPI_SUCCESS;
+        } else {
+            opal_output_verbose(20, mca_common_cuda_output,
+                                "CUDA: Stage three already complete, failed during the init");
+            OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
+            return OMPI_ERROR;
+        }
    }

    /* Check to see if this process is running in a CUDA context.  If
@ -566,6 +581,7 @@ static int mca_common_cuda_stage_three_init(void)
     * CUDA support.
     */
    if (false == mca_common_cuda_enabled) {
+        OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
        return OMPI_ERROR;
    }

@ -578,11 +594,12 @@ static int mca_common_cuda_stage_three_init(void)
        cuda_event_ipc_first_avail = 0;
        cuda_event_ipc_first_used = 0;

-        cuda_event_ipc_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
+        cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
        if (NULL == cuda_event_ipc_array) {
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, ompi_process_info.nodename);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }

        /* Create the events since they can be reused. */
@ -591,7 +608,8 @@ static int mca_common_cuda_stage_three_init(void)
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, ompi_process_info.nodename, res);
-                return OMPI_ERROR;
+                rc = OMPI_ERROR;
+                goto cleanup_and_error;
            }
        }

@ -602,7 +620,8 @@ static int mca_common_cuda_stage_three_init(void)
        if (NULL == cuda_event_ipc_frag_array) {
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, ompi_process_info.nodename);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }
    }

@ -616,11 +635,12 @@ static int mca_common_cuda_stage_three_init(void)
        cuda_event_dtoh_first_avail = 0;
        cuda_event_dtoh_first_used = 0;

-        cuda_event_dtoh_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
+        cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
        if (NULL == cuda_event_dtoh_array) {
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, ompi_process_info.nodename);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }

        /* Create the events since they can be reused. */
@ -629,7 +649,8 @@ static int mca_common_cuda_stage_three_init(void)
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, ompi_process_info.nodename, res);
-                return OMPI_ERROR;
+                rc = OMPI_ERROR;
+                goto cleanup_and_error;
            }
        }

@ -640,7 +661,8 @@ static int mca_common_cuda_stage_three_init(void)
        if (NULL == cuda_event_dtoh_frag_array) {
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, ompi_process_info.nodename);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }

        /* Set up an array to store outstanding async htod events.  Used on the
@ -651,11 +673,12 @@ static int mca_common_cuda_stage_three_init(void)
        cuda_event_htod_first_avail = 0;
        cuda_event_htod_first_used = 0;

-        cuda_event_htod_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
+        cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
        if (NULL == cuda_event_htod_array) {
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, ompi_process_info.nodename);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }

        /* Create the events since they can be reused. */
@ -664,7 +687,8 @@ static int mca_common_cuda_stage_three_init(void)
            if (CUDA_SUCCESS != res) {
                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
                               true, ompi_process_info.nodename, res);
-                return OMPI_ERROR;
+                rc = OMPI_ERROR;
+                goto cleanup_and_error;
            }
        }

@ -675,7 +699,8 @@ static int mca_common_cuda_stage_three_init(void)
        if (NULL == cuda_event_htod_frag_array) {
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, ompi_process_info.nodename);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }
    }

@ -707,7 +732,8 @@ static int mca_common_cuda_stage_three_init(void)
    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                       true, ompi_process_info.nodename, res);
-        return OMPI_ERROR;
+        rc = OMPI_ERROR;
+        goto cleanup_and_error;
    }

    /* Create stream for use in dtoh asynchronous copies */
@ -715,8 +741,8 @@ static int mca_common_cuda_stage_three_init(void)
    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                       true, ompi_process_info.nodename, res);
-        return OMPI_ERROR;
-
+        rc = OMPI_ERROR;
+        goto cleanup_and_error;
    }

    /* Create stream for use in htod asynchronous copies */
@ -724,8 +750,8 @@ static int mca_common_cuda_stage_three_init(void)
    if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
        opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                       true, ompi_process_info.nodename, res);
-        return OMPI_ERROR;
-
+        rc = OMPI_ERROR;
+        goto cleanup_and_error;
    }

    if (mca_common_cuda_cumemcpy_async) {
@ -734,14 +760,64 @@ static int mca_common_cuda_stage_three_init(void)
        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
            opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
                           true, ompi_process_info.nodename, res);
-            return OMPI_ERROR;
+            rc = OMPI_ERROR;
+            goto cleanup_and_error;
        }
    }

    opal_output_verbose(30, mca_common_cuda_output,
                        "CUDA: initialized");
    common_cuda_initialized = true;
+    stage_three_init_complete = true;
+    OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
    return OMPI_SUCCESS;
+
+    /* If we are here, something went wrong.  Cleanup and return an error. */
+ cleanup_and_error:
+    for (i = 0; i < cuda_event_max; i++) {
+        if (NULL != cuda_event_ipc_array[i]) {
+            cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
+        }
+        if (NULL != cuda_event_htod_array[i]) {
+            cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
+        }
+        if (NULL != cuda_event_dtoh_array[i]) {
+            cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
+        }
+    }
+    if (NULL != cuda_event_ipc_array) {
+        free(cuda_event_ipc_array);
+    }
+    if (NULL != cuda_event_htod_array) {
+        free(cuda_event_htod_array);
+    }
+    if (NULL != cuda_event_dtoh_array) {
+        free(cuda_event_dtoh_array);
+    }
+    if (NULL != cuda_event_ipc_frag_array) {
+        free(cuda_event_ipc_frag_array);
+    }
+    if (NULL != cuda_event_htod_frag_array) {
+        free(cuda_event_ipc_frag_array);
+    }
+    if (NULL != cuda_event_dtoh_frag_array) {
+        free(cuda_event_dtoh_frag_array);
+    }
+    if (NULL != ipcStream) {
+        cuFunc.cuStreamDestroy(ipcStream);
+    }
+    if (NULL != dtohStream) {
+        cuFunc.cuStreamDestroy(dtohStream);
+    }
+    if (NULL != htodStream) {
+        cuFunc.cuStreamDestroy(htodStream);
+    }
+    if (NULL != memcpyStream) {
+        cuFunc.cuStreamDestroy(memcpyStream);
+    }
+    stage_three_init_complete = true;
+    OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
+    return rc;
 }