Fix initialization and cleanup code for CUDA-aware code. Eliminates all resource leaks.

This commit was SVN r32512.
2014-08-12 19:41:46 +00:00 · 2014-08-12 19:41:46 +00:00 · c53c981506
--- a/opal/mca/btl/openib/btl_openib_component.c
+++ b/opal/mca/btl/openib/btl_openib_component.c
@ -222,6 +222,10 @@ static int btl_openib_component_open(void)
    OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
    mca_btl_openib_component.memory_registration_verbose = -1;

+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_stage_one_init();
+#endif /* OPAL_CUDA_SUPPORT */
+
    return OPAL_SUCCESS;
 }

@ -279,6 +283,10 @@ static int btl_openib_component_close(void)
    /* close memory registration debugging output */
    opal_output_close (mca_btl_openib_component.memory_registration_verbose);

+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_fini();
+#endif /* OPAL_CUDA_SUPPORT */
+
    return rc;
 }

--- a/opal/mca/btl/sm/Makefile.am
+++ b/opal/mca/btl/sm/Makefile.am
@ -10,6 +10,7 @@
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
 # Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -49,6 +50,10 @@ mca_btl_sm_la_SOURCES = $(libmca_btl_sm_la_sources)
 mca_btl_sm_la_LDFLAGS = -module -avoid-version
 mca_btl_sm_la_LIBADD = \
    $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/libmca_common_sm.la
+if OPAL_cuda_support
+mca_btl_sm_la_LIBADD = \
+    $(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/libmca_common_cuda.la
+endif
 mca_btl_sm_la_CPPFLAGS = $(btl_sm_CPPFLAGS)

 noinst_LTLIBRARIES = $(component_noinst)
--- a/opal/mca/btl/sm/btl_sm_component.c
+++ b/opal/mca/btl/sm/btl_sm_component.c
@ -14,7 +14,7 @@
 * Copyright (c) 2009-2014 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2010-2014 Los Alamos National Security, LLC.
 *                         All rights reserved.
- * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
 * Copyright (c) 2010-2012 IBM Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
@ -60,6 +60,9 @@
 #include "btl_sm.h"
 #include "btl_sm_frag.h"
 #include "btl_sm_fifo.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/mca/common/cuda/common_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */

 static int mca_btl_sm_component_open(void);
 static int mca_btl_sm_component_close(void);
@ -356,6 +359,10 @@ static int mca_btl_sm_component_close(void)

 CLEANUP:
    
+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_fini();
+#endif /* OPAL_CUDA_SUPPORT */
+
    /* return */
    return return_value;
 }
@ -878,6 +885,10 @@ mca_btl_sm_component_init(int *num_btls,
    }
 #endif /* OPAL_BTL_SM_HAVE_CMA */

+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_stage_one_init();
+#endif /* OPAL_CUDA_SUPPORT */
+
    return btls;

 #if OPAL_BTL_SM_HAVE_KNEM
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@ -288,6 +288,10 @@ static int mca_btl_smcuda_component_close(void)

 CLEANUP:

+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_fini();
+#endif /* OPAL_CUDA_SUPPORT */
+
    /* return */
    return return_value;
 }
@ -931,6 +935,7 @@ mca_btl_smcuda_component_init(int *num_btls,
    /* Register a smcuda control function to help setup IPC support */
    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
+    mca_common_cuda_stage_one_init();
 #endif /* OPAL_CUDA_SUPPORT */

    return btls;
--- a/opal/mca/btl/tcp/btl_tcp_component.c
+++ b/opal/mca/btl/tcp/btl_tcp_component.c
@ -15,7 +15,7 @@
 * Copyright (c) 2009      Oak Ridge National Laboratory
 * Copyright (c) 2012-2014 Los Alamos National Security, LLC.  All rights
 *                         reserved. 
- * Copyright (c) 2013      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -381,6 +381,10 @@ static int mca_btl_tcp_component_close(void)
    OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
    OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);

+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_fini();
+#endif /* OPAL_CUDA_SUPPORT */
+
    return OPAL_SUCCESS;
 }

--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@ -104,7 +104,7 @@ struct cudaFunctionTable {
 typedef struct cudaFunctionTable cudaFunctionTable_t;
 cudaFunctionTable_t cuFunc;

-static bool stage_one_init_complete = false;
+static int stage_one_init_ref_count = 0;
 static bool stage_three_init_complete = false;
 static bool common_cuda_initialized = false;
 static int mca_common_cuda_verbose;
@ -155,15 +155,15 @@ static int mca_common_cuda_cumemcpy_timing;

 /* Array of CUDA events to be queried for IPC stream, sending side and
 * receiving side. */
-CUevent *cuda_event_ipc_array;
-CUevent *cuda_event_dtoh_array;
-CUevent *cuda_event_htod_array;
+CUevent *cuda_event_ipc_array = NULL;
+CUevent *cuda_event_dtoh_array = NULL;
+CUevent *cuda_event_htod_array = NULL;

 /* Array of fragments currently being moved by cuda async non-blocking
 * operations */
-struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
-struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array;
-struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array;
+struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
+struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
+struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;

 /* First free/available location in cuda_event_status_array */
 int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
@ -181,7 +181,7 @@ static int cuda_event_dtoh_most = 0;
 static int cuda_event_htod_most = 0;

 /* Handle to libcuda.so */
-opal_lt_dlhandle libcuda_handle;
+opal_lt_dlhandle libcuda_handle = NULL;

 #define CUDA_COMMON_TIMING 0
 #if OPAL_ENABLE_DEBUG
@ -211,8 +211,9 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;

 /**
 * This is the first stage of initialization.  This function is
- * triggered when there are memory registration requests from various
- * BTLs.  This function will register some mca variables and then open
+ * called explicitly by any BTLs that can support CUDA-aware.
+ * It is called during the component open phase of initialization.
+ * This function will register some mca variables and then open
 * and load the symbols needed from the CUDA driver library. Look for
 * the SONAME of the library which is libcuda.so.1.  In most cases,
 * this will result in the library found.  However, there are some
@ -232,10 +233,14 @@ int mca_common_cuda_stage_one_init(void)
    int errsize;
    bool stage_one_init_passed = false;

-    if (true == stage_one_init_complete) {
-        return 0;
+    stage_one_init_ref_count++;
+    if (stage_one_init_ref_count > 1) {
+        opal_output_verbose(10, mca_common_cuda_output,
+                            "CUDA: stage_one_init_ref_count is now %d, no need to init",
+                            stage_one_init_ref_count);
+        return OPAL_SUCCESS;
    }
-    stage_one_init_complete = true;
+
    OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
@ -313,6 +318,10 @@ int mca_common_cuda_stage_one_init(void)
    mca_common_cuda_output = opal_output_open(NULL);
    opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);

+    opal_output_verbose(10, mca_common_cuda_output,
+                        "CUDA: stage_one_init_ref_count is now %d, initializing",
+                        stage_one_init_ref_count);
+
    /* First check if the support is enabled.  In the case that the user has
     * turned it off, we do not need to continue with any CUDA specific
     * initialization.  Do this after MCA parameter registration. */
@ -351,7 +360,7 @@ int mca_common_cuda_stage_one_init(void)
     * paths from the system.  For the second loop, set /usr/lib64 to
     * the search path and try again.  This is done to handle the case
     * where we have both 32 and 64 bit libcuda.so libraries installed.
-     * Even when running in 64-bit mode, the /usr/lib direcotry
+     * Even when running in 64-bit mode, the /usr/lib directory
     * is searched first and we may find a 32-bit libcuda.so.1 library.
     * Loading of this library will fail as libtool does not handle having
     * the wrong ABI in the search path (unlike ld or ld.so).  Note that
@ -515,7 +524,8 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta
 * This is the last phase of initialization.  This is triggered when we examine
 * a buffer pointer and determine it is a GPU buffer.  We then assume the user
 * has selected their GPU and we can go ahead with all the CUDA related 
- * initializations. 
+ * initializations.  If we get an error, just return.  Cleanup of resources
+ * will happen when fini is called.
 */
 static int mca_common_cuda_stage_three_init(void)
 {
@ -597,8 +607,6 @@ static int mca_common_cuda_stage_three_init(void)
 #if OPAL_CUDA_SUPPORT_41
    if (true == mca_common_cuda_enabled) {
        /* Set up an array to store outstanding IPC async copy events */
-        cuda_event_ipc_array = NULL;
-        cuda_event_ipc_frag_array = NULL;
        cuda_event_ipc_num_used = 0;
        cuda_event_ipc_first_avail = 0;
        cuda_event_ipc_first_used = 0;
@ -638,8 +646,6 @@ static int mca_common_cuda_stage_three_init(void)
    if (true == mca_common_cuda_enabled) {
        /* Set up an array to store outstanding async dtoh events.  Used on the
         * sending side for asynchronous copies. */
-        cuda_event_dtoh_array = NULL;
-        cuda_event_dtoh_frag_array = NULL;
        cuda_event_dtoh_num_used = 0;
        cuda_event_dtoh_first_avail = 0;
        cuda_event_dtoh_first_used = 0;
@ -649,7 +655,7 @@ static int mca_common_cuda_stage_three_init(void)
            opal_show_help("help-mpi-common-cuda.txt", "No memory",
                           true, OPAL_PROC_MY_HOSTNAME);
            rc = OPAL_ERROR;
-           goto cleanup_and_error;
+            goto cleanup_and_error;
        }

        /* Create the events since they can be reused. */
@ -676,8 +682,6 @@ static int mca_common_cuda_stage_three_init(void)

        /* Set up an array to store outstanding async htod events.  Used on the
         * receiving side for asynchronous copies. */
-        cuda_event_htod_array = NULL;
-        cuda_event_htod_frag_array = NULL;
        cuda_event_htod_num_used = 0;
        cuda_event_htod_first_avail = 0;
        cuda_event_htod_first_used = 0;
@ -784,53 +788,100 @@ static int mca_common_cuda_stage_three_init(void)

    /* If we are here, something went wrong.  Cleanup and return an error. */
 cleanup_and_error:
-    for (i = 0; i < cuda_event_max; i++) {
-        if (NULL != cuda_event_ipc_array[i]) {
-            cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
-        }
-        if (NULL != cuda_event_htod_array[i]) {
-            cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
-        }
-        if (NULL != cuda_event_dtoh_array[i]) {
-            cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
-        }
-    }
-    if (NULL != cuda_event_ipc_array) {
-        free(cuda_event_ipc_array);
-    }
-    if (NULL != cuda_event_htod_array) {
-        free(cuda_event_htod_array);
-    }
-    if (NULL != cuda_event_dtoh_array) {
-        free(cuda_event_dtoh_array);
-    }
-    if (NULL != cuda_event_ipc_frag_array) {
-        free(cuda_event_ipc_frag_array);
-    }
-    if (NULL != cuda_event_htod_frag_array) {
-        free(cuda_event_ipc_frag_array);
-    }
-    if (NULL != cuda_event_dtoh_frag_array) {
-        free(cuda_event_dtoh_frag_array);
-    }
-    if (NULL != ipcStream) {
-        cuFunc.cuStreamDestroy(ipcStream);
-    }
-    if (NULL != dtohStream) {
-        cuFunc.cuStreamDestroy(dtohStream);
-    }
-    if (NULL != htodStream) {
-        cuFunc.cuStreamDestroy(htodStream);
-    }
-    if (NULL != memcpyStream) {
-        cuFunc.cuStreamDestroy(memcpyStream);
-    }
    opal_atomic_mb(); /* Make sure next statement does not get reordered */
    stage_three_init_complete = true;
    OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
    return rc;
 }

+/**
+ * Cleanup all CUDA resources.
+ *
+ * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
+ * mpool.  Looks like with the memory pool from openib (grdma), the unregistering is 
+ * called as the free list is destructed.  Not true for the sm mpool.  This means we
+ * are currently still leaking some host memory we registered with CUDA.
+ */
+void mca_common_cuda_fini(void)
+{
+    int i;
+    
+    if (0 == stage_one_init_ref_count) {
+        opal_output_verbose(20, mca_common_cuda_output,
+                            "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
+                            stage_one_init_ref_count);
+        return;
+    }
+
+    if (1 == stage_one_init_ref_count) {
+        opal_output_verbose(20, mca_common_cuda_output,
+                            "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up",
+                            stage_one_init_ref_count);
+      
+        if (NULL != cuda_event_ipc_array) {
+            for (i = 0; i < cuda_event_max; i++) {
+                if (NULL != cuda_event_ipc_array[i]) {
+                    cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
+                }
+            } 
+            free(cuda_event_ipc_array);
+        }
+        if (NULL != cuda_event_htod_array) {
+            for (i = 0; i < cuda_event_max; i++) {
+                if (NULL != cuda_event_htod_array[i]) {
+                    cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
+                }
+            }
+            free(cuda_event_htod_array);
+        }
+
+        if (NULL != cuda_event_dtoh_array) {
+            for (i = 0; i < cuda_event_max; i++) {
+                if (NULL != cuda_event_dtoh_array[i]) {
+                    cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
+                }
+            }
+            free(cuda_event_dtoh_array);
+        }
+
+        if (NULL != cuda_event_ipc_frag_array) {
+            free(cuda_event_ipc_frag_array);
+        }
+        if (NULL != cuda_event_htod_frag_array) {
+            free(cuda_event_htod_frag_array);
+        }
+        if (NULL != cuda_event_dtoh_frag_array) {
+            free(cuda_event_dtoh_frag_array);
+        }
+        if (NULL != ipcStream) {
+            cuFunc.cuStreamDestroy(ipcStream);
+        }
+        if (NULL != dtohStream) {
+            cuFunc.cuStreamDestroy(dtohStream);
+        }
+        if (NULL != htodStream) {
+            cuFunc.cuStreamDestroy(htodStream);
+        }
+        if (NULL != memcpyStream) {
+            cuFunc.cuStreamDestroy(memcpyStream);
+        }
+        OBJ_DESTRUCT(&common_cuda_init_lock);
+        OBJ_DESTRUCT(&common_cuda_htod_lock);
+        OBJ_DESTRUCT(&common_cuda_dtoh_lock);
+        OBJ_DESTRUCT(&common_cuda_ipc_lock);
+        if (NULL != libcuda_handle) {
+            opal_lt_dlclose(libcuda_handle);
+            opal_lt_dlexit();
+        }
+        opal_output_close(mca_common_cuda_output);
+
+    } else {
+        opal_output_verbose(20, mca_common_cuda_output,
+                            "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
+                            stage_one_init_ref_count);
+    }
+    stage_one_init_ref_count--;
+}    

 /**
 * Call the CUDA register function so we pin the memory in the CUDA
@ -843,17 +894,6 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
    if (!opal_cuda_support)
        return;

-    /* Registering memory during BTL initialization will be the first call
-     * into the cuda common code, so this is where we do the first
-     * initialization function. If the first stage fails, then disable
-     * support and return. */
-    if (!stage_one_init_complete) {
-        if (0 != mca_common_cuda_stage_one_init()) {
-            opal_cuda_support = 0;
-            return;
-        }
-    }
-
    if (!common_cuda_initialized) {
        OPAL_THREAD_LOCK(&common_cuda_init_lock);
        if (!common_cuda_initialized) {
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@ -74,6 +74,7 @@ OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
 OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
+OPAL_DECLSPEC void mca_common_cuda_fini(void);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);