cuda: convert to opal_dl interface
Этот коммит содержится в:
родитель
c683500a29
Коммит
1995f6beba
@ -10,7 +10,7 @@ dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
dnl University of Stuttgart. All rights reserved.
|
||||
dnl Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
dnl All rights reserved.
|
||||
dnl Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
@ -72,10 +72,12 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
|
||||
opal_cuda_incdir="$with_cuda/include"
|
||||
AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])])
|
||||
|
||||
# We cannot have CUDA support without dlopen support. Check for that and
|
||||
# error out if the user has also set --disable-dlopen.
|
||||
AS_IF([test "$enable_dlopen" = "no" && test "$opal_check_cuda_happy" = "yes"],
|
||||
[AC_MSG_ERROR([--with-cuda cannot be used with --disable-dlopen. Remove one of them and reconfigure.])])
|
||||
dnl We cannot have CUDA support without dlopen support. HOWEVER, at
|
||||
dnl this point in configure, we can't know whether the DL framework
|
||||
dnl has been configured or not yet (it likely hasn't, since CUDA is a
|
||||
dnl common framework, and likely configured first). So we have to
|
||||
dnl defer this check until later (see the OPAL_CHECK_CUDA_AFTER_OPAL_DL m4
|
||||
dnl macro, below). :-(
|
||||
|
||||
# If we have CUDA support, check to see if we have CUDA 4.1 support
|
||||
AS_IF([test "$opal_check_cuda_happy"="yes"],
|
||||
@ -142,3 +144,21 @@ AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
|
||||
[Whether we have CUDA GDR support available])
|
||||
|
||||
])
|
||||
|
||||
dnl
|
||||
dnl CUDA support requires DL support (it dynamically opens the CUDA
|
||||
dnl library at run time). But we do not check for OPAL DL support
|
||||
dnl until lafter the initial OPAL_CHECK_CUDA is called. So put the
|
||||
dnl CUDA+DL check in a separate macro that can be called after the DL MCA
|
||||
dnl framework checks in the top-level configure.ac.
|
||||
dnl
|
||||
AC_DEFUN([OPAL_CHECK_CUDA_AFTER_OPAL_DL],[
|
||||
|
||||
# We cannot have CUDA support without OPAL DL support. Error out
|
||||
# if the user wants CUDA but we do not have OPAL DL support.
|
||||
AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && \
|
||||
test "$opal_check_cuda_happy" = "yes"],
|
||||
[AC_MSG_WARN([--with-cuda was specified, but dlopen support is disabled.])
|
||||
AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
|
||||
AC_MSG_ERROR([Cannot continue.])])
|
||||
])
|
||||
|
12
configure.ac
12
configure.ac
@ -1157,6 +1157,18 @@ m4_ifdef([project_ompi], [OMPI_REQUIRE_ENDPOINT_TAG_FINI])
|
||||
# checkpoint results
|
||||
AC_CACHE_SAVE
|
||||
|
||||
##################################
|
||||
# CUDA: part two
|
||||
##################################
|
||||
|
||||
# This is somewhat gross to have a configure check for a common MCA
|
||||
# component outside of the normal MCA checks, but this check must come
|
||||
# after the opal DL MCA checks have done. Someday this could perhaps
|
||||
# be done better by having some kind of "run this check at the end of
|
||||
# all other MCA checks" hook...?
|
||||
|
||||
OPAL_CHECK_CUDA_AFTER_OPAL_DL
|
||||
|
||||
##################################
|
||||
# MPI Extended Interfaces
|
||||
##################################
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -33,13 +34,13 @@
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/lt_interface.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/timer/base/base.h"
|
||||
#include "opal/mca/dl/base/base.h"
|
||||
|
||||
#include "common_cuda.h"
|
||||
|
||||
@ -55,12 +56,15 @@
|
||||
|
||||
#define OPAL_CUDA_DLSYM(libhandle, funcName) \
|
||||
do { \
|
||||
*(void **)(&cuFunc.funcName) = opal_lt_dlsym(libhandle, STRINGIFY(funcName)); \
|
||||
if (NULL == cuFunc.funcName) { \
|
||||
char *err_msg; \
|
||||
void *ptr; \
|
||||
if (OPAL_SUCCESS != \
|
||||
opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) { \
|
||||
opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, \
|
||||
STRINGIFY(funcName), opal_lt_dlerror()); \
|
||||
STRINGIFY(funcName), err_msg); \
|
||||
return 1; \
|
||||
} else { \
|
||||
*(void **)(&cuFunc.funcName) = ptr; \
|
||||
opal_output_verbose(15, mca_common_cuda_output, \
|
||||
"CUDA: successful dlsym of %s", \
|
||||
STRINGIFY(funcName)); \
|
||||
@ -185,7 +189,7 @@ static int cuda_event_dtoh_most = 0;
|
||||
static int cuda_event_htod_most = 0;
|
||||
|
||||
/* Handle to libcuda.so */
|
||||
opal_lt_dlhandle libcuda_handle = NULL;
|
||||
opal_dl_handle_t *libcuda_handle = NULL;
|
||||
|
||||
/* Unused variable that we register at init time and unregister at fini time.
|
||||
* This is used to detect if user has done a device reset prior to MPI_Finalize.
|
||||
@ -233,9 +237,7 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
|
||||
*/
|
||||
int mca_common_cuda_stage_one_init(void)
|
||||
{
|
||||
opal_lt_dladvise advise;
|
||||
int retval, i, j;
|
||||
int advise_support = 1;
|
||||
char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
|
||||
char *searchpaths[] = {"", "/usr/lib64", NULL};
|
||||
char **errmsgs = NULL;
|
||||
@ -339,120 +341,76 @@ int mca_common_cuda_stage_one_init(void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (0 != (retval = opal_lt_dlinit())) {
|
||||
if (OPAL_ERR_NOT_SUPPORTED == retval) {
|
||||
opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
|
||||
} else {
|
||||
opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
|
||||
"opal_lt_dlinit", retval, opal_lt_dlerror());
|
||||
}
|
||||
if (!OPAL_HAVE_DL_SUPPORT) {
|
||||
opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Initialize the lt_dladvise structure. If this does not work, we can
|
||||
* proceed without the support. Things should still work. */
|
||||
if (0 != (retval = opal_lt_dladvise_init(&advise))) {
|
||||
if (OPAL_ERR_NOT_SUPPORTED == retval) {
|
||||
advise_support = 0;
|
||||
} else {
|
||||
opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
|
||||
"opal_lt_dladvise_init", retval, opal_lt_dlerror());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now walk through all the potential names libcuda and find one
|
||||
* that works. If it does, all is good. If not, print out all
|
||||
* the messages about why things failed. This code was careful
|
||||
* to try and save away all error messages if the loading ultimately
|
||||
* failed to help with debugging.
|
||||
* failed to help with debugging.
|
||||
*
|
||||
* NOTE: On the first loop we just utilize the default loading
|
||||
* paths from the system. For the second loop, set /usr/lib64 to
|
||||
* the search path and try again. This is done to handle the case
|
||||
* where we have both 32 and 64 bit libcuda.so libraries installed.
|
||||
* Even when running in 64-bit mode, the /usr/lib directory
|
||||
* is searched first and we may find a 32-bit libcuda.so.1 library.
|
||||
* Loading of this library will fail as libtool does not handle having
|
||||
* the wrong ABI in the search path (unlike ld or ld.so). Note that
|
||||
* we only set this search path after the original search. This is
|
||||
* so that LD_LIBRARY_PATH and run path settings are respected.
|
||||
* Setting this search path overrides them (rather then being appended). */
|
||||
if (advise_support) {
|
||||
if (0 != (retval = opal_lt_dladvise_global(&advise))) {
|
||||
opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
|
||||
"opal_lt_dladvise_global", retval, opal_lt_dlerror());
|
||||
opal_lt_dladvise_destroy(&advise);
|
||||
return 1;
|
||||
}
|
||||
j = 0;
|
||||
while (searchpaths[j] != NULL) {
|
||||
/* Set explicit search path if entry is not empty string */
|
||||
if (strcmp("", searchpaths[j])) {
|
||||
opal_lt_dlsetsearchpath(searchpaths[j]);
|
||||
}
|
||||
i = 0;
|
||||
while (cudalibs[i] != NULL) {
|
||||
const char *str;
|
||||
libcuda_handle = opal_lt_dlopenadvise(cudalibs[i], advise);
|
||||
if (NULL == libcuda_handle) {
|
||||
str = opal_lt_dlerror();
|
||||
if (NULL != str) {
|
||||
opal_argv_append(&errsize, &errmsgs, str);
|
||||
} else {
|
||||
opal_argv_append(&errsize, &errmsgs, "lt_dlerror() returned NULL.");
|
||||
}
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: Library open error: %s",
|
||||
errmsgs[errsize-1]);
|
||||
} else {
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: Library successfully opened %s",
|
||||
cudalibs[i]);
|
||||
stage_one_init_passed = true;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (true == stage_one_init_passed) break; /* Break out of outer loop */
|
||||
j++;
|
||||
}
|
||||
opal_lt_dladvise_destroy(&advise);
|
||||
} else {
|
||||
j = 0;
|
||||
/* No lt_dladvise support. This should rarely happen. */
|
||||
while (searchpaths[j] != NULL) {
|
||||
/* Set explicit search path if entry is not empty string */
|
||||
if (strcmp("", searchpaths[j])) {
|
||||
opal_lt_dlsetsearchpath(searchpaths[j]);
|
||||
}
|
||||
i = 0;
|
||||
while (cudalibs[i] != NULL) {
|
||||
const char *str;
|
||||
libcuda_handle = opal_lt_dlopen(cudalibs[i]);
|
||||
if (NULL == libcuda_handle) {
|
||||
str = opal_lt_dlerror();
|
||||
if (NULL != str) {
|
||||
opal_argv_append(&errsize, &errmsgs, str);
|
||||
} else {
|
||||
opal_argv_append(&errsize, &errmsgs, "lt_dlerror() returned NULL.");
|
||||
}
|
||||
* where we have both 32 and 64 bit libcuda.so libraries
|
||||
* installed. Even when running in 64-bit mode, the /usr/lib
|
||||
* directory is searched first and we may find a 32-bit
|
||||
* libcuda.so.1 library. Loading of this library will fail as the
|
||||
* OPAL DL framework does not handle having the wrong ABI in the
|
||||
* search path (unlike ld or ld.so). Note that we only set this
|
||||
* search path after the original search. This is so that
|
||||
* LD_LIBRARY_PATH and run path settings are respected. Setting
|
||||
* this search path overrides them (rather then being
|
||||
* appended). */
|
||||
j = 0;
|
||||
while (searchpaths[j] != NULL) {
|
||||
while (cudalibs[i] != NULL) {
|
||||
char *filename;
|
||||
char *str;
|
||||
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: Library open error: %s",
|
||||
errmsgs[errsize-1]);
|
||||
|
||||
} else {
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: Library successfully opened %s",
|
||||
cudalibs[i]);
|
||||
stage_one_init_passed = true;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
/* If there's a non-empty search path, prepend it
|
||||
to the library filename */
|
||||
if (strlen(searchpaths[j]) > 0) {
|
||||
asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
|
||||
} else {
|
||||
filename = strdup(cudalibs[i]);
|
||||
}
|
||||
if (true == stage_one_init_passed) break; /* Break out of outer loop */
|
||||
j++;
|
||||
if (NULL == filename) {
|
||||
opal_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||
true, OPAL_PROC_MY_HOSTNAME);
|
||||
return 1;
|
||||
}
|
||||
|
||||
retval = opal_dl_open(filename, false, false,
|
||||
&libcuda_handle, &str);
|
||||
if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
|
||||
if (NULL != str) {
|
||||
opal_argv_append(&errsize, &errmsgs, str);
|
||||
} else {
|
||||
opal_argv_append(&errsize, &errmsgs,
|
||||
"opal_dl_open() returned NULL.");
|
||||
}
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: Library open error: %s",
|
||||
errmsgs[errsize-1]);
|
||||
} else {
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: Library successfully opened %s",
|
||||
cudalibs[i]);
|
||||
stage_one_init_passed = true;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
|
||||
free(filename);
|
||||
}
|
||||
if (true == stage_one_init_passed) {
|
||||
break; /* Break out of outer loop */
|
||||
}
|
||||
j++;
|
||||
}
|
||||
|
||||
if (true != stage_one_init_passed) {
|
||||
@ -916,8 +874,7 @@ void mca_common_cuda_fini(void)
|
||||
OBJ_DESTRUCT(&common_cuda_dtoh_lock);
|
||||
OBJ_DESTRUCT(&common_cuda_ipc_lock);
|
||||
if (NULL != libcuda_handle) {
|
||||
opal_lt_dlclose(libcuda_handle);
|
||||
opal_lt_dlexit();
|
||||
opal_dl_close(libcuda_handle);
|
||||
}
|
||||
|
||||
opal_output_verbose(20, mca_common_cuda_output,
|
||||
|
@ -1,10 +1,11 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2011-2015 NVIDIA. All rights reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[cuCtxGetCurrent failed not initialized]
|
||||
@ -152,14 +153,6 @@ Open MPI was compiled without dynamic library support (e.g., with the
|
||||
|
||||
If you need CUDA support, reconfigure Open MPI with dynamic library support enabled.
|
||||
#
|
||||
[unknown ltdl error]
|
||||
While attempting to load the supporting libcuda.so library, an error
|
||||
occurred. This really should rarely happen. Please notify the Open
|
||||
MPI developers.
|
||||
Function: %s
|
||||
Return Value: %d
|
||||
Error string: %s
|
||||
#
|
||||
[dlopen failed]
|
||||
The library attempted to open the following supporting CUDA libraries,
|
||||
but each of them failed. CUDA-aware support is disabled.
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user