1
1

cuda: convert to opal_dl interface

Этот коммит содержится в:
Jeff Squyres 2015-02-19 13:59:44 -08:00
родитель c683500a29
Коммит 1995f6beba
4 изменённых файлов: 107 добавлений и 125 удалений

Просмотреть файл

@ -10,7 +10,7 @@ dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
dnl University of Stuttgart. All rights reserved. dnl University of Stuttgart. All rights reserved.
dnl Copyright (c) 2004-2005 The Regents of the University of California. dnl Copyright (c) 2004-2005 The Regents of the University of California.
dnl All rights reserved. dnl All rights reserved.
dnl Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
dnl Copyright (c) 2009 IBM Corporation. All rights reserved. dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
@ -72,10 +72,12 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
opal_cuda_incdir="$with_cuda/include" opal_cuda_incdir="$with_cuda/include"
AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])]) AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])])
# We cannot have CUDA support without dlopen support. Check for that and dnl We cannot have CUDA support without dlopen support. HOWEVER, at
# error out if the user has also set --disable-dlopen. dnl this point in configure, we can't know whether the DL framework
AS_IF([test "$enable_dlopen" = "no" && test "$opal_check_cuda_happy" = "yes"], dnl has been configured or not yet (it likely hasn't, since CUDA is a
[AC_MSG_ERROR([--with-cuda cannot be used with --disable-dlopen. Remove one of them and reconfigure.])]) dnl common framework, and likely configured first). So we have to
dnl defer this check until later (see the OPAL_CHECK_CUDA_AFTER_OPAL_DL m4
dnl macro, below). :-(
# If we have CUDA support, check to see if we have CUDA 4.1 support # If we have CUDA support, check to see if we have CUDA 4.1 support
AS_IF([test "$opal_check_cuda_happy"="yes"], AS_IF([test "$opal_check_cuda_happy"="yes"],
@ -142,3 +144,21 @@ AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
[Whether we have CUDA GDR support available]) [Whether we have CUDA GDR support available])
]) ])
dnl
dnl CUDA support requires DL support (it dynamically opens the CUDA
dnl library at run time). But we do not check for OPAL DL support
dnl until lafter the initial OPAL_CHECK_CUDA is called. So put the
dnl CUDA+DL check in a separate macro that can be called after the DL MCA
dnl framework checks in the top-level configure.ac.
dnl
AC_DEFUN([OPAL_CHECK_CUDA_AFTER_OPAL_DL],[
# We cannot have CUDA support without OPAL DL support. Error out
# if the user wants CUDA but we do not have OPAL DL support.
AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && \
test "$opal_check_cuda_happy" = "yes"],
[AC_MSG_WARN([--with-cuda was specified, but dlopen support is disabled.])
AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
AC_MSG_ERROR([Cannot continue.])])
])

Просмотреть файл

@ -1157,6 +1157,18 @@ m4_ifdef([project_ompi], [OMPI_REQUIRE_ENDPOINT_TAG_FINI])
# checkpoint results # checkpoint results
AC_CACHE_SAVE AC_CACHE_SAVE
##################################
# CUDA: part two
##################################
# This is somewhat gross to have a configure check for a common MCA
# component outside of the normal MCA checks, but this check must come
# after the opal DL MCA checks have done. Someday this could perhaps
# be done better by having some kind of "run this check at the end of
# all other MCA checks" hook...?
OPAL_CHECK_CUDA_AFTER_OPAL_DL
################################## ##################################
# MPI Extended Interfaces # MPI Extended Interfaces
################################## ##################################

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California. * Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -33,13 +34,13 @@
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h" #include "opal/datatype/opal_datatype_cuda.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/lt_interface.h"
#include "opal/util/show_help.h" #include "opal/util/show_help.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
#include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/base/base.h"
#include "opal/runtime/opal_params.h" #include "opal/runtime/opal_params.h"
#include "opal/mca/timer/base/base.h" #include "opal/mca/timer/base/base.h"
#include "opal/mca/dl/base/base.h"
#include "common_cuda.h" #include "common_cuda.h"
@ -55,12 +56,15 @@
#define OPAL_CUDA_DLSYM(libhandle, funcName) \ #define OPAL_CUDA_DLSYM(libhandle, funcName) \
do { \ do { \
*(void **)(&cuFunc.funcName) = opal_lt_dlsym(libhandle, STRINGIFY(funcName)); \ char *err_msg; \
if (NULL == cuFunc.funcName) { \ void *ptr; \
if (OPAL_SUCCESS != \
opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) { \
opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, \ opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, \
STRINGIFY(funcName), opal_lt_dlerror()); \ STRINGIFY(funcName), err_msg); \
return 1; \ return 1; \
} else { \ } else { \
*(void **)(&cuFunc.funcName) = ptr; \
opal_output_verbose(15, mca_common_cuda_output, \ opal_output_verbose(15, mca_common_cuda_output, \
"CUDA: successful dlsym of %s", \ "CUDA: successful dlsym of %s", \
STRINGIFY(funcName)); \ STRINGIFY(funcName)); \
@ -185,7 +189,7 @@ static int cuda_event_dtoh_most = 0;
static int cuda_event_htod_most = 0; static int cuda_event_htod_most = 0;
/* Handle to libcuda.so */ /* Handle to libcuda.so */
opal_lt_dlhandle libcuda_handle = NULL; opal_dl_handle_t *libcuda_handle = NULL;
/* Unused variable that we register at init time and unregister at fini time. /* Unused variable that we register at init time and unregister at fini time.
* This is used to detect if user has done a device reset prior to MPI_Finalize. * This is used to detect if user has done a device reset prior to MPI_Finalize.
@ -233,9 +237,7 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
*/ */
int mca_common_cuda_stage_one_init(void) int mca_common_cuda_stage_one_init(void)
{ {
opal_lt_dladvise advise;
int retval, i, j; int retval, i, j;
int advise_support = 1;
char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL}; char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
char *searchpaths[] = {"", "/usr/lib64", NULL}; char *searchpaths[] = {"", "/usr/lib64", NULL};
char **errmsgs = NULL; char **errmsgs = NULL;
@ -339,120 +341,76 @@ int mca_common_cuda_stage_one_init(void)
return 1; return 1;
} }
if (0 != (retval = opal_lt_dlinit())) { if (!OPAL_HAVE_DL_SUPPORT) {
if (OPAL_ERR_NOT_SUPPORTED == retval) { opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
} else {
opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
"opal_lt_dlinit", retval, opal_lt_dlerror());
}
return 1; return 1;
} }
/* Initialize the lt_dladvise structure. If this does not work, we can
* proceed without the support. Things should still work. */
if (0 != (retval = opal_lt_dladvise_init(&advise))) {
if (OPAL_ERR_NOT_SUPPORTED == retval) {
advise_support = 0;
} else {
opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true,
"opal_lt_dladvise_init", retval, opal_lt_dlerror());
return 1;
}
}
/* Now walk through all the potential names libcuda and find one /* Now walk through all the potential names libcuda and find one
* that works. If it does, all is good. If not, print out all * that works. If it does, all is good. If not, print out all
* the messages about why things failed. This code was careful * the messages about why things failed. This code was careful
* to try and save away all error messages if the loading ultimately * to try and save away all error messages if the loading ultimately
* failed to help with debugging. * failed to help with debugging.
*
* NOTE: On the first loop we just utilize the default loading * NOTE: On the first loop we just utilize the default loading
* paths from the system. For the second loop, set /usr/lib64 to * paths from the system. For the second loop, set /usr/lib64 to
* the search path and try again. This is done to handle the case * the search path and try again. This is done to handle the case
* where we have both 32 and 64 bit libcuda.so libraries installed. * where we have both 32 and 64 bit libcuda.so libraries
* Even when running in 64-bit mode, the /usr/lib directory * installed. Even when running in 64-bit mode, the /usr/lib
* is searched first and we may find a 32-bit libcuda.so.1 library. * directory is searched first and we may find a 32-bit
* Loading of this library will fail as libtool does not handle having * libcuda.so.1 library. Loading of this library will fail as the
* the wrong ABI in the search path (unlike ld or ld.so). Note that * OPAL DL framework does not handle having the wrong ABI in the
* we only set this search path after the original search. This is * search path (unlike ld or ld.so). Note that we only set this
* so that LD_LIBRARY_PATH and run path settings are respected. * search path after the original search. This is so that
* Setting this search path overrides them (rather then being appended). */ * LD_LIBRARY_PATH and run path settings are respected. Setting
if (advise_support) { * this search path overrides them (rather then being
if (0 != (retval = opal_lt_dladvise_global(&advise))) { * appended). */
opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", true, j = 0;
"opal_lt_dladvise_global", retval, opal_lt_dlerror()); while (searchpaths[j] != NULL) {
opal_lt_dladvise_destroy(&advise); while (cudalibs[i] != NULL) {
return 1; char *filename;
} char *str;
j = 0;
while (searchpaths[j] != NULL) {
/* Set explicit search path if entry is not empty string */
if (strcmp("", searchpaths[j])) {
opal_lt_dlsetsearchpath(searchpaths[j]);
}
i = 0;
while (cudalibs[i] != NULL) {
const char *str;
libcuda_handle = opal_lt_dlopenadvise(cudalibs[i], advise);
if (NULL == libcuda_handle) {
str = opal_lt_dlerror();
if (NULL != str) {
opal_argv_append(&errsize, &errmsgs, str);
} else {
opal_argv_append(&errsize, &errmsgs, "lt_dlerror() returned NULL.");
}
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: Library open error: %s",
errmsgs[errsize-1]);
} else {
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: Library successfully opened %s",
cudalibs[i]);
stage_one_init_passed = true;
break;
}
i++;
}
if (true == stage_one_init_passed) break; /* Break out of outer loop */
j++;
}
opal_lt_dladvise_destroy(&advise);
} else {
j = 0;
/* No lt_dladvise support. This should rarely happen. */
while (searchpaths[j] != NULL) {
/* Set explicit search path if entry is not empty string */
if (strcmp("", searchpaths[j])) {
opal_lt_dlsetsearchpath(searchpaths[j]);
}
i = 0;
while (cudalibs[i] != NULL) {
const char *str;
libcuda_handle = opal_lt_dlopen(cudalibs[i]);
if (NULL == libcuda_handle) {
str = opal_lt_dlerror();
if (NULL != str) {
opal_argv_append(&errsize, &errmsgs, str);
} else {
opal_argv_append(&errsize, &errmsgs, "lt_dlerror() returned NULL.");
}
opal_output_verbose(10, mca_common_cuda_output, /* If there's a non-empty search path, prepend it
"CUDA: Library open error: %s", to the library filename */
errmsgs[errsize-1]); if (strlen(searchpaths[j]) > 0) {
asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
} else { } else {
opal_output_verbose(10, mca_common_cuda_output, filename = strdup(cudalibs[i]);
"CUDA: Library successfully opened %s",
cudalibs[i]);
stage_one_init_passed = true;
break;
}
i++;
} }
if (true == stage_one_init_passed) break; /* Break out of outer loop */ if (NULL == filename) {
j++; opal_show_help("help-mpi-common-cuda.txt", "No memory",
true, OPAL_PROC_MY_HOSTNAME);
return 1;
}
retval = opal_dl_open(filename, false, false,
&libcuda_handle, &str);
if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
if (NULL != str) {
opal_argv_append(&errsize, &errmsgs, str);
} else {
opal_argv_append(&errsize, &errmsgs,
"opal_dl_open() returned NULL.");
}
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: Library open error: %s",
errmsgs[errsize-1]);
} else {
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: Library successfully opened %s",
cudalibs[i]);
stage_one_init_passed = true;
break;
}
i++;
free(filename);
} }
if (true == stage_one_init_passed) {
break; /* Break out of outer loop */
}
j++;
} }
if (true != stage_one_init_passed) { if (true != stage_one_init_passed) {
@ -916,8 +874,7 @@ void mca_common_cuda_fini(void)
OBJ_DESTRUCT(&common_cuda_dtoh_lock); OBJ_DESTRUCT(&common_cuda_dtoh_lock);
OBJ_DESTRUCT(&common_cuda_ipc_lock); OBJ_DESTRUCT(&common_cuda_ipc_lock);
if (NULL != libcuda_handle) { if (NULL != libcuda_handle) {
opal_lt_dlclose(libcuda_handle); opal_dl_close(libcuda_handle);
opal_lt_dlexit();
} }
opal_output_verbose(20, mca_common_cuda_output, opal_output_verbose(20, mca_common_cuda_output,

Просмотреть файл

@ -1,10 +1,11 @@
# -*- text -*- # -*- text -*-
# #
# Copyright (c) 2011-2015 NVIDIA. All rights reserved. # Copyright (c) 2011-2015 NVIDIA. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
# #
# $HEADER$ # $HEADER$
# #
[cuCtxGetCurrent failed not initialized] [cuCtxGetCurrent failed not initialized]
@ -152,14 +153,6 @@ Open MPI was compiled without dynamic library support (e.g., with the
If you need CUDA support, reconfigure Open MPI with dynamic library support enabled. If you need CUDA support, reconfigure Open MPI with dynamic library support enabled.
# #
[unknown ltdl error]
While attempting to load the supporting libcuda.so library, an error
occurred. This really should rarely happen. Please notify the Open
MPI developers.
Function: %s
Return Value: %d
Error string: %s
#
[dlopen failed] [dlopen failed]
The library attempted to open the following supporting CUDA libraries, The library attempted to open the following supporting CUDA libraries,
but each of them failed. CUDA-aware support is disabled. but each of them failed. CUDA-aware support is disabled.