Merge pull request #7174 from jsquyres/pr/ofi-mtl-fi-version-bump
mtl/ofi: increase the FI_VERSION requested to 1.5 and make sure to check for OFI_LOCAL_COMM
Этот коммит содержится в:
Коммит
887400c878
@ -1,6 +1,6 @@
|
|||||||
dnl -*- shell-script -*-
|
dnl -*- shell-script -*-
|
||||||
dnl
|
dnl
|
||||||
dnl Copyright (c) 2015-2019 Cisco Systems, Inc. All rights reserved.
|
dnl Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
|
||||||
dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
|
dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
|
||||||
dnl reserved.
|
dnl reserved.
|
||||||
dnl $COPYRIGHT$
|
dnl $COPYRIGHT$
|
||||||
@ -10,6 +10,45 @@ dnl
|
|||||||
dnl $HEADER$
|
dnl $HEADER$
|
||||||
dnl
|
dnl
|
||||||
|
|
||||||
|
dnl
|
||||||
|
dnl OPAL_CHECK_OFI_VERSION_GE
|
||||||
|
dnl
|
||||||
|
dnl Check that the OFI API version number is >= a specific value.
|
||||||
|
dnl
|
||||||
|
dnl $1: version number to compare, in the form of "major,minor"
|
||||||
|
dnl (without quotes) -- i.e., a single token representing the
|
||||||
|
dnl arguments to FI_VERSION()
|
||||||
|
dnl $2: action if OFI API version is >= $1
|
||||||
|
dnl $3: action if OFI API version is < $1
|
||||||
|
AC_DEFUN([OPAL_CHECK_OFI_VERSION_GE],[
|
||||||
|
OPAL_VAR_SCOPE_PUSH([opal_ofi_ver_ge_save_CPPFLAGS opal_ofi_ver_ge_happy])
|
||||||
|
|
||||||
|
AC_MSG_CHECKING([if OFI API version number is >= $1])
|
||||||
|
opal_ofi_ver_ge_save_CPPFLAGS=$CPPFLAGS
|
||||||
|
CPPFLAGS=$opal_ofi_CPPFLAGS
|
||||||
|
|
||||||
|
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fabric.h>]],
|
||||||
|
[[
|
||||||
|
#if !defined(FI_MAJOR_VERSION)
|
||||||
|
#error "we cannot check the version -- sad panda"
|
||||||
|
#elif FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION($1))
|
||||||
|
#error "version is too low -- nopes"
|
||||||
|
#endif
|
||||||
|
]])],
|
||||||
|
[opal_ofi_ver_ge_happy=1],
|
||||||
|
[opal_ofi_ver_ge_happy=0])
|
||||||
|
|
||||||
|
AS_IF([test $opal_ofi_ver_ge_happy -eq 1],
|
||||||
|
[AC_MSG_RESULT([yes])
|
||||||
|
$2],
|
||||||
|
[AC_MSG_RESULT([no])
|
||||||
|
$3])
|
||||||
|
|
||||||
|
CPPFLAGS=$opal_ofi_ver_ge_save_CPPFLAGS
|
||||||
|
|
||||||
|
OPAL_VAR_SCOPE_POP
|
||||||
|
])dnl
|
||||||
|
|
||||||
dnl
|
dnl
|
||||||
dnl _OPAL_CHECK_OFI
|
dnl _OPAL_CHECK_OFI
|
||||||
dnl --------------------------------------------------------
|
dnl --------------------------------------------------------
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||||
#
|
#
|
||||||
# Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
|
# Copyright (c) 2014-2020 Cisco Systems, Inc. All rights reserved
|
||||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||||
# reserved.
|
# reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
@ -28,6 +28,12 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[
|
|||||||
# Check for OFI
|
# Check for OFI
|
||||||
OPAL_CHECK_OFI
|
OPAL_CHECK_OFI
|
||||||
|
|
||||||
|
# The OFI MTL requires at least OFI libfabric v1.5.
|
||||||
|
AS_IF([test "$opal_ofi_happy" = "yes"],
|
||||||
|
[OPAL_CHECK_OFI_VERSION_GE([1,5],
|
||||||
|
[],
|
||||||
|
[opal_ofi_happy=no])])
|
||||||
|
|
||||||
AS_IF([test "$opal_ofi_happy" = "yes"],
|
AS_IF([test "$opal_ofi_happy" = "yes"],
|
||||||
[$1],
|
[$1],
|
||||||
[$2])
|
[$2])
|
||||||
|
@ -341,21 +341,12 @@ is_in_list(char **list, char *item)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct fi_info*
|
static struct fi_info*
|
||||||
select_ofi_provider(struct fi_info *providers)
|
select_ofi_provider(struct fi_info *providers,
|
||||||
|
char **include_list, char **exclude_list)
|
||||||
{
|
{
|
||||||
char **include_list = NULL;
|
|
||||||
char **exclude_list = NULL;
|
|
||||||
struct fi_info *prov = providers;
|
struct fi_info *prov = providers;
|
||||||
|
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
if (NULL != include_list) {
|
||||||
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
|
|
||||||
__FILE__, __LINE__, prov_include);
|
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
|
||||||
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
|
|
||||||
__FILE__, __LINE__, prov_exclude);
|
|
||||||
|
|
||||||
if (NULL != prov_include) {
|
|
||||||
include_list = opal_argv_split(prov_include, ',');
|
|
||||||
while ((NULL != prov) &&
|
while ((NULL != prov) &&
|
||||||
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
|
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
@ -364,8 +355,7 @@ select_ofi_provider(struct fi_info *providers)
|
|||||||
prov->fabric_attr->prov_name);
|
prov->fabric_attr->prov_name);
|
||||||
prov = prov->next;
|
prov = prov->next;
|
||||||
}
|
}
|
||||||
} else if (NULL != prov_exclude) {
|
} else if (NULL != exclude_list) {
|
||||||
exclude_list = opal_argv_split(prov_exclude, ',');
|
|
||||||
while ((NULL != prov) &&
|
while ((NULL != prov) &&
|
||||||
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
|
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
@ -376,9 +366,6 @@ select_ofi_provider(struct fi_info *providers)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_argv_free(include_list);
|
|
||||||
opal_argv_free(exclude_list);
|
|
||||||
|
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
"%s:%d: mtl:ofi:prov: %s\n",
|
"%s:%d: mtl:ofi:prov: %s\n",
|
||||||
__FILE__, __LINE__,
|
__FILE__, __LINE__,
|
||||||
@ -621,7 +608,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
int ret, fi_version;
|
int ret, fi_version;
|
||||||
int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
|
int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
|
||||||
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
|
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
|
||||||
struct fi_info *hints;
|
char **include_list = NULL;
|
||||||
|
char **exclude_list = NULL;
|
||||||
|
struct fi_info *hints, *hints_dup = NULL;
|
||||||
struct fi_info *providers = NULL;
|
struct fi_info *providers = NULL;
|
||||||
struct fi_info *prov = NULL;
|
struct fi_info *prov = NULL;
|
||||||
struct fi_info *prov_cq_data = NULL;
|
struct fi_info *prov_cq_data = NULL;
|
||||||
@ -630,6 +619,19 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
int universe_size;
|
int universe_size;
|
||||||
char *univ_size_str;
|
char *univ_size_str;
|
||||||
|
|
||||||
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
|
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
|
||||||
|
__FILE__, __LINE__, prov_include);
|
||||||
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
|
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
|
||||||
|
__FILE__, __LINE__, prov_exclude);
|
||||||
|
|
||||||
|
if (NULL != prov_include) {
|
||||||
|
include_list = opal_argv_split(prov_include, ',');
|
||||||
|
} else if (NULL != prov_exclude) {
|
||||||
|
exclude_list = opal_argv_split(prov_exclude, ',');
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hints to filter providers
|
* Hints to filter providers
|
||||||
* See man fi_getinfo for a list of all filters
|
* See man fi_getinfo for a list of all filters
|
||||||
@ -647,9 +649,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
__FILE__, __LINE__);
|
__FILE__, __LINE__);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
/* Make sure to get a RDM provider that can do the tagged matching
|
||||||
|
interface and local communication and remote communication. */
|
||||||
hints->mode = FI_CONTEXT;
|
hints->mode = FI_CONTEXT;
|
||||||
hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */
|
hints->ep_attr->type = FI_EP_RDM;
|
||||||
hints->caps = FI_TAGGED; /* Tag matching interface */
|
hints->caps = FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM;
|
||||||
hints->tx_attr->msg_order = FI_ORDER_SAS;
|
hints->tx_attr->msg_order = FI_ORDER_SAS;
|
||||||
hints->rx_attr->msg_order = FI_ORDER_SAS;
|
hints->rx_attr->msg_order = FI_ORDER_SAS;
|
||||||
hints->rx_attr->op_flags = FI_COMPLETION;
|
hints->rx_attr->op_flags = FI_COMPLETION;
|
||||||
@ -697,8 +701,59 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
* FI_VERSION provides binary backward and forward compatibility support
|
* FI_VERSION provides binary backward and forward compatibility support
|
||||||
* Specify the version of OFI is coded to, the provider will select struct
|
* Specify the version of OFI is coded to, the provider will select struct
|
||||||
* layouts that are compatible with this version.
|
* layouts that are compatible with this version.
|
||||||
|
*
|
||||||
|
* Note: API version 1.5 is the first version that supports
|
||||||
|
* FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
|
||||||
|
* that checking -- e.g., some providers are suitable for RXD or
|
||||||
|
* RXM, but can't provide local communication).
|
||||||
*/
|
*/
|
||||||
fi_version = FI_VERSION(1, 0);
|
fi_version = FI_VERSION(1, 5);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The EFA provider in Libfabric versions prior to 1.10 contains a bug
|
||||||
|
* where the FI_LOCAL_COMM and FI_REMOTE_COMM capabilities are not
|
||||||
|
* advertised. However, we know that this provider supports both local and
|
||||||
|
* remote communication. We must exclude these capability bits in order to
|
||||||
|
* select EFA when we are using a version of Libfabric with this bug.
|
||||||
|
*
|
||||||
|
* Call fi_getinfo() without those capabilities and specifically ask for
|
||||||
|
* the EFA provider. This is safe to do as EFA is only supported on Amazon
|
||||||
|
* EC2 and EC2 only supports EFA and TCP-based networks. We'll also skip
|
||||||
|
* this logic if the user specifies an include list without EFA or adds EFA
|
||||||
|
* to the exclude list.
|
||||||
|
*/
|
||||||
|
if ((include_list && is_in_list(include_list, "efa")) ||
|
||||||
|
(exclude_list && !is_in_list(exclude_list, "efa"))) {
|
||||||
|
hints_dup = fi_dupinfo(hints);
|
||||||
|
hints_dup->caps &= ~(FI_LOCAL_COMM | FI_REMOTE_COMM);
|
||||||
|
hints_dup->fabric_attr->prov_name = strdup("efa");
|
||||||
|
|
||||||
|
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
|
||||||
|
|
||||||
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
|
"%s:%d: EFA specific fi_getinfo(): %s\n",
|
||||||
|
__FILE__, __LINE__, fi_strerror(-ret));
|
||||||
|
|
||||||
|
if (FI_ENODATA == -ret) {
|
||||||
|
/**
|
||||||
|
* EFA is not available so fall through to call fi_getinfo() again
|
||||||
|
* with the local/remote capabilities set.
|
||||||
|
*/
|
||||||
|
fi_freeinfo(hints_dup);
|
||||||
|
hints_dup = NULL;
|
||||||
|
} else if (0 != ret) {
|
||||||
|
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||||
|
"fi_getinfo",
|
||||||
|
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||||
|
fi_strerror(-ret), -ret);
|
||||||
|
goto error;
|
||||||
|
} else {
|
||||||
|
fi_freeinfo(hints);
|
||||||
|
hints = hints_dup;
|
||||||
|
hints_dup = NULL;
|
||||||
|
goto select_prov;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* fi_getinfo: returns information about fabric services for reaching a
|
* fi_getinfo: returns information about fabric services for reaching a
|
||||||
@ -711,6 +766,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
0ULL, /* Optional flag */
|
0ULL, /* Optional flag */
|
||||||
hints, /* In: Hints to filter providers */
|
hints, /* In: Hints to filter providers */
|
||||||
&providers); /* Out: List of matching providers */
|
&providers); /* Out: List of matching providers */
|
||||||
|
|
||||||
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
|
"%s:%d: fi_getinfo(): %s\n",
|
||||||
|
__FILE__, __LINE__, fi_strerror(-ret));
|
||||||
|
|
||||||
if (FI_ENODATA == -ret) {
|
if (FI_ENODATA == -ret) {
|
||||||
// It is not an error if no information is returned.
|
// It is not an error if no information is returned.
|
||||||
goto error;
|
goto error;
|
||||||
@ -722,10 +782,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
select_prov:
|
||||||
/**
|
/**
|
||||||
* Select a provider from the list returned by fi_getinfo().
|
* Select a provider from the list returned by fi_getinfo().
|
||||||
*/
|
*/
|
||||||
prov = select_ofi_provider(providers);
|
prov = select_ofi_provider(providers, include_list, exclude_list);
|
||||||
if (!prov) {
|
if (!prov) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
"%s:%d: select_ofi_provider: no provider found\n",
|
"%s:%d: select_ofi_provider: no provider found\n",
|
||||||
@ -733,6 +794,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_argv_free(include_list);
|
||||||
|
include_list = NULL;
|
||||||
|
opal_argv_free(exclude_list);
|
||||||
|
exclude_list = NULL;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Select the format of the OFI tag
|
* Select the format of the OFI tag
|
||||||
*/
|
*/
|
||||||
@ -1006,6 +1072,12 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
return &ompi_mtl_ofi.base;
|
return &ompi_mtl_ofi.base;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
if (include_list) {
|
||||||
|
opal_argv_free(include_list);
|
||||||
|
}
|
||||||
|
if (exclude_list) {
|
||||||
|
opal_argv_free(exclude_list);
|
||||||
|
}
|
||||||
if (providers) {
|
if (providers) {
|
||||||
(void) fi_freeinfo(providers);
|
(void) fi_freeinfo(providers);
|
||||||
}
|
}
|
||||||
@ -1015,6 +1087,9 @@ error:
|
|||||||
if (hints) {
|
if (hints) {
|
||||||
(void) fi_freeinfo(hints);
|
(void) fi_freeinfo(hints);
|
||||||
}
|
}
|
||||||
|
if (hints_dup) {
|
||||||
|
(void) fi_freeinfo(hints_dup);
|
||||||
|
}
|
||||||
if (ompi_mtl_ofi.sep) {
|
if (ompi_mtl_ofi.sep) {
|
||||||
(void) fi_close((fid_t)ompi_mtl_ofi.sep);
|
(void) fi_close((fid_t)ompi_mtl_ofi.sep);
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
# reserved.
|
# reserved.
|
||||||
# Copyright (c) 2010-2019 Cisco Systems, Inc. All rights reserved
|
# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved
|
||||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||||
# reserved.
|
# reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
@ -100,25 +100,11 @@ AC_DEFUN([_OPAL_BTL_USNIC_DO_CONFIG],[
|
|||||||
OPAL_CHECK_OFI
|
OPAL_CHECK_OFI
|
||||||
opal_btl_usnic_happy=$opal_ofi_happy])
|
opal_btl_usnic_happy=$opal_ofi_happy])
|
||||||
|
|
||||||
# The usnic BTL requires at least OFI libfabric v1.1 (there was a
|
# The usnic BTL requires at least OFI libfabric v1.3.
|
||||||
# critical bug in libfabric v1.0).
|
|
||||||
AS_IF([test "$opal_btl_usnic_happy" = "yes"],
|
AS_IF([test "$opal_btl_usnic_happy" = "yes"],
|
||||||
[AC_MSG_CHECKING([whether OFI libfabric is >= v1.1])
|
[OPAL_CHECK_OFI_VERSION_GE([1,3],
|
||||||
opal_btl_usnic_CPPFLAGS_save=$CPPFLAGS
|
[],
|
||||||
CPPFLAGS="$opal_ofi_CPPFLAGS $CPPFLAGS"
|
[opal_btl_usnic_happy=no])])
|
||||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fabric.h>]],
|
|
||||||
[[
|
|
||||||
#if !defined(FI_MAJOR_VERSION)
|
|
||||||
#error your version of OFI libfabric is too old
|
|
||||||
#elif FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION) < FI_VERSION(1, 1)
|
|
||||||
#error your version of OFI libfabric is too old
|
|
||||||
#endif
|
|
||||||
]])],
|
|
||||||
[opal_btl_usnic_happy=yes],
|
|
||||||
[opal_btl_usnic_happy=no])
|
|
||||||
AC_MSG_RESULT([$opal_btl_usnic_happy])
|
|
||||||
CPPFLAGS=$opal_btl_usnic_CPPFLAGS_save
|
|
||||||
])
|
|
||||||
|
|
||||||
# Make sure we can find the OFI libfabric usnic extensions header
|
# Make sure we can find the OFI libfabric usnic extensions header
|
||||||
AS_IF([test "$opal_btl_usnic_happy" = "yes" ],
|
AS_IF([test "$opal_btl_usnic_happy" = "yes" ],
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user