diff --git a/config/opal_check_ofi.m4 b/config/opal_check_ofi.m4 index 480d1d5c7e..ae90ad1dc5 100644 --- a/config/opal_check_ofi.m4 +++ b/config/opal_check_ofi.m4 @@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[ [$opal_check_fi_info_pci], [check if pci data is available in ofi]) + AC_CHECK_DECLS([PMIX_PACKAGE_RANK], + [], + [], + [#include ]) + CPPFLAGS=$opal_check_ofi_save_CPPFLAGS LDFLAGS=$opal_check_ofi_save_LDFLAGS LIBS=$opal_check_ofi_save_LIBS diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index b52bacb74e..16a5900adc 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -15,6 +15,7 @@ * $HEADER$ */ +#include "opal_config.h" #include "mtl_ofi.h" #include "opal/util/argv.h" #include "opal/util/printf.h" @@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers, __FILE__, __LINE__, (prov ? prov->fabric_attr->prov_name : "none")); - /* The initial fi_getinfo() call will return a list of providers + /** The initial provider selection will return a list of providers * available for this process. once a provider is selected from the * list, we will cycle through the remaining list to identify NICs * serviced by this provider, and try to pick one on the same NUMA @@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers, * attributes for the same NIC. The initial provider attributes * are used to ensure that all NICs we return provide the same * capabilities as the inital one. + * + * We use package rank to select between NICs of equal distance + * if we cannot calculate a package_rank, we fall back to using the + * process id. */ if (NULL != prov) { - prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank); + prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: mtl:ofi:provider: %s\n", __FILE__, __LINE__, @@ -1170,6 +1175,3 @@ finalize_err: return OMPI_ERROR; } - - - diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index c92a6f83e9..654e744f5b 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, * are used to ensure that all NICs we return provide the same * capabilities as the inital one. */ - selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank); + selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info); rc = mca_btl_ofi_init_device(selected_info); if (OPAL_SUCCESS == rc) { info = selected_info; diff --git a/opal/mca/common/ofi/Makefile.am b/opal/mca/common/ofi/Makefile.am index 486acdf8b9..b2f22a7176 100644 --- a/opal/mca/common/ofi/Makefile.am +++ b/opal/mca/common/ofi/Makefile.am @@ -31,6 +31,8 @@ AM_CPPFLAGS = $(opal_ofi_CPPFLAGS) +dist_opaldata_DATA = help-common-ofi.txt + # Header files headers = \ diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 314e2b16af..2422a59dfb 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -16,6 +16,7 @@ #include #include +#include "opal_config.h" #include "common_ofi.h" #include "opal_config.h" #include "opal/constants.h" @@ -23,6 +24,8 @@ #include "opal/mca/base/mca_base_var.h" #include "opal/mca/base/mca_base_framework.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/pmix/base/base.h" +#include "opal/util/show_help.h" OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = { .prov_include = NULL, @@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list) return num_provider; } +/* Calculate the currrent process package rank. + * @param (IN) process_info struct opal_process_info_t information + * about the current process. used to get + * num_local_peers, myprocid.rank, and + * my_local_rank. + * + * @param (OUT) uint32_t package rank or myprocid.rank + * + * If successful, returns PMIX_PACKAGE_RANK, or an + * equivalent calculated package rank. + * otherwise falls back to using opal_process_info.myprocid.rank + * this can affect performance, but is unlikely to happen. + */ +static uint32_t get_package_rank(opal_process_info_t process_info) +{ + int i; + uint16_t relative_locality, *package_rank_ptr; + uint16_t current_package_rank = 0; + uint16_t package_ranks[process_info.num_local_peers]; + opal_process_name_t pname; + opal_status_t rc; + char **peers = NULL; + char *local_peers = NULL; + char *locality_string = NULL; + + pname.jobid = OPAL_PROC_MY_NAME.jobid; + pname.vpid = OPAL_VPID_WILDCARD; + +#if HAVE_DECL_PMIX_PACKAGE_RANK + // Try to get the PACKAGE_RANK from PMIx + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK, + &pname, &package_rank_ptr, PMIX_UINT16); + if (PMIX_SUCCESS == rc) { + return (uint32_t)*package_rank_ptr; + } +#endif + + // Get the local peers + OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS, + &pname, &local_peers, PMIX_STRING); + if (PMIX_SUCCESS != rc || NULL == local_peers) { + // We can't find package_rank, fall back to procid + opal_show_help("help-common-ofi.txt", "package_rank failed", true); + return (uint32_t)process_info.myprocid.rank; + } + peers = opal_argv_split(local_peers, ','); + free(local_peers); + + for (i = 0; NULL != peers[i]; i++) { + pname.vpid = strtoul(peers[i], NULL, 10); + locality_string = NULL; + // Get the LOCALITY_STRING for process[i] + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, + &pname, &locality_string, PMIX_STRING); + if (PMIX_SUCCESS != rc || NULL == locality_string) { + // If we don't have information about locality, fall back to procid + opal_show_help("help-common-ofi.txt", "package_rank failed", true); + return (uint32_t)process_info.myprocid.rank; + } + + // compute relative locality + relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string); + free(locality_string); + + if (relative_locality & OPAL_PROC_ON_SOCKET) { + package_ranks[i] = current_package_rank; + current_package_rank++; + } + } + + return (uint32_t)package_ranks[process_info.my_local_rank]; +} + /* Selects a NIC based on hardware locality between process cpuset and device BDF. * * Initializes opal_hwloc_topology to access hardware topology if not previously @@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list) * selection. This provider is returned if the * NIC selection fails. * - * @param local_index (IN) int The local rank of the process. Used to + * @param package_rank (IN) uint32_t The rank of the process. Used to * select one valid NIC if there is a case * where more than one can be selected. This * could occur when more than one provider * shares the same cpuset as the process. + * This could either be a package_rank if one is + * successfully calculated, or the process id. * * @param provider (OUT) struct fi_info* object with the selected * provider if the selection succeeds @@ -335,7 +413,7 @@ count_providers(struct fi_info* provider_list) * balance across available NICs. */ struct fi_info* -opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index) +opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info) { struct fi_info *provider = provider_list, *current_provider = provider_list; struct fi_info **provider_table; @@ -343,6 +421,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind struct fi_pci_attr pci; #endif int ret; + uint32_t package_rank; unsigned int num_provider = 0, provider_limit = 0; bool provider_found = false, cpusets_match = false; @@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind } /* Select provider from local rank % number of providers */ - if (num_provider > 0) { - provider = provider_table[local_index % num_provider]; + if (num_provider >= 2) { + // If there are multiple NICs "close" to the process, try to calculate package_rank + package_rank = get_package_rank(process_info); + provider = provider_table[package_rank % num_provider]; + } else if (num_provider == 1) { + provider = provider_table[num_provider - 1]; } #if OPAL_OFI_PCI_DATA_AVAILABLE @@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind #if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, - "local rank: %d device: %s cpusets match: %s\n", - local_index, provider->domain_attr->name, + "package rank: %d device: %s cpusets match: %s\n", + package_rank, provider->domain_attr->name, cpusets_match ? "true" : "false"); #endif diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 06ac691132..3fcfc0601e 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -19,6 +19,7 @@ #include "opal_config.h" #include "opal/mca/base/mca_base_var.h" #include "opal/mca/base/mca_base_framework.h" +#include "opal/util/proc.h" #include BEGIN_C_DECLS @@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi; OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component); OPAL_DECLSPEC void opal_common_ofi_mca_register(void); OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); -OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers, - char *framework_name); + /* * @param list (IN) List of strings corresponding to lower providers. * @param item (IN) Single string corresponding to a provider. @@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); END_C_DECLS -struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank); +struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info); #endif /* OPAL_MCA_COMMON_OFI_H */ diff --git a/opal/mca/common/ofi/help-common-ofi.txt b/opal/mca/common/ofi/help-common-ofi.txt new file mode 100644 index 0000000000..4ea109a0b3 --- /dev/null +++ b/opal/mca/common/ofi/help-common-ofi.txt @@ -0,0 +1,14 @@ +# -*- text -*- +# +# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[package_rank failed] +Open MPI's OFI driver detected multiple equidistant NICs from the current process, +but had insufficient information to ensure MPI processes fairly pick a NIC for use. +This may negatively impact performance. A more modern PMIx server is necessary to +resolve this issue.