From 8017f1280137dee4d5fc7dac0a5c627e72e48058 Mon Sep 17 00:00:00 2001 From: Nikola Dancejic Date: Thu, 22 Oct 2020 19:18:28 -0700 Subject: [PATCH] Using package_rank to select between NIC of equal distance from the process. If PMIX_PACKAGE_RANK is available, uses this value to select between multiple NIC of equal distance between the current process. If this value is not available, try to calculate it by getting the locality string from each local process and assign a package_rank. If everything fails, fall back to using process_id.rank to select the NIC. This last case is not ideal, but has a small chance of occuring, and causes an output to be displayed to notify that this is occuring. Signed-off-by: Nikola Dancejic --- config/opal_check_ofi.m4 | 5 ++ ompi/mca/mtl/ofi/mtl_ofi_component.c | 12 ++-- opal/mca/btl/ofi/btl_ofi_component.c | 2 +- opal/mca/common/ofi/Makefile.am | 2 + opal/mca/common/ofi/common_ofi.c | 95 +++++++++++++++++++++++-- opal/mca/common/ofi/common_ofi.h | 6 +- opal/mca/common/ofi/help-common-ofi.txt | 14 ++++ 7 files changed, 121 insertions(+), 15 deletions(-) create mode 100644 opal/mca/common/ofi/help-common-ofi.txt diff --git a/config/opal_check_ofi.m4 b/config/opal_check_ofi.m4 index 480d1d5c7e..ae90ad1dc5 100644 --- a/config/opal_check_ofi.m4 +++ b/config/opal_check_ofi.m4 @@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[ [$opal_check_fi_info_pci], [check if pci data is available in ofi]) + AC_CHECK_DECLS([PMIX_PACKAGE_RANK], + [], + [], + [#include ]) + CPPFLAGS=$opal_check_ofi_save_CPPFLAGS LDFLAGS=$opal_check_ofi_save_LDFLAGS LIBS=$opal_check_ofi_save_LIBS diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index b52bacb74e..16a5900adc 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -15,6 +15,7 @@ * $HEADER$ */ +#include "opal_config.h" #include "mtl_ofi.h" #include "opal/util/argv.h" #include "opal/util/printf.h" @@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers, __FILE__, __LINE__, (prov ? prov->fabric_attr->prov_name : "none")); - /* The initial fi_getinfo() call will return a list of providers + /** The initial provider selection will return a list of providers * available for this process. once a provider is selected from the * list, we will cycle through the remaining list to identify NICs * serviced by this provider, and try to pick one on the same NUMA @@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers, * attributes for the same NIC. The initial provider attributes * are used to ensure that all NICs we return provide the same * capabilities as the inital one. + * + * We use package rank to select between NICs of equal distance + * if we cannot calculate a package_rank, we fall back to using the + * process id. */ if (NULL != prov) { - prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank); + prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: mtl:ofi:provider: %s\n", __FILE__, __LINE__, @@ -1170,6 +1175,3 @@ finalize_err: return OMPI_ERROR; } - - - diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index c92a6f83e9..654e744f5b 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, * are used to ensure that all NICs we return provide the same * capabilities as the inital one. */ - selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank); + selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info); rc = mca_btl_ofi_init_device(selected_info); if (OPAL_SUCCESS == rc) { info = selected_info; diff --git a/opal/mca/common/ofi/Makefile.am b/opal/mca/common/ofi/Makefile.am index 486acdf8b9..b2f22a7176 100644 --- a/opal/mca/common/ofi/Makefile.am +++ b/opal/mca/common/ofi/Makefile.am @@ -31,6 +31,8 @@ AM_CPPFLAGS = $(opal_ofi_CPPFLAGS) +dist_opaldata_DATA = help-common-ofi.txt + # Header files headers = \ diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 314e2b16af..2422a59dfb 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -16,6 +16,7 @@ #include #include +#include "opal_config.h" #include "common_ofi.h" #include "opal_config.h" #include "opal/constants.h" @@ -23,6 +24,8 @@ #include "opal/mca/base/mca_base_var.h" #include "opal/mca/base/mca_base_framework.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/pmix/base/base.h" +#include "opal/util/show_help.h" OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = { .prov_include = NULL, @@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list) return num_provider; } +/* Calculate the currrent process package rank. + * @param (IN) process_info struct opal_process_info_t information + * about the current process. used to get + * num_local_peers, myprocid.rank, and + * my_local_rank. + * + * @param (OUT) uint32_t package rank or myprocid.rank + * + * If successful, returns PMIX_PACKAGE_RANK, or an + * equivalent calculated package rank. + * otherwise falls back to using opal_process_info.myprocid.rank + * this can affect performance, but is unlikely to happen. + */ +static uint32_t get_package_rank(opal_process_info_t process_info) +{ + int i; + uint16_t relative_locality, *package_rank_ptr; + uint16_t current_package_rank = 0; + uint16_t package_ranks[process_info.num_local_peers]; + opal_process_name_t pname; + opal_status_t rc; + char **peers = NULL; + char *local_peers = NULL; + char *locality_string = NULL; + + pname.jobid = OPAL_PROC_MY_NAME.jobid; + pname.vpid = OPAL_VPID_WILDCARD; + +#if HAVE_DECL_PMIX_PACKAGE_RANK + // Try to get the PACKAGE_RANK from PMIx + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK, + &pname, &package_rank_ptr, PMIX_UINT16); + if (PMIX_SUCCESS == rc) { + return (uint32_t)*package_rank_ptr; + } +#endif + + // Get the local peers + OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS, + &pname, &local_peers, PMIX_STRING); + if (PMIX_SUCCESS != rc || NULL == local_peers) { + // We can't find package_rank, fall back to procid + opal_show_help("help-common-ofi.txt", "package_rank failed", true); + return (uint32_t)process_info.myprocid.rank; + } + peers = opal_argv_split(local_peers, ','); + free(local_peers); + + for (i = 0; NULL != peers[i]; i++) { + pname.vpid = strtoul(peers[i], NULL, 10); + locality_string = NULL; + // Get the LOCALITY_STRING for process[i] + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, + &pname, &locality_string, PMIX_STRING); + if (PMIX_SUCCESS != rc || NULL == locality_string) { + // If we don't have information about locality, fall back to procid + opal_show_help("help-common-ofi.txt", "package_rank failed", true); + return (uint32_t)process_info.myprocid.rank; + } + + // compute relative locality + relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string); + free(locality_string); + + if (relative_locality & OPAL_PROC_ON_SOCKET) { + package_ranks[i] = current_package_rank; + current_package_rank++; + } + } + + return (uint32_t)package_ranks[process_info.my_local_rank]; +} + /* Selects a NIC based on hardware locality between process cpuset and device BDF. * * Initializes opal_hwloc_topology to access hardware topology if not previously @@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list) * selection. This provider is returned if the * NIC selection fails. * - * @param local_index (IN) int The local rank of the process. Used to + * @param package_rank (IN) uint32_t The rank of the process. Used to * select one valid NIC if there is a case * where more than one can be selected. This * could occur when more than one provider * shares the same cpuset as the process. + * This could either be a package_rank if one is + * successfully calculated, or the process id. * * @param provider (OUT) struct fi_info* object with the selected * provider if the selection succeeds @@ -335,7 +413,7 @@ count_providers(struct fi_info* provider_list) * balance across available NICs. */ struct fi_info* -opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index) +opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info) { struct fi_info *provider = provider_list, *current_provider = provider_list; struct fi_info **provider_table; @@ -343,6 +421,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind struct fi_pci_attr pci; #endif int ret; + uint32_t package_rank; unsigned int num_provider = 0, provider_limit = 0; bool provider_found = false, cpusets_match = false; @@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind } /* Select provider from local rank % number of providers */ - if (num_provider > 0) { - provider = provider_table[local_index % num_provider]; + if (num_provider >= 2) { + // If there are multiple NICs "close" to the process, try to calculate package_rank + package_rank = get_package_rank(process_info); + provider = provider_table[package_rank % num_provider]; + } else if (num_provider == 1) { + provider = provider_table[num_provider - 1]; } #if OPAL_OFI_PCI_DATA_AVAILABLE @@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind #if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, - "local rank: %d device: %s cpusets match: %s\n", - local_index, provider->domain_attr->name, + "package rank: %d device: %s cpusets match: %s\n", + package_rank, provider->domain_attr->name, cpusets_match ? "true" : "false"); #endif diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 06ac691132..3fcfc0601e 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -19,6 +19,7 @@ #include "opal_config.h" #include "opal/mca/base/mca_base_var.h" #include "opal/mca/base/mca_base_framework.h" +#include "opal/util/proc.h" #include BEGIN_C_DECLS @@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi; OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component); OPAL_DECLSPEC void opal_common_ofi_mca_register(void); OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); -OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers, - char *framework_name); + /* * @param list (IN) List of strings corresponding to lower providers. * @param item (IN) Single string corresponding to a provider. @@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); END_C_DECLS -struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank); +struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info); #endif /* OPAL_MCA_COMMON_OFI_H */ diff --git a/opal/mca/common/ofi/help-common-ofi.txt b/opal/mca/common/ofi/help-common-ofi.txt new file mode 100644 index 0000000000..4ea109a0b3 --- /dev/null +++ b/opal/mca/common/ofi/help-common-ofi.txt @@ -0,0 +1,14 @@ +# -*- text -*- +# +# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[package_rank failed] +Open MPI's OFI driver detected multiple equidistant NICs from the current process, +but had insufficient information to ensure MPI processes fairly pick a NIC for use. +This may negatively impact performance. A more modern PMIx server is necessary to +resolve this issue.