1
1

v4.1.x: Using package_rank to select between NIC of equal distance from the process.

If PMIX_PACKAGE_RANK is available, uses this value to select between multiple
NIC of equal distance between the current process. If this value is not
available, try to calculate it by getting the locality string from each local
process and assign a package_rank. If everything fails, fall back to using
process_id.rank to select the NIC. This last case is not ideal, but has a small
chance of occuring, and causes an output to be displayed to notify that this is
occuring.

Some of the information in master branch is not available for the multi-NIC
patch, such as myprocinfo.rank. This info is used to select between multiple
NIC of equal distance to the process. This adapts the previous commit to work
with the v4.1.x branch.

Signed-off-by: Nikola Dancejic <dancejic@amazon.com>
(cherry picked from commit 8017f12801)
This commit is contained in:
Nikola Dancejic 2020-10-22 19:18:28 -07:00
parent 74a743fc21
commit 3f863aab8a
7 changed files with 132 additions and 15 deletions

View File

@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
[$opal_check_fi_info_pci],
[check if pci data is available in ofi])
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
[],
[],
[#include <pmix.h>])
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
LDFLAGS=$opal_check_ofi_save_LDFLAGS
LIBS=$opal_check_ofi_save_LIBS

View File

@ -15,6 +15,7 @@
* $HEADER$
*/
#include "opal_config.h"
#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/printf.h"
@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers,
__FILE__, __LINE__,
(prov ? prov->fabric_attr->prov_name : "none"));
/* The initial fi_getinfo() call will return a list of providers
/** The initial provider selection will return a list of providers
* available for this process. once a provider is selected from the
* list, we will cycle through the remaining list to identify NICs
* serviced by this provider, and try to pick one on the same NUMA
@ -350,9 +351,16 @@ select_ofi_provider(struct fi_info *providers,
* attributes for the same NIC. The initial provider attributes
* are used to ensure that all NICs we return provide the same
* capabilities as the inital one.
*
* We use package rank to select between NICs of equal distance
* if we cannot calculate a package_rank, we fall back to using the
* process id.
*/
if (NULL != prov) {
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.num_local_peers,
ompi_process_info.my_local_rank,
ompi_process_info.cpuset,
ompi_process_info.pid);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider: %s\n",
__FILE__, __LINE__,
@ -1170,6 +1178,3 @@ finalize_err:
return OMPI_ERROR;
}

View File

@ -391,7 +391,10 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
* are used to ensure that all NICs we return provide the same
* capabilities as the inital one.
*/
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.num_local_peers,
opal_process_info.my_local_rank,
opal_process_info.cpuset,
opal_process_info.my_local_rank);
rc = mca_btl_ofi_init_device(selected_info);
if (OPAL_SUCCESS == rc) {
info = selected_info;

View File

@ -31,6 +31,8 @@
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
dist_opaldata_DATA = help-common-ofi.txt
# Header files
headers = \

View File

@ -16,6 +16,7 @@
#include <errno.h>
#include <unistd.h>
#include "opal_config.h"
#include "common_ofi.h"
#include "opal_config.h"
#include "opal/constants.h"
@ -23,6 +24,8 @@
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/util/show_help.h"
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
.prov_include = NULL,
@ -281,6 +284,80 @@ count_providers(struct fi_info* provider_list)
return num_provider;
}
/* Calculate the currrent process package rank.
* @param (IN) process_info struct opal_process_info_t information
* about the current process. used to get
* num_local_peers, myprocid.rank, and
* my_local_rank.
*
* @param (OUT) uint32_t package rank or myprocid.rank
*
* If successful, returns PMIX_PACKAGE_RANK, or an
* equivalent calculated package rank.
* otherwise falls back to using opal_process_info.myprocid.rank
* this can affect performance, but is unlikely to happen.
*/
static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank, char *cpuset, uint32_t pid)
{
int i;
uint16_t relative_locality;
uint16_t current_package_rank = 0;
uint16_t package_ranks[num_local_peers];
opal_process_name_t pname;
opal_status_t rc;
char **peers = NULL;
char *local_peers = NULL;
char *locality_string = NULL;
pname.jobid = OPAL_PROC_MY_NAME.jobid;
pname.vpid = OPAL_VPID_WILDCARD;
#if HAVE_DECL_PMIX_PACKAGE_RANK
uint16_t *package_rank_ptr;
// Try to get the PACKAGE_RANK from PMIx
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_PACKAGE_RANK,
&pname, &package_rank_ptr, OPAL_UINT16);
if (OPAL_SUCCESS == rc) {
return (uint32_t)*package_rank_ptr;
}
#endif
// Get the local peers
OPAL_MODEX_RECV_VALUE(rc, OPAL_PMIX_LOCAL_PEERS,
&pname, &local_peers, OPAL_STRING);
if (OPAL_SUCCESS != rc || NULL == local_peers) {
// We can't find package_rank, fall back to procid
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
return pid;
}
peers = opal_argv_split(local_peers, ',');
free(local_peers);
for (i = 0; NULL != peers[i]; i++) {
pname.vpid = strtoul(peers[i], NULL, 10);
locality_string = NULL;
// Get the LOCALITY_STRING for process[i]
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
&pname, &locality_string, OPAL_STRING);
if (OPAL_SUCCESS != rc || NULL == locality_string) {
// If we don't have information about locality, fall back to procid
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
return pid;
}
// compute relative locality
relative_locality = opal_hwloc_compute_relative_locality(cpuset, locality_string);
free(locality_string);
if (relative_locality & OPAL_PROC_ON_SOCKET) {
package_ranks[i] = current_package_rank;
current_package_rank++;
}
}
return (uint32_t)package_ranks[my_local_rank];
}
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
*
* Initializes opal_hwloc_topology to access hardware topology if not previously
@ -318,11 +395,13 @@ count_providers(struct fi_info* provider_list)
* selection. This provider is returned if the
* NIC selection fails.
*
* @param local_index (IN) int The local rank of the process. Used to
* @param package_rank (IN) uint32_t The rank of the process. Used to
* select one valid NIC if there is a case
* where more than one can be selected. This
* could occur when more than one provider
* shares the same cpuset as the process.
* This could either be a package_rank if one is
* successfully calculated, or the process id.
*
* @param provider (OUT) struct fi_info* object with the selected
* provider if the selection succeeds
@ -335,7 +414,8 @@ count_providers(struct fi_info* provider_list)
* balance across available NICs.
*/
struct fi_info*
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int32_t num_local_peers,
uint16_t my_local_rank, char *cpuset, uint32_t pid)
{
struct fi_info *provider = provider_list, *current_provider = provider_list;
struct fi_info **provider_table;
@ -343,6 +423,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
struct fi_pci_attr pci;
#endif
int ret;
uint32_t package_rank;
unsigned int num_provider = 0, provider_limit = 0;
bool provider_found = false, cpusets_match = false;
@ -399,8 +480,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
}
/* Select provider from local rank % number of providers */
if (num_provider > 0) {
provider = provider_table[local_index % num_provider];
if (num_provider >= 2) {
// If there are multiple NICs "close" to the process, try to calculate package_rank
package_rank = get_package_rank(num_local_peers, my_local_rank, cpuset, pid);
provider = provider_table[package_rank % num_provider];
} else if (num_provider == 1) {
provider = provider_table[num_provider - 1];
}
#if OPAL_OFI_PCI_DATA_AVAILABLE
@ -412,8 +497,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
#if OPAL_ENABLE_DEBUG
opal_output_verbose(1, opal_common_ofi.output,
"local rank: %d device: %s cpusets match: %s\n",
local_index, provider->domain_attr->name,
"package rank: %d device: %s cpusets match: %s\n",
package_rank, provider->domain_attr->name,
cpusets_match ? "true" : "false");
#endif

View File

@ -19,6 +19,7 @@
#include "opal_config.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/util/proc.h"
#include <rdma/fabric.h>
BEGIN_C_DECLS
@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi;
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
char *framework_name);
/*
* @param list (IN) List of strings corresponding to lower providers.
* @param item (IN) Single string corresponding to a provider.
@ -56,6 +56,9 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
END_C_DECLS
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list,
int32_t num_local_peers,
uint16_t my_local_rank,
char *cpuset, uint32_t pid);
#endif /* OPAL_MCA_COMMON_OFI_H */

View File

@ -0,0 +1,14 @@
# -*- text -*-
#
# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[package_rank failed]
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
This may negatively impact performance. A more modern PMIx server is necessary to
resolve this issue.