Merge pull request #8153 from dancejic/multi
Using package_rank to select between NIC of equal distance from the process.
Этот коммит содержится в:
Коммит
e9e5dab8b9
@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
|
||||
[$opal_check_fi_info_pci],
|
||||
[check if pci data is available in ofi])
|
||||
|
||||
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
|
||||
[],
|
||||
[],
|
||||
[#include <pmix.h>])
|
||||
|
||||
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
|
||||
LDFLAGS=$opal_check_ofi_save_LDFLAGS
|
||||
LIBS=$opal_check_ofi_save_LIBS
|
||||
|
@ -15,6 +15,7 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "mtl_ofi.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/printf.h"
|
||||
@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers,
|
||||
__FILE__, __LINE__,
|
||||
(prov ? prov->fabric_attr->prov_name : "none"));
|
||||
|
||||
/* The initial fi_getinfo() call will return a list of providers
|
||||
/** The initial provider selection will return a list of providers
|
||||
* available for this process. once a provider is selected from the
|
||||
* list, we will cycle through the remaining list to identify NICs
|
||||
* serviced by this provider, and try to pick one on the same NUMA
|
||||
@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers,
|
||||
* attributes for the same NIC. The initial provider attributes
|
||||
* are used to ensure that all NICs we return provide the same
|
||||
* capabilities as the inital one.
|
||||
*
|
||||
* We use package rank to select between NICs of equal distance
|
||||
* if we cannot calculate a package_rank, we fall back to using the
|
||||
* process id.
|
||||
*/
|
||||
if (NULL != prov) {
|
||||
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
|
||||
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info);
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider: %s\n",
|
||||
__FILE__, __LINE__,
|
||||
@ -1170,6 +1175,3 @@ finalize_err:
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
||||
* are used to ensure that all NICs we return provide the same
|
||||
* capabilities as the inital one.
|
||||
*/
|
||||
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
|
||||
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info);
|
||||
rc = mca_btl_ofi_init_device(selected_info);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
info = selected_info;
|
||||
|
@ -31,6 +31,8 @@
|
||||
|
||||
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
|
||||
|
||||
dist_opaldata_DATA = help-common-ofi.txt
|
||||
|
||||
# Header files
|
||||
|
||||
headers = \
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "common_ofi.h"
|
||||
#include "opal_config.h"
|
||||
#include "opal/constants.h"
|
||||
@ -23,6 +24,8 @@
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
|
||||
.prov_include = NULL,
|
||||
@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list)
|
||||
return num_provider;
|
||||
}
|
||||
|
||||
/* Calculate the currrent process package rank.
|
||||
* @param (IN) process_info struct opal_process_info_t information
|
||||
* about the current process. used to get
|
||||
* num_local_peers, myprocid.rank, and
|
||||
* my_local_rank.
|
||||
*
|
||||
* @param (OUT) uint32_t package rank or myprocid.rank
|
||||
*
|
||||
* If successful, returns PMIX_PACKAGE_RANK, or an
|
||||
* equivalent calculated package rank.
|
||||
* otherwise falls back to using opal_process_info.myprocid.rank
|
||||
* this can affect performance, but is unlikely to happen.
|
||||
*/
|
||||
static uint32_t get_package_rank(opal_process_info_t process_info)
|
||||
{
|
||||
int i;
|
||||
uint16_t relative_locality, *package_rank_ptr;
|
||||
uint16_t current_package_rank = 0;
|
||||
uint16_t package_ranks[process_info.num_local_peers];
|
||||
opal_process_name_t pname;
|
||||
opal_status_t rc;
|
||||
char **peers = NULL;
|
||||
char *local_peers = NULL;
|
||||
char *locality_string = NULL;
|
||||
|
||||
pname.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||
pname.vpid = OPAL_VPID_WILDCARD;
|
||||
|
||||
#if HAVE_DECL_PMIX_PACKAGE_RANK
|
||||
// Try to get the PACKAGE_RANK from PMIx
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK,
|
||||
&pname, &package_rank_ptr, PMIX_UINT16);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
return (uint32_t)*package_rank_ptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Get the local peers
|
||||
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS,
|
||||
&pname, &local_peers, PMIX_STRING);
|
||||
if (PMIX_SUCCESS != rc || NULL == local_peers) {
|
||||
// We can't find package_rank, fall back to procid
|
||||
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
|
||||
return (uint32_t)process_info.myprocid.rank;
|
||||
}
|
||||
peers = opal_argv_split(local_peers, ',');
|
||||
free(local_peers);
|
||||
|
||||
for (i = 0; NULL != peers[i]; i++) {
|
||||
pname.vpid = strtoul(peers[i], NULL, 10);
|
||||
locality_string = NULL;
|
||||
// Get the LOCALITY_STRING for process[i]
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||
&pname, &locality_string, PMIX_STRING);
|
||||
if (PMIX_SUCCESS != rc || NULL == locality_string) {
|
||||
// If we don't have information about locality, fall back to procid
|
||||
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
|
||||
return (uint32_t)process_info.myprocid.rank;
|
||||
}
|
||||
|
||||
// compute relative locality
|
||||
relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string);
|
||||
free(locality_string);
|
||||
|
||||
if (relative_locality & OPAL_PROC_ON_SOCKET) {
|
||||
package_ranks[i] = current_package_rank;
|
||||
current_package_rank++;
|
||||
}
|
||||
}
|
||||
|
||||
return (uint32_t)package_ranks[process_info.my_local_rank];
|
||||
}
|
||||
|
||||
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
|
||||
*
|
||||
* Initializes opal_hwloc_topology to access hardware topology if not previously
|
||||
@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list)
|
||||
* selection. This provider is returned if the
|
||||
* NIC selection fails.
|
||||
*
|
||||
* @param local_index (IN) int The local rank of the process. Used to
|
||||
* @param package_rank (IN) uint32_t The rank of the process. Used to
|
||||
* select one valid NIC if there is a case
|
||||
* where more than one can be selected. This
|
||||
* could occur when more than one provider
|
||||
* shares the same cpuset as the process.
|
||||
* This could either be a package_rank if one is
|
||||
* successfully calculated, or the process id.
|
||||
*
|
||||
* @param provider (OUT) struct fi_info* object with the selected
|
||||
* provider if the selection succeeds
|
||||
@ -335,7 +413,7 @@ count_providers(struct fi_info* provider_list)
|
||||
* balance across available NICs.
|
||||
*/
|
||||
struct fi_info*
|
||||
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
|
||||
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info)
|
||||
{
|
||||
struct fi_info *provider = provider_list, *current_provider = provider_list;
|
||||
struct fi_info **provider_table;
|
||||
@ -343,6 +421,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
|
||||
struct fi_pci_attr pci;
|
||||
#endif
|
||||
int ret;
|
||||
uint32_t package_rank;
|
||||
unsigned int num_provider = 0, provider_limit = 0;
|
||||
bool provider_found = false, cpusets_match = false;
|
||||
|
||||
@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
|
||||
}
|
||||
|
||||
/* Select provider from local rank % number of providers */
|
||||
if (num_provider > 0) {
|
||||
provider = provider_table[local_index % num_provider];
|
||||
if (num_provider >= 2) {
|
||||
// If there are multiple NICs "close" to the process, try to calculate package_rank
|
||||
package_rank = get_package_rank(process_info);
|
||||
provider = provider_table[package_rank % num_provider];
|
||||
} else if (num_provider == 1) {
|
||||
provider = provider_table[num_provider - 1];
|
||||
}
|
||||
|
||||
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
||||
@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"local rank: %d device: %s cpusets match: %s\n",
|
||||
local_index, provider->domain_attr->name,
|
||||
"package rank: %d device: %s cpusets match: %s\n",
|
||||
package_rank, provider->domain_attr->name,
|
||||
cpusets_match ? "true" : "false");
|
||||
#endif
|
||||
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "opal_config.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include <rdma/fabric.h>
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi;
|
||||
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
|
||||
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
|
||||
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
|
||||
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
|
||||
char *framework_name);
|
||||
|
||||
/*
|
||||
* @param list (IN) List of strings corresponding to lower providers.
|
||||
* @param item (IN) Single string corresponding to a provider.
|
||||
@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
|
||||
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info);
|
||||
|
||||
#endif /* OPAL_MCA_COMMON_OFI_H */
|
||||
|
14
opal/mca/common/ofi/help-common-ofi.txt
Обычный файл
14
opal/mca/common/ofi/help-common-ofi.txt
Обычный файл
@ -0,0 +1,14 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[package_rank failed]
|
||||
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
|
||||
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
|
||||
This may negatively impact performance. A more modern PMIx server is necessary to
|
||||
resolve this issue.
|
Загрузка…
Ссылка в новой задаче
Block a user