1
1

Merge pull request #8153 from dancejic/multi

Using package_rank to select between NIC of equal distance from the process.
Этот коммит содержится в:
Jeff Squyres 2020-11-02 15:27:37 -05:00 коммит произвёл GitHub
родитель 5b25a06c7d 8017f12801
Коммит e9e5dab8b9
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 121 добавлений и 15 удалений

Просмотреть файл

@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
[$opal_check_fi_info_pci],
[check if pci data is available in ofi])
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
[],
[],
[#include <pmix.h>])
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
LDFLAGS=$opal_check_ofi_save_LDFLAGS
LIBS=$opal_check_ofi_save_LIBS

Просмотреть файл

@ -15,6 +15,7 @@
* $HEADER$
*/
#include "opal_config.h"
#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/printf.h"
@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers,
__FILE__, __LINE__,
(prov ? prov->fabric_attr->prov_name : "none"));
/* The initial fi_getinfo() call will return a list of providers
/** The initial provider selection will return a list of providers
* available for this process. once a provider is selected from the
* list, we will cycle through the remaining list to identify NICs
* serviced by this provider, and try to pick one on the same NUMA
@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers,
* attributes for the same NIC. The initial provider attributes
* are used to ensure that all NICs we return provide the same
* capabilities as the inital one.
*
* We use package rank to select between NICs of equal distance
* if we cannot calculate a package_rank, we fall back to using the
* process id.
*/
if (NULL != prov) {
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider: %s\n",
__FILE__, __LINE__,
@ -1170,6 +1175,3 @@ finalize_err:
return OMPI_ERROR;
}

Просмотреть файл

@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
* are used to ensure that all NICs we return provide the same
* capabilities as the inital one.
*/
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info);
rc = mca_btl_ofi_init_device(selected_info);
if (OPAL_SUCCESS == rc) {
info = selected_info;

Просмотреть файл

@ -31,6 +31,8 @@
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
dist_opaldata_DATA = help-common-ofi.txt
# Header files
headers = \

Просмотреть файл

@ -16,6 +16,7 @@
#include <errno.h>
#include <unistd.h>
#include "opal_config.h"
#include "common_ofi.h"
#include "opal_config.h"
#include "opal/constants.h"
@ -23,6 +24,8 @@
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/util/show_help.h"
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
.prov_include = NULL,
@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list)
return num_provider;
}
/* Calculate the currrent process package rank.
* @param (IN) process_info struct opal_process_info_t information
* about the current process. used to get
* num_local_peers, myprocid.rank, and
* my_local_rank.
*
* @param (OUT) uint32_t package rank or myprocid.rank
*
* If successful, returns PMIX_PACKAGE_RANK, or an
* equivalent calculated package rank.
* otherwise falls back to using opal_process_info.myprocid.rank
* this can affect performance, but is unlikely to happen.
*/
static uint32_t get_package_rank(opal_process_info_t process_info)
{
int i;
uint16_t relative_locality, *package_rank_ptr;
uint16_t current_package_rank = 0;
uint16_t package_ranks[process_info.num_local_peers];
opal_process_name_t pname;
opal_status_t rc;
char **peers = NULL;
char *local_peers = NULL;
char *locality_string = NULL;
pname.jobid = OPAL_PROC_MY_NAME.jobid;
pname.vpid = OPAL_VPID_WILDCARD;
#if HAVE_DECL_PMIX_PACKAGE_RANK
// Try to get the PACKAGE_RANK from PMIx
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK,
&pname, &package_rank_ptr, PMIX_UINT16);
if (PMIX_SUCCESS == rc) {
return (uint32_t)*package_rank_ptr;
}
#endif
// Get the local peers
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS,
&pname, &local_peers, PMIX_STRING);
if (PMIX_SUCCESS != rc || NULL == local_peers) {
// We can't find package_rank, fall back to procid
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
return (uint32_t)process_info.myprocid.rank;
}
peers = opal_argv_split(local_peers, ',');
free(local_peers);
for (i = 0; NULL != peers[i]; i++) {
pname.vpid = strtoul(peers[i], NULL, 10);
locality_string = NULL;
// Get the LOCALITY_STRING for process[i]
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&pname, &locality_string, PMIX_STRING);
if (PMIX_SUCCESS != rc || NULL == locality_string) {
// If we don't have information about locality, fall back to procid
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
return (uint32_t)process_info.myprocid.rank;
}
// compute relative locality
relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string);
free(locality_string);
if (relative_locality & OPAL_PROC_ON_SOCKET) {
package_ranks[i] = current_package_rank;
current_package_rank++;
}
}
return (uint32_t)package_ranks[process_info.my_local_rank];
}
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
*
* Initializes opal_hwloc_topology to access hardware topology if not previously
@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list)
* selection. This provider is returned if the
* NIC selection fails.
*
* @param local_index (IN) int The local rank of the process. Used to
* @param package_rank (IN) uint32_t The rank of the process. Used to
* select one valid NIC if there is a case
* where more than one can be selected. This
* could occur when more than one provider
* shares the same cpuset as the process.
* This could either be a package_rank if one is
* successfully calculated, or the process id.
*
* @param provider (OUT) struct fi_info* object with the selected
* provider if the selection succeeds
@ -335,7 +413,7 @@ count_providers(struct fi_info* provider_list)
* balance across available NICs.
*/
struct fi_info*
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info)
{
struct fi_info *provider = provider_list, *current_provider = provider_list;
struct fi_info **provider_table;
@ -343,6 +421,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
struct fi_pci_attr pci;
#endif
int ret;
uint32_t package_rank;
unsigned int num_provider = 0, provider_limit = 0;
bool provider_found = false, cpusets_match = false;
@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
}
/* Select provider from local rank % number of providers */
if (num_provider > 0) {
provider = provider_table[local_index % num_provider];
if (num_provider >= 2) {
// If there are multiple NICs "close" to the process, try to calculate package_rank
package_rank = get_package_rank(process_info);
provider = provider_table[package_rank % num_provider];
} else if (num_provider == 1) {
provider = provider_table[num_provider - 1];
}
#if OPAL_OFI_PCI_DATA_AVAILABLE
@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
#if OPAL_ENABLE_DEBUG
opal_output_verbose(1, opal_common_ofi.output,
"local rank: %d device: %s cpusets match: %s\n",
local_index, provider->domain_attr->name,
"package rank: %d device: %s cpusets match: %s\n",
package_rank, provider->domain_attr->name,
cpusets_match ? "true" : "false");
#endif

Просмотреть файл

@ -19,6 +19,7 @@
#include "opal_config.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/util/proc.h"
#include <rdma/fabric.h>
BEGIN_C_DECLS
@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi;
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
char *framework_name);
/*
* @param list (IN) List of strings corresponding to lower providers.
* @param item (IN) Single string corresponding to a provider.
@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
END_C_DECLS
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info);
#endif /* OPAL_MCA_COMMON_OFI_H */

14
opal/mca/common/ofi/help-common-ofi.txt Обычный файл
Просмотреть файл

@ -0,0 +1,14 @@
# -*- text -*-
#
# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[package_rank failed]
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
This may negatively impact performance. A more modern PMIx server is necessary to
resolve this issue.