Merge pull request #8153 from dancejic/multi
Using package_rank to select between NIC of equal distance from the process.
Этот коммит содержится в:
Коммит
e9e5dab8b9
@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
|
|||||||
[$opal_check_fi_info_pci],
|
[$opal_check_fi_info_pci],
|
||||||
[check if pci data is available in ofi])
|
[check if pci data is available in ofi])
|
||||||
|
|
||||||
|
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[#include <pmix.h>])
|
||||||
|
|
||||||
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
|
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
|
||||||
LDFLAGS=$opal_check_ofi_save_LDFLAGS
|
LDFLAGS=$opal_check_ofi_save_LDFLAGS
|
||||||
LIBS=$opal_check_ofi_save_LIBS
|
LIBS=$opal_check_ofi_save_LIBS
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "opal_config.h"
|
||||||
#include "mtl_ofi.h"
|
#include "mtl_ofi.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/printf.h"
|
#include "opal/util/printf.h"
|
||||||
@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers,
|
|||||||
__FILE__, __LINE__,
|
__FILE__, __LINE__,
|
||||||
(prov ? prov->fabric_attr->prov_name : "none"));
|
(prov ? prov->fabric_attr->prov_name : "none"));
|
||||||
|
|
||||||
/* The initial fi_getinfo() call will return a list of providers
|
/** The initial provider selection will return a list of providers
|
||||||
* available for this process. once a provider is selected from the
|
* available for this process. once a provider is selected from the
|
||||||
* list, we will cycle through the remaining list to identify NICs
|
* list, we will cycle through the remaining list to identify NICs
|
||||||
* serviced by this provider, and try to pick one on the same NUMA
|
* serviced by this provider, and try to pick one on the same NUMA
|
||||||
@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers,
|
|||||||
* attributes for the same NIC. The initial provider attributes
|
* attributes for the same NIC. The initial provider attributes
|
||||||
* are used to ensure that all NICs we return provide the same
|
* are used to ensure that all NICs we return provide the same
|
||||||
* capabilities as the inital one.
|
* capabilities as the inital one.
|
||||||
|
*
|
||||||
|
* We use package rank to select between NICs of equal distance
|
||||||
|
* if we cannot calculate a package_rank, we fall back to using the
|
||||||
|
* process id.
|
||||||
*/
|
*/
|
||||||
if (NULL != prov) {
|
if (NULL != prov) {
|
||||||
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
|
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info);
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
"%s:%d: mtl:ofi:provider: %s\n",
|
"%s:%d: mtl:ofi:provider: %s\n",
|
||||||
__FILE__, __LINE__,
|
__FILE__, __LINE__,
|
||||||
@ -1170,6 +1175,3 @@ finalize_err:
|
|||||||
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
|
|||||||
* are used to ensure that all NICs we return provide the same
|
* are used to ensure that all NICs we return provide the same
|
||||||
* capabilities as the inital one.
|
* capabilities as the inital one.
|
||||||
*/
|
*/
|
||||||
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
|
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info);
|
||||||
rc = mca_btl_ofi_init_device(selected_info);
|
rc = mca_btl_ofi_init_device(selected_info);
|
||||||
if (OPAL_SUCCESS == rc) {
|
if (OPAL_SUCCESS == rc) {
|
||||||
info = selected_info;
|
info = selected_info;
|
||||||
|
@ -31,6 +31,8 @@
|
|||||||
|
|
||||||
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
|
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
|
||||||
|
|
||||||
|
dist_opaldata_DATA = help-common-ofi.txt
|
||||||
|
|
||||||
# Header files
|
# Header files
|
||||||
|
|
||||||
headers = \
|
headers = \
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "opal_config.h"
|
||||||
#include "common_ofi.h"
|
#include "common_ofi.h"
|
||||||
#include "opal_config.h"
|
#include "opal_config.h"
|
||||||
#include "opal/constants.h"
|
#include "opal/constants.h"
|
||||||
@ -23,6 +24,8 @@
|
|||||||
#include "opal/mca/base/mca_base_var.h"
|
#include "opal/mca/base/mca_base_var.h"
|
||||||
#include "opal/mca/base/mca_base_framework.h"
|
#include "opal/mca/base/mca_base_framework.h"
|
||||||
#include "opal/mca/hwloc/base/base.h"
|
#include "opal/mca/hwloc/base/base.h"
|
||||||
|
#include "opal/mca/pmix/base/base.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
|
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
|
||||||
.prov_include = NULL,
|
.prov_include = NULL,
|
||||||
@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list)
|
|||||||
return num_provider;
|
return num_provider;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Calculate the currrent process package rank.
|
||||||
|
* @param (IN) process_info struct opal_process_info_t information
|
||||||
|
* about the current process. used to get
|
||||||
|
* num_local_peers, myprocid.rank, and
|
||||||
|
* my_local_rank.
|
||||||
|
*
|
||||||
|
* @param (OUT) uint32_t package rank or myprocid.rank
|
||||||
|
*
|
||||||
|
* If successful, returns PMIX_PACKAGE_RANK, or an
|
||||||
|
* equivalent calculated package rank.
|
||||||
|
* otherwise falls back to using opal_process_info.myprocid.rank
|
||||||
|
* this can affect performance, but is unlikely to happen.
|
||||||
|
*/
|
||||||
|
static uint32_t get_package_rank(opal_process_info_t process_info)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
uint16_t relative_locality, *package_rank_ptr;
|
||||||
|
uint16_t current_package_rank = 0;
|
||||||
|
uint16_t package_ranks[process_info.num_local_peers];
|
||||||
|
opal_process_name_t pname;
|
||||||
|
opal_status_t rc;
|
||||||
|
char **peers = NULL;
|
||||||
|
char *local_peers = NULL;
|
||||||
|
char *locality_string = NULL;
|
||||||
|
|
||||||
|
pname.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||||
|
pname.vpid = OPAL_VPID_WILDCARD;
|
||||||
|
|
||||||
|
#if HAVE_DECL_PMIX_PACKAGE_RANK
|
||||||
|
// Try to get the PACKAGE_RANK from PMIx
|
||||||
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK,
|
||||||
|
&pname, &package_rank_ptr, PMIX_UINT16);
|
||||||
|
if (PMIX_SUCCESS == rc) {
|
||||||
|
return (uint32_t)*package_rank_ptr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Get the local peers
|
||||||
|
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS,
|
||||||
|
&pname, &local_peers, PMIX_STRING);
|
||||||
|
if (PMIX_SUCCESS != rc || NULL == local_peers) {
|
||||||
|
// We can't find package_rank, fall back to procid
|
||||||
|
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
|
||||||
|
return (uint32_t)process_info.myprocid.rank;
|
||||||
|
}
|
||||||
|
peers = opal_argv_split(local_peers, ',');
|
||||||
|
free(local_peers);
|
||||||
|
|
||||||
|
for (i = 0; NULL != peers[i]; i++) {
|
||||||
|
pname.vpid = strtoul(peers[i], NULL, 10);
|
||||||
|
locality_string = NULL;
|
||||||
|
// Get the LOCALITY_STRING for process[i]
|
||||||
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||||
|
&pname, &locality_string, PMIX_STRING);
|
||||||
|
if (PMIX_SUCCESS != rc || NULL == locality_string) {
|
||||||
|
// If we don't have information about locality, fall back to procid
|
||||||
|
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
|
||||||
|
return (uint32_t)process_info.myprocid.rank;
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute relative locality
|
||||||
|
relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string);
|
||||||
|
free(locality_string);
|
||||||
|
|
||||||
|
if (relative_locality & OPAL_PROC_ON_SOCKET) {
|
||||||
|
package_ranks[i] = current_package_rank;
|
||||||
|
current_package_rank++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (uint32_t)package_ranks[process_info.my_local_rank];
|
||||||
|
}
|
||||||
|
|
||||||
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
|
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
|
||||||
*
|
*
|
||||||
* Initializes opal_hwloc_topology to access hardware topology if not previously
|
* Initializes opal_hwloc_topology to access hardware topology if not previously
|
||||||
@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list)
|
|||||||
* selection. This provider is returned if the
|
* selection. This provider is returned if the
|
||||||
* NIC selection fails.
|
* NIC selection fails.
|
||||||
*
|
*
|
||||||
* @param local_index (IN) int The local rank of the process. Used to
|
* @param package_rank (IN) uint32_t The rank of the process. Used to
|
||||||
* select one valid NIC if there is a case
|
* select one valid NIC if there is a case
|
||||||
* where more than one can be selected. This
|
* where more than one can be selected. This
|
||||||
* could occur when more than one provider
|
* could occur when more than one provider
|
||||||
* shares the same cpuset as the process.
|
* shares the same cpuset as the process.
|
||||||
|
* This could either be a package_rank if one is
|
||||||
|
* successfully calculated, or the process id.
|
||||||
*
|
*
|
||||||
* @param provider (OUT) struct fi_info* object with the selected
|
* @param provider (OUT) struct fi_info* object with the selected
|
||||||
* provider if the selection succeeds
|
* provider if the selection succeeds
|
||||||
@ -335,7 +413,7 @@ count_providers(struct fi_info* provider_list)
|
|||||||
* balance across available NICs.
|
* balance across available NICs.
|
||||||
*/
|
*/
|
||||||
struct fi_info*
|
struct fi_info*
|
||||||
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
|
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info)
|
||||||
{
|
{
|
||||||
struct fi_info *provider = provider_list, *current_provider = provider_list;
|
struct fi_info *provider = provider_list, *current_provider = provider_list;
|
||||||
struct fi_info **provider_table;
|
struct fi_info **provider_table;
|
||||||
@ -343,6 +421,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
|
|||||||
struct fi_pci_attr pci;
|
struct fi_pci_attr pci;
|
||||||
#endif
|
#endif
|
||||||
int ret;
|
int ret;
|
||||||
|
uint32_t package_rank;
|
||||||
unsigned int num_provider = 0, provider_limit = 0;
|
unsigned int num_provider = 0, provider_limit = 0;
|
||||||
bool provider_found = false, cpusets_match = false;
|
bool provider_found = false, cpusets_match = false;
|
||||||
|
|
||||||
@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Select provider from local rank % number of providers */
|
/* Select provider from local rank % number of providers */
|
||||||
if (num_provider > 0) {
|
if (num_provider >= 2) {
|
||||||
provider = provider_table[local_index % num_provider];
|
// If there are multiple NICs "close" to the process, try to calculate package_rank
|
||||||
|
package_rank = get_package_rank(process_info);
|
||||||
|
provider = provider_table[package_rank % num_provider];
|
||||||
|
} else if (num_provider == 1) {
|
||||||
|
provider = provider_table[num_provider - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
||||||
@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
|
|||||||
|
|
||||||
#if OPAL_ENABLE_DEBUG
|
#if OPAL_ENABLE_DEBUG
|
||||||
opal_output_verbose(1, opal_common_ofi.output,
|
opal_output_verbose(1, opal_common_ofi.output,
|
||||||
"local rank: %d device: %s cpusets match: %s\n",
|
"package rank: %d device: %s cpusets match: %s\n",
|
||||||
local_index, provider->domain_attr->name,
|
package_rank, provider->domain_attr->name,
|
||||||
cpusets_match ? "true" : "false");
|
cpusets_match ? "true" : "false");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
#include "opal_config.h"
|
#include "opal_config.h"
|
||||||
#include "opal/mca/base/mca_base_var.h"
|
#include "opal/mca/base/mca_base_var.h"
|
||||||
#include "opal/mca/base/mca_base_framework.h"
|
#include "opal/mca/base/mca_base_framework.h"
|
||||||
|
#include "opal/util/proc.h"
|
||||||
#include <rdma/fabric.h>
|
#include <rdma/fabric.h>
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi;
|
|||||||
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
|
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
|
||||||
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
|
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
|
||||||
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
|
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
|
||||||
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
|
|
||||||
char *framework_name);
|
|
||||||
/*
|
/*
|
||||||
* @param list (IN) List of strings corresponding to lower providers.
|
* @param list (IN) List of strings corresponding to lower providers.
|
||||||
* @param item (IN) Single string corresponding to a provider.
|
* @param item (IN) Single string corresponding to a provider.
|
||||||
@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
|
|||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
|
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info);
|
||||||
|
|
||||||
#endif /* OPAL_MCA_COMMON_OFI_H */
|
#endif /* OPAL_MCA_COMMON_OFI_H */
|
||||||
|
14
opal/mca/common/ofi/help-common-ofi.txt
Обычный файл
14
opal/mca/common/ofi/help-common-ofi.txt
Обычный файл
@ -0,0 +1,14 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
[package_rank failed]
|
||||||
|
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
|
||||||
|
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
|
||||||
|
This may negatively impact performance. A more modern PMIx server is necessary to
|
||||||
|
resolve this issue.
|
Загрузка…
Ссылка в новой задаче
Block a user