Merge pull request #7134 from wckzhang/btl_tcp_interface_match
btl tcp: Use reachability and graph solving for global interface matching
Этот коммит содержится в:
Коммит
fc8c7a5869
@ -15,6 +15,8 @@
|
||||
* Copyright (c) 2016-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -101,12 +103,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to make sure that the peer has at least as many interface
|
||||
* addresses exported as we are trying to use. If not, then
|
||||
* don't bind this BTL instance to the proc.
|
||||
*/
|
||||
|
||||
OPAL_THREAD_LOCK(&tcp_proc->proc_lock);
|
||||
|
||||
for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) {
|
||||
|
@ -15,6 +15,8 @@
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t {
|
||||
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
|
||||
unsigned int tcp_num_links; /**< number of logical links per physical device */
|
||||
struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
|
||||
opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */
|
||||
int tcp_free_list_num; /**< initial size of free lists */
|
||||
int tcp_free_list_max; /**< maximum size of free lists */
|
||||
int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
|
||||
*/
|
||||
struct mca_btl_tcp_module_t {
|
||||
mca_btl_base_module_t super; /**< base BTL interface */
|
||||
uint32_t btl_index; /**< Local BTL module index, used for vertex
|
||||
data and used as a hash key when
|
||||
solving module matching problem */
|
||||
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
|
||||
struct sockaddr_storage tcp_ifaddr; /**< First address
|
||||
discovered for this
|
||||
|
@ -9,6 +9,9 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,37 +33,43 @@
|
||||
#ifdef HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
/**
|
||||
* Modex address structure.
|
||||
*
|
||||
* One of these structures will be sent for every btl module in use by
|
||||
* the local BTL TCP component.
|
||||
* the local BTL TCP component. This is used to construct an opal_if_t
|
||||
* structure for the reachability component as well as populate the
|
||||
* mca_btl_tcp_addr_t structure on remote procs. These will be used
|
||||
* for interface matching and filling out the mca_btl_base_endpoint_t
|
||||
* structure.
|
||||
*/
|
||||
struct mca_btl_tcp_modex_addr_t {
|
||||
uint8_t addr[16]; /* endpoint address. for addr_family
|
||||
of MCA_BTL_TCP_AF_INET, only the
|
||||
first 4 bytes have meaning. */
|
||||
uint32_t addr_ifkindex; /* endpoint kernel index */
|
||||
uint32_t addr_mask; /* ip mask */
|
||||
uint32_t addr_bandwidth; /* interface bandwidth */
|
||||
uint16_t addr_port; /* endpoint listen port */
|
||||
uint8_t addr_family; /* endpoint address family. Note that
|
||||
this is
|
||||
MCA_BTL_TCP_AF_{INET,INET6}, not
|
||||
the traditional
|
||||
AF_INET/AF_INET6. */
|
||||
uint8_t padding[1]; /* padd out to an 8-byte word */
|
||||
uint8_t padding[1]; /* pad out to an 8-byte word */
|
||||
};
|
||||
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;
|
||||
|
||||
_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t");
|
||||
|
||||
/**
|
||||
* Remote peer address structure
|
||||
*
|
||||
* One of these structures will be allocated for every remote endpoint
|
||||
* associated with a remote proc. The data is pulled from the
|
||||
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
|
||||
* field, which is local.
|
||||
* mca_btl_tcp_modex_addr_t structure.
|
||||
*/
|
||||
struct mca_btl_tcp_addr_t {
|
||||
union {
|
||||
@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t {
|
||||
int addr_ifkindex; /**< remote interface index assigned with
|
||||
this address */
|
||||
uint8_t addr_family; /**< AF_INET or AF_INET6 */
|
||||
bool addr_inuse; /**< local meaning only */
|
||||
};
|
||||
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;
|
||||
|
||||
|
@ -19,7 +19,8 @@
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -69,6 +70,7 @@
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/util/fd.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/string_copy.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
@ -76,6 +78,7 @@
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/mca/reachable/base/base.h"
|
||||
#include "opal/threads/threads.h"
|
||||
|
||||
#include "opal/constants.h"
|
||||
@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void)
|
||||
mca_btl_tcp_component.tcp_btls = NULL;
|
||||
|
||||
/* initialize objects */
|
||||
OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
|
||||
@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void)
|
||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max);
|
||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
|
||||
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
|
||||
OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs);
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_common_cuda_fini();
|
||||
@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void)
|
||||
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
|
||||
{
|
||||
struct mca_btl_tcp_module_t* btl;
|
||||
opal_if_t *copied_interface, *selected_interface;
|
||||
char param[256];
|
||||
int i;
|
||||
int i, if_index;
|
||||
struct sockaddr_storage addr;
|
||||
bool found = false;
|
||||
|
||||
@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
|
||||
* 10.1.0.1 as the one that is published in the modex and used for
|
||||
* connection.
|
||||
*/
|
||||
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
|
||||
int ret;
|
||||
|
||||
if (if_kindex != opal_ifindextokindex(i)) {
|
||||
OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) {
|
||||
if (if_kindex != selected_interface->if_kernel_index) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
|
||||
sizeof(struct sockaddr_storage));
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
if_index = selected_interface->if_index;
|
||||
|
||||
memcpy((struct sockaddr*)&addr, &selected_interface->if_addr,
|
||||
MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr)));
|
||||
|
||||
if (addr.ss_family == AF_INET &&
|
||||
4 != mca_btl_tcp_component.tcp_disable_family) {
|
||||
@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
|
||||
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
|
||||
if(NULL == btl)
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
copied_interface = OBJ_NEW(opal_if_t);
|
||||
if (NULL == copied_interface) {
|
||||
free(btl);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module));
|
||||
OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t);
|
||||
OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t);
|
||||
mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl;
|
||||
|
||||
/* initialize the btl */
|
||||
/* This index is used as a key for a hash table used for interface matching. */
|
||||
btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1;
|
||||
btl->tcp_ifkindex = (uint16_t) if_kindex;
|
||||
#if MCA_BTL_TCP_STATISTICS
|
||||
btl->tcp_bytes_recv = 0;
|
||||
@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
|
||||
#endif
|
||||
|
||||
memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
|
||||
btl->tcp_ifmask = selected_interface->if_mask;
|
||||
|
||||
/* allow user to specify interface bandwidth */
|
||||
sprintf(param, "bandwidth_%s", if_name);
|
||||
@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
|
||||
}
|
||||
}
|
||||
|
||||
/* Add another entry to the local interface list */
|
||||
opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE);
|
||||
copied_interface->if_index = if_index;
|
||||
copied_interface->if_kernel_index = btl->tcp_ifkindex;
|
||||
copied_interface->af_family = btl->tcp_ifaddr.ss_family;
|
||||
copied_interface->if_flags = selected_interface->if_flags;
|
||||
copied_interface->if_speed = selected_interface->if_speed;
|
||||
memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage));
|
||||
copied_interface->if_mask = selected_interface->if_mask;
|
||||
copied_interface->if_bandwidth = btl->super.btl_bandwidth;
|
||||
memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac));
|
||||
copied_interface->ifmtu = selected_interface->ifmtu;
|
||||
|
||||
opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super));
|
||||
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
|
||||
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
|
||||
@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
|
||||
sizeof(struct in6_addr));
|
||||
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
|
||||
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
|
||||
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"btl: tcp: exchange: %d %d IPv6 %s",
|
||||
@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
|
||||
sizeof(struct in_addr));
|
||||
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
|
||||
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
|
||||
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"btl: tcp: exchange: %d %d IPv4 %s",
|
||||
@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
|
||||
addrs[i].addr_mask = btl->tcp_ifmask;
|
||||
addrs[i].addr_bandwidth = btl->super.btl_bandwidth;
|
||||
}
|
||||
|
||||
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
|
||||
|
@ -16,8 +16,11 @@
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* Copyright (c) 2013-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -36,6 +39,7 @@
|
||||
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/mca/reachable/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -44,6 +48,8 @@
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/util/string_copy.h"
|
||||
#include "opal/util/bipartite_graph.h"
|
||||
|
||||
#include "btl_tcp.h"
|
||||
#include "btl_tcp_proc.h"
|
||||
@ -51,21 +57,6 @@
|
||||
static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc);
|
||||
static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc);
|
||||
|
||||
struct mca_btl_tcp_proc_data_t {
|
||||
mca_btl_tcp_interface_t** local_interfaces;
|
||||
opal_hash_table_t local_kindex_to_index;
|
||||
size_t num_local_interfaces, max_local_interfaces;
|
||||
size_t num_peer_interfaces;
|
||||
opal_hash_table_t peer_kindex_to_index;
|
||||
unsigned int *best_assignment;
|
||||
int max_assignment_weight;
|
||||
int max_assignment_cardinality;
|
||||
enum mca_btl_tcp_connection_quality **weights;
|
||||
struct mca_btl_tcp_addr_t ***best_addr;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_tcp_proc_data_t mca_btl_tcp_proc_data_t;
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_btl_tcp_proc_t,
|
||||
opal_list_item_t,
|
||||
mca_btl_tcp_proc_construct,
|
||||
@ -79,6 +70,8 @@ void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* tcp_proc)
|
||||
tcp_proc->proc_endpoints = NULL;
|
||||
tcp_proc->proc_endpoint_count = 0;
|
||||
OBJ_CONSTRUCT(&tcp_proc->proc_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&tcp_proc->btl_index_to_endpoint, opal_hash_table_t);
|
||||
opal_hash_table_init(&tcp_proc->btl_index_to_endpoint, mca_btl_tcp_component.tcp_num_btls);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -103,9 +96,270 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc)
|
||||
if(NULL != tcp_proc->proc_addrs) {
|
||||
free(tcp_proc->proc_addrs);
|
||||
}
|
||||
OBJ_DESTRUCT(&tcp_proc->btl_index_to_endpoint);
|
||||
OBJ_DESTRUCT(&tcp_proc->proc_lock);
|
||||
}
|
||||
|
||||
static inline int mca_btl_tcp_proc_is_proc_left(opal_process_name_t a,
|
||||
opal_process_name_t b)
|
||||
{
|
||||
if (a.jobid != b.jobid) {
|
||||
return (a.jobid < b.jobid);
|
||||
} else {
|
||||
return (a.vpid < b.vpid);
|
||||
}
|
||||
}
|
||||
|
||||
#define MCA_BTL_TCP_PROC_LOCAL_VERTEX(index) (index)
|
||||
#define MCA_BTL_TCP_PROC_REMOTE_VERTEX(index) (index + mca_btl_tcp_component.tcp_num_btls)
|
||||
|
||||
/* This function builds a graph to match local and remote interfaces
|
||||
* together. It also populates the remote proc object.
|
||||
*
|
||||
* @param btl_proc (IN) Remote proc information
|
||||
* @param remote_addrs (IN) List of addresses from remote interfaces
|
||||
* @param local_proc_is_left (IN) Boolean indicator. If true, we set local process
|
||||
* interfaces to be on the left side of the graph.
|
||||
* If false, we set remote process interfaces to
|
||||
* be on the left side of the graph.
|
||||
* @param graph_out (OUT) Constructed and populated bipartite interface
|
||||
* graph with vertices as interfaces and negative
|
||||
* reachability weights as costs for the edges.
|
||||
* @return OPAL error code or success
|
||||
*
|
||||
* The vertices of this graph are the local and remote interfaces. Edges in
|
||||
* this graph are connections between the interfaces. Costs are computed as
|
||||
* negative weight which is calculated using the reachability framework.
|
||||
*
|
||||
* In order to mirror inputs on both the local and remote side when solving
|
||||
* interface matching from both sides, we require local_proc_is_left to
|
||||
* indicate whether the local interfaces should be on the left of the graph
|
||||
* or not.
|
||||
*
|
||||
* The remote list and proc_addrs are assembled and populated here so that
|
||||
* we can ensure that the vertex ordering matches the proc_addr ordering.
|
||||
* This allows us to pass the correct pointers to the vertex data for storage.
|
||||
*
|
||||
*/
|
||||
static int mca_btl_tcp_proc_create_interface_graph(mca_btl_tcp_proc_t* btl_proc,
|
||||
mca_btl_tcp_modex_addr_t* remote_addrs,
|
||||
int local_proc_is_left,
|
||||
opal_bp_graph_t **graph_out)
|
||||
{
|
||||
opal_bp_graph_t *graph = NULL;
|
||||
opal_reachable_t *results = NULL;
|
||||
opal_list_t *local_list = &mca_btl_tcp_component.local_ifs;
|
||||
opal_list_t *remote_list;
|
||||
int rc, v_index, x, y, cost, u, v, num_edges = 0;
|
||||
size_t i;
|
||||
|
||||
remote_list = OBJ_NEW(opal_list_t);
|
||||
if (NULL == remote_list) {
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* the modex and proc structures differ slightly, so copy the
|
||||
fields needed in the proc version */
|
||||
for (i = 0 ; i < btl_proc->proc_addr_count ; i++) {
|
||||
/* Construct opal_if_t objects for the remote interfaces */
|
||||
opal_if_t *interface = OBJ_NEW(opal_if_t);
|
||||
if (NULL == interface) {
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
|
||||
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet,
|
||||
remote_addrs[i].addr, sizeof(struct in_addr));
|
||||
btl_proc->proc_addrs[i].addr_family = AF_INET;
|
||||
|
||||
memcpy(&((struct sockaddr_in *)&(interface->if_addr))->sin_addr,
|
||||
remote_addrs[i].addr, sizeof(struct in_addr));
|
||||
((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET;
|
||||
interface->af_family = AF_INET;
|
||||
} else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) {
|
||||
#if OPAL_ENABLE_IPV6
|
||||
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6,
|
||||
remote_addrs[i].addr, sizeof(struct in6_addr));
|
||||
btl_proc->proc_addrs[i].addr_family = AF_INET6;
|
||||
|
||||
memcpy(&((struct sockaddr_in6 *)&(interface->if_addr))->sin6_addr,
|
||||
remote_addrs[i].addr, sizeof(struct in6_addr));
|
||||
((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET6;
|
||||
interface->af_family = AF_INET6;
|
||||
#else
|
||||
rc = OPAL_ERR_NOT_SUPPORTED;
|
||||
OBJ_RELEASE(interface);
|
||||
goto out;
|
||||
#endif
|
||||
} else {
|
||||
BTL_ERROR(("Unexpected address family %d",
|
||||
(int)remote_addrs[i].addr_family));
|
||||
rc = OPAL_ERR_BAD_PARAM;
|
||||
OBJ_RELEASE(interface);
|
||||
goto out;
|
||||
}
|
||||
|
||||
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
|
||||
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
|
||||
|
||||
interface->if_mask = remote_addrs[i].addr_mask;
|
||||
interface->if_bandwidth = remote_addrs[i].addr_bandwidth;
|
||||
|
||||
opal_list_append(remote_list, &(interface->super));
|
||||
}
|
||||
|
||||
rc = opal_bp_graph_create(NULL, NULL, &graph);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto out;
|
||||
}
|
||||
results = opal_reachable.reachable(local_list, remote_list);
|
||||
if (NULL == results) {
|
||||
rc = OPAL_ERROR;
|
||||
goto err_graph;
|
||||
}
|
||||
|
||||
/* Add vertices for each local node. These will store the btl index */
|
||||
for (x = 0; x < results->num_local; x++) {
|
||||
rc = opal_bp_graph_add_vertex(graph, &mca_btl_tcp_component.tcp_btls[x]->btl_index, &v_index);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto err_graph;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add vertices for each remote node. These will store remote interface information */
|
||||
for (y = 0; y < results->num_remote; y++) {
|
||||
rc = opal_bp_graph_add_vertex(graph, &btl_proc->proc_addrs[y], &v_index);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto err_graph;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add edges */
|
||||
for (x = 0; x < results->num_local; x++) {
|
||||
for (y = 0; y < results->num_remote; y++) {
|
||||
/* The bipartite assignment solver will optimize a graph for
|
||||
* least cost. Since weights vary from 0 as no connection and
|
||||
* higher weights as better connections (multiplied by some other
|
||||
* factors), higher weight is better. Thus, to achieve least cost,
|
||||
* we set cost as negative weight.
|
||||
*/
|
||||
cost = -results->weights[x][y];
|
||||
/* Skip edges with no connections */
|
||||
if (0 == cost) {
|
||||
continue;
|
||||
}
|
||||
if (local_proc_is_left) {
|
||||
u = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x);
|
||||
v = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y);
|
||||
} else {
|
||||
u = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y);
|
||||
v = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x);
|
||||
}
|
||||
rc = opal_bp_graph_add_edge(graph, u, v, cost, 1, NULL);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto err_graph;
|
||||
}
|
||||
num_edges++;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == num_edges) {
|
||||
BTL_ERROR(("Unable to find reachable pairing between local and remote interfaces"));
|
||||
rc = OPAL_ERR_UNREACH;
|
||||
}
|
||||
|
||||
*graph_out = graph;
|
||||
goto out;
|
||||
|
||||
err_graph:
|
||||
if (NULL != graph) {
|
||||
opal_bp_graph_free(graph);
|
||||
}
|
||||
out:
|
||||
if (NULL != results) {
|
||||
free(results);
|
||||
}
|
||||
if (NULL != remote_list) {
|
||||
OBJ_RELEASE(remote_list);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* We store the matched interface data by using the btl_index as the key and
|
||||
* a pointer to a mca_btl_tcp_addr_t struct.
|
||||
*/
|
||||
static int mca_btl_tcp_proc_store_matched_interfaces(mca_btl_tcp_proc_t *btl_proc,
|
||||
int local_proc_is_left,
|
||||
opal_bp_graph_t *graph,
|
||||
int num_matched, int *matched_edges)
|
||||
{
|
||||
int rc = OPAL_SUCCESS;
|
||||
int i, left, right;
|
||||
uint32_t* local_index;
|
||||
struct mca_btl_tcp_addr_t *remote_addr;
|
||||
|
||||
for (i = 0; i < num_matched; i++) {
|
||||
left = matched_edges[2 * i + 0];
|
||||
right = matched_edges[2 * i + 1];
|
||||
if (local_proc_is_left) {
|
||||
rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&local_index);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto out;
|
||||
}
|
||||
rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&remote_addr);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&local_index);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto out;
|
||||
}
|
||||
rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&remote_addr);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
opal_hash_table_set_value_uint32(&btl_proc->btl_index_to_endpoint, *local_index, (void *)remote_addr);
|
||||
}
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int mca_btl_tcp_proc_handle_modex_addresses(mca_btl_tcp_proc_t* btl_proc,
|
||||
mca_btl_tcp_modex_addr_t* remote_addrs,
|
||||
int local_proc_is_left)
|
||||
{
|
||||
opal_bp_graph_t *graph = NULL;
|
||||
int rc = OPAL_SUCCESS;
|
||||
int num_matched = 0;
|
||||
int *matched_edges = NULL;
|
||||
|
||||
rc = mca_btl_tcp_proc_create_interface_graph(btl_proc, remote_addrs, local_proc_is_left, &graph);
|
||||
if (rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = opal_bp_graph_solve_bipartite_assignment(graph, &num_matched, &matched_edges);
|
||||
if (rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = mca_btl_tcp_proc_store_matched_interfaces(btl_proc, local_proc_is_left,
|
||||
graph, num_matched, matched_edges);
|
||||
if (rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != graph) {
|
||||
opal_bp_graph_free(graph);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a TCP process structure. There is a one-to-one correspondence
|
||||
* between a opal_proc_t and a mca_btl_tcp_proc_t instance. We cache
|
||||
@ -117,9 +371,9 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc)
|
||||
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
|
||||
{
|
||||
mca_btl_tcp_proc_t* btl_proc;
|
||||
int rc;
|
||||
int rc, local_proc_is_left;
|
||||
mca_btl_tcp_modex_addr_t *remote_addrs = NULL;
|
||||
size_t i, size;
|
||||
size_t size;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
|
||||
rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
|
||||
@ -168,34 +422,20 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* the modex and proc structures differ slightly, so copy the
|
||||
fields needed in the proc version */
|
||||
for (i = 0 ; i < btl_proc->proc_addr_count ; i++) {
|
||||
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
|
||||
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet,
|
||||
remote_addrs[i].addr, sizeof(struct in_addr));
|
||||
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
|
||||
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
|
||||
btl_proc->proc_addrs[i].addr_family = AF_INET;
|
||||
btl_proc->proc_addrs[i].addr_inuse = false;
|
||||
} else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) {
|
||||
#if OPAL_ENABLE_IPV6
|
||||
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6,
|
||||
remote_addrs[i].addr, sizeof(struct in6_addr));
|
||||
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
|
||||
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
|
||||
btl_proc->proc_addrs[i].addr_family = AF_INET6;
|
||||
btl_proc->proc_addrs[i].addr_inuse = false;
|
||||
#else
|
||||
rc = OPAL_ERR_NOT_SUPPORTED;
|
||||
goto cleanup;
|
||||
#endif
|
||||
} else {
|
||||
BTL_ERROR(("Unexpected address family %d",
|
||||
(int)remote_addrs[i].addr_family));
|
||||
rc = OPAL_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
/* When solving for bipartite assignment, a graph with equal weights
|
||||
* can provide different outputs depending on the input parameters.
|
||||
* Thus two processes can construct different interface matchings.
|
||||
* To avoid this case, we put the process with the lower jobid on the
|
||||
* left or if they are equal, we use the lower vpid on the left.
|
||||
*
|
||||
* The concept of mirroring the local and remote sides is borrowed
|
||||
* from the usnic btl implementation of its bipartite assignment solver.
|
||||
*/
|
||||
local_proc_is_left = mca_btl_tcp_proc_is_proc_left(proc->proc_name, opal_proc_local_get()->proc_name);
|
||||
rc = mca_btl_tcp_proc_handle_modex_addresses(btl_proc, remote_addrs, local_proc_is_left);
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* allocate space for endpoint array - one for each exported address */
|
||||
@ -230,236 +470,33 @@ cleanup:
|
||||
return btl_proc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void evaluate_assignment(mca_btl_tcp_proc_data_t *proc_data, int *a) {
|
||||
size_t i;
|
||||
unsigned int max_interfaces = proc_data->num_local_interfaces;
|
||||
int assignment_weight = 0;
|
||||
int assignment_cardinality = 0;
|
||||
|
||||
if(max_interfaces < proc_data->num_peer_interfaces) {
|
||||
max_interfaces = proc_data->num_peer_interfaces;
|
||||
}
|
||||
|
||||
for(i = 0; i < max_interfaces; ++i) {
|
||||
if(0 < proc_data->weights[i][a[i]-1]) {
|
||||
++assignment_cardinality;
|
||||
assignment_weight += proc_data->weights[i][a[i]-1];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* check wether current solution beats all previous solutions
|
||||
*/
|
||||
if(assignment_cardinality > proc_data->max_assignment_cardinality
|
||||
|| (assignment_cardinality == proc_data->max_assignment_cardinality
|
||||
&& assignment_weight > proc_data->max_assignment_weight)) {
|
||||
|
||||
for(i = 0; i < max_interfaces; ++i) {
|
||||
proc_data->best_assignment[i] = a[i]-1;
|
||||
}
|
||||
proc_data->max_assignment_weight = assignment_weight;
|
||||
proc_data->max_assignment_cardinality = assignment_cardinality;
|
||||
}
|
||||
}
|
||||
|
||||
static void visit(mca_btl_tcp_proc_data_t *proc_data, int k, int level, int siz, int *a)
|
||||
{
|
||||
level = level+1; a[k] = level;
|
||||
|
||||
if (level == siz) {
|
||||
evaluate_assignment(proc_data, a);
|
||||
} else {
|
||||
int i;
|
||||
for ( i = 0; i < siz; i++)
|
||||
if (a[i] == 0)
|
||||
visit(proc_data, i, level, siz, a);
|
||||
}
|
||||
|
||||
level = level-1; a[k] = 0;
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_tcp_initialise_interface(mca_btl_tcp_interface_t* tcp_interface,
|
||||
int ifk_index, int index)
|
||||
{
|
||||
tcp_interface->kernel_index = ifk_index;
|
||||
tcp_interface->peer_interface = -1;
|
||||
tcp_interface->ipv4_address = NULL;
|
||||
tcp_interface->ipv6_address = NULL;
|
||||
tcp_interface->index = index;
|
||||
tcp_interface->inuse = 0;
|
||||
}
|
||||
|
||||
static mca_btl_tcp_interface_t** mca_btl_tcp_retrieve_local_interfaces(mca_btl_tcp_proc_data_t *proc_data)
|
||||
{
|
||||
struct sockaddr_storage local_addr;
|
||||
char local_if_name[OPAL_IF_NAMESIZE];
|
||||
char **include, **exclude, **argv;
|
||||
int idx;
|
||||
mca_btl_tcp_interface_t * local_interface;
|
||||
|
||||
assert (NULL == proc_data->local_interfaces);
|
||||
if( NULL != proc_data->local_interfaces )
|
||||
return proc_data->local_interfaces;
|
||||
|
||||
proc_data->max_local_interfaces = MAX_KERNEL_INTERFACES;
|
||||
proc_data->num_local_interfaces = 0;
|
||||
proc_data->local_interfaces = (mca_btl_tcp_interface_t**)calloc( proc_data->max_local_interfaces, sizeof(mca_btl_tcp_interface_t*) );
|
||||
if( NULL == proc_data->local_interfaces )
|
||||
return NULL;
|
||||
|
||||
/* Collect up the list of included and excluded interfaces, if any */
|
||||
include = opal_argv_split(mca_btl_tcp_component.tcp_if_include,',');
|
||||
exclude = opal_argv_split(mca_btl_tcp_component.tcp_if_exclude,',');
|
||||
|
||||
/*
|
||||
* identify all kernel interfaces and the associated addresses of
|
||||
* the local node
|
||||
*/
|
||||
for( idx = opal_ifbegin(); idx >= 0; idx = opal_ifnext (idx) ) {
|
||||
int kindex;
|
||||
uint64_t index;
|
||||
bool skip = false;
|
||||
|
||||
opal_ifindextoaddr (idx, (struct sockaddr*) &local_addr, sizeof (local_addr));
|
||||
opal_ifindextoname (idx, local_if_name, sizeof (local_if_name));
|
||||
|
||||
/* If we were given a list of included interfaces, then check
|
||||
* to see if the current one is a member of this set. If so,
|
||||
* drop down and complete processing. If not, skip it and
|
||||
* continue on to the next one. Note that providing an include
|
||||
* list will override providing an exclude list as the two are
|
||||
* mutually exclusive. This matches how it works in
|
||||
* mca_btl_tcp_component_create_instances() which is the function
|
||||
* that exports the interfaces. */
|
||||
if(NULL != include) {
|
||||
argv = include;
|
||||
skip = true;
|
||||
while(argv && *argv) {
|
||||
/* When comparing included interfaces, we look for exact matches.
|
||||
That is why we are using strcmp() here. */
|
||||
if (0 == strcmp(*argv, local_if_name)) {
|
||||
skip = false;
|
||||
break;
|
||||
}
|
||||
argv++;
|
||||
}
|
||||
} else if (NULL != exclude) {
|
||||
/* If we were given a list of excluded interfaces, then check to see if the
|
||||
* current one is a member of this set. If not, drop down and complete
|
||||
* processing. If so, skip it and continue on to the next one. */
|
||||
argv = exclude;
|
||||
while(argv && *argv) {
|
||||
/* When looking for interfaces to exclude, we only look at
|
||||
* the number of characters equal to what the user provided.
|
||||
* For example, excluding "lo" excludes "lo", "lo0" and
|
||||
* anything that starts with "lo" */
|
||||
if(0 == strncmp(*argv, local_if_name, strlen(*argv))) {
|
||||
skip = true;
|
||||
break;
|
||||
}
|
||||
argv++;
|
||||
}
|
||||
}
|
||||
if (true == skip) {
|
||||
/* This interface is not part of the requested set, so skip it */
|
||||
continue;
|
||||
}
|
||||
|
||||
kindex = opal_ifindextokindex(idx);
|
||||
int rc = opal_hash_table_get_value_uint32(&proc_data->local_kindex_to_index, kindex, (void**) &index);
|
||||
|
||||
/* create entry for this kernel index previously not seen */
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
index = proc_data->num_local_interfaces++;
|
||||
opal_hash_table_set_value_uint32(&proc_data->local_kindex_to_index, kindex, (void*)(uintptr_t) index);
|
||||
|
||||
if( proc_data->num_local_interfaces == proc_data->max_local_interfaces ) {
|
||||
proc_data->max_local_interfaces <<= 1;
|
||||
proc_data->local_interfaces = (mca_btl_tcp_interface_t**)realloc( proc_data->local_interfaces,
|
||||
proc_data->max_local_interfaces * sizeof(mca_btl_tcp_interface_t*) );
|
||||
if( NULL == proc_data->local_interfaces )
|
||||
goto cleanup;
|
||||
}
|
||||
proc_data->local_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
|
||||
assert(NULL != proc_data->local_interfaces[index]);
|
||||
mca_btl_tcp_initialise_interface(proc_data->local_interfaces[index], kindex, index);
|
||||
}
|
||||
|
||||
local_interface = proc_data->local_interfaces[index];
|
||||
switch(local_addr.ss_family) {
|
||||
case AF_INET:
|
||||
/* if AF is disabled, skip it completely */
|
||||
if (4 == mca_btl_tcp_component.tcp_disable_family) {
|
||||
continue;
|
||||
}
|
||||
|
||||
local_interface->ipv4_address =
|
||||
(struct sockaddr_storage*) malloc(sizeof(local_addr));
|
||||
memcpy(local_interface->ipv4_address,
|
||||
&local_addr, sizeof(local_addr));
|
||||
opal_ifindextomask(idx,
|
||||
&local_interface->ipv4_netmask,
|
||||
sizeof(int));
|
||||
break;
|
||||
case AF_INET6:
|
||||
/* if AF is disabled, skip it completely */
|
||||
if (6 == mca_btl_tcp_component.tcp_disable_family) {
|
||||
continue;
|
||||
}
|
||||
|
||||
local_interface->ipv6_address
|
||||
= (struct sockaddr_storage*) malloc(sizeof(local_addr));
|
||||
memcpy(local_interface->ipv6_address,
|
||||
&local_addr, sizeof(local_addr));
|
||||
opal_ifindextomask(idx,
|
||||
&local_interface->ipv6_netmask,
|
||||
sizeof(int));
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "unknown address family for tcp: %d\n",
|
||||
local_addr.ss_family);
|
||||
}
|
||||
}
|
||||
cleanup:
|
||||
if (NULL != include) {
|
||||
opal_argv_free(include);
|
||||
}
|
||||
if (NULL != exclude) {
|
||||
opal_argv_free(exclude);
|
||||
}
|
||||
|
||||
return proc_data->local_interfaces;
|
||||
}
|
||||
/*
|
||||
* Note that this routine must be called with the lock on the process
|
||||
* already held. Insert a btl instance into the proc array and assign
|
||||
* it an address.
|
||||
*/
|
||||
int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
mca_btl_base_endpoint_t* btl_endpoint )
|
||||
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc,
|
||||
mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
struct sockaddr_storage endpoint_addr_ss;
|
||||
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
|
||||
const char *proc_hostname;
|
||||
unsigned int perm_size = 0;
|
||||
int rc, *a = NULL;
|
||||
size_t i, j;
|
||||
mca_btl_tcp_interface_t** peer_interfaces = NULL;
|
||||
mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data;
|
||||
size_t max_peer_interfaces;
|
||||
char str_local[128], str_remote[128];
|
||||
mca_btl_tcp_addr_t *remote_addr;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
|
||||
return OPAL_ERR_UNREACH;
|
||||
rc = OPAL_ERR_UNREACH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t));
|
||||
OBJ_CONSTRUCT(&_proc_data.local_kindex_to_index, opal_hash_table_t);
|
||||
opal_hash_table_init(&_proc_data.local_kindex_to_index, 8);
|
||||
OBJ_CONSTRUCT(&_proc_data.peer_kindex_to_index, opal_hash_table_t);
|
||||
opal_hash_table_init(&_proc_data.peer_kindex_to_index, 8);
|
||||
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: host %s, process %s UNREACHABLE",
|
||||
proc_hostname,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
goto out;
|
||||
}
|
||||
btl_endpoint->endpoint_addr = remote_addr;
|
||||
|
||||
#ifndef WORDS_BIGENDIAN
|
||||
/* if we are little endian and our peer is not so lucky, then we
|
||||
@ -476,304 +513,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
btl_endpoint->endpoint_proc = btl_proc;
|
||||
btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint;
|
||||
|
||||
/* sanity checks */
|
||||
if( NULL == mca_btl_tcp_retrieve_local_interfaces(proc_data) )
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
if( 0 == proc_data->num_local_interfaces ) {
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
|
||||
max_peer_interfaces = proc_data->max_local_interfaces;
|
||||
peer_interfaces = (mca_btl_tcp_interface_t**)calloc( max_peer_interfaces, sizeof(mca_btl_tcp_interface_t*) );
|
||||
if (NULL == peer_interfaces) {
|
||||
max_peer_interfaces = 0;
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
proc_data->num_peer_interfaces = 0;
|
||||
|
||||
/*
|
||||
* identify all kernel interfaces and the associated addresses of
|
||||
* the peer
|
||||
*/
|
||||
|
||||
for( i = 0; i < btl_proc->proc_addr_count; i++ ) {
|
||||
|
||||
uint64_t index;
|
||||
|
||||
mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i;
|
||||
|
||||
mca_btl_tcp_proc_tosocks (endpoint_addr, &endpoint_addr_ss);
|
||||
|
||||
rc = opal_hash_table_get_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void**) &index);
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
index = proc_data->num_peer_interfaces++;
|
||||
opal_hash_table_set_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void*)(uintptr_t) index);
|
||||
if( proc_data->num_peer_interfaces == max_peer_interfaces ) {
|
||||
max_peer_interfaces <<= 1;
|
||||
peer_interfaces = (mca_btl_tcp_interface_t**)realloc( peer_interfaces,
|
||||
max_peer_interfaces * sizeof(mca_btl_tcp_interface_t*) );
|
||||
if( NULL == peer_interfaces ) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
peer_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
|
||||
mca_btl_tcp_initialise_interface(peer_interfaces[index],
|
||||
endpoint_addr->addr_ifkindex, index);
|
||||
}
|
||||
|
||||
/*
|
||||
* in case the peer address has created all intended connections,
|
||||
* mark the complete peer interface as 'not available'
|
||||
*/
|
||||
if(endpoint_addr->addr_inuse >= mca_btl_tcp_component.tcp_num_links) {
|
||||
peer_interfaces[index]->inuse = 1;
|
||||
}
|
||||
|
||||
switch(endpoint_addr_ss.ss_family) {
|
||||
case AF_INET:
|
||||
peer_interfaces[index]->ipv4_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
|
||||
peer_interfaces[index]->ipv4_endpoint_addr = endpoint_addr;
|
||||
memcpy(peer_interfaces[index]->ipv4_address,
|
||||
&endpoint_addr_ss, sizeof(endpoint_addr_ss));
|
||||
break;
|
||||
case AF_INET6:
|
||||
peer_interfaces[index]->ipv6_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
|
||||
peer_interfaces[index]->ipv6_endpoint_addr = endpoint_addr;
|
||||
memcpy(peer_interfaces[index]->ipv6_address,
|
||||
&endpoint_addr_ss, sizeof(endpoint_addr_ss));
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "unknown address family for tcp: %d\n",
|
||||
endpoint_addr_ss.ss_family);
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* assign weights to each possible pair of interfaces
|
||||
*/
|
||||
|
||||
perm_size = proc_data->num_local_interfaces;
|
||||
if(proc_data->num_peer_interfaces > perm_size) {
|
||||
perm_size = proc_data->num_peer_interfaces;
|
||||
}
|
||||
|
||||
proc_data->weights = (enum mca_btl_tcp_connection_quality**) malloc(perm_size
|
||||
* sizeof(enum mca_btl_tcp_connection_quality*));
|
||||
assert(NULL != proc_data->weights);
|
||||
|
||||
proc_data->best_addr = (mca_btl_tcp_addr_t ***) malloc(perm_size
|
||||
* sizeof(mca_btl_tcp_addr_t **));
|
||||
assert(NULL != proc_data->best_addr);
|
||||
for(i = 0; i < perm_size; ++i) {
|
||||
proc_data->weights[i] = (enum mca_btl_tcp_connection_quality*) calloc(perm_size,
|
||||
sizeof(enum mca_btl_tcp_connection_quality));
|
||||
assert(NULL != proc_data->weights[i]);
|
||||
|
||||
proc_data->best_addr[i] = (mca_btl_tcp_addr_t **) calloc(perm_size,
|
||||
sizeof(mca_btl_tcp_addr_t *));
|
||||
assert(NULL != proc_data->best_addr[i]);
|
||||
}
|
||||
|
||||
|
||||
for( i = 0; i < proc_data->num_local_interfaces; ++i ) {
|
||||
mca_btl_tcp_interface_t* local_interface = proc_data->local_interfaces[i];
|
||||
for( j = 0; j < proc_data->num_peer_interfaces; ++j ) {
|
||||
|
||||
/* initially, assume no connection is possible */
|
||||
proc_data->weights[i][j] = CQ_NO_CONNECTION;
|
||||
|
||||
/* check state of ipv4 address pair */
|
||||
if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
|
||||
NULL != peer_interfaces[j]->ipv4_address) {
|
||||
|
||||
/* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr,
|
||||
str_local, sizeof(str_local));
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr,
|
||||
str_remote, sizeof(str_remote));
|
||||
|
||||
if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
|
||||
opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
|
||||
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
|
||||
local_interface->ipv4_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
|
||||
continue;
|
||||
}
|
||||
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
|
||||
local_interface->ipv4_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check state of ipv6 address pair - ipv6 is always public,
|
||||
* since link-local addresses are skipped in opal_ifinit()
|
||||
*/
|
||||
if(NULL != local_interface->ipv6_address &&
|
||||
NULL != peer_interfaces[j]->ipv6_address) {
|
||||
|
||||
/* Convert the IPv6 addresses into nicely-printable strings for verbose debugging output */
|
||||
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) local_interface->ipv6_address))->sin6_addr,
|
||||
str_local, sizeof(str_local));
|
||||
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) peer_interfaces[j]->ipv6_address))->sin6_addr,
|
||||
str_remote, sizeof(str_remote));
|
||||
|
||||
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv6_address,
|
||||
local_interface->ipv6_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV6 PUBLIC SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV6 PUBLIC DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr;
|
||||
continue;
|
||||
}
|
||||
|
||||
} /* for each peer interface */
|
||||
} /* for each local interface */
|
||||
|
||||
/*
|
||||
* determine the size of the set to permute (max number of
|
||||
* interfaces
|
||||
*/
|
||||
|
||||
proc_data->best_assignment = (unsigned int *) malloc (perm_size * sizeof(int));
|
||||
|
||||
a = (int *) malloc(perm_size * sizeof(int));
|
||||
if (NULL == a) {
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* Can only find the best set of connections when the number of
|
||||
* interfaces is not too big. When it gets larger, we fall back
|
||||
* to a simpler and faster (and not as optimal) algorithm.
|
||||
* See ticket https://svn.open-mpi.org/trac/ompi/ticket/2031
|
||||
* for more details about this issue. */
|
||||
if (perm_size <= MAX_PERMUTATION_INTERFACES) {
|
||||
memset(a, 0, perm_size * sizeof(int));
|
||||
proc_data->max_assignment_cardinality = -1;
|
||||
proc_data->max_assignment_weight = -1;
|
||||
visit(proc_data, 0, -1, perm_size, a);
|
||||
|
||||
rc = OPAL_ERR_UNREACH;
|
||||
for(i = 0; i < perm_size; ++i) {
|
||||
unsigned int best = proc_data->best_assignment[i];
|
||||
if(best > proc_data->num_peer_interfaces
|
||||
|| proc_data->weights[i][best] == CQ_NO_CONNECTION
|
||||
|| peer_interfaces[best]->inuse
|
||||
|| NULL == peer_interfaces[best]) {
|
||||
continue;
|
||||
}
|
||||
peer_interfaces[best]->inuse++;
|
||||
btl_endpoint->endpoint_addr = proc_data->best_addr[i][best];
|
||||
btl_endpoint->endpoint_addr->addr_inuse = true;
|
||||
rc = OPAL_SUCCESS;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
enum mca_btl_tcp_connection_quality max;
|
||||
int i_max = 0, j_max = 0;
|
||||
/* Find the best connection that is not in use. Save away
|
||||
* the indices of the best location. */
|
||||
max = CQ_NO_CONNECTION;
|
||||
for(i=0; i<proc_data->num_local_interfaces; ++i) {
|
||||
for(j=0; j<proc_data->num_peer_interfaces; ++j) {
|
||||
if (!peer_interfaces[j]->inuse) {
|
||||
if (proc_data->weights[i][j] > max) {
|
||||
max = proc_data->weights[i][j];
|
||||
i_max = i;
|
||||
j_max = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Now see if there is a some type of connection available. */
|
||||
rc = OPAL_ERR_UNREACH;
|
||||
if (CQ_NO_CONNECTION != max) {
|
||||
peer_interfaces[j_max]->inuse++;
|
||||
btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max];
|
||||
btl_endpoint->endpoint_addr->addr_inuse = true;
|
||||
rc = OPAL_SUCCESS;
|
||||
}
|
||||
}
|
||||
if (OPAL_ERR_UNREACH == rc) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: host %s, process %s UNREACHABLE",
|
||||
proc_hostname,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
}
|
||||
|
||||
exit:
|
||||
// Ok to always free because proc_data() was memset() to 0 before
|
||||
// any possible return (and free(NULL) is fine).
|
||||
for(i = 0; i < perm_size; ++i) {
|
||||
free(proc_data->weights[i]);
|
||||
free(proc_data->best_addr[i]);
|
||||
}
|
||||
|
||||
for(i = 0; i < proc_data->num_peer_interfaces; ++i) {
|
||||
if(NULL != peer_interfaces[i]->ipv4_address) {
|
||||
free(peer_interfaces[i]->ipv4_address);
|
||||
}
|
||||
if(NULL != peer_interfaces[i]->ipv6_address) {
|
||||
free(peer_interfaces[i]->ipv6_address);
|
||||
}
|
||||
free(peer_interfaces[i]);
|
||||
}
|
||||
free(peer_interfaces);
|
||||
|
||||
for(i = 0; i < proc_data->num_local_interfaces; ++i) {
|
||||
if(NULL != proc_data->local_interfaces[i]->ipv4_address) {
|
||||
free(proc_data->local_interfaces[i]->ipv4_address);
|
||||
}
|
||||
if(NULL != proc_data->local_interfaces[i]->ipv6_address) {
|
||||
free(proc_data->local_interfaces[i]->ipv6_address);
|
||||
}
|
||||
free(proc_data->local_interfaces[i]);
|
||||
}
|
||||
free(proc_data->local_interfaces); proc_data->local_interfaces = NULL;
|
||||
proc_data->max_local_interfaces = 0;
|
||||
|
||||
free(proc_data->weights); proc_data->weights = NULL;
|
||||
free(proc_data->best_addr); proc_data->best_addr = NULL;
|
||||
free(proc_data->best_assignment); proc_data->best_assignment = NULL;
|
||||
|
||||
OBJ_DESTRUCT(&_proc_data.local_kindex_to_index);
|
||||
OBJ_DESTRUCT(&_proc_data.peer_kindex_to_index);
|
||||
|
||||
free(a);
|
||||
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -796,12 +536,6 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_
|
||||
OBJ_RELEASE(btl_proc);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
/* The endpoint_addr may still be NULL if this endpoint is
|
||||
being removed early in the wireup sequence (e.g., if it
|
||||
is unreachable by all other procs) */
|
||||
if (NULL != btl_endpoint->endpoint_addr) {
|
||||
btl_endpoint->endpoint_addr->addr_inuse = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,57 +55,15 @@ struct mca_btl_tcp_proc_t {
|
||||
size_t proc_endpoint_count;
|
||||
/**< number of endpoints */
|
||||
|
||||
opal_hash_table_t btl_index_to_endpoint;
|
||||
/**< interface match table, matches btl_index to remote addresses of type mca_btl_tcp_addr_t */
|
||||
|
||||
opal_mutex_t proc_lock;
|
||||
/**< lock to protect against concurrent access to proc state */
|
||||
};
|
||||
typedef struct mca_btl_tcp_proc_t mca_btl_tcp_proc_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_tcp_proc_t);
|
||||
|
||||
/* the highest possible interface kernel index we can handle */
|
||||
#define MAX_KERNEL_INTERFACE_INDEX 65536
|
||||
|
||||
/* the maximum number of kernel interfaces we can handle */
|
||||
#define MAX_KERNEL_INTERFACES 8
|
||||
|
||||
/* The maximum number of interfaces that we can have and use the
|
||||
* recursion code for determining the best set of connections. When
|
||||
* the number is greater than this, we switch to a simpler algorithm
|
||||
* to speed things up. */
|
||||
#define MAX_PERMUTATION_INTERFACES 8
|
||||
|
||||
/*
|
||||
* FIXME: this should probably be part of an ompi list, so we need the
|
||||
* appropriate definitions
|
||||
*/
|
||||
|
||||
struct mca_btl_tcp_interface_t {
|
||||
struct sockaddr_storage* ipv4_address;
|
||||
struct sockaddr_storage* ipv6_address;
|
||||
mca_btl_tcp_addr_t* ipv4_endpoint_addr;
|
||||
mca_btl_tcp_addr_t* ipv6_endpoint_addr;
|
||||
uint32_t ipv4_netmask;
|
||||
uint32_t ipv6_netmask;
|
||||
int kernel_index;
|
||||
int peer_interface;
|
||||
int index;
|
||||
int inuse;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_tcp_interface_t mca_btl_tcp_interface_t;
|
||||
|
||||
/*
|
||||
* describes the quality of a possible connection between a local and
|
||||
* a remote network interface
|
||||
*/
|
||||
enum mca_btl_tcp_connection_quality {
|
||||
CQ_NO_CONNECTION,
|
||||
CQ_PRIVATE_DIFFERENT_NETWORK,
|
||||
CQ_PRIVATE_SAME_NETWORK,
|
||||
CQ_PUBLIC_DIFFERENT_NETWORK,
|
||||
CQ_PUBLIC_SAME_NETWORK
|
||||
};
|
||||
|
||||
|
||||
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc);
|
||||
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t* name);
|
||||
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user