1
1

Merge pull request #7134 from wckzhang/btl_tcp_interface_match

btl tcp: Use reachability and graph solving for global interface matching
Этот коммит содержится в:
Brian Barrett 2020-01-27 15:38:49 -08:00 коммит произвёл GitHub
родитель 10f6a77640 e958f3cf22
Коммит fc8c7a5869
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 372 добавлений и 640 удалений

Просмотреть файл

@ -15,6 +15,8 @@
* Copyright (c) 2016-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
@ -101,12 +103,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
continue;
}
/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this BTL instance to the proc.
*/
OPAL_THREAD_LOCK(&tcp_proc->proc_lock);
for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) {

Просмотреть файл

@ -15,6 +15,8 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t {
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
unsigned int tcp_num_links; /**< number of logical links per physical device */
struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */
int tcp_free_list_num; /**< initial size of free lists */
int tcp_free_list_max; /**< maximum size of free lists */
int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */
@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
*/
struct mca_btl_tcp_module_t {
mca_btl_base_module_t super; /**< base BTL interface */
uint32_t btl_index; /**< Local BTL module index, used for vertex
data and used as a hash key when
solving module matching problem */
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
struct sockaddr_storage tcp_ifaddr; /**< First address
discovered for this

Просмотреть файл

@ -9,6 +9,9 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -30,37 +33,43 @@
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include <assert.h>
/**
* Modex address structure.
*
* One of these structures will be sent for every btl module in use by
* the local BTL TCP component.
* the local BTL TCP component. This is used to construct an opal_if_t
* structure for the reachability component as well as populate the
* mca_btl_tcp_addr_t structure on remote procs. These will be used
* for interface matching and filling out the mca_btl_base_endpoint_t
* structure.
*/
struct mca_btl_tcp_modex_addr_t {
uint8_t addr[16]; /* endpoint address. for addr_family
of MCA_BTL_TCP_AF_INET, only the
first 4 bytes have meaning. */
uint32_t addr_ifkindex; /* endpoint kernel index */
uint32_t addr_mask; /* ip mask */
uint32_t addr_bandwidth; /* interface bandwidth */
uint16_t addr_port; /* endpoint listen port */
uint8_t addr_family; /* endpoint address family. Note that
this is
MCA_BTL_TCP_AF_{INET,INET6}, not
the traditional
AF_INET/AF_INET6. */
uint8_t padding[1]; /* padd out to an 8-byte word */
uint8_t padding[1]; /* pad out to an 8-byte word */
};
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;
_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t");
/**
* Remote peer address structure
*
* One of these structures will be allocated for every remote endpoint
* associated with a remote proc. The data is pulled from the
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
* field, which is local.
* mca_btl_tcp_modex_addr_t structure.
*/
struct mca_btl_tcp_addr_t {
union {
@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t {
int addr_ifkindex; /**< remote interface index assigned with
this address */
uint8_t addr_family; /**< AF_INET or AF_INET6 */
bool addr_inuse; /**< local meaning only */
};
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;

Просмотреть файл

@ -19,7 +19,8 @@
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -69,6 +70,7 @@
#include "opal/util/net.h"
#include "opal/util/fd.h"
#include "opal/util/show_help.h"
#include "opal/util/string_copy.h"
#include "opal/util/printf.h"
#include "opal/constants.h"
#include "opal/mca/btl/btl.h"
@ -76,6 +78,7 @@
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/reachable/base/base.h"
#include "opal/threads/threads.h"
#include "opal/constants.h"
@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void)
mca_btl_tcp_component.tcp_btls = NULL;
/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void)
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs);
#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void)
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
{
struct mca_btl_tcp_module_t* btl;
opal_if_t *copied_interface, *selected_interface;
char param[256];
int i;
int i, if_index;
struct sockaddr_storage addr;
bool found = false;
@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
* 10.1.0.1 as the one that is published in the modex and used for
* connection.
*/
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
int ret;
if (if_kindex != opal_ifindextokindex(i)) {
OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) {
if (if_kindex != selected_interface->if_kernel_index) {
continue;
}
ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
sizeof(struct sockaddr_storage));
if (OPAL_SUCCESS != ret) {
return ret;
}
if_index = selected_interface->if_index;
memcpy((struct sockaddr*)&addr, &selected_interface->if_addr,
MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr)));
if (addr.ss_family == AF_INET &&
4 != mca_btl_tcp_component.tcp_disable_family) {
@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
if(NULL == btl)
return OPAL_ERR_OUT_OF_RESOURCE;
copied_interface = OBJ_NEW(opal_if_t);
if (NULL == copied_interface) {
free(btl);
return OPAL_ERR_OUT_OF_RESOURCE;
}
memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module));
OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t);
OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t);
mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl;
/* initialize the btl */
/* This index is used as a key for a hash table used for interface matching. */
btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1;
btl->tcp_ifkindex = (uint16_t) if_kindex;
#if MCA_BTL_TCP_STATISTICS
btl->tcp_bytes_recv = 0;
@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
#endif
memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
btl->tcp_ifmask = selected_interface->if_mask;
/* allow user to specify interface bandwidth */
sprintf(param, "bandwidth_%s", if_name);
@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
}
}
/* Add another entry to the local interface list */
opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE);
copied_interface->if_index = if_index;
copied_interface->if_kernel_index = btl->tcp_ifkindex;
copied_interface->af_family = btl->tcp_ifaddr.ss_family;
copied_interface->if_flags = selected_interface->if_flags;
copied_interface->if_speed = selected_interface->if_speed;
memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage));
copied_interface->if_mask = selected_interface->if_mask;
copied_interface->if_bandwidth = btl->super.btl_bandwidth;
memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac));
copied_interface->ifmtu = selected_interface->ifmtu;
opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super));
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
sizeof(struct in6_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv6 %s",
@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
sizeof(struct in_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv4 %s",
@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void)
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
return OPAL_ERR_BAD_PARAM;
}
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_mask = btl->tcp_ifmask;
addrs[i].addr_bandwidth = btl->super.btl_bandwidth;
}
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,

Просмотреть файл

@ -16,8 +16,11 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2013-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -36,6 +39,7 @@
#include "opal/class/opal_hash_table.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/reachable/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/arch.h"
#include "opal/util/argv.h"
@ -44,6 +48,8 @@
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/util/string_copy.h"
#include "opal/util/bipartite_graph.h"
#include "btl_tcp.h"
#include "btl_tcp_proc.h"
@ -51,21 +57,6 @@
static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc);
static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc);
struct mca_btl_tcp_proc_data_t {
mca_btl_tcp_interface_t** local_interfaces;
opal_hash_table_t local_kindex_to_index;
size_t num_local_interfaces, max_local_interfaces;
size_t num_peer_interfaces;
opal_hash_table_t peer_kindex_to_index;
unsigned int *best_assignment;
int max_assignment_weight;
int max_assignment_cardinality;
enum mca_btl_tcp_connection_quality **weights;
struct mca_btl_tcp_addr_t ***best_addr;
};
typedef struct mca_btl_tcp_proc_data_t mca_btl_tcp_proc_data_t;
OBJ_CLASS_INSTANCE( mca_btl_tcp_proc_t,
opal_list_item_t,
mca_btl_tcp_proc_construct,
@ -79,6 +70,8 @@ void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* tcp_proc)
tcp_proc->proc_endpoints = NULL;
tcp_proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&tcp_proc->proc_lock, opal_mutex_t);
OBJ_CONSTRUCT(&tcp_proc->btl_index_to_endpoint, opal_hash_table_t);
opal_hash_table_init(&tcp_proc->btl_index_to_endpoint, mca_btl_tcp_component.tcp_num_btls);
}
/*
@ -103,9 +96,270 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc)
if(NULL != tcp_proc->proc_addrs) {
free(tcp_proc->proc_addrs);
}
OBJ_DESTRUCT(&tcp_proc->btl_index_to_endpoint);
OBJ_DESTRUCT(&tcp_proc->proc_lock);
}
static inline int mca_btl_tcp_proc_is_proc_left(opal_process_name_t a,
opal_process_name_t b)
{
if (a.jobid != b.jobid) {
return (a.jobid < b.jobid);
} else {
return (a.vpid < b.vpid);
}
}
#define MCA_BTL_TCP_PROC_LOCAL_VERTEX(index) (index)
#define MCA_BTL_TCP_PROC_REMOTE_VERTEX(index) (index + mca_btl_tcp_component.tcp_num_btls)
/* This function builds a graph to match local and remote interfaces
* together. It also populates the remote proc object.
*
* @param btl_proc (IN) Remote proc information
* @param remote_addrs (IN) List of addresses from remote interfaces
* @param local_proc_is_left (IN) Boolean indicator. If true, we set local process
* interfaces to be on the left side of the graph.
* If false, we set remote process interfaces to
* be on the left side of the graph.
* @param graph_out (OUT) Constructed and populated bipartite interface
* graph with vertices as interfaces and negative
* reachability weights as costs for the edges.
* @return OPAL error code or success
*
* The vertices of this graph are the local and remote interfaces. Edges in
* this graph are connections between the interfaces. Costs are computed as
* negative weight which is calculated using the reachability framework.
*
* In order to mirror inputs on both the local and remote side when solving
* interface matching from both sides, we require local_proc_is_left to
* indicate whether the local interfaces should be on the left of the graph
* or not.
*
* The remote list and proc_addrs are assembled and populated here so that
* we can ensure that the vertex ordering matches the proc_addr ordering.
* This allows us to pass the correct pointers to the vertex data for storage.
*
*/
static int mca_btl_tcp_proc_create_interface_graph(mca_btl_tcp_proc_t* btl_proc,
mca_btl_tcp_modex_addr_t* remote_addrs,
int local_proc_is_left,
opal_bp_graph_t **graph_out)
{
opal_bp_graph_t *graph = NULL;
opal_reachable_t *results = NULL;
opal_list_t *local_list = &mca_btl_tcp_component.local_ifs;
opal_list_t *remote_list;
int rc, v_index, x, y, cost, u, v, num_edges = 0;
size_t i;
remote_list = OBJ_NEW(opal_list_t);
if (NULL == remote_list) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}
/* the modex and proc structures differ slightly, so copy the
fields needed in the proc version */
for (i = 0 ; i < btl_proc->proc_addr_count ; i++) {
/* Construct opal_if_t objects for the remote interfaces */
opal_if_t *interface = OBJ_NEW(opal_if_t);
if (NULL == interface) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet,
remote_addrs[i].addr, sizeof(struct in_addr));
btl_proc->proc_addrs[i].addr_family = AF_INET;
memcpy(&((struct sockaddr_in *)&(interface->if_addr))->sin_addr,
remote_addrs[i].addr, sizeof(struct in_addr));
((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET;
interface->af_family = AF_INET;
} else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) {
#if OPAL_ENABLE_IPV6
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6,
remote_addrs[i].addr, sizeof(struct in6_addr));
btl_proc->proc_addrs[i].addr_family = AF_INET6;
memcpy(&((struct sockaddr_in6 *)&(interface->if_addr))->sin6_addr,
remote_addrs[i].addr, sizeof(struct in6_addr));
((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET6;
interface->af_family = AF_INET6;
#else
rc = OPAL_ERR_NOT_SUPPORTED;
OBJ_RELEASE(interface);
goto out;
#endif
} else {
BTL_ERROR(("Unexpected address family %d",
(int)remote_addrs[i].addr_family));
rc = OPAL_ERR_BAD_PARAM;
OBJ_RELEASE(interface);
goto out;
}
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
interface->if_mask = remote_addrs[i].addr_mask;
interface->if_bandwidth = remote_addrs[i].addr_bandwidth;
opal_list_append(remote_list, &(interface->super));
}
rc = opal_bp_graph_create(NULL, NULL, &graph);
if (OPAL_SUCCESS != rc) {
goto out;
}
results = opal_reachable.reachable(local_list, remote_list);
if (NULL == results) {
rc = OPAL_ERROR;
goto err_graph;
}
/* Add vertices for each local node. These will store the btl index */
for (x = 0; x < results->num_local; x++) {
rc = opal_bp_graph_add_vertex(graph, &mca_btl_tcp_component.tcp_btls[x]->btl_index, &v_index);
if (OPAL_SUCCESS != rc) {
goto err_graph;
}
}
/* Add vertices for each remote node. These will store remote interface information */
for (y = 0; y < results->num_remote; y++) {
rc = opal_bp_graph_add_vertex(graph, &btl_proc->proc_addrs[y], &v_index);
if (OPAL_SUCCESS != rc) {
goto err_graph;
}
}
/* Add edges */
for (x = 0; x < results->num_local; x++) {
for (y = 0; y < results->num_remote; y++) {
/* The bipartite assignment solver will optimize a graph for
* least cost. Since weights vary from 0 as no connection and
* higher weights as better connections (multiplied by some other
* factors), higher weight is better. Thus, to achieve least cost,
* we set cost as negative weight.
*/
cost = -results->weights[x][y];
/* Skip edges with no connections */
if (0 == cost) {
continue;
}
if (local_proc_is_left) {
u = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x);
v = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y);
} else {
u = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y);
v = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x);
}
rc = opal_bp_graph_add_edge(graph, u, v, cost, 1, NULL);
if (OPAL_SUCCESS != rc) {
goto err_graph;
}
num_edges++;
}
}
if (0 == num_edges) {
BTL_ERROR(("Unable to find reachable pairing between local and remote interfaces"));
rc = OPAL_ERR_UNREACH;
}
*graph_out = graph;
goto out;
err_graph:
if (NULL != graph) {
opal_bp_graph_free(graph);
}
out:
if (NULL != results) {
free(results);
}
if (NULL != remote_list) {
OBJ_RELEASE(remote_list);
}
return rc;
}
/* We store the matched interface data by using the btl_index as the key and
* a pointer to a mca_btl_tcp_addr_t struct.
*/
static int mca_btl_tcp_proc_store_matched_interfaces(mca_btl_tcp_proc_t *btl_proc,
int local_proc_is_left,
opal_bp_graph_t *graph,
int num_matched, int *matched_edges)
{
int rc = OPAL_SUCCESS;
int i, left, right;
uint32_t* local_index;
struct mca_btl_tcp_addr_t *remote_addr;
for (i = 0; i < num_matched; i++) {
left = matched_edges[2 * i + 0];
right = matched_edges[2 * i + 1];
if (local_proc_is_left) {
rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&local_index);
if (OPAL_SUCCESS != rc) {
goto out;
}
rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&remote_addr);
if (OPAL_SUCCESS != rc) {
goto out;
}
} else {
rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&local_index);
if (OPAL_SUCCESS != rc) {
goto out;
}
rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&remote_addr);
if (OPAL_SUCCESS != rc) {
goto out;
}
}
opal_hash_table_set_value_uint32(&btl_proc->btl_index_to_endpoint, *local_index, (void *)remote_addr);
}
out:
return rc;
}
static int mca_btl_tcp_proc_handle_modex_addresses(mca_btl_tcp_proc_t* btl_proc,
mca_btl_tcp_modex_addr_t* remote_addrs,
int local_proc_is_left)
{
opal_bp_graph_t *graph = NULL;
int rc = OPAL_SUCCESS;
int num_matched = 0;
int *matched_edges = NULL;
rc = mca_btl_tcp_proc_create_interface_graph(btl_proc, remote_addrs, local_proc_is_left, &graph);
if (rc) {
goto cleanup;
}
rc = opal_bp_graph_solve_bipartite_assignment(graph, &num_matched, &matched_edges);
if (rc) {
goto cleanup;
}
rc = mca_btl_tcp_proc_store_matched_interfaces(btl_proc, local_proc_is_left,
graph, num_matched, matched_edges);
if (rc) {
goto cleanup;
}
cleanup:
if (NULL != graph) {
opal_bp_graph_free(graph);
}
return rc;
}
/*
* Create a TCP process structure. There is a one-to-one correspondence
* between a opal_proc_t and a mca_btl_tcp_proc_t instance. We cache
@ -117,9 +371,9 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc)
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
{
mca_btl_tcp_proc_t* btl_proc;
int rc;
int rc, local_proc_is_left;
mca_btl_tcp_modex_addr_t *remote_addrs = NULL;
size_t i, size;
size_t size;
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
@ -168,34 +422,20 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
goto cleanup;
}
/* the modex and proc structures differ slightly, so copy the
fields needed in the proc version */
for (i = 0 ; i < btl_proc->proc_addr_count ; i++) {
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet,
remote_addrs[i].addr, sizeof(struct in_addr));
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
btl_proc->proc_addrs[i].addr_family = AF_INET;
btl_proc->proc_addrs[i].addr_inuse = false;
} else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) {
#if OPAL_ENABLE_IPV6
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6,
remote_addrs[i].addr, sizeof(struct in6_addr));
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
btl_proc->proc_addrs[i].addr_family = AF_INET6;
btl_proc->proc_addrs[i].addr_inuse = false;
#else
rc = OPAL_ERR_NOT_SUPPORTED;
goto cleanup;
#endif
} else {
BTL_ERROR(("Unexpected address family %d",
(int)remote_addrs[i].addr_family));
rc = OPAL_ERR_BAD_PARAM;
goto cleanup;
}
/* When solving for bipartite assignment, a graph with equal weights
* can provide different outputs depending on the input parameters.
* Thus two processes can construct different interface matchings.
* To avoid this case, we put the process with the lower jobid on the
* left or if they are equal, we use the lower vpid on the left.
*
* The concept of mirroring the local and remote sides is borrowed
* from the usnic btl implementation of its bipartite assignment solver.
*/
local_proc_is_left = mca_btl_tcp_proc_is_proc_left(proc->proc_name, opal_proc_local_get()->proc_name);
rc = mca_btl_tcp_proc_handle_modex_addresses(btl_proc, remote_addrs, local_proc_is_left);
if (OPAL_SUCCESS != rc) {
goto cleanup;
}
/* allocate space for endpoint array - one for each exported address */
@ -230,236 +470,33 @@ cleanup:
return btl_proc;
}
static void evaluate_assignment(mca_btl_tcp_proc_data_t *proc_data, int *a) {
size_t i;
unsigned int max_interfaces = proc_data->num_local_interfaces;
int assignment_weight = 0;
int assignment_cardinality = 0;
if(max_interfaces < proc_data->num_peer_interfaces) {
max_interfaces = proc_data->num_peer_interfaces;
}
for(i = 0; i < max_interfaces; ++i) {
if(0 < proc_data->weights[i][a[i]-1]) {
++assignment_cardinality;
assignment_weight += proc_data->weights[i][a[i]-1];
}
}
/*
* check wether current solution beats all previous solutions
*/
if(assignment_cardinality > proc_data->max_assignment_cardinality
|| (assignment_cardinality == proc_data->max_assignment_cardinality
&& assignment_weight > proc_data->max_assignment_weight)) {
for(i = 0; i < max_interfaces; ++i) {
proc_data->best_assignment[i] = a[i]-1;
}
proc_data->max_assignment_weight = assignment_weight;
proc_data->max_assignment_cardinality = assignment_cardinality;
}
}
static void visit(mca_btl_tcp_proc_data_t *proc_data, int k, int level, int siz, int *a)
{
level = level+1; a[k] = level;
if (level == siz) {
evaluate_assignment(proc_data, a);
} else {
int i;
for ( i = 0; i < siz; i++)
if (a[i] == 0)
visit(proc_data, i, level, siz, a);
}
level = level-1; a[k] = 0;
}
static void mca_btl_tcp_initialise_interface(mca_btl_tcp_interface_t* tcp_interface,
int ifk_index, int index)
{
tcp_interface->kernel_index = ifk_index;
tcp_interface->peer_interface = -1;
tcp_interface->ipv4_address = NULL;
tcp_interface->ipv6_address = NULL;
tcp_interface->index = index;
tcp_interface->inuse = 0;
}
static mca_btl_tcp_interface_t** mca_btl_tcp_retrieve_local_interfaces(mca_btl_tcp_proc_data_t *proc_data)
{
struct sockaddr_storage local_addr;
char local_if_name[OPAL_IF_NAMESIZE];
char **include, **exclude, **argv;
int idx;
mca_btl_tcp_interface_t * local_interface;
assert (NULL == proc_data->local_interfaces);
if( NULL != proc_data->local_interfaces )
return proc_data->local_interfaces;
proc_data->max_local_interfaces = MAX_KERNEL_INTERFACES;
proc_data->num_local_interfaces = 0;
proc_data->local_interfaces = (mca_btl_tcp_interface_t**)calloc( proc_data->max_local_interfaces, sizeof(mca_btl_tcp_interface_t*) );
if( NULL == proc_data->local_interfaces )
return NULL;
/* Collect up the list of included and excluded interfaces, if any */
include = opal_argv_split(mca_btl_tcp_component.tcp_if_include,',');
exclude = opal_argv_split(mca_btl_tcp_component.tcp_if_exclude,',');
/*
* identify all kernel interfaces and the associated addresses of
* the local node
*/
for( idx = opal_ifbegin(); idx >= 0; idx = opal_ifnext (idx) ) {
int kindex;
uint64_t index;
bool skip = false;
opal_ifindextoaddr (idx, (struct sockaddr*) &local_addr, sizeof (local_addr));
opal_ifindextoname (idx, local_if_name, sizeof (local_if_name));
/* If we were given a list of included interfaces, then check
* to see if the current one is a member of this set. If so,
* drop down and complete processing. If not, skip it and
* continue on to the next one. Note that providing an include
* list will override providing an exclude list as the two are
* mutually exclusive. This matches how it works in
* mca_btl_tcp_component_create_instances() which is the function
* that exports the interfaces. */
if(NULL != include) {
argv = include;
skip = true;
while(argv && *argv) {
/* When comparing included interfaces, we look for exact matches.
That is why we are using strcmp() here. */
if (0 == strcmp(*argv, local_if_name)) {
skip = false;
break;
}
argv++;
}
} else if (NULL != exclude) {
/* If we were given a list of excluded interfaces, then check to see if the
* current one is a member of this set. If not, drop down and complete
* processing. If so, skip it and continue on to the next one. */
argv = exclude;
while(argv && *argv) {
/* When looking for interfaces to exclude, we only look at
* the number of characters equal to what the user provided.
* For example, excluding "lo" excludes "lo", "lo0" and
* anything that starts with "lo" */
if(0 == strncmp(*argv, local_if_name, strlen(*argv))) {
skip = true;
break;
}
argv++;
}
}
if (true == skip) {
/* This interface is not part of the requested set, so skip it */
continue;
}
kindex = opal_ifindextokindex(idx);
int rc = opal_hash_table_get_value_uint32(&proc_data->local_kindex_to_index, kindex, (void**) &index);
/* create entry for this kernel index previously not seen */
if (OPAL_SUCCESS != rc) {
index = proc_data->num_local_interfaces++;
opal_hash_table_set_value_uint32(&proc_data->local_kindex_to_index, kindex, (void*)(uintptr_t) index);
if( proc_data->num_local_interfaces == proc_data->max_local_interfaces ) {
proc_data->max_local_interfaces <<= 1;
proc_data->local_interfaces = (mca_btl_tcp_interface_t**)realloc( proc_data->local_interfaces,
proc_data->max_local_interfaces * sizeof(mca_btl_tcp_interface_t*) );
if( NULL == proc_data->local_interfaces )
goto cleanup;
}
proc_data->local_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
assert(NULL != proc_data->local_interfaces[index]);
mca_btl_tcp_initialise_interface(proc_data->local_interfaces[index], kindex, index);
}
local_interface = proc_data->local_interfaces[index];
switch(local_addr.ss_family) {
case AF_INET:
/* if AF is disabled, skip it completely */
if (4 == mca_btl_tcp_component.tcp_disable_family) {
continue;
}
local_interface->ipv4_address =
(struct sockaddr_storage*) malloc(sizeof(local_addr));
memcpy(local_interface->ipv4_address,
&local_addr, sizeof(local_addr));
opal_ifindextomask(idx,
&local_interface->ipv4_netmask,
sizeof(int));
break;
case AF_INET6:
/* if AF is disabled, skip it completely */
if (6 == mca_btl_tcp_component.tcp_disable_family) {
continue;
}
local_interface->ipv6_address
= (struct sockaddr_storage*) malloc(sizeof(local_addr));
memcpy(local_interface->ipv6_address,
&local_addr, sizeof(local_addr));
opal_ifindextomask(idx,
&local_interface->ipv6_netmask,
sizeof(int));
break;
default:
opal_output(0, "unknown address family for tcp: %d\n",
local_addr.ss_family);
}
}
cleanup:
if (NULL != include) {
opal_argv_free(include);
}
if (NULL != exclude) {
opal_argv_free(exclude);
}
return proc_data->local_interfaces;
}
/*
* Note that this routine must be called with the lock on the process
* already held. Insert a btl instance into the proc array and assign
* it an address.
*/
int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
mca_btl_base_endpoint_t* btl_endpoint )
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc,
mca_btl_base_endpoint_t* btl_endpoint)
{
struct sockaddr_storage endpoint_addr_ss;
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
const char *proc_hostname;
unsigned int perm_size = 0;
int rc, *a = NULL;
size_t i, j;
mca_btl_tcp_interface_t** peer_interfaces = NULL;
mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data;
size_t max_peer_interfaces;
char str_local[128], str_remote[128];
mca_btl_tcp_addr_t *remote_addr;
int rc = OPAL_SUCCESS;
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
return OPAL_ERR_UNREACH;
rc = OPAL_ERR_UNREACH;
goto out;
}
memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t));
OBJ_CONSTRUCT(&_proc_data.local_kindex_to_index, opal_hash_table_t);
opal_hash_table_init(&_proc_data.local_kindex_to_index, 8);
OBJ_CONSTRUCT(&_proc_data.peer_kindex_to_index, opal_hash_table_t);
opal_hash_table_init(&_proc_data.peer_kindex_to_index, 8);
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: host %s, process %s UNREACHABLE",
proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
goto out;
}
btl_endpoint->endpoint_addr = remote_addr;
#ifndef WORDS_BIGENDIAN
/* if we are little endian and our peer is not so lucky, then we
@ -476,304 +513,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
btl_endpoint->endpoint_proc = btl_proc;
btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint;
/* sanity checks */
if( NULL == mca_btl_tcp_retrieve_local_interfaces(proc_data) )
return OPAL_ERR_OUT_OF_RESOURCE;
if( 0 == proc_data->num_local_interfaces ) {
return OPAL_ERR_UNREACH;
}
max_peer_interfaces = proc_data->max_local_interfaces;
peer_interfaces = (mca_btl_tcp_interface_t**)calloc( max_peer_interfaces, sizeof(mca_btl_tcp_interface_t*) );
if (NULL == peer_interfaces) {
max_peer_interfaces = 0;
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto exit;
}
proc_data->num_peer_interfaces = 0;
/*
* identify all kernel interfaces and the associated addresses of
* the peer
*/
for( i = 0; i < btl_proc->proc_addr_count; i++ ) {
uint64_t index;
mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i;
mca_btl_tcp_proc_tosocks (endpoint_addr, &endpoint_addr_ss);
rc = opal_hash_table_get_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void**) &index);
if (OPAL_SUCCESS != rc) {
index = proc_data->num_peer_interfaces++;
opal_hash_table_set_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void*)(uintptr_t) index);
if( proc_data->num_peer_interfaces == max_peer_interfaces ) {
max_peer_interfaces <<= 1;
peer_interfaces = (mca_btl_tcp_interface_t**)realloc( peer_interfaces,
max_peer_interfaces * sizeof(mca_btl_tcp_interface_t*) );
if( NULL == peer_interfaces ) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
}
peer_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
mca_btl_tcp_initialise_interface(peer_interfaces[index],
endpoint_addr->addr_ifkindex, index);
}
/*
* in case the peer address has created all intended connections,
* mark the complete peer interface as 'not available'
*/
if(endpoint_addr->addr_inuse >= mca_btl_tcp_component.tcp_num_links) {
peer_interfaces[index]->inuse = 1;
}
switch(endpoint_addr_ss.ss_family) {
case AF_INET:
peer_interfaces[index]->ipv4_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
peer_interfaces[index]->ipv4_endpoint_addr = endpoint_addr;
memcpy(peer_interfaces[index]->ipv4_address,
&endpoint_addr_ss, sizeof(endpoint_addr_ss));
break;
case AF_INET6:
peer_interfaces[index]->ipv6_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
peer_interfaces[index]->ipv6_endpoint_addr = endpoint_addr;
memcpy(peer_interfaces[index]->ipv6_address,
&endpoint_addr_ss, sizeof(endpoint_addr_ss));
break;
default:
opal_output(0, "unknown address family for tcp: %d\n",
endpoint_addr_ss.ss_family);
return OPAL_ERR_UNREACH;
}
}
/*
* assign weights to each possible pair of interfaces
*/
perm_size = proc_data->num_local_interfaces;
if(proc_data->num_peer_interfaces > perm_size) {
perm_size = proc_data->num_peer_interfaces;
}
proc_data->weights = (enum mca_btl_tcp_connection_quality**) malloc(perm_size
* sizeof(enum mca_btl_tcp_connection_quality*));
assert(NULL != proc_data->weights);
proc_data->best_addr = (mca_btl_tcp_addr_t ***) malloc(perm_size
* sizeof(mca_btl_tcp_addr_t **));
assert(NULL != proc_data->best_addr);
for(i = 0; i < perm_size; ++i) {
proc_data->weights[i] = (enum mca_btl_tcp_connection_quality*) calloc(perm_size,
sizeof(enum mca_btl_tcp_connection_quality));
assert(NULL != proc_data->weights[i]);
proc_data->best_addr[i] = (mca_btl_tcp_addr_t **) calloc(perm_size,
sizeof(mca_btl_tcp_addr_t *));
assert(NULL != proc_data->best_addr[i]);
}
for( i = 0; i < proc_data->num_local_interfaces; ++i ) {
mca_btl_tcp_interface_t* local_interface = proc_data->local_interfaces[i];
for( j = 0; j < proc_data->num_peer_interfaces; ++j ) {
/* initially, assume no connection is possible */
proc_data->weights[i][j] = CQ_NO_CONNECTION;
/* check state of ipv4 address pair */
if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
NULL != peer_interfaces[j]->ipv4_address) {
/* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */
inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr,
str_local, sizeof(str_local));
inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr,
str_remote, sizeof(str_remote));
if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
local_interface->ipv4_netmask)) {
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK",
str_local, str_remote);
} else {
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK",
str_local, str_remote);
}
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
continue;
}
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
local_interface->ipv4_netmask)) {
proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK",
str_local, str_remote);
} else {
proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK",
str_local, str_remote);
}
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
continue;
}
/* check state of ipv6 address pair - ipv6 is always public,
* since link-local addresses are skipped in opal_ifinit()
*/
if(NULL != local_interface->ipv6_address &&
NULL != peer_interfaces[j]->ipv6_address) {
/* Convert the IPv6 addresses into nicely-printable strings for verbose debugging output */
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) local_interface->ipv6_address))->sin6_addr,
str_local, sizeof(str_local));
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) peer_interfaces[j]->ipv6_address))->sin6_addr,
str_remote, sizeof(str_remote));
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address,
(struct sockaddr*) peer_interfaces[j]->ipv6_address,
local_interface->ipv6_netmask)) {
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV6 PUBLIC SAME NETWORK",
str_local, str_remote);
} else {
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV6 PUBLIC DIFFERENT NETWORK",
str_local, str_remote);
}
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr;
continue;
}
} /* for each peer interface */
} /* for each local interface */
/*
* determine the size of the set to permute (max number of
* interfaces
*/
proc_data->best_assignment = (unsigned int *) malloc (perm_size * sizeof(int));
a = (int *) malloc(perm_size * sizeof(int));
if (NULL == a) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto exit;
}
/* Can only find the best set of connections when the number of
* interfaces is not too big. When it gets larger, we fall back
* to a simpler and faster (and not as optimal) algorithm.
* See ticket https://svn.open-mpi.org/trac/ompi/ticket/2031
* for more details about this issue. */
if (perm_size <= MAX_PERMUTATION_INTERFACES) {
memset(a, 0, perm_size * sizeof(int));
proc_data->max_assignment_cardinality = -1;
proc_data->max_assignment_weight = -1;
visit(proc_data, 0, -1, perm_size, a);
rc = OPAL_ERR_UNREACH;
for(i = 0; i < perm_size; ++i) {
unsigned int best = proc_data->best_assignment[i];
if(best > proc_data->num_peer_interfaces
|| proc_data->weights[i][best] == CQ_NO_CONNECTION
|| peer_interfaces[best]->inuse
|| NULL == peer_interfaces[best]) {
continue;
}
peer_interfaces[best]->inuse++;
btl_endpoint->endpoint_addr = proc_data->best_addr[i][best];
btl_endpoint->endpoint_addr->addr_inuse = true;
rc = OPAL_SUCCESS;
break;
}
} else {
enum mca_btl_tcp_connection_quality max;
int i_max = 0, j_max = 0;
/* Find the best connection that is not in use. Save away
* the indices of the best location. */
max = CQ_NO_CONNECTION;
for(i=0; i<proc_data->num_local_interfaces; ++i) {
for(j=0; j<proc_data->num_peer_interfaces; ++j) {
if (!peer_interfaces[j]->inuse) {
if (proc_data->weights[i][j] > max) {
max = proc_data->weights[i][j];
i_max = i;
j_max = j;
}
}
}
}
/* Now see if there is a some type of connection available. */
rc = OPAL_ERR_UNREACH;
if (CQ_NO_CONNECTION != max) {
peer_interfaces[j_max]->inuse++;
btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max];
btl_endpoint->endpoint_addr->addr_inuse = true;
rc = OPAL_SUCCESS;
}
}
if (OPAL_ERR_UNREACH == rc) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: host %s, process %s UNREACHABLE",
proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
}
exit:
// Ok to always free because proc_data() was memset() to 0 before
// any possible return (and free(NULL) is fine).
for(i = 0; i < perm_size; ++i) {
free(proc_data->weights[i]);
free(proc_data->best_addr[i]);
}
for(i = 0; i < proc_data->num_peer_interfaces; ++i) {
if(NULL != peer_interfaces[i]->ipv4_address) {
free(peer_interfaces[i]->ipv4_address);
}
if(NULL != peer_interfaces[i]->ipv6_address) {
free(peer_interfaces[i]->ipv6_address);
}
free(peer_interfaces[i]);
}
free(peer_interfaces);
for(i = 0; i < proc_data->num_local_interfaces; ++i) {
if(NULL != proc_data->local_interfaces[i]->ipv4_address) {
free(proc_data->local_interfaces[i]->ipv4_address);
}
if(NULL != proc_data->local_interfaces[i]->ipv6_address) {
free(proc_data->local_interfaces[i]->ipv6_address);
}
free(proc_data->local_interfaces[i]);
}
free(proc_data->local_interfaces); proc_data->local_interfaces = NULL;
proc_data->max_local_interfaces = 0;
free(proc_data->weights); proc_data->weights = NULL;
free(proc_data->best_addr); proc_data->best_addr = NULL;
free(proc_data->best_assignment); proc_data->best_assignment = NULL;
OBJ_DESTRUCT(&_proc_data.local_kindex_to_index);
OBJ_DESTRUCT(&_proc_data.peer_kindex_to_index);
free(a);
out:
return rc;
}
@ -796,12 +536,6 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_
OBJ_RELEASE(btl_proc);
return OPAL_SUCCESS;
}
/* The endpoint_addr may still be NULL if this endpoint is
being removed early in the wireup sequence (e.g., if it
is unreachable by all other procs) */
if (NULL != btl_endpoint->endpoint_addr) {
btl_endpoint->endpoint_addr->addr_inuse = false;
}
break;
}
}

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,57 +55,15 @@ struct mca_btl_tcp_proc_t {
size_t proc_endpoint_count;
/**< number of endpoints */
opal_hash_table_t btl_index_to_endpoint;
/**< interface match table, matches btl_index to remote addresses of type mca_btl_tcp_addr_t */
opal_mutex_t proc_lock;
/**< lock to protect against concurrent access to proc state */
};
typedef struct mca_btl_tcp_proc_t mca_btl_tcp_proc_t;
OBJ_CLASS_DECLARATION(mca_btl_tcp_proc_t);
/* the highest possible interface kernel index we can handle */
#define MAX_KERNEL_INTERFACE_INDEX 65536
/* the maximum number of kernel interfaces we can handle */
#define MAX_KERNEL_INTERFACES 8
/* The maximum number of interfaces that we can have and use the
* recursion code for determining the best set of connections. When
* the number is greater than this, we switch to a simpler algorithm
* to speed things up. */
#define MAX_PERMUTATION_INTERFACES 8
/*
* FIXME: this should probably be part of an ompi list, so we need the
* appropriate definitions
*/
struct mca_btl_tcp_interface_t {
struct sockaddr_storage* ipv4_address;
struct sockaddr_storage* ipv6_address;
mca_btl_tcp_addr_t* ipv4_endpoint_addr;
mca_btl_tcp_addr_t* ipv6_endpoint_addr;
uint32_t ipv4_netmask;
uint32_t ipv6_netmask;
int kernel_index;
int peer_interface;
int index;
int inuse;
};
typedef struct mca_btl_tcp_interface_t mca_btl_tcp_interface_t;
/*
* describes the quality of a possible connection between a local and
* a remote network interface
*/
enum mca_btl_tcp_connection_quality {
CQ_NO_CONNECTION,
CQ_PRIVATE_DIFFERENT_NETWORK,
CQ_PRIVATE_SAME_NETWORK,
CQ_PUBLIC_DIFFERENT_NETWORK,
CQ_PUBLIC_SAME_NETWORK
};
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc);
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t* name);
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*);