1
1
openmpi/opal/mca/btl/tcp/btl_tcp_proc.c
William Zhang e958f3cf22 btl tcp: Use reachability and graph solving for global interface matching
Previously we used a fairly simple algorithm in
mca_btl_tcp_proc_insert() to pair local and remote modules. This was a
point in time solution rather than a global optimization problem (where
global means all modules between two peers). The selection logic would
often fail due to pairing interfaces that are not routable for traffic.
The complexity of the selection logic was Θ(n^n), which was expensive.
Due to poor scalability, this logic was only used when the number of
interfaces was less than MAX_PERMUTATION_INTERFACES (default 8). More
details can be found in this ticket:
https://svn.open-mpi.org/trac/ompi/ticket/2031 (The complexity estimates
in the ticket do not match what I calculated from the function)
As a fallback, when interfaces surpassed this threshold, a brute force
O(n^2) double for loop was used to match interfaces.

This commit solves two problems. First, the point-in-time solution is
turned into a global optimization solution. Second, the reachability
framework was used to create a more realistic reachability map. We
switched from using IP/netmask to using the reachability framework,
which supports route lookup. This will help many corner cases as well as
utilize any future development of the reachability framework.

The solution implemented in this commit has a complexity mainly derived
from the bipartite assignment solver. If the local and remote peer both
have the same number of interfaces (n), the complexity of matching will
be O(n^5).

With the decrease in complexity to O(n^5), I calculated and tested
that initialization costs would be 5000 microseconds with 30 interfaces
per node (Likely close to the maximum realistic number of interfaces we
will encounter). For additional datapoints, data up to 300 (a very
unrealistic number) of interfaces was simulated. Up until 150
interfaces, the matching costs will be less than 1 second, climbing to
10 seconds with 300 interfaces. Reflecting on these results, I removed
the suboptimal O(n^2) fallback logic, as it no longer seems necessary.

Data was gathered comparing the scaling of initialization costs with
ranks. For low number of interfaces, the impact of initialization is
negligible. At an interface count of 7-8, the new code has slightly
faster initialization costs. At an interface count of 15, the new code
has slower initialization costs. However, all initialization costs
scale linearly with the number of ranks.

In order to use the reachable function, we populate local and remote
lists of interfaces. We then convert the interface matching problem
into a graph problem. We create a bipartite graph with the local and
remote interfaces as vertices and use negative reachability weights as
costs. Using the bipartite assignment solver, we generate the matches
for the graph. To ensure that both the local and remote process have
the same output, we ensure we mirror their respective inputs for the
graphs. Finally, we store the endpoint matches that we created earlier
in a hash table. This is stored with the btl_index as the key and a
struct mca_btl_tcp_addr_t* as the value. This is then retrieved during
insertion time to set the endpoint address.

Signed-off-by: William Zhang <wilzhang@amazon.com>
2020-01-21 18:24:08 +00:00

739 строки
28 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#include "opal/class/opal_hash_table.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/reachable/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/arch.h"
#include "opal/util/argv.h"
#include "opal/util/if.h"
#include "opal/util/net.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/util/string_copy.h"
#include "opal/util/bipartite_graph.h"
#include "btl_tcp.h"
#include "btl_tcp_proc.h"
static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc);
static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc);
OBJ_CLASS_INSTANCE( mca_btl_tcp_proc_t,
opal_list_item_t,
mca_btl_tcp_proc_construct,
mca_btl_tcp_proc_destruct );
void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* tcp_proc)
{
tcp_proc->proc_opal = NULL;
tcp_proc->proc_addrs = NULL;
tcp_proc->proc_addr_count = 0;
tcp_proc->proc_endpoints = NULL;
tcp_proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&tcp_proc->proc_lock, opal_mutex_t);
OBJ_CONSTRUCT(&tcp_proc->btl_index_to_endpoint, opal_hash_table_t);
opal_hash_table_init(&tcp_proc->btl_index_to_endpoint, mca_btl_tcp_component.tcp_num_btls);
}
/*
* Cleanup ib proc instance
*/
void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc)
{
if( NULL != tcp_proc->proc_opal ) {
/* remove from list of all proc instances */
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
opal_proc_table_remove_value(&mca_btl_tcp_component.tcp_procs,
tcp_proc->proc_opal->proc_name);
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
OBJ_RELEASE(tcp_proc->proc_opal);
tcp_proc->proc_opal = NULL;
}
/* release resources */
if(NULL != tcp_proc->proc_endpoints) {
free(tcp_proc->proc_endpoints);
}
if(NULL != tcp_proc->proc_addrs) {
free(tcp_proc->proc_addrs);
}
OBJ_DESTRUCT(&tcp_proc->btl_index_to_endpoint);
OBJ_DESTRUCT(&tcp_proc->proc_lock);
}
static inline int mca_btl_tcp_proc_is_proc_left(opal_process_name_t a,
opal_process_name_t b)
{
if (a.jobid != b.jobid) {
return (a.jobid < b.jobid);
} else {
return (a.vpid < b.vpid);
}
}
#define MCA_BTL_TCP_PROC_LOCAL_VERTEX(index) (index)
#define MCA_BTL_TCP_PROC_REMOTE_VERTEX(index) (index + mca_btl_tcp_component.tcp_num_btls)
/* This function builds a graph to match local and remote interfaces
* together. It also populates the remote proc object.
*
* @param btl_proc (IN) Remote proc information
* @param remote_addrs (IN) List of addresses from remote interfaces
* @param local_proc_is_left (IN) Boolean indicator. If true, we set local process
* interfaces to be on the left side of the graph.
* If false, we set remote process interfaces to
* be on the left side of the graph.
* @param graph_out (OUT) Constructed and populated bipartite interface
* graph with vertices as interfaces and negative
* reachability weights as costs for the edges.
* @return OPAL error code or success
*
* The vertices of this graph are the local and remote interfaces. Edges in
* this graph are connections between the interfaces. Costs are computed as
* negative weight which is calculated using the reachability framework.
*
* In order to mirror inputs on both the local and remote side when solving
* interface matching from both sides, we require local_proc_is_left to
* indicate whether the local interfaces should be on the left of the graph
* or not.
*
* The remote list and proc_addrs are assembled and populated here so that
* we can ensure that the vertex ordering matches the proc_addr ordering.
* This allows us to pass the correct pointers to the vertex data for storage.
*
*/
static int mca_btl_tcp_proc_create_interface_graph(mca_btl_tcp_proc_t* btl_proc,
mca_btl_tcp_modex_addr_t* remote_addrs,
int local_proc_is_left,
opal_bp_graph_t **graph_out)
{
opal_bp_graph_t *graph = NULL;
opal_reachable_t *results = NULL;
opal_list_t *local_list = &mca_btl_tcp_component.local_ifs;
opal_list_t *remote_list;
int rc, v_index, x, y, cost, u, v, num_edges = 0;
size_t i;
remote_list = OBJ_NEW(opal_list_t);
if (NULL == remote_list) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}
/* the modex and proc structures differ slightly, so copy the
fields needed in the proc version */
for (i = 0 ; i < btl_proc->proc_addr_count ; i++) {
/* Construct opal_if_t objects for the remote interfaces */
opal_if_t *interface = OBJ_NEW(opal_if_t);
if (NULL == interface) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet,
remote_addrs[i].addr, sizeof(struct in_addr));
btl_proc->proc_addrs[i].addr_family = AF_INET;
memcpy(&((struct sockaddr_in *)&(interface->if_addr))->sin_addr,
remote_addrs[i].addr, sizeof(struct in_addr));
((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET;
interface->af_family = AF_INET;
} else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) {
#if OPAL_ENABLE_IPV6
memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6,
remote_addrs[i].addr, sizeof(struct in6_addr));
btl_proc->proc_addrs[i].addr_family = AF_INET6;
memcpy(&((struct sockaddr_in6 *)&(interface->if_addr))->sin6_addr,
remote_addrs[i].addr, sizeof(struct in6_addr));
((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET6;
interface->af_family = AF_INET6;
#else
rc = OPAL_ERR_NOT_SUPPORTED;
OBJ_RELEASE(interface);
goto out;
#endif
} else {
BTL_ERROR(("Unexpected address family %d",
(int)remote_addrs[i].addr_family));
rc = OPAL_ERR_BAD_PARAM;
OBJ_RELEASE(interface);
goto out;
}
btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
interface->if_mask = remote_addrs[i].addr_mask;
interface->if_bandwidth = remote_addrs[i].addr_bandwidth;
opal_list_append(remote_list, &(interface->super));
}
rc = opal_bp_graph_create(NULL, NULL, &graph);
if (OPAL_SUCCESS != rc) {
goto out;
}
results = opal_reachable.reachable(local_list, remote_list);
if (NULL == results) {
rc = OPAL_ERROR;
goto err_graph;
}
/* Add vertices for each local node. These will store the btl index */
for (x = 0; x < results->num_local; x++) {
rc = opal_bp_graph_add_vertex(graph, &mca_btl_tcp_component.tcp_btls[x]->btl_index, &v_index);
if (OPAL_SUCCESS != rc) {
goto err_graph;
}
}
/* Add vertices for each remote node. These will store remote interface information */
for (y = 0; y < results->num_remote; y++) {
rc = opal_bp_graph_add_vertex(graph, &btl_proc->proc_addrs[y], &v_index);
if (OPAL_SUCCESS != rc) {
goto err_graph;
}
}
/* Add edges */
for (x = 0; x < results->num_local; x++) {
for (y = 0; y < results->num_remote; y++) {
/* The bipartite assignment solver will optimize a graph for
* least cost. Since weights vary from 0 as no connection and
* higher weights as better connections (multiplied by some other
* factors), higher weight is better. Thus, to achieve least cost,
* we set cost as negative weight.
*/
cost = -results->weights[x][y];
/* Skip edges with no connections */
if (0 == cost) {
continue;
}
if (local_proc_is_left) {
u = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x);
v = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y);
} else {
u = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y);
v = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x);
}
rc = opal_bp_graph_add_edge(graph, u, v, cost, 1, NULL);
if (OPAL_SUCCESS != rc) {
goto err_graph;
}
num_edges++;
}
}
if (0 == num_edges) {
BTL_ERROR(("Unable to find reachable pairing between local and remote interfaces"));
rc = OPAL_ERR_UNREACH;
}
*graph_out = graph;
goto out;
err_graph:
if (NULL != graph) {
opal_bp_graph_free(graph);
}
out:
if (NULL != results) {
free(results);
}
if (NULL != remote_list) {
OBJ_RELEASE(remote_list);
}
return rc;
}
/* We store the matched interface data by using the btl_index as the key and
* a pointer to a mca_btl_tcp_addr_t struct.
*/
static int mca_btl_tcp_proc_store_matched_interfaces(mca_btl_tcp_proc_t *btl_proc,
int local_proc_is_left,
opal_bp_graph_t *graph,
int num_matched, int *matched_edges)
{
int rc = OPAL_SUCCESS;
int i, left, right;
uint32_t* local_index;
struct mca_btl_tcp_addr_t *remote_addr;
for (i = 0; i < num_matched; i++) {
left = matched_edges[2 * i + 0];
right = matched_edges[2 * i + 1];
if (local_proc_is_left) {
rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&local_index);
if (OPAL_SUCCESS != rc) {
goto out;
}
rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&remote_addr);
if (OPAL_SUCCESS != rc) {
goto out;
}
} else {
rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&local_index);
if (OPAL_SUCCESS != rc) {
goto out;
}
rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&remote_addr);
if (OPAL_SUCCESS != rc) {
goto out;
}
}
opal_hash_table_set_value_uint32(&btl_proc->btl_index_to_endpoint, *local_index, (void *)remote_addr);
}
out:
return rc;
}
static int mca_btl_tcp_proc_handle_modex_addresses(mca_btl_tcp_proc_t* btl_proc,
mca_btl_tcp_modex_addr_t* remote_addrs,
int local_proc_is_left)
{
opal_bp_graph_t *graph = NULL;
int rc = OPAL_SUCCESS;
int num_matched = 0;
int *matched_edges = NULL;
rc = mca_btl_tcp_proc_create_interface_graph(btl_proc, remote_addrs, local_proc_is_left, &graph);
if (rc) {
goto cleanup;
}
rc = opal_bp_graph_solve_bipartite_assignment(graph, &num_matched, &matched_edges);
if (rc) {
goto cleanup;
}
rc = mca_btl_tcp_proc_store_matched_interfaces(btl_proc, local_proc_is_left,
graph, num_matched, matched_edges);
if (rc) {
goto cleanup;
}
cleanup:
if (NULL != graph) {
opal_bp_graph_free(graph);
}
return rc;
}
/*
* Create a TCP process structure. There is a one-to-one correspondence
* between a opal_proc_t and a mca_btl_tcp_proc_t instance. We cache
* additional data (specifically the list of mca_btl_tcp_endpoint_t instances,
* and published addresses) associated w/ a given destination on this
* datastructure.
*/
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
{
mca_btl_tcp_proc_t* btl_proc;
int rc, local_proc_is_left;
mca_btl_tcp_modex_addr_t *remote_addrs = NULL;
size_t size;
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
proc->proc_name, (void**)&btl_proc);
if (OPAL_SUCCESS == rc) {
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
return btl_proc;
}
/* proc was not found, so create one */
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
if (NULL == btl_proc) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* Retain the proc, but don't store the ref into the btl_proc just yet. This
* provides a way to release the btl_proc in case of failure without having to
* unlock the mutex.
*/
OBJ_RETAIN(proc);
/* lookup tcp parameters exported by this proc */
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
&proc->proc_name, (uint8_t**)&remote_addrs, &size);
if (OPAL_SUCCESS != rc) {
if (OPAL_ERR_NOT_FOUND != rc) {
BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc));
}
goto cleanup;
}
if (0 != (size % sizeof(mca_btl_tcp_modex_addr_t))) {
BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n",
(unsigned long)size,
(unsigned long)sizeof(mca_btl_tcp_modex_addr_t)));
rc = OPAL_ERROR;
goto cleanup;
}
btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_modex_addr_t);
btl_proc->proc_addrs = malloc(btl_proc->proc_addr_count *
sizeof(mca_btl_tcp_addr_t));
if (NULL == btl_proc->proc_addrs) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* When solving for bipartite assignment, a graph with equal weights
* can provide different outputs depending on the input parameters.
* Thus two processes can construct different interface matchings.
* To avoid this case, we put the process with the lower jobid on the
* left or if they are equal, we use the lower vpid on the left.
*
* The concept of mirroring the local and remote sides is borrowed
* from the usnic btl implementation of its bipartite assignment solver.
*/
local_proc_is_left = mca_btl_tcp_proc_is_proc_left(proc->proc_name, opal_proc_local_get()->proc_name);
rc = mca_btl_tcp_proc_handle_modex_addresses(btl_proc, remote_addrs, local_proc_is_left);
if (OPAL_SUCCESS != rc) {
goto cleanup;
}
/* allocate space for endpoint array - one for each exported address */
btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc((1 + btl_proc->proc_addr_count) *
sizeof(mca_btl_base_endpoint_t*));
if (NULL == btl_proc->proc_endpoints) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
cleanup:
if (OPAL_SUCCESS == rc) {
btl_proc->proc_opal = proc; /* link with the proc */
/* add to hash table of all proc instance. */
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
proc->proc_name, btl_proc);
} else {
if (btl_proc) {
OBJ_RELEASE(btl_proc); /* release the local proc */
OBJ_RELEASE(proc); /* and the ref on the OMPI proc */
btl_proc = NULL;
}
}
if (NULL != remote_addrs) {
free(remote_addrs);
}
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
return btl_proc;
}
/*
* Note that this routine must be called with the lock on the process
* already held. Insert a btl instance into the proc array and assign
* it an address.
*/
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc,
mca_btl_base_endpoint_t* btl_endpoint)
{
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
const char *proc_hostname;
mca_btl_tcp_addr_t *remote_addr;
int rc = OPAL_SUCCESS;
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
rc = OPAL_ERR_UNREACH;
goto out;
}
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: host %s, process %s UNREACHABLE",
proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
goto out;
}
btl_endpoint->endpoint_addr = remote_addr;
#ifndef WORDS_BIGENDIAN
/* if we are little endian and our peer is not so lucky, then we
need to put all information sent to him in big endian (aka
Network Byte Order) and expect all information received to
be in NBO. Since big endian machines always send and receive
in NBO, we don't care so much about that case. */
if (btl_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) {
btl_endpoint->endpoint_nbo = true;
}
#endif
/* insert into endpoint array */
btl_endpoint->endpoint_proc = btl_proc;
btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint;
out:
return rc;
}
/*
* Remove an endpoint from the proc array and indicate the address is
* no longer in use.
*/
int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_t* btl_endpoint)
{
size_t i;
if (NULL != btl_proc) {
OPAL_THREAD_LOCK(&btl_proc->proc_lock);
for(i = 0; i < btl_proc->proc_endpoint_count; i++) {
if(btl_proc->proc_endpoints[i] == btl_endpoint) {
memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1,
(btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*));
if(--btl_proc->proc_endpoint_count == 0) {
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
OBJ_RELEASE(btl_proc);
return OPAL_SUCCESS;
}
break;
}
}
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
}
return OPAL_SUCCESS;
}
/*
* Look for an existing TCP process instance based on the globally unique
* process identifier.
*/
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name)
{
mca_btl_tcp_proc_t* proc = NULL;
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
*name, (void**)&proc);
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
if (OPAL_UNLIKELY(NULL == proc)) {
mca_btl_base_endpoint_t *endpoint;
opal_proc_t *opal_proc;
BTL_VERBOSE(("adding tcp proc for peer {%s}",
OPAL_NAME_PRINT(*name)));
opal_proc = opal_proc_for_name (*name);
if (NULL == opal_proc) {
return NULL;
}
/* try adding this proc to each btl until */
for( uint32_t i = 0; i < mca_btl_tcp_component.tcp_num_btls; ++i ) {
endpoint = NULL;
(void) mca_btl_tcp_add_procs (&mca_btl_tcp_component.tcp_btls[i]->super, 1, &opal_proc,
&endpoint, NULL);
if (NULL != endpoint && NULL == proc) {
/* construct all the endpoints and get the proc */
proc = endpoint->endpoint_proc;
}
}
}
return proc;
}
/*
* loop through all available BTLs for one matching the source address
* of the request.
*/
void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr, int sd)
{
OPAL_THREAD_LOCK(&btl_proc->proc_lock);
int found_match = 0;
mca_btl_base_endpoint_t* match_btl_endpoint;
for( size_t i = 0; i < btl_proc->proc_endpoint_count; i++ ) {
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
/* We are not here to make a decision about what is good socket
* and what is not. We simply check that this socket fit the endpoint
* end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
continue;
}
switch (addr->sa_family) {
case AF_INET:
if( memcmp( &btl_endpoint->endpoint_addr->addr_union.addr_inet,
&(((struct sockaddr_in*)addr)->sin_addr),
sizeof(struct in_addr) ) ) {
char tmp[2][16];
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl: tcp: Match incoming connection from %s %s with locally known IP %s failed (iface %d/%d)!\n",
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
inet_ntop(AF_INET, (void*)&((struct sockaddr_in*)addr)->sin_addr,
tmp[0], 16),
inet_ntop(AF_INET, (void*)(struct in_addr*)&btl_endpoint->endpoint_addr->addr_union.addr_inet,
tmp[1], 16),
(int)i, (int)btl_proc->proc_endpoint_count);
continue;
} else if (btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED) {
found_match = 1;
match_btl_endpoint = btl_endpoint;
continue;
}
break;
#if OPAL_ENABLE_IPV6
case AF_INET6:
if( memcmp( &btl_endpoint->endpoint_addr->addr_union.addr_inet,
&(((struct sockaddr_in6*)addr)->sin6_addr),
sizeof(struct in6_addr) ) ) {
char tmp[2][INET6_ADDRSTRLEN];
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl: tcp: Match incoming connection from %s %s with locally known IP %s failed (iface %d/%d)!\n",
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
inet_ntop(AF_INET6, (void*)&((struct sockaddr_in6*)addr)->sin6_addr,
tmp[0], INET6_ADDRSTRLEN),
inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_union.addr_inet,
tmp[1], INET6_ADDRSTRLEN),
(int)i, (int)btl_proc->proc_endpoint_count);
continue;
} else if (btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED) {
found_match = 1;
match_btl_endpoint = btl_endpoint;
continue;
}
break;
#endif
default:
;
}
/* Set state to CONNECTING to ensure that subsequent conenctions do not attempt to re-use endpoint in the num_links > 1 case*/
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
(void)mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd);
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return;
}
/* In this case the connection was inbound to an address exported, but was not in a CLOSED state.
* mca_btl_tcp_endpoint_accept() has logic to deal with the race condition that has likely caused this
* scenario, so call it here.*/
if (found_match) {
(void)mca_btl_tcp_endpoint_accept(match_btl_endpoint, addr, sd);
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return;
}
/* No further use of this socket. Close it */
CLOSE_THE_SOCKET(sd);
{
char *addr_str = NULL, *tmp;
char ip[128];
ip[sizeof(ip) - 1] = '\0';
for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) {
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
continue;
}
inet_ntop(btl_endpoint->endpoint_addr->addr_family,
(void*) &(btl_endpoint->endpoint_addr->addr_union.addr_inet),
ip, sizeof(ip) - 1);
if (NULL == addr_str) {
opal_asprintf(&tmp, "\n\t%s", ip);
} else {
opal_asprintf(&tmp, "%s\n\t%s", addr_str, ip);
free(addr_str);
}
addr_str = tmp;
}
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
true, opal_process_info.nodename,
getpid(),
btl_proc->proc_opal->proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
opal_net_get_hostname((struct sockaddr*)addr),
btl_proc->proc_endpoint_count,
(NULL == addr_str) ? "NONE" : addr_str);
if (NULL != addr_str) {
free(addr_str);
}
}
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
}
/*
* convert internal data structure (mca_btl_tcp_addr_t) to sockaddr_storage
*
*/
bool mca_btl_tcp_proc_tosocks(mca_btl_tcp_addr_t* proc_addr,
struct sockaddr_storage* output)
{
memset(output, 0, sizeof (*output));
switch (proc_addr->addr_family) {
case AF_INET:
output->ss_family = AF_INET;
memcpy(&((struct sockaddr_in*)output)->sin_addr,
&proc_addr->addr_union.addr_inet, sizeof(struct in_addr));
((struct sockaddr_in*)output)->sin_port = proc_addr->addr_port;
break;
#if OPAL_ENABLE_IPV6
case AF_INET6:
{
struct sockaddr_in6* inaddr = (struct sockaddr_in6*)output;
output->ss_family = AF_INET6;
memcpy(&inaddr->sin6_addr, &proc_addr->addr_union.addr_inet6,
sizeof (proc_addr->addr_union.addr_inet6));
inaddr->sin6_port = proc_addr->addr_port;
inaddr->sin6_scope_id = 0;
inaddr->sin6_flowinfo = 0;
}
break;
#endif
default:
opal_output( 0, "mca_btl_tcp_proc: unknown af_family received: %d\n",
proc_addr->addr_family );
return false;
}
return true;
}