e958f3cf22
Previously we used a fairly simple algorithm in mca_btl_tcp_proc_insert() to pair local and remote modules. This was a point in time solution rather than a global optimization problem (where global means all modules between two peers). The selection logic would often fail due to pairing interfaces that are not routable for traffic. The complexity of the selection logic was Θ(n^n), which was expensive. Due to poor scalability, this logic was only used when the number of interfaces was less than MAX_PERMUTATION_INTERFACES (default 8). More details can be found in this ticket: https://svn.open-mpi.org/trac/ompi/ticket/2031 (The complexity estimates in the ticket do not match what I calculated from the function) As a fallback, when interfaces surpassed this threshold, a brute force O(n^2) double for loop was used to match interfaces. This commit solves two problems. First, the point-in-time solution is turned into a global optimization solution. Second, the reachability framework was used to create a more realistic reachability map. We switched from using IP/netmask to using the reachability framework, which supports route lookup. This will help many corner cases as well as utilize any future development of the reachability framework. The solution implemented in this commit has a complexity mainly derived from the bipartite assignment solver. If the local and remote peer both have the same number of interfaces (n), the complexity of matching will be O(n^5). With the decrease in complexity to O(n^5), I calculated and tested that initialization costs would be 5000 microseconds with 30 interfaces per node (Likely close to the maximum realistic number of interfaces we will encounter). For additional datapoints, data up to 300 (a very unrealistic number) of interfaces was simulated. Up until 150 interfaces, the matching costs will be less than 1 second, climbing to 10 seconds with 300 interfaces. Reflecting on these results, I removed the suboptimal O(n^2) fallback logic, as it no longer seems necessary. Data was gathered comparing the scaling of initialization costs with ranks. For low number of interfaces, the impact of initialization is negligible. At an interface count of 7-8, the new code has slightly faster initialization costs. At an interface count of 15, the new code has slower initialization costs. However, all initialization costs scale linearly with the number of ranks. In order to use the reachable function, we populate local and remote lists of interfaces. We then convert the interface matching problem into a graph problem. We create a bipartite graph with the local and remote interfaces as vertices and use negative reachability weights as costs. Using the bipartite assignment solver, we generate the matches for the graph. To ensure that both the local and remote process have the same output, we ensure we mirror their respective inputs for the graphs. Finally, we store the endpoint matches that we created earlier in a hash table. This is stored with the btl_index as the key and a struct mca_btl_tcp_addr_t* as the value. This is then retrieved during insertion time to set the endpoint address. Signed-off-by: William Zhang <wilzhang@amazon.com>
76 строки
2.6 KiB
C
76 строки
2.6 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
|
|
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef MCA_BTL_TCP_PROC_H
|
|
#define MCA_BTL_TCP_PROC_H
|
|
|
|
#include "opal/class/opal_object.h"
|
|
#include "opal/util/proc.h"
|
|
#include "btl_tcp.h"
|
|
#include "btl_tcp_addr.h"
|
|
#include "btl_tcp_endpoint.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/**
|
|
* Represents the state of a remote process and the set of addresses
|
|
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
|
|
* each
|
|
* BTL instance that attempts to open a connection to the process.
|
|
*/
|
|
struct mca_btl_tcp_proc_t {
|
|
opal_list_item_t super;
|
|
/**< allow proc to be placed on a list */
|
|
|
|
opal_proc_t *proc_opal;
|
|
/**< pointer to corresponding opal_proc_t */
|
|
|
|
struct mca_btl_tcp_addr_t* proc_addrs;
|
|
/**< array of addresses exported by peer */
|
|
|
|
size_t proc_addr_count;
|
|
/**< number of addresses published by endpoint */
|
|
|
|
struct mca_btl_base_endpoint_t **proc_endpoints;
|
|
/**< array of endpoints that have been created to access this proc */
|
|
|
|
size_t proc_endpoint_count;
|
|
/**< number of endpoints */
|
|
|
|
opal_hash_table_t btl_index_to_endpoint;
|
|
/**< interface match table, matches btl_index to remote addresses of type mca_btl_tcp_addr_t */
|
|
|
|
opal_mutex_t proc_lock;
|
|
/**< lock to protect against concurrent access to proc state */
|
|
};
|
|
typedef struct mca_btl_tcp_proc_t mca_btl_tcp_proc_t;
|
|
OBJ_CLASS_DECLARATION(mca_btl_tcp_proc_t);
|
|
|
|
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc);
|
|
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t* name);
|
|
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*);
|
|
int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*);
|
|
void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t*, struct sockaddr*, int);
|
|
bool mca_btl_tcp_proc_tosocks(mca_btl_tcp_addr_t*, struct sockaddr_storage*);
|
|
|
|
END_C_DECLS
|
|
#endif
|