usnic: improve interface matching (esp. for UDP)
Prior to this commit we matched local interfaces to remote interfaces in order to create endpoints in a simplistic way. If any remote interfaces were on the same subnet as any of our local interfaces then only local interfaces would be paired (IP-routed remote interfaces would be ignored). This commit introduces a more general scheme which attempts to make the "best" pairing of local interfaces to remote interfaces. We now cast the problem as a graph theory problem known as the "Assignment Problem", or finding a maximum-cardinality, minimum-weight bipartite matching. We solve this problem by reducing the bipartite graph of interface connectivity to a flow network and then solving for a minimum cost flow. This is then easily converted into back into a matching on the original bipartite graph. In the new scheme, interfaces on the same subnet are preferred over interfaces requiring intermediate routing hops and higher bandwidth links are preferred over lower bandwidth links. Reviewed-by: Jeff Squyres <jsquyres@cisco.com> cmr=v1.7.5:ticket=trac:4253 This commit was SVN r30849. The following Trac tickets were found above: Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
родитель
47148ab3cb
Коммит
c40f8879c8
@ -40,6 +40,7 @@
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "ompi/mca/mpool/grdma/mpool_grdma.h"
|
||||
|
||||
#include "btl_usnic_libnl_utils.h"
|
||||
#include "btl_usnic_compat.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -188,6 +189,9 @@ typedef struct ompi_btl_usnic_component_t {
|
||||
|
||||
/** retrans characteristics */
|
||||
int retrans_timeout;
|
||||
|
||||
/** socket used for rtnetlink queries */
|
||||
struct usnic_rtnl_sk *unlsk;
|
||||
} ompi_btl_usnic_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern ompi_btl_usnic_component_t mca_btl_usnic_component;
|
||||
|
@ -31,6 +31,9 @@
|
||||
# define opal_show_help orte_show_help
|
||||
# define mca_base_var_check_exclusive(S,A,B,C,D,E,F) \
|
||||
mca_base_param_check_exclusive_string(A,B,C,D,E,F)
|
||||
# define ompi_rte_compare_name_fields(a, b, c) \
|
||||
orte_util_compare_name_fields(a, b, c)
|
||||
# define OMPI_RTE_CMP_ALL ORTE_NS_CMP_ALL
|
||||
# define ompi_process_info orte_process_info
|
||||
# define ompi_rte_hash_name orte_util_hash_name
|
||||
# define OMPI_PROC_MY_NAME ORTE_PROC_MY_NAME
|
||||
|
@ -209,11 +209,14 @@ static int usnic_component_close(void)
|
||||
|
||||
free(mca_btl_usnic_component.usnic_all_modules);
|
||||
free(mca_btl_usnic_component.usnic_active_modules);
|
||||
|
||||
if (NULL != mca_btl_usnic_component.vendor_part_ids) {
|
||||
free(mca_btl_usnic_component.vendor_part_ids);
|
||||
mca_btl_usnic_component.vendor_part_ids = NULL;
|
||||
}
|
||||
|
||||
ompi_btl_usnic_rtnl_sk_free(mca_btl_usnic_component.unlsk);
|
||||
|
||||
#if OMPI_BTL_USNIC_UNIT_TESTS
|
||||
/* clean up the unit test infrastructure */
|
||||
ompi_btl_usnic_cleanup_tests();
|
||||
@ -475,7 +478,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
usnic_if_filter_t *filter;
|
||||
bool keep_module;
|
||||
bool filter_incl = false;
|
||||
int min_distance, num_local_procs;
|
||||
int min_distance, num_local_procs, err;
|
||||
|
||||
*num_btl_modules = 0;
|
||||
|
||||
@ -528,6 +531,14 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
||||
|
||||
opal_srand(&ompi_btl_usnic_rand_buff, ((uint32_t) getpid()));
|
||||
|
||||
err = ompi_btl_usnic_rtnl_sk_alloc(&mca_btl_usnic_component.unlsk);
|
||||
if (0 != err) {
|
||||
/* API returns negative errno values */
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "rtnetlink init fail",
|
||||
true, ompi_process_info.nodename, strerror(-err));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Find the ports that we want to use. We do our own interface name
|
||||
* filtering below, so don't let the verbs code see our
|
||||
* if_include/if_exclude strings */
|
||||
@ -1212,6 +1223,7 @@ static int init_module_from_port(ompi_btl_usnic_module_t *module,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
module->local_addr.link_speed_mbps = module->super.btl_bandwidth;
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: bandwidth for %s:%d = %u",
|
||||
ibv_get_device_name(module->device),
|
||||
|
@ -72,6 +72,7 @@ typedef struct ompi_btl_usnic_addr_t {
|
||||
uint32_t cidrmask;
|
||||
uint8_t mac[6];
|
||||
int mtu;
|
||||
uint32_t link_speed_mbps;
|
||||
} ompi_btl_usnic_addr_t;
|
||||
|
||||
struct ompi_btl_usnic_send_segment_t;
|
||||
|
@ -129,6 +129,9 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
rc = ompi_btl_usnic_create_endpoint(module, usnic_proc,
|
||||
&usnic_endpoint);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:%s: unable to create endpoint for module=%p proc=%p\n",
|
||||
__func__, (void *)module, (void *)usnic_proc);
|
||||
OBJ_RELEASE(usnic_proc);
|
||||
continue;
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
|
@ -36,14 +36,21 @@
|
||||
#include "btl_usnic_endpoint.h"
|
||||
#include "btl_usnic_module.h"
|
||||
#include "btl_usnic_util.h"
|
||||
#include "btl_usnic_graph.h"
|
||||
|
||||
/* larger weight values are more desirable (i.e., worth, not cost) */
|
||||
enum conn_weight {
|
||||
WEIGHT_UNREACHABLE = -1,
|
||||
WEIGHT_DIFF_NET = 0,
|
||||
WEIGHT_SAME_NET = 1
|
||||
enum {
|
||||
WEIGHT_UNREACHABLE = -1
|
||||
};
|
||||
|
||||
/* Helper macros for "match_modex" and friends for translating between array
|
||||
* indices and vertex IDs. Module vertices always come first in the graph,
|
||||
* followed by proc (endpoint) vertices. */
|
||||
#define PROC_VERTEX(modex_idx) (mca_btl_usnic_component.num_modules + modex_idx)
|
||||
#define MODULE_VERTEX(module_idx) (module_idx)
|
||||
#define PROC_INDEX(proc_vertex) ((proc_vertex) - mca_btl_usnic_component.num_modules)
|
||||
#define MODULE_INDEX(module_vertex) (module_vertex)
|
||||
|
||||
static void proc_construct(ompi_btl_usnic_proc_t* proc)
|
||||
{
|
||||
proc->proc_ompi = 0;
|
||||
@ -52,7 +59,8 @@ static void proc_construct(ompi_btl_usnic_proc_t* proc)
|
||||
proc->proc_modex_claimed = NULL;
|
||||
proc->proc_endpoints = NULL;
|
||||
proc->proc_endpoint_count = 0;
|
||||
proc->proc_ep_weights = NULL;
|
||||
proc->proc_ep_match_table = NULL;
|
||||
proc->proc_match_exists = false;
|
||||
|
||||
/* add to list of all proc instance */
|
||||
opal_list_append(&mca_btl_usnic_component.usnic_procs, &proc->super);
|
||||
@ -61,8 +69,6 @@ static void proc_construct(ompi_btl_usnic_proc_t* proc)
|
||||
|
||||
static void proc_destruct(ompi_btl_usnic_proc_t* proc)
|
||||
{
|
||||
uint32_t i;
|
||||
|
||||
/* remove from list of all proc instances */
|
||||
opal_list_remove_item(&mca_btl_usnic_component.usnic_procs, &proc->super);
|
||||
|
||||
@ -77,13 +83,9 @@ static void proc_destruct(ompi_btl_usnic_proc_t* proc)
|
||||
proc->proc_modex_claimed = NULL;
|
||||
}
|
||||
|
||||
if (NULL != proc->proc_ep_weights) {
|
||||
for (i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
|
||||
free(proc->proc_ep_weights[i]);
|
||||
proc->proc_ep_weights[i] = NULL;
|
||||
}
|
||||
free(proc->proc_ep_weights);
|
||||
proc->proc_ep_weights = NULL;
|
||||
if (NULL != proc->proc_ep_match_table) {
|
||||
free(proc->proc_ep_match_table);
|
||||
proc->proc_ep_match_table = NULL;
|
||||
}
|
||||
|
||||
/* Release all endpoints associated with this proc */
|
||||
@ -238,13 +240,16 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
|
||||
}
|
||||
|
||||
/* Compare the addresses of the local interface corresponding to module and the
|
||||
* remote interface corresponding to proc_modex_addr. Returns a weight value. */
|
||||
static enum conn_weight compute_weight(
|
||||
* remote interface corresponding to proc_modex_addr. Returns a weight value
|
||||
* (higher values indicate more desirable connections). */
|
||||
static uint64_t compute_weight(
|
||||
ompi_btl_usnic_module_t *module,
|
||||
ompi_btl_usnic_addr_t *proc_modex_addr)
|
||||
{
|
||||
char my_ip_string[INET_ADDRSTRLEN], peer_ip_string[INET_ADDRSTRLEN];
|
||||
uint32_t mynet, peernet;
|
||||
int err, metric;
|
||||
uint32_t min_link_speed_gbps;
|
||||
|
||||
inet_ntop(AF_INET, &module->if_ipv4_addr,
|
||||
my_ip_string, sizeof(my_ip_string));
|
||||
@ -257,31 +262,255 @@ static enum conn_weight compute_weight(
|
||||
module->if_cidrmask);
|
||||
peernet = ompi_btl_usnic_get_ipv4_subnet(proc_modex_addr->ipv4_addr,
|
||||
proc_modex_addr->cidrmask);
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:%s: checking my IP address/subnet (%s/%d) vs. peer (%s/%d): %s",
|
||||
__func__, my_ip_string, module->if_cidrmask,
|
||||
peer_ip_string, proc_modex_addr->cidrmask,
|
||||
(mynet == peernet ? "match" : "DO NOT match"));
|
||||
|
||||
if (mynet == peernet) {
|
||||
return WEIGHT_SAME_NET;
|
||||
} else {
|
||||
return WEIGHT_DIFF_NET;
|
||||
if (!mca_btl_usnic_component.use_udp) {
|
||||
if (mynet != peernet) {
|
||||
return WEIGHT_UNREACHABLE;
|
||||
} else {
|
||||
return 1; /* any positive weight is fine */
|
||||
}
|
||||
}
|
||||
|
||||
min_link_speed_gbps = MIN(module->super.btl_bandwidth,
|
||||
proc_modex_addr->link_speed_mbps) / 1000;
|
||||
|
||||
metric = 0;
|
||||
err = ompi_btl_usnic_nl_ip_rt_lookup(mca_btl_usnic_component.unlsk,
|
||||
module->if_ipv4_addr,
|
||||
proc_modex_addr->ipv4_addr,
|
||||
&metric);
|
||||
if (0 != err) {
|
||||
return 0; /* no connectivity */
|
||||
}
|
||||
else {
|
||||
/* Format in binary MSB LSB
|
||||
* most sig. 32-bits: 00000000 0000000A BBBBBBBB 00000001
|
||||
* least sig. 32-bits: CCCCCCCC CCCCCCCC CCCCCCCC CCCCCCCC
|
||||
*
|
||||
* A = 1 iff same subnet
|
||||
* B = min link speed (in Gbps) between iface pair
|
||||
* C = metric from routing table
|
||||
*
|
||||
* That is, this prioritizes interfaces in the same subnet first,
|
||||
* followed by having the same link speed. The extra literal "1" is in
|
||||
* there to help prioritize over any zero-cost links that might
|
||||
* otherwise make their way into the graph. It is not strictly
|
||||
* necessary and could be eliminated if the extra byte is needed.
|
||||
*
|
||||
* TODO add an MCA parameter to optionally swap the offsets of A and
|
||||
* B, thereby prioritizing link speed over same subnet reachability.
|
||||
*/
|
||||
/* FIXME how can we check that the metric is the same before we have
|
||||
* communication with this host? Mismatched metrics could cause the
|
||||
* remote peer to make a different pairing decision... */
|
||||
if (min_link_speed_gbps > 0xff) {
|
||||
opal_output_verbose(20, USNIC_OUT, "clamping min_link_speed_gbps=%u to 255",
|
||||
min_link_speed_gbps);
|
||||
min_link_speed_gbps = 0xff;
|
||||
}
|
||||
return ((uint64_t)(mynet == peernet) << 48) |
|
||||
((uint64_t)(min_link_speed_gbps & 0xff) << 40) |
|
||||
((uint64_t)0x1 << 32) |
|
||||
(/*metric=*/0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Populate the given proc's match table from an array of (u,v) edge pairs.
|
||||
*
|
||||
* (DJG: this unfortunately knows a bit too much about the internals of
|
||||
* "match_modex")
|
||||
*/
|
||||
static void edge_pairs_to_match_table(
|
||||
ompi_btl_usnic_proc_t *proc,
|
||||
bool proc_is_left,
|
||||
int nme,
|
||||
int *me)
|
||||
{
|
||||
int i;
|
||||
int left, right;
|
||||
int module_idx, proc_idx;
|
||||
int num_modules;
|
||||
|
||||
num_modules = (int)mca_btl_usnic_component.num_modules;
|
||||
|
||||
assert(nme >= 0);
|
||||
for (i = 0; i < nme; ++i) {
|
||||
left = me[2*i+0];
|
||||
right = me[2*i+1];
|
||||
|
||||
if (proc_is_left) {
|
||||
proc_idx = PROC_INDEX(left);
|
||||
module_idx = MODULE_INDEX(right);
|
||||
} else {
|
||||
module_idx = MODULE_INDEX(left);
|
||||
proc_idx = PROC_INDEX(right);
|
||||
}
|
||||
assert(module_idx >= 0 && module_idx < num_modules);
|
||||
assert(proc_idx >= 0 && proc_idx < (int)proc->proc_modex_count);
|
||||
proc->proc_ep_match_table[module_idx] = proc_idx;
|
||||
proc->proc_match_exists = true;
|
||||
}
|
||||
|
||||
/* emit match summary for debugging purposes */
|
||||
for (i = 0; i < num_modules; ++i) {
|
||||
if (-1 != proc->proc_ep_match_table[i]) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:%s: module[%d] (%p) should claim endpoint[%d] on proc %p",
|
||||
__func__, i,
|
||||
(void *)mca_btl_usnic_component.usnic_active_modules[i],
|
||||
proc->proc_ep_match_table[i], (void *)proc);
|
||||
} else {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:%s: module[%d] (%p) will NOT claim an endpoint on proc %p",
|
||||
__func__, i,
|
||||
(void *)mca_btl_usnic_component.usnic_active_modules[i],
|
||||
(void *)proc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an interface graph from all local modules and the given proc's
|
||||
* remote interfaces. The resulting vertices will always have the module
|
||||
* vertices appear before the proc vertices.
|
||||
*/
|
||||
static int create_proc_module_graph(
|
||||
ompi_btl_usnic_proc_t *proc,
|
||||
bool proc_is_left,
|
||||
ompi_btl_usnic_graph_t **g_out)
|
||||
{
|
||||
int err;
|
||||
int i, j;
|
||||
int u, v;
|
||||
int num_modules;
|
||||
ompi_btl_usnic_graph_t *g = NULL;
|
||||
|
||||
if (NULL == g_out) {
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
*g_out = NULL;
|
||||
|
||||
num_modules = (int)mca_btl_usnic_component.num_modules;
|
||||
|
||||
/* Construct a bipartite graph with remote interfaces on the one side and
|
||||
* local interfaces (modules) on the other. */
|
||||
err = ompi_btl_usnic_gr_create(NULL, NULL, &g);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
OMPI_ERROR_LOG(err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* create vertices for each interface (local and remote) */
|
||||
for (i = 0; i < num_modules; ++i) {
|
||||
int idx = -1;
|
||||
err = ompi_btl_usnic_gr_add_vertex(g,
|
||||
mca_btl_usnic_component.usnic_active_modules[i],
|
||||
&idx);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
OMPI_ERROR_LOG(err);
|
||||
goto out_free_graph;
|
||||
}
|
||||
assert(idx == MODULE_VERTEX(i));
|
||||
}
|
||||
for (i = 0; i < (int)proc->proc_modex_count; ++i) {
|
||||
int idx = -1;
|
||||
err = ompi_btl_usnic_gr_add_vertex(g, &proc->proc_modex[i], &idx);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
OMPI_ERROR_LOG(err);
|
||||
goto out_free_graph;
|
||||
}
|
||||
assert(idx == (int)PROC_VERTEX(i));
|
||||
}
|
||||
|
||||
/* now add edges between interfaces that can communicate */
|
||||
for (i = 0; i < num_modules; ++i) {
|
||||
for (j = 0; j < (int)proc->proc_modex_count; ++j) {
|
||||
int64_t weight, cost;
|
||||
|
||||
/* assumption: compute_weight returns the same weight on the
|
||||
* remote process with these arguments (effectively) transposed */
|
||||
weight = compute_weight(mca_btl_usnic_component.usnic_active_modules[i],
|
||||
&proc->proc_modex[j]);
|
||||
|
||||
opal_output_verbose(20, USNIC_OUT,
|
||||
"btl:usnic:%s: weight=0x%016" PRIx64 " for edge module[%d] (%p) <--> endpoint[%d] on proc %p",
|
||||
__func__,
|
||||
weight, i,
|
||||
(void *)mca_btl_usnic_component.usnic_active_modules[i],
|
||||
j, (void *)proc);
|
||||
|
||||
if (WEIGHT_UNREACHABLE == weight) {
|
||||
continue;
|
||||
} else {
|
||||
/* the graph code optimizes for minimum *cost*, but we have
|
||||
* been computing weights (negative costs) */
|
||||
cost = -weight;
|
||||
}
|
||||
assert(INT64_MAX != cost);
|
||||
assert(INT64_MIN != cost);
|
||||
|
||||
if (proc_is_left) {
|
||||
u = PROC_VERTEX(j);
|
||||
v = MODULE_VERTEX(i);
|
||||
} else {
|
||||
u = MODULE_VERTEX(i);
|
||||
v = PROC_VERTEX(j);
|
||||
}
|
||||
opal_output_verbose(20, USNIC_OUT,
|
||||
"btl:usnic:%s: adding edge (%d,%d) with cost=%" PRIi64 " for edge module[%d] <--> endpoint[%d]",
|
||||
__func__, u, v, cost, i, j);
|
||||
err = ompi_btl_usnic_gr_add_edge(g, u, v, cost,
|
||||
/*capacity=*/1,
|
||||
/*e_data=*/NULL);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
OMPI_ERROR_LOG(err);
|
||||
goto out_free_graph;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*g_out = g;
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_free_graph:
|
||||
ompi_btl_usnic_gr_free(g);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* For a specific module, see if this proc has matching address/modex
|
||||
* info. If so, create an endpoint and return it.
|
||||
*
|
||||
* Implementation note: This code relies on the order of modules on a local
|
||||
* side matching the order of the modex entries that we send around, otherwise
|
||||
* both sides may not agree on a bidirectional connection. It also assumes
|
||||
* that add_procs will be invoked on the local modules in that same order, for
|
||||
* the same reason. If those assumptions do not hold, we will need to
|
||||
* canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
|
||||
* by the interface MAC or IP address.
|
||||
*/
|
||||
static int match_modex(ompi_btl_usnic_module_t *module,
|
||||
ompi_btl_usnic_proc_t *proc)
|
||||
ompi_btl_usnic_proc_t *proc,
|
||||
int *index_out)
|
||||
{
|
||||
size_t i, j;
|
||||
int8_t **weights;
|
||||
int err = OMPI_SUCCESS;
|
||||
size_t i;
|
||||
uint32_t num_modules;
|
||||
int modex_index = -1;
|
||||
ompi_btl_usnic_graph_t *g = NULL;
|
||||
int nme;
|
||||
int *me;
|
||||
bool proc_is_left;
|
||||
|
||||
if (NULL == index_out) {
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
*index_out = -1;
|
||||
|
||||
num_modules = mca_btl_usnic_component.num_modules;
|
||||
|
||||
@ -289,102 +518,99 @@ static int match_modex(ompi_btl_usnic_module_t *module,
|
||||
__func__, (void *)module, (void *)proc,
|
||||
num_modules, (int)proc->proc_modex_count);
|
||||
|
||||
/* We compute an interface match-up weights table once for each
|
||||
* (module,proc) pair and cache it in the proc. Store per-proc instead of
|
||||
* per-module, since MPI dynamic process routines can add procs but not new
|
||||
* modules. */
|
||||
if (NULL == proc->proc_ep_weights) {
|
||||
proc->proc_ep_weights = malloc(num_modules *
|
||||
sizeof(*proc->proc_ep_weights));
|
||||
if (NULL == proc->proc_ep_weights) {
|
||||
/* We compute an interface match-up table once for each (module,proc) pair
|
||||
* and cache it in the proc. Store per-proc instead of per-module, since
|
||||
* MPI dynamic process routines can add procs but not new modules. */
|
||||
if (NULL == proc->proc_ep_match_table) {
|
||||
proc->proc_ep_match_table = malloc(num_modules *
|
||||
sizeof(*proc->proc_ep_match_table));
|
||||
if (NULL == proc->proc_ep_match_table) {
|
||||
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
return -1;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc->proc_ep_max_weight = WEIGHT_UNREACHABLE;
|
||||
|
||||
weights = proc->proc_ep_weights;
|
||||
|
||||
/* initialize to "no matches" */
|
||||
for (i = 0; i < num_modules; ++i) {
|
||||
weights[i] = malloc(proc->proc_modex_count * sizeof(*weights[i]));
|
||||
if (NULL == weights[i]) {
|
||||
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
proc->proc_ep_match_table[i] = -1;
|
||||
}
|
||||
|
||||
/* free everything allocated so far */
|
||||
for (j = 0; j < i; ++j) {
|
||||
free(proc->proc_ep_weights[j]);
|
||||
}
|
||||
free(proc->proc_ep_weights);
|
||||
return -1;
|
||||
}
|
||||
/* For graphs where all edges are equal (and even for some other
|
||||
* graphs), two peers making matching calculations with "mirror image"
|
||||
* graphs might not end up with the same matching. Ensure that both
|
||||
* sides are always setting up the exact same graph by always putting
|
||||
* the process with the lower (jobid,vpid) on the "left".
|
||||
*/
|
||||
proc_is_left =
|
||||
(ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
|
||||
&proc->proc_ompi->proc_name,
|
||||
&(ompi_proc_local()->proc_name)) < 0);
|
||||
|
||||
/* compute all weights */
|
||||
for (j = 0; j < proc->proc_modex_count; ++j) {
|
||||
weights[i][j] = compute_weight(mca_btl_usnic_component.usnic_active_modules[i],
|
||||
&proc->proc_modex[j]);
|
||||
if (!mca_btl_usnic_component.use_udp &&
|
||||
WEIGHT_DIFF_NET == weights[i][j]) {
|
||||
/* UDP is required for routability */
|
||||
weights[i][j] = WEIGHT_UNREACHABLE;
|
||||
}
|
||||
if (weights[i][j] > proc->proc_ep_max_weight) {
|
||||
proc->proc_ep_max_weight = weights[i][j];
|
||||
}
|
||||
}
|
||||
err = create_proc_module_graph(proc, proc_is_left, &g);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
goto out_free_table;
|
||||
}
|
||||
|
||||
nme = 0;
|
||||
err = ompi_btl_usnic_solve_bipartite_assignment(g, &nme, &me);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
OMPI_ERROR_LOG(err);
|
||||
goto out_free_graph;
|
||||
}
|
||||
|
||||
edge_pairs_to_match_table(proc, proc_is_left, nme, me);
|
||||
|
||||
err = ompi_btl_usnic_gr_free(g);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
OMPI_ERROR_LOG(err);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
if (WEIGHT_UNREACHABLE == proc->proc_ep_max_weight) {
|
||||
|
||||
if (!proc->proc_match_exists) {
|
||||
opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
|
||||
__func__, OMPI_NAME_PRINT(&proc->proc_ompi->proc_name));
|
||||
return -1;
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
weights = proc->proc_ep_weights;
|
||||
|
||||
/* Each module can claim an address in the proc's modex info that no other
|
||||
* local module is using. Take the first maximal interface pairing where
|
||||
* the remote interface is not yet claimed. If unclaimed remote interfaces
|
||||
* remain but their pairings are non-maximal, they will not be used.
|
||||
*
|
||||
* This code relies on the order of modules on a local side matching the
|
||||
* order of the modex entries that we send around, otherwise both sides may
|
||||
* not agree on a bidirectional connection. It also assumes that add_procs
|
||||
* will be invoked on the local modules in that same order, for the same
|
||||
* reason. If those assumptions do not hold, we will need to canonicalize
|
||||
* this match ordering somehow, probably by (jobid,vpid) pair or by the
|
||||
* interface MAC or IP address. */
|
||||
for (i = 0; i < num_modules; ++i) {
|
||||
if (mca_btl_usnic_component.usnic_active_modules[i] == module) {
|
||||
for (j = 0; j < proc->proc_modex_count; ++j) {
|
||||
if (!proc->proc_modex_claimed[j] &&
|
||||
weights[i][j] == proc->proc_ep_max_weight) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"module[%d] (%p) claiming endpoint[%d] on proc %p",
|
||||
(int)i, (void *)module, (int)j,
|
||||
(void *)proc);
|
||||
modex_index = j;
|
||||
break;
|
||||
}
|
||||
/* assuming no strange failure cases, this should always be present */
|
||||
if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
|
||||
for (i = 0; i < num_modules; ++i) {
|
||||
if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
|
||||
*index_out = proc->proc_ep_match_table[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If MTU does not match, throw an error */
|
||||
if (modex_index >= 0 &&
|
||||
proc->proc_modex[modex_index].mtu != module->if_mtu) {
|
||||
/* TODO with UDP, do we still want to enforce this restriction or just take
|
||||
* the min of the two MTUs? Another choice is to disqualify this pairing
|
||||
* before running the matching algorithm on it. */
|
||||
if (*index_out >= 0 &&
|
||||
proc->proc_modex[*index_out].mtu != module->if_mtu) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_mtu,
|
||||
(NULL == proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : proc->proc_ompi->proc_hostname,
|
||||
proc->proc_modex[i].mtu);
|
||||
return -1;
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_mtu,
|
||||
(NULL == proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : proc->proc_ompi->proc_hostname,
|
||||
proc->proc_modex[*index_out].mtu);
|
||||
*index_out = -1;
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
return modex_index;
|
||||
return (*index_out == -1 ? OMPI_ERR_NOT_FOUND : OMPI_SUCCESS);
|
||||
|
||||
out_free_graph:
|
||||
ompi_btl_usnic_gr_free(g);
|
||||
out_free_table:
|
||||
free(proc->proc_ep_match_table);
|
||||
proc->proc_ep_match_table = NULL;
|
||||
proc->proc_match_exists = false;
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -395,17 +621,18 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
||||
ompi_btl_usnic_proc_t *proc,
|
||||
ompi_btl_usnic_endpoint_t **endpoint_o)
|
||||
{
|
||||
int err;
|
||||
int modex_index;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
ompi_btl_usnic_endpoint_t *endpoint;
|
||||
|
||||
/* look for matching modex info */
|
||||
modex_index = match_modex(module, proc);
|
||||
if (modex_index < 0) {
|
||||
err = match_modex(module, proc, &modex_index);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:create_endpoint: did not find usnic modex info for peer %s",
|
||||
"btl:usnic:create_endpoint: did not match usnic modex info for peer %s",
|
||||
OMPI_NAME_PRINT(&proc->proc_ompi->proc_name));
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
return err;
|
||||
}
|
||||
|
||||
endpoint = OBJ_NEW(ompi_btl_usnic_endpoint_t);
|
||||
@ -415,6 +642,7 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
||||
|
||||
/* Initalize the endpoint */
|
||||
endpoint->endpoint_module = module;
|
||||
assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count);
|
||||
endpoint->endpoint_remote_addr = proc->proc_modex[modex_index];
|
||||
|
||||
/* Initialize endpoint sequence number info */
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -58,15 +58,23 @@ typedef struct ompi_btl_usnic_proc_t {
|
||||
size_t proc_endpoint_count;
|
||||
|
||||
/**
|
||||
* Communication weights between local interfaces (modules) and remote
|
||||
* interfaces (nascent endpoints). proc_ep_weights[i][j] is the weight
|
||||
* between usnic_active_modules[i] and the interface corresponding to
|
||||
* proc->proc_modex[j].
|
||||
* A table giving the chosen pairing between modules and endpoint
|
||||
* addresses. It has size mca_btl_usnic_component.num_modules.
|
||||
* j=proc_ep_match_table[i] means that
|
||||
* mca_btl_usnic_component.usnic_active_modules[i] should be paired with
|
||||
* proc_modex[j]. If there is no pairing for proc_modex[i] then
|
||||
* proc_ep_match_table[i] will be set to -1
|
||||
*
|
||||
* If matchings have not yet been computed for this proc, the pointer will
|
||||
* be NULL.
|
||||
*/
|
||||
int8_t **proc_ep_weights;
|
||||
int *proc_ep_match_table;
|
||||
|
||||
/** greatest weight value (not location) found in proc_ep_weights */
|
||||
int8_t proc_ep_max_weight;
|
||||
/**
|
||||
* true iff proc_ep_match_table != NULL and it contains at least one entry
|
||||
* that is not equal to -1.
|
||||
*/
|
||||
bool proc_match_exists;
|
||||
} ompi_btl_usnic_proc_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(ompi_btl_usnic_proc_t);
|
||||
|
@ -15,6 +15,10 @@
|
||||
#include "btl_usnic.h"
|
||||
#include "btl_usnic_module.h"
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(a,b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
/* avoid "defined but not used" warnings */
|
||||
static inline int __opal_attribute_always_inline__ usnic_fls(int x)
|
||||
__opal_attribute_unused__;
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2012-2013 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012-2014 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -192,3 +192,9 @@ list of decimal verbs vendor part IDs. This usnic BTL will be ignored
|
||||
for this job.
|
||||
|
||||
btl_usnic_vendor_part_ids value: %s
|
||||
#
|
||||
[rtnetlink init fail]
|
||||
The usnic BTL failed to initialize the rtnetlink query subsystem.
|
||||
|
||||
Server: %s
|
||||
Error message: %s
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user