c40f8879c8
Prior to this commit we matched local interfaces to remote interfaces in order to create endpoints in a simplistic way. If any remote interfaces were on the same subnet as any of our local interfaces then only local interfaces would be paired (IP-routed remote interfaces would be ignored). This commit introduces a more general scheme which attempts to make the "best" pairing of local interfaces to remote interfaces. We now cast the problem as a graph theory problem known as the "Assignment Problem", or finding a maximum-cardinality, minimum-weight bipartite matching. We solve this problem by reducing the bipartite graph of interface connectivity to a flow network and then solving for a minimum cost flow. This is then easily converted into back into a matching on the original bipartite graph. In the new scheme, interfaces on the same subnet are preferred over interfaces requiring intermediate routing hops and higher bandwidth links are preferred over lower bandwidth links. Reviewed-by: Jeff Squyres <jsquyres@cisco.com> cmr=v1.7.5:ticket=trac:4253 This commit was SVN r30849. The following Trac tickets were found above: Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
273 строки
7.8 KiB
C
273 строки
7.8 KiB
C
/*
|
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
|
* reserved.
|
|
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*/
|
|
#ifndef OMPI_BTL_USNIC_MODULE_H
|
|
#define OMPI_BTL_USNIC_MODULE_H
|
|
|
|
#include "opal/class/opal_pointer_array.h"
|
|
|
|
#include "ompi/mca/common/verbs/common_verbs.h"
|
|
|
|
#include "btl_usnic_endpoint.h"
|
|
#include "btl_usnic_stats.h"
|
|
|
|
/*
|
|
* Default limits.
|
|
*
|
|
* These values obtained from empirical testing on Intel E5-2690
|
|
* machines with Sereno/Lexington cards through an N3546 switch.
|
|
*/
|
|
#define USNIC_DFLT_EAGER_LIMIT_1DEVICE (150 * 1024)
|
|
#define USNIC_DFLT_EAGER_LIMIT_NDEVICES (25 * 1024)
|
|
#define USNIC_DFLT_RNDV_EAGER_LIMIT 500
|
|
#define USNIC_DFLT_PACK_LAZY_THRESHOLD (16 * 1024)
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/*
|
|
* Forward declarations to avoid include loops
|
|
*/
|
|
struct ompi_btl_usnic_send_segment_t;
|
|
struct ompi_btl_usnic_recv_segment_t;
|
|
|
|
/*
|
|
* Abstraction of a set of IB queues
|
|
*/
|
|
typedef struct ompi_btl_usnic_channel_t {
|
|
int chan_index;
|
|
|
|
struct ibv_cq *cq;
|
|
|
|
int chan_mtu;
|
|
int chan_rd_num;
|
|
int chan_sd_num;
|
|
|
|
/** available send WQ entries */
|
|
int32_t sd_wqe;
|
|
|
|
/* fastsend enabled if sd_wqe >= fastsend_wqe_thresh */
|
|
int fastsend_wqe_thresh;
|
|
|
|
/* pointer to receive segment whose bookkeeping has been deferred */
|
|
struct ompi_btl_usnic_recv_segment_t *chan_deferred_recv;
|
|
|
|
/** queue pair */
|
|
struct ibv_qp* qp;
|
|
|
|
struct ibv_recv_wr *repost_recv_head;
|
|
|
|
/** receive segments & buffers */
|
|
ompi_free_list_t recv_segs;
|
|
|
|
bool chan_error; /* set when error detected on channel */
|
|
|
|
/* statistics */
|
|
uint32_t num_channel_sends;
|
|
} ompi_btl_usnic_channel_t;
|
|
|
|
/**
|
|
* usNIC verbs BTL interface
|
|
*/
|
|
typedef struct ompi_btl_usnic_module_t {
|
|
mca_btl_base_module_t super;
|
|
|
|
/* Cache for use during component_init to associate a module with
|
|
the ompi_common_verbs_port_item_t that it came from. */
|
|
ompi_common_verbs_port_item_t *port;
|
|
|
|
mca_btl_base_module_error_cb_fn_t pml_error_callback;
|
|
|
|
/* Information about the usNIC verbs device */
|
|
uint8_t port_num;
|
|
struct ibv_device *device;
|
|
struct ibv_context *device_context;
|
|
struct event device_async_event;
|
|
bool device_async_event_active;
|
|
struct ibv_pd *pd;
|
|
int numa_distance; /* hwloc NUMA distance from this process */
|
|
|
|
/* Information about the IP interface corresponding to this USNIC
|
|
interface */
|
|
char if_name[64];
|
|
uint32_t if_ipv4_addr; /* in network byte order */
|
|
uint32_t if_cidrmask; /* X in "/X" CIDR addr fmt, host byte order */
|
|
uint8_t if_mac[6];
|
|
int if_mtu;
|
|
|
|
/** desired send, receive, and completion queue entries (from MCA
|
|
params; cached here on the component because the MCA param
|
|
might == 0, which means "max supported on that device") */
|
|
int sd_num;
|
|
int rd_num;
|
|
int cq_num;
|
|
int prio_sd_num;
|
|
int prio_rd_num;
|
|
|
|
/*
|
|
* Fragments larger than max_frag_payload will be broken up into
|
|
* multiple chunks. The amount that can be held in a single chunk
|
|
* segment is slightly less than what can be held in frag segment due
|
|
* to fragment reassembly info.
|
|
*/
|
|
size_t tiny_mtu;
|
|
size_t max_frag_payload; /* most that fits in a frag segment */
|
|
size_t max_chunk_payload; /* most that can fit in chunk segment */
|
|
size_t max_tiny_payload; /* threshold for using inline send */
|
|
|
|
/** Hash table to keep track of senders */
|
|
opal_hash_table_t senders;
|
|
|
|
/** local address information */
|
|
struct ompi_btl_usnic_addr_t local_addr;
|
|
|
|
/** list of all endpoints */
|
|
opal_list_t all_endpoints;
|
|
|
|
/** array of procs used by this module (can't use a list because a
|
|
proc can be used by multiple modules) */
|
|
opal_pointer_array_t all_procs;
|
|
|
|
/** send fragments & buffers */
|
|
ompi_free_list_t small_send_frags;
|
|
ompi_free_list_t large_send_frags;
|
|
ompi_free_list_t put_dest_frags;
|
|
ompi_free_list_t chunk_segs;
|
|
|
|
/** receive buffer pools */
|
|
int first_pool;
|
|
int last_pool;
|
|
ompi_free_list_t *module_recv_buffers;
|
|
|
|
/** list of endpoints with data to send */
|
|
/* this list uses base endpoint ptr */
|
|
opal_list_t endpoints_with_sends;
|
|
|
|
/** list of send frags that are waiting to be resent (they
|
|
previously deferred because of lack of resources) */
|
|
opal_list_t pending_resend_segs;
|
|
|
|
/** ack segments */
|
|
ompi_free_list_t ack_segs;
|
|
|
|
/** list of endpoints to which we need to send ACKs */
|
|
/* this list uses endpoint->endpoint_ack_li */
|
|
opal_list_t endpoints_that_need_acks;
|
|
|
|
/* abstract queue-pairs into channels */
|
|
ompi_btl_usnic_channel_t mod_channels[USNIC_NUM_CHANNELS];
|
|
|
|
uint32_t qp_max_inline;
|
|
|
|
/* Performance / debugging statistics */
|
|
ompi_btl_usnic_module_stats_t stats;
|
|
} ompi_btl_usnic_module_t;
|
|
|
|
struct ompi_btl_usnic_frag_t;
|
|
extern ompi_btl_usnic_module_t ompi_btl_usnic_module_template;
|
|
|
|
/*
|
|
* Manipulate the "endpoints_that_need_acks" list
|
|
*/
|
|
|
|
/* get first endpoint needing ACK */
|
|
static inline ompi_btl_usnic_endpoint_t *
|
|
ompi_btl_usnic_get_first_endpoint_needing_ack(
|
|
ompi_btl_usnic_module_t *module)
|
|
{
|
|
opal_list_item_t *item;
|
|
ompi_btl_usnic_endpoint_t *endpoint;
|
|
|
|
item = opal_list_get_first(&module->endpoints_that_need_acks);
|
|
if (item != opal_list_get_end(&module->endpoints_that_need_acks)) {
|
|
endpoint = container_of(item, mca_btl_base_endpoint_t, endpoint_ack_li);
|
|
return endpoint;
|
|
} else {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* get next item in chain */
|
|
static inline ompi_btl_usnic_endpoint_t *
|
|
ompi_btl_usnic_get_next_endpoint_needing_ack(
|
|
ompi_btl_usnic_endpoint_t *endpoint)
|
|
{
|
|
opal_list_item_t *item;
|
|
ompi_btl_usnic_module_t *module;
|
|
|
|
module = endpoint->endpoint_module;
|
|
|
|
item = opal_list_get_next(&(endpoint->endpoint_ack_li));
|
|
if (item != opal_list_get_end(&module->endpoints_that_need_acks)) {
|
|
endpoint = container_of(item, mca_btl_base_endpoint_t, endpoint_ack_li);
|
|
return endpoint;
|
|
} else {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
ompi_btl_usnic_remove_from_endpoints_needing_ack(
|
|
ompi_btl_usnic_endpoint_t *endpoint)
|
|
{
|
|
opal_list_remove_item(
|
|
&(endpoint->endpoint_module->endpoints_that_need_acks),
|
|
&endpoint->endpoint_ack_li);
|
|
endpoint->endpoint_ack_needed = false;
|
|
endpoint->endpoint_acktime = 0;
|
|
#if MSGDEBUG1
|
|
opal_output(0, "clear ack_needed on %p\n", (void*)endpoint);
|
|
#endif
|
|
}
|
|
|
|
static inline void
|
|
ompi_btl_usnic_add_to_endpoints_needing_ack(
|
|
ompi_btl_usnic_endpoint_t *endpoint)
|
|
{
|
|
opal_list_append(&(endpoint->endpoint_module->endpoints_that_need_acks),
|
|
&endpoint->endpoint_ack_li);
|
|
endpoint->endpoint_ack_needed = true;
|
|
#if MSGDEBUG1
|
|
opal_output(0, "set ack_needed on %p\n", (void*)endpoint);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Initialize a module
|
|
*/
|
|
int ompi_btl_usnic_module_init(ompi_btl_usnic_module_t* module);
|
|
|
|
|
|
/*
|
|
* Progress pending sends on a module
|
|
*/
|
|
void ompi_btl_usnic_module_progress_sends(ompi_btl_usnic_module_t *module);
|
|
|
|
/* opal_output statistics that are useful for debugging */
|
|
void ompi_btl_usnic_print_stats(
|
|
ompi_btl_usnic_module_t *module,
|
|
const char *prefix,
|
|
bool reset_stats);
|
|
|
|
END_C_DECLS
|
|
#endif
|