c40f8879c8
Prior to this commit we matched local interfaces to remote interfaces in order to create endpoints in a simplistic way. If any remote interfaces were on the same subnet as any of our local interfaces then only local interfaces would be paired (IP-routed remote interfaces would be ignored). This commit introduces a more general scheme which attempts to make the "best" pairing of local interfaces to remote interfaces. We now cast the problem as a graph theory problem known as the "Assignment Problem", or finding a maximum-cardinality, minimum-weight bipartite matching. We solve this problem by reducing the bipartite graph of interface connectivity to a flow network and then solving for a minimum cost flow. This is then easily converted into back into a matching on the original bipartite graph. In the new scheme, interfaces on the same subnet are preferred over interfaces requiring intermediate routing hops and higher bandwidth links are preferred over lower bandwidth links. Reviewed-by: Jeff Squyres <jsquyres@cisco.com> cmr=v1.7.5:ticket=trac:4253 This commit was SVN r30849. The following Trac tickets were found above: Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
187 строки
6.5 KiB
C
187 строки
6.5 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
|
* reserved.
|
|
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef OMPI_BTL_USNIC_ENDPOINT_H
|
|
#define OMPI_BTL_USNIC_ENDPOINT_H
|
|
|
|
#include <infiniband/verbs.h>
|
|
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/class/opal_hotel.h"
|
|
#include "opal/mca/event/event.h"
|
|
|
|
#include "btl_usnic.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/*
|
|
* Forward declarations to avoid include loops
|
|
*/
|
|
struct ompi_btl_usnic_module_t;
|
|
struct ompi_btl_usnic_send_segment_t;
|
|
|
|
/*
|
|
* Have the window size as a compile-time constant that is a power of
|
|
* two so that we can take advantage of fast bit operations.
|
|
*/
|
|
#define WINDOW_SIZE 4096
|
|
#define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1)))
|
|
#define WINDOW_OPEN(E) (SEQ_LT((E)->endpoint_next_seq_to_send, \
|
|
((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE)))
|
|
#define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \
|
|
((E)->endpoint_next_seq_to_send-1))
|
|
|
|
/*
|
|
* Returns true when an endpoint has nothing left to send
|
|
*/
|
|
#define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \
|
|
opal_list_is_empty(&(E)->endpoint_frag_send_queue))
|
|
|
|
/*
|
|
* Channel IDs
|
|
*/
|
|
typedef enum ompi_btl_usnic_channel_id_t {
|
|
USNIC_PRIORITY_CHANNEL,
|
|
USNIC_DATA_CHANNEL,
|
|
USNIC_NUM_CHANNELS
|
|
} ompi_btl_usnic_channel_id_t;
|
|
|
|
typedef struct ompi_btl_usnic_addr_t {
|
|
ompi_btl_usnic_seq_t isn;
|
|
uint32_t qp_num[USNIC_NUM_CHANNELS];
|
|
union ibv_gid gid;
|
|
uint32_t ipv4_addr;
|
|
uint32_t cidrmask;
|
|
uint8_t mac[6];
|
|
int mtu;
|
|
uint32_t link_speed_mbps;
|
|
} ompi_btl_usnic_addr_t;
|
|
|
|
struct ompi_btl_usnic_send_segment_t;
|
|
struct ompi_btl_usnic_proc_t;
|
|
|
|
/*
|
|
* This is a descriptor for an incoming fragment that is broken
|
|
* into chunks. When the first reference to this frag_id is seen,
|
|
* memory is allocated for it. When the last byte arrives, the assembled
|
|
* fragment is passed to the PML.
|
|
*
|
|
* The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments.
|
|
* This is the largest number of fragments that can possibly be in-flight
|
|
* to us from a particular endpoint because eash chunked fragment will occupy
|
|
* at least two segments, and only WINDOW_SIZE segments can be in flight.
|
|
* OK, so there is an extremely pathological case where we could see
|
|
* (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one
|
|
* and waiting for retrans is just fine in this hypothetical hyper-pathological
|
|
* case, which is what we'll do.
|
|
*/
|
|
#define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2)
|
|
typedef struct ompi_btl_usnic_rx_frag_info_t {
|
|
uint32_t rfi_frag_id; /* ID for this fragment */
|
|
uint32_t rfi_frag_size; /* bytes in this fragment */
|
|
uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */
|
|
char *rfi_data; /* pointer to assembly area */
|
|
int rfi_data_pool; /* if 0, data malloced, else rx buf pool */
|
|
ompi_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool
|
|
when rfi_data_pool is nonzero */
|
|
} ompi_btl_usnic_rx_frag_info_t;
|
|
|
|
/**
|
|
* An abstraction that represents a connection to a remote process.
|
|
* An instance of mca_btl_base_endpoint_t is associated with each
|
|
* (btl_usnic_proc_t, btl_usnic_module_t) tuple and address
|
|
* information is exchanged at startup. The usnic BTL is
|
|
* connectionless, so no connection is ever established.
|
|
*/
|
|
typedef struct mca_btl_base_endpoint_t {
|
|
opal_list_item_t super;
|
|
|
|
/** BTL module that created this connection */
|
|
struct ompi_btl_usnic_module_t *endpoint_module;
|
|
|
|
/** proc that owns this endpoint */
|
|
struct ompi_btl_usnic_proc_t *endpoint_proc;
|
|
int endpoint_proc_index; /* index in owning proc's endpoint array */
|
|
|
|
/** True when proc has been deleted, but still have sends that need ACKs */
|
|
bool endpoint_exiting;
|
|
|
|
/** List item for linking into module "all_endpoints" */
|
|
opal_list_item_t endpoint_endpoint_li;
|
|
|
|
/** List item for linking into "need ack" */
|
|
opal_list_item_t endpoint_ack_li;
|
|
|
|
/** Remote address information */
|
|
ompi_btl_usnic_addr_t endpoint_remote_addr;
|
|
|
|
/** Remote address handle */
|
|
struct ibv_ah* endpoint_remote_ah;
|
|
|
|
/** Send-related data */
|
|
bool endpoint_ready_to_send;
|
|
opal_list_t endpoint_frag_send_queue;
|
|
int32_t endpoint_send_credits;
|
|
uint32_t endpoint_next_frag_id;
|
|
|
|
/** Receive-related data */
|
|
struct ompi_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info;
|
|
|
|
/** OPAL hotel to track outstanding stends */
|
|
opal_hotel_t endpoint_hotel;
|
|
|
|
/** Sliding window parameters for this peer */
|
|
/* Values for the current proc to send to this endpoint on the
|
|
peer proc */
|
|
ompi_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */
|
|
ompi_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */
|
|
|
|
struct ompi_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE];
|
|
|
|
/* Values for the current proc to receive from this endpoint on
|
|
the peer proc */
|
|
bool endpoint_ack_needed;
|
|
|
|
/* When we receive a packet that needs an ACK, set this
|
|
* to delay the ACK to allow for piggybacking
|
|
*/
|
|
uint64_t endpoint_acktime;
|
|
|
|
ompi_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */
|
|
ompi_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */
|
|
|
|
bool endpoint_rcvd_segs[WINDOW_SIZE];
|
|
uint32_t endpoint_rfstart;
|
|
} mca_btl_base_endpoint_t;
|
|
|
|
typedef mca_btl_base_endpoint_t ompi_btl_usnic_endpoint_t;
|
|
OBJ_CLASS_DECLARATION(ompi_btl_usnic_endpoint_t);
|
|
|
|
/*
|
|
* Flush all pending sends and resends from and endpoint
|
|
*/
|
|
void
|
|
ompi_btl_usnic_flush_endpoint(
|
|
ompi_btl_usnic_endpoint_t *endpoint);
|
|
|
|
END_C_DECLS
|
|
#endif
|