552c9ca5a0
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic. This commit was SVN r32317.
191 строка
6.7 KiB
C
191 строка
6.7 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
|
* reserved.
|
|
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef OPAL_BTL_USNIC_ENDPOINT_H
|
|
#define OPAL_BTL_USNIC_ENDPOINT_H
|
|
|
|
#include <infiniband/verbs.h>
|
|
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/class/opal_hotel.h"
|
|
#include "opal/mca/event/event.h"
|
|
|
|
#include "btl_usnic.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/*
|
|
* Forward declarations to avoid include loops
|
|
*/
|
|
struct opal_btl_usnic_module_t;
|
|
struct opal_btl_usnic_send_segment_t;
|
|
|
|
/*
|
|
* Have the window size as a compile-time constant that is a power of
|
|
* two so that we can take advantage of fast bit operations.
|
|
*/
|
|
#define WINDOW_SIZE 4096
|
|
#define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1)))
|
|
#define WINDOW_OPEN(E) (SEQ_LT((E)->endpoint_next_seq_to_send, \
|
|
((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE)))
|
|
#define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \
|
|
((E)->endpoint_next_seq_to_send-1))
|
|
|
|
/*
|
|
* Returns true when an endpoint has nothing left to send
|
|
*/
|
|
#define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \
|
|
opal_list_is_empty(&(E)->endpoint_frag_send_queue))
|
|
|
|
/*
|
|
* Channel IDs
|
|
*/
|
|
typedef enum opal_btl_usnic_channel_id_t {
|
|
USNIC_PRIORITY_CHANNEL,
|
|
USNIC_DATA_CHANNEL,
|
|
USNIC_NUM_CHANNELS
|
|
} opal_btl_usnic_channel_id_t;
|
|
|
|
typedef struct opal_btl_usnic_addr_t {
|
|
union ibv_gid gid;
|
|
uint32_t qp_num[USNIC_NUM_CHANNELS];
|
|
uint32_t ipv4_addr;
|
|
uint32_t cidrmask;
|
|
uint32_t connectivity_udp_port;
|
|
uint32_t link_speed_mbps;
|
|
uint16_t mtu;
|
|
opal_btl_usnic_seq_t isn;
|
|
uint8_t mac[6];
|
|
uint8_t use_udp;
|
|
} opal_btl_usnic_addr_t;
|
|
|
|
struct opal_btl_usnic_send_segment_t;
|
|
struct opal_btl_usnic_proc_t;
|
|
|
|
/*
|
|
* This is a descriptor for an incoming fragment that is broken
|
|
* into chunks. When the first reference to this frag_id is seen,
|
|
* memory is allocated for it. When the last byte arrives, the assembled
|
|
* fragment is passed to the PML.
|
|
*
|
|
* The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments.
|
|
* This is the largest number of fragments that can possibly be in-flight
|
|
* to us from a particular endpoint because eash chunked fragment will occupy
|
|
* at least two segments, and only WINDOW_SIZE segments can be in flight.
|
|
* OK, so there is an extremely pathological case where we could see
|
|
* (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one
|
|
* and waiting for retrans is just fine in this hypothetical hyper-pathological
|
|
* case, which is what we'll do.
|
|
*/
|
|
#define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2)
|
|
typedef struct opal_btl_usnic_rx_frag_info_t {
|
|
uint32_t rfi_frag_id; /* ID for this fragment */
|
|
uint32_t rfi_frag_size; /* bytes in this fragment */
|
|
uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */
|
|
char *rfi_data; /* pointer to assembly area */
|
|
int rfi_data_pool; /* if 0, data malloced, else rx buf pool */
|
|
ompi_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool
|
|
when rfi_data_pool is nonzero */
|
|
} opal_btl_usnic_rx_frag_info_t;
|
|
|
|
/**
|
|
* An abstraction that represents a connection to a remote process.
|
|
* An instance of mca_btl_base_endpoint_t is associated with each
|
|
* (btl_usnic_proc_t, btl_usnic_module_t) tuple and address
|
|
* information is exchanged at startup. The usnic BTL is
|
|
* connectionless, so no connection is ever established.
|
|
*/
|
|
typedef struct mca_btl_base_endpoint_t {
|
|
opal_list_item_t super;
|
|
|
|
/** BTL module that created this connection */
|
|
struct opal_btl_usnic_module_t *endpoint_module;
|
|
|
|
/** proc that owns this endpoint */
|
|
struct opal_btl_usnic_proc_t *endpoint_proc;
|
|
int endpoint_proc_index; /* index in owning proc's endpoint array */
|
|
|
|
/** True when proc has been deleted, but still have sends that need ACKs */
|
|
bool endpoint_exiting;
|
|
|
|
/** List item for linking into module "all_endpoints" */
|
|
opal_list_item_t endpoint_endpoint_li;
|
|
|
|
/** List item for linking into "need ack" */
|
|
opal_list_item_t endpoint_ack_li;
|
|
|
|
/** Remote address information */
|
|
opal_btl_usnic_addr_t endpoint_remote_addr;
|
|
|
|
/** Remote address handle */
|
|
struct ibv_ah* endpoint_remote_ah;
|
|
|
|
/** Send-related data */
|
|
bool endpoint_ready_to_send;
|
|
opal_list_t endpoint_frag_send_queue;
|
|
int32_t endpoint_send_credits;
|
|
uint32_t endpoint_next_frag_id;
|
|
|
|
/** Receive-related data */
|
|
struct opal_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info;
|
|
|
|
/** OPAL hotel to track outstanding stends */
|
|
opal_hotel_t endpoint_hotel;
|
|
|
|
/** Sliding window parameters for this peer */
|
|
/* Values for the current proc to send to this endpoint on the
|
|
peer proc */
|
|
opal_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */
|
|
opal_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */
|
|
|
|
struct opal_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE];
|
|
|
|
/* Values for the current proc to receive from this endpoint on
|
|
the peer proc */
|
|
bool endpoint_ack_needed;
|
|
|
|
/* When we receive a packet that needs an ACK, set this
|
|
* to delay the ACK to allow for piggybacking
|
|
*/
|
|
uint64_t endpoint_acktime;
|
|
|
|
opal_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */
|
|
opal_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */
|
|
|
|
bool endpoint_rcvd_segs[WINDOW_SIZE];
|
|
uint32_t endpoint_rfstart;
|
|
|
|
bool endpoint_connectivity_checked;
|
|
} mca_btl_base_endpoint_t;
|
|
|
|
typedef mca_btl_base_endpoint_t opal_btl_usnic_endpoint_t;
|
|
OBJ_CLASS_DECLARATION(opal_btl_usnic_endpoint_t);
|
|
|
|
/*
|
|
* Flush all pending sends and resends from and endpoint
|
|
*/
|
|
void
|
|
opal_btl_usnic_flush_endpoint(
|
|
opal_btl_usnic_endpoint_t *endpoint);
|
|
|
|
END_C_DECLS
|
|
#endif
|