1
1
openmpi/opal/mca/btl/usnic/btl_usnic_endpoint.h
Jeff Squyres 40fe575132 usnic: trivial updates (no code/logic changes)
- Add more explanatory comments
- Trivial whitespace / style updates
- Rename opal_btl_usnic_force_retrans() -> opal_btl_usnic_fast_retrans()

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
2017-01-10 10:40:02 -08:00

206 строки
7.0 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013-2017 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_BTL_USNIC_ENDPOINT_H
#define OPAL_BTL_USNIC_ENDPOINT_H
#include <rdma/fabric.h>
#include "opal/class/opal_list.h"
#include "opal/class/opal_hotel.h"
#include "opal/mca/event/event.h"
#include "btl_usnic.h"
BEGIN_C_DECLS
/*
* Forward declarations to avoid include loops
*/
struct opal_btl_usnic_module_t;
struct opal_btl_usnic_send_segment_t;
/*
* Have the window size as a compile-time constant that is a power of
* two so that we can take advantage of fast bit operations.
*/
#define WINDOW_SIZE 4096
#define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1)))
#define WINDOW_OPEN(E) (SEQ_LT((E)->endpoint_next_seq_to_send, \
((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE)))
#define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \
((E)->endpoint_next_seq_to_send-1))
/*
* Returns true when an endpoint has nothing left to send
*/
#define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \
opal_list_is_empty(&(E)->endpoint_frag_send_queue))
/*
* Channel IDs
*/
typedef enum opal_btl_usnic_channel_id_t {
USNIC_PRIORITY_CHANNEL,
USNIC_DATA_CHANNEL,
USNIC_NUM_CHANNELS
} opal_btl_usnic_channel_id_t;
typedef struct opal_btl_usnic_modex_t {
/* Stored in network order */
uint32_t ipv4_addr;
/* Stored in host order */
uint32_t ports[USNIC_NUM_CHANNELS];
/* Stored in network order */
uint32_t netmask;
/* Stored in host order */
uint32_t connectivity_udp_port;
uint32_t link_speed_mbps;
uint16_t max_msg_size;
opal_btl_usnic_seq_t isn;
uint32_t protocol;
} opal_btl_usnic_modex_t;
struct opal_btl_usnic_send_segment_t;
struct opal_btl_usnic_proc_t;
/*
* This is a descriptor for an incoming fragment that is broken
* into chunks. When the first reference to this frag_id is seen,
* memory is allocated for it. When the last byte arrives, the assembled
* fragment is passed to the PML.
*
* The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments.
* This is the largest number of fragments that can possibly be in-flight
* to us from a particular endpoint because eash chunked fragment will occupy
* at least two segments, and only WINDOW_SIZE segments can be in flight.
* OK, so there is an extremely pathological case where we could see
* (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one
* and waiting for retrans is just fine in this hypothetical hyper-pathological
* case, which is what we'll do.
*/
#define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2)
typedef struct opal_btl_usnic_rx_frag_info_t {
uint32_t rfi_frag_id; /* ID for this fragment */
uint32_t rfi_frag_size; /* bytes in this fragment */
uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */
bool rfi_data_in_pool; /* data in data_pool if true, else malloced */
int rfi_data_pool; /* if <0, data malloced, else rx buf pool */
char *rfi_data; /* pointer to assembly area */
opal_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool
when rfi_data_pool is nonzero */
} opal_btl_usnic_rx_frag_info_t;
/**
* An abstraction that represents a connection to a remote process.
* An instance of mca_btl_base_endpoint_t is associated with each
* (btl_usnic_proc_t, btl_usnic_module_t) tuple and address
* information is exchanged at startup. The usnic BTL is
* connectionless, so no connection is ever established.
*/
typedef struct mca_btl_base_endpoint_t {
opal_list_item_t super;
/** BTL module that created this connection */
struct opal_btl_usnic_module_t *endpoint_module;
/** proc that owns this endpoint */
struct opal_btl_usnic_proc_t *endpoint_proc;
int endpoint_proc_index; /* index in owning proc's endpoint array */
/** True when proc has been deleted, but still have sends that need ACKs */
bool endpoint_exiting;
/** List item for linking into module "all_endpoints" */
opal_list_item_t endpoint_endpoint_li;
/** List item for linking into "need ack" */
opal_list_item_t endpoint_ack_li;
/** Remote address information */
opal_btl_usnic_modex_t endpoint_remote_modex;
/** Remote address handle. Need one for each
channel because each remote channel has different dest port */
fi_addr_t endpoint_remote_addrs[USNIC_NUM_CHANNELS];
/** Send-related data */
bool endpoint_ready_to_send;
opal_list_t endpoint_frag_send_queue;
int32_t endpoint_send_credits;
uint32_t endpoint_next_frag_id;
/** Receive-related data */
struct opal_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info;
/** OPAL hotel to track outstanding stends */
opal_hotel_t endpoint_hotel;
/** Sliding window parameters for this peer */
/* Values for the current proc to send to this endpoint on the
peer proc */
opal_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */
opal_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */
/* Table where sent segments sit while waiting for their ACKs.
When a segment is ACKed, it is removed from this table. */
struct opal_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE];
/* Values for the current proc to receive from this endpoint on
the peer proc */
bool endpoint_ack_needed;
/* When we receive a packet that needs an ACK, set this
* to delay the ACK to allow for piggybacking
*/
uint64_t endpoint_acktime;
opal_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */
opal_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */
bool endpoint_rcvd_segs[WINDOW_SIZE];
uint32_t endpoint_rfstart;
bool endpoint_connectivity_checked;
bool endpoint_on_all_endpoints;
} mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t opal_btl_usnic_endpoint_t;
OBJ_CLASS_DECLARATION(opal_btl_usnic_endpoint_t);
/*
* Helper struct for the asynchornous creation of fi_addr array
*/
typedef struct {
opal_btl_usnic_endpoint_t *endpoint;
opal_btl_usnic_channel_id_t channel_id;
} opal_btl_usnic_addr_context_t;
/*
* Flush all pending sends and resends from and endpoint
*/
void
opal_btl_usnic_flush_endpoint(
opal_btl_usnic_endpoint_t *endpoint);
END_C_DECLS
#endif