1
1
openmpi/ompi/mca/btl/usnic/btl_usnic_frag.h
Jeff Squyres 6710c2ef3f usnic: remove unnecessary header union
Realistically, the usnic BTL doesn't need to know anything about the
underlying transport except for its header length (so that it knows
where the payload begins in a received buffer).  So remove the use of
the specific transport prefix union and just rely on the usnic verbs
extension to tell us what the header length is if we're using the
usNIC/UDP transport, or sizeof(struct ibv_grh) if we're using usNIC/L2
transport.

This commit was SVN r30914.
2014-03-03 21:33:12 +00:00

624 строки
18 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_BTL_USNIC_FRAG_H
#define OMPI_BTL_USNIC_FRAG_H
#define OMPI_BTL_USNIC_FRAG_ALIGN (8)
#include <infiniband/verbs.h>
#include "btl_usnic.h"
#include "btl_usnic_module.h"
BEGIN_C_DECLS
/*
* Forward declarations to avoid include loops
*/
struct ompi_btl_usnic_module_t;
/*
* Some definitions:
* frag - what the upper layer hands us to send, may be large or small
* segment - one packet on the wire
* chunk - when a fragment is too big to fit into one segment, it is
* broken into chunks, each chunk fitting in one segment
*/
/**
* Fragment types
* The upper layer may give us very large "fragements" to send, larger than
* an MTU. We break fragments into segments for sending, a segment being
* defined to fit within an MTU.
*/
typedef enum {
OMPI_BTL_USNIC_FRAG_LARGE_SEND,
OMPI_BTL_USNIC_FRAG_SMALL_SEND,
OMPI_BTL_USNIC_FRAG_PUT_DEST
} ompi_btl_usnic_frag_type_t;
static inline const char *
usnic_frag_type(ompi_btl_usnic_frag_type_t t)
{
switch (t) {
case OMPI_BTL_USNIC_FRAG_LARGE_SEND: return "large";
case OMPI_BTL_USNIC_FRAG_SMALL_SEND: return "small";
case OMPI_BTL_USNIC_FRAG_PUT_DEST: return "put dest";
default: return "unknown";
}
}
typedef enum {
OMPI_BTL_USNIC_SEG_ACK,
OMPI_BTL_USNIC_SEG_FRAG,
OMPI_BTL_USNIC_SEG_CHUNK,
OMPI_BTL_USNIC_SEG_RECV
} ompi_btl_usnic_seg_type_t;
static inline const char *
usnic_seg_type(ompi_btl_usnic_seg_type_t t)
{
switch (t) {
case OMPI_BTL_USNIC_SEG_ACK: return "ACK";
case OMPI_BTL_USNIC_SEG_FRAG: return "FRAG";
case OMPI_BTL_USNIC_SEG_CHUNK: return "CHUNK";
case OMPI_BTL_USNIC_SEG_RECV: return "RECV";
default: return "unknown";
}
}
typedef struct ompi_btl_usnic_reg_t {
mca_mpool_base_registration_t base;
struct ibv_mr* mr;
} ompi_btl_usnic_reg_t;
/* UDP headers are always 42 bytes long */
#define OMPI_BTL_USNIC_UDP_HDR_SZ (42)
#define OMPI_BTL_USNIC_PROTO_HDR_SZ \
(mca_btl_usnic_component.use_udp ? \
OMPI_BTL_USNIC_UDP_HDR_SZ : \
sizeof(struct ibv_grh))
/**
* usnic header type
*/
typedef enum {
OMPI_BTL_USNIC_PAYLOAD_TYPE_ACK = 1,
OMPI_BTL_USNIC_PAYLOAD_TYPE_FRAG = 2, /* an entire fragment */
OMPI_BTL_USNIC_PAYLOAD_TYPE_CHUNK = 3 /* one chunk of fragment */
} ompi_btl_usnic_payload_type_t;
/**
* BTL header that goes after the protocol header. Since this is not
* a stream, we can put the fields in whatever order make the least
* holes.
*/
typedef struct {
/* Hashed RTE process name of the sender */
uint64_t sender;
/* Sliding window sequence number (echoed back in an ACK). */
ompi_btl_usnic_seq_t pkt_seq;
ompi_btl_usnic_seq_t ack_seq; /* for piggy-backing ACKs */
/* payload legnth (in bytes). We unfortunately have to include
this in our header because the L2 layer may artifically inflate
the length of the packet to meet a minimum size */
uint16_t payload_len;
/* If this is an emulated PUT, store at this address on receiver */
char *put_addr;
/* Type of BTL header (see enum, above) */
uint8_t payload_type;
/* true if there is piggy-backed ACK */
uint8_t ack_present;
/* tag for upper layer */
mca_btl_base_tag_t tag;
} ompi_btl_usnic_btl_header_t;
/**
* BTL header for a chunk of a fragment
*/
typedef struct {
ompi_btl_usnic_btl_header_t ch_hdr;
uint32_t ch_frag_id; /* ID for collecting segments of same frag */
uint32_t ch_frag_size; /* total frag len */
uint32_t ch_frag_offset; /* where in fragment this goes */
} ompi_btl_usnic_btl_chunk_header_t;
/**
* Descriptor for a common segment. This is exactly one packet and may
* be send or receive
*/
typedef struct ompi_btl_usnic_segment_t {
ompi_free_list_item_t us_list;
ompi_btl_usnic_seg_type_t us_type;
/* allow for 2 SG entries */
struct ibv_sge us_sg_entry[2];
/* header for chunked frag is different */
union {
ompi_btl_usnic_btl_header_t *uus_btl_header;
ompi_btl_usnic_btl_chunk_header_t *uus_btl_chunk_header;
} us_hdr;
#define us_btl_header us_hdr.uus_btl_header
#define us_btl_chunk_header us_hdr.uus_btl_chunk_header
union {
uint8_t *raw;
void *ompi_header;
} us_payload;
} ompi_btl_usnic_segment_t;
struct ompi_btl_usnic_endpoint_t;
/**
* Descriptor for a recv segment. This is exactly one packet and may
* be part of a large or small send or may be an ACK
*/
typedef struct ompi_btl_usnic_recv_segment_t {
ompi_btl_usnic_segment_t rs_base;
mca_btl_base_descriptor_t rs_desc;
mca_btl_base_segment_t rs_segment;
/* receive segments have protocol header prepended */
uint8_t *rs_protocol_header;
ompi_btl_usnic_endpoint_t *rs_endpoint;
/* verbs recv desc */
struct ibv_recv_wr rs_recv_desc;
} ompi_btl_usnic_recv_segment_t;
/**
* Descriptor for a send segment. This is exactly one packet and may
* be part of a large or small send or may be an ACK
*/
typedef struct ompi_btl_usnic_send_segment_t {
ompi_btl_usnic_segment_t ss_base;
/* verbs send desc */
struct ibv_send_wr ss_send_desc;
/* channel upon which send was posted */
ompi_btl_usnic_channel_id_t ss_channel;
struct ompi_btl_usnic_send_frag_t *ss_parent_frag;
int ss_hotel_room; /* current retrans room, or -1 if none */
/* How many times is this frag on a hardware queue? */
uint32_t ss_send_posted;
bool ss_ack_pending; /* true until this segment is ACKed */
} ompi_btl_usnic_send_segment_t;
typedef ompi_btl_usnic_send_segment_t ompi_btl_usnic_frag_segment_t;
typedef ompi_btl_usnic_send_segment_t ompi_btl_usnic_chunk_segment_t;
/**
* Common part of usNIC fragment descriptor
*/
typedef struct ompi_btl_usnic_frag_t {
mca_btl_base_descriptor_t uf_base;
/* fragment descriptor type */
ompi_btl_usnic_frag_type_t uf_type;
/* utility segments */
mca_btl_base_segment_t uf_src_seg[2];
mca_btl_base_segment_t uf_dst_seg[1];
/* freelist this came from */
ompi_free_list_t *uf_freelist;
} ompi_btl_usnic_frag_t;
/**
* Common part of usNIC send fragment descriptor
*/
typedef struct ompi_btl_usnic_send_frag_t {
ompi_btl_usnic_frag_t sf_base;
struct mca_btl_base_endpoint_t *sf_endpoint;
size_t sf_size; /* total_fragment size (upper + user payload) */
struct opal_convertor_t sf_convertor; /* copy of original message data if
convertor required */
uint32_t sf_seg_post_cnt; /* total segs currently posted for this frag */
size_t sf_ack_bytes_left; /* bytes remaining to be ACKed */
struct ompi_btl_usnic_send_frag_t *sf_next;
} ompi_btl_usnic_send_frag_t;
/**
* Descriptor for a large fragment
* Large fragment uses two SG entries - one points to upper layer header,
* other points to data.
*/
typedef struct ompi_btl_usnic_large_send_frag_t {
ompi_btl_usnic_send_frag_t lsf_base;
char lsf_ompi_header[64]; /* space for upper layer header */
mca_btl_base_tag_t lsf_tag; /* save tag */
uint32_t lsf_frag_id; /* fragment ID for reassembly */
size_t lsf_cur_offset; /* next byte offset to be enqueued on the
endpoint (incl. any convertor payload) */
size_t lsf_bytes_left; /* bytes remaining to give enqueue on the
endpoint (incl. any convertor payload) */
size_t lsf_pack_bytes_left; /* bytes remaining to be packed into chunk
segments (incl. any convertor payload) */
uint8_t *lsf_cur_ptr; /* current packing pointer */
int lsf_cur_sge;
size_t lsf_bytes_left_in_sge;
uint8_t *lsf_buffer; /* attached storage for usnic_alloc() */
opal_list_t lsf_seg_chain; /* chain of segments for converted data */
bool lsf_pack_on_the_fly; /* true if we are packing on the fly */
} ompi_btl_usnic_large_send_frag_t;
/* Shortcut member macros. Access uf_src_seg array instead of the descriptor's
* des_src ptr to save a deref. */
#define lsf_des_src lsf_base.sf_base.uf_src_seg
#define lsf_des_src_cnt lsf_base.sf_base.uf_base.des_src_cnt
/**
* small send fragment
* Small send will optimistically use 2 SG entries in hopes of performing
* an inline send, but will convert to a single SG entry is inline cannot
* be done and data must be copied.
* First segment will point to registered memory of associated segment to
* hold BTL and upper layer headers.
* Second segment will point directly to user data. If inlining fails, we
* will copy user data into the registered memory after the upper layer header
* and convert to a single segment.
*/
typedef struct ompi_btl_usnic_small_send_frag_t {
ompi_btl_usnic_send_frag_t ssf_base;
/* small fragments have embedded segs */
ompi_btl_usnic_send_segment_t ssf_segment;
} ompi_btl_usnic_small_send_frag_t;
/**
* descriptor for a put destination
*/
typedef ompi_btl_usnic_frag_t ompi_btl_usnic_put_dest_frag_t;
/**
* A simple buffer that can be enqueued on an ompi_free_list_t that is intended
* to be used for fragment reassembly. Nominally the free list code supports
* this via the rb_super.ptr field, but that field is only allocated and
* non-NULL if an mpool is used, and we don't need this reassembly memory to be
* registered.
*/
typedef struct ompi_btl_usnic_rx_buf_t {
ompi_free_list_item_t rb_super;
char buf[1]; /* flexible array member for frag reassembly */
} ompi_btl_usnic_rx_buf_t;
OBJ_CLASS_DECLARATION(ompi_btl_usnic_send_frag_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_small_send_frag_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_large_send_frag_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_put_dest_frag_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_segment_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_frag_segment_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_chunk_segment_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_recv_segment_t);
OBJ_CLASS_DECLARATION(ompi_btl_usnic_rx_buf_t);
typedef ompi_btl_usnic_send_segment_t ompi_btl_usnic_ack_segment_t;
OBJ_CLASS_DECLARATION(ompi_btl_usnic_ack_segment_t);
/*
* Alloc a send frag from the send pool
*/
static inline ompi_btl_usnic_small_send_frag_t *
ompi_btl_usnic_small_send_frag_alloc(ompi_btl_usnic_module_t *module)
{
ompi_free_list_item_t *item;
ompi_btl_usnic_small_send_frag_t *frag;
OMPI_FREE_LIST_GET_MT(&(module->small_send_frags), item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag = (ompi_btl_usnic_small_send_frag_t*) item;
/* this belongs in constructor... */
frag->ssf_base.sf_base.uf_freelist = &(module->small_send_frags);
assert(frag);
assert(OMPI_BTL_USNIC_FRAG_SMALL_SEND == frag->ssf_base.sf_base.uf_type);
return frag;
}
static inline ompi_btl_usnic_large_send_frag_t *
ompi_btl_usnic_large_send_frag_alloc(ompi_btl_usnic_module_t *module)
{
ompi_free_list_item_t *item;
ompi_btl_usnic_large_send_frag_t *frag;
OMPI_FREE_LIST_GET_MT(&(module->large_send_frags), item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag = (ompi_btl_usnic_large_send_frag_t*) item;
/* this belongs in constructor... */
frag->lsf_base.sf_base.uf_freelist = &(module->large_send_frags);
assert(frag);
assert(OMPI_BTL_USNIC_FRAG_LARGE_SEND == frag->lsf_base.sf_base.uf_type);
return frag;
}
static inline ompi_btl_usnic_put_dest_frag_t *
ompi_btl_usnic_put_dest_frag_alloc(
struct ompi_btl_usnic_module_t *module)
{
ompi_free_list_item_t *item;
ompi_btl_usnic_put_dest_frag_t *frag;
OMPI_FREE_LIST_GET_MT(&(module->put_dest_frags), item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag = (ompi_btl_usnic_put_dest_frag_t*) item;
/* this belongs in constructor... */
frag->uf_freelist = &(module->put_dest_frags);
assert(frag);
assert(OMPI_BTL_USNIC_FRAG_PUT_DEST == frag->uf_type);
return frag;
}
/*
* A send frag can be returned to the freelist when all of the
* following are true:
*
* 1. upper layer is freeing it (via module.free())
* 2. Or all of these:
* a) it finishes sending all its segments
* b) all of its segments have been ACKed
* c) it is owned by the BTL
*/
static inline bool
ompi_btl_usnic_send_frag_ok_to_return(
ompi_btl_usnic_module_t *module,
ompi_btl_usnic_send_frag_t *frag)
{
assert(frag);
if (OPAL_LIKELY(frag->sf_base.uf_base.des_flags &
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) &&
0 == frag->sf_ack_bytes_left &&
0 == frag->sf_seg_post_cnt) {
return true;
}
return false;
}
static inline void
ompi_btl_usnic_frag_return(
struct ompi_btl_usnic_module_t *module,
ompi_btl_usnic_frag_t *frag)
{
#if MSGDEBUG1
opal_output(0, "freeing frag %p, type %s\n", (void *)frag,
usnic_frag_type(frag->uf_type));
#endif
frag->uf_src_seg[0].seg_len = 0;
frag->uf_src_seg[1].seg_len = 0;
/* If this is a large fragment, we need to free any
* attached storage
*/
if (frag->uf_type == OMPI_BTL_USNIC_FRAG_LARGE_SEND) {
ompi_btl_usnic_large_send_frag_t *lfrag;
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
if (lfrag->lsf_buffer != NULL) {
free(lfrag->lsf_buffer);
lfrag->lsf_buffer = NULL;
}
lfrag->lsf_pack_on_the_fly = false;
if (2 == lfrag->lsf_des_src_cnt &&
NULL == lfrag->lsf_des_src[1].seg_addr.pval) {
opal_convertor_cleanup(&lfrag->lsf_base.sf_convertor);
}
} else if (frag->uf_type == OMPI_BTL_USNIC_FRAG_SMALL_SEND) {
ompi_btl_usnic_small_send_frag_t *sfrag;
sfrag = (ompi_btl_usnic_small_send_frag_t *)frag;
sfrag->ssf_segment.ss_send_desc.send_flags &= ~IBV_SEND_INLINE;
}
OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super));
}
/*
* Return a send frag if it's all done and owned by BTL
*/
static inline void
ompi_btl_usnic_send_frag_return_cond(
struct ompi_btl_usnic_module_t *module,
ompi_btl_usnic_send_frag_t *frag)
{
if (ompi_btl_usnic_send_frag_ok_to_return(module, frag)) {
ompi_btl_usnic_frag_return(module, &frag->sf_base);
}
}
/*
* Return a frag if it's all done and owned by BTL
* If this is a PUT destination, only condition is that we own it. If it's
* a send frag, there are other conditions, so use the specific send frag
* return checker.
*/
static inline void
ompi_btl_usnic_frag_return_cond(
struct ompi_btl_usnic_module_t *module,
ompi_btl_usnic_frag_t *frag)
{
if (OMPI_BTL_USNIC_FRAG_PUT_DEST == frag->uf_type) {
if (OPAL_LIKELY(frag->uf_base.des_flags &
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
ompi_btl_usnic_frag_return(module, frag);
}
} else {
ompi_btl_usnic_send_frag_return_cond(module,
(ompi_btl_usnic_send_frag_t *)frag);
}
}
static inline ompi_btl_usnic_chunk_segment_t *
ompi_btl_usnic_chunk_segment_alloc(
ompi_btl_usnic_module_t *module)
{
ompi_free_list_item_t *item;
ompi_btl_usnic_send_segment_t *seg;
OMPI_FREE_LIST_GET_MT(&(module->chunk_segs), item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
seg = (ompi_btl_usnic_send_segment_t*) item;
seg->ss_channel = USNIC_DATA_CHANNEL;
seg->ss_send_desc.send_flags = IBV_SEND_SIGNALED;
assert(seg);
assert(OMPI_BTL_USNIC_SEG_CHUNK == seg->ss_base.us_type);
return seg;
}
static inline void
ompi_btl_usnic_chunk_segment_return(
ompi_btl_usnic_module_t *module,
ompi_btl_usnic_chunk_segment_t *seg)
{
assert(seg);
assert(OMPI_BTL_USNIC_SEG_CHUNK == seg->ss_base.us_type);
OMPI_FREE_LIST_RETURN_MT(&(module->chunk_segs), &(seg->ss_base.us_list));
}
/*
* Alloc an ACK segment
*/
static inline ompi_btl_usnic_ack_segment_t *
ompi_btl_usnic_ack_segment_alloc(ompi_btl_usnic_module_t *module)
{
ompi_free_list_item_t *item;
ompi_btl_usnic_send_segment_t *ack;
OMPI_FREE_LIST_GET_MT(&(module->ack_segs), item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
ack = (ompi_btl_usnic_ack_segment_t*) item;
ack->ss_channel = USNIC_PRIORITY_CHANNEL;
ack->ss_send_desc.send_flags = IBV_SEND_SIGNALED;
assert(ack);
assert(OMPI_BTL_USNIC_SEG_ACK == ack->ss_base.us_type);
return ack;
}
/*
* Return an ACK segment
*/
static inline void
ompi_btl_usnic_ack_segment_return(
ompi_btl_usnic_module_t *module,
ompi_btl_usnic_ack_segment_t *ack)
{
assert(ack);
assert(OMPI_BTL_USNIC_SEG_ACK == ack->ss_base.us_type);
OMPI_FREE_LIST_RETURN_MT(&(module->ack_segs), &(ack->ss_base.us_list));
}
/* returns the expected L2 packet size in bytes for the given FRAG recv
* segment, based on the payload_len */
static inline uint32_t
ompi_btl_usnic_frag_seg_proto_size(ompi_btl_usnic_recv_segment_t *rseg)
{
ompi_btl_usnic_segment_t *bseg = &rseg->rs_base;
MSGDEBUG1_OUT("us_type=%d\n", bseg->us_type);
assert(OMPI_BTL_USNIC_PAYLOAD_TYPE_FRAG == bseg->us_btl_header->payload_type);
return (OMPI_BTL_USNIC_PROTO_HDR_SZ +
sizeof(*bseg->us_btl_header) +
bseg->us_btl_header->payload_len);
}
/* returns the expected L2 packet size in bytes for the given CHUNK recv
* segment, based on the payload_len */
static inline uint32_t
ompi_btl_usnic_chunk_seg_proto_size(ompi_btl_usnic_recv_segment_t *rseg)
{
ompi_btl_usnic_segment_t *bseg = &rseg->rs_base;
assert(OMPI_BTL_USNIC_PAYLOAD_TYPE_CHUNK ==
bseg->us_btl_chunk_header->ch_hdr.payload_type);
return (OMPI_BTL_USNIC_PROTO_HDR_SZ +
sizeof(*bseg->us_btl_chunk_header) +
bseg->us_btl_chunk_header->ch_hdr.payload_len);
}
END_C_DECLS
#endif