/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef OPAL_BTL_USNIC_FRAG_H #define OPAL_BTL_USNIC_FRAG_H #define OPAL_BTL_USNIC_FRAG_ALIGN (8) #include #include "btl_usnic.h" #include "btl_usnic_module.h" BEGIN_C_DECLS /* * Forward declarations to avoid include loops */ struct opal_btl_usnic_module_t; /* * Some definitions: * frag - what the upper layer hands us to send, may be large or small * segment - one packet on the wire * chunk - when a fragment is too big to fit into one segment, it is * broken into chunks, each chunk fitting in one segment */ /** * Fragment types * The upper layer may give us very large "fragements" to send, larger than * an MTU. We break fragments into segments for sending, a segment being * defined to fit within an MTU. */ typedef enum { OPAL_BTL_USNIC_FRAG_LARGE_SEND, OPAL_BTL_USNIC_FRAG_SMALL_SEND, OPAL_BTL_USNIC_FRAG_PUT_DEST } opal_btl_usnic_frag_type_t; static inline const char * usnic_frag_type(opal_btl_usnic_frag_type_t t) { switch (t) { case OPAL_BTL_USNIC_FRAG_LARGE_SEND: return "large"; case OPAL_BTL_USNIC_FRAG_SMALL_SEND: return "small"; case OPAL_BTL_USNIC_FRAG_PUT_DEST: return "put dest"; default: return "unknown"; } } typedef enum { OPAL_BTL_USNIC_SEG_ACK, OPAL_BTL_USNIC_SEG_FRAG, OPAL_BTL_USNIC_SEG_CHUNK, OPAL_BTL_USNIC_SEG_RECV } opal_btl_usnic_seg_type_t; static inline const char * usnic_seg_type(opal_btl_usnic_seg_type_t t) { switch (t) { case OPAL_BTL_USNIC_SEG_ACK: return "ACK"; case OPAL_BTL_USNIC_SEG_FRAG: return "FRAG"; case OPAL_BTL_USNIC_SEG_CHUNK: return "CHUNK"; case OPAL_BTL_USNIC_SEG_RECV: return "RECV"; default: return "unknown"; } } typedef struct opal_btl_usnic_reg_t { mca_mpool_base_registration_t base; struct ibv_mr* mr; } opal_btl_usnic_reg_t; /* UDP headers are always 42 bytes long */ #define OPAL_BTL_USNIC_UDP_HDR_SZ (42) #define OPAL_BTL_USNIC_PROTO_HDR_SZ \ (mca_btl_usnic_component.use_udp ? \ OPAL_BTL_USNIC_UDP_HDR_SZ : \ sizeof(struct ibv_grh)) /** * usnic header type */ typedef enum { OPAL_BTL_USNIC_PAYLOAD_TYPE_ACK = 1, OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG = 2, /* an entire fragment */ OPAL_BTL_USNIC_PAYLOAD_TYPE_CHUNK = 3 /* one chunk of fragment */ } opal_btl_usnic_payload_type_t; /** * BTL header that goes after the protocol header. Since this is not * a stream, we can put the fields in whatever order make the least * holes. */ typedef struct { /* Hashed RTE process name of the sender */ uint64_t sender; /* Sliding window sequence number (echoed back in an ACK). */ opal_btl_usnic_seq_t pkt_seq; opal_btl_usnic_seq_t ack_seq; /* for piggy-backing ACKs */ /* payload legnth (in bytes). We unfortunately have to include this in our header because the L2 layer may artifically inflate the length of the packet to meet a minimum size */ uint16_t payload_len; /* If this is an emulated PUT, store at this address on receiver */ char *put_addr; /* Type of BTL header (see enum, above) */ uint8_t payload_type; /* true if there is piggy-backed ACK */ uint8_t ack_present; /* tag for upper layer */ mca_btl_base_tag_t tag; } opal_btl_usnic_btl_header_t; /** * BTL header for a chunk of a fragment */ typedef struct { opal_btl_usnic_btl_header_t ch_hdr; uint32_t ch_frag_id; /* ID for collecting segments of same frag */ uint32_t ch_frag_size; /* total frag len */ uint32_t ch_frag_offset; /* where in fragment this goes */ } opal_btl_usnic_btl_chunk_header_t; /** * Descriptor for a common segment. This is exactly one packet and may * be send or receive */ typedef struct opal_btl_usnic_segment_t { ompi_free_list_item_t us_list; opal_btl_usnic_seg_type_t us_type; /* allow for 2 SG entries */ struct ibv_sge us_sg_entry[2]; /* header for chunked frag is different */ union { opal_btl_usnic_btl_header_t *uus_btl_header; opal_btl_usnic_btl_chunk_header_t *uus_btl_chunk_header; } us_hdr; #define us_btl_header us_hdr.uus_btl_header #define us_btl_chunk_header us_hdr.uus_btl_chunk_header union { uint8_t *raw; void *ompi_header; } us_payload; } opal_btl_usnic_segment_t; struct opal_btl_usnic_endpoint_t; /** * Descriptor for a recv segment. This is exactly one packet and may * be part of a large or small send or may be an ACK */ typedef struct opal_btl_usnic_recv_segment_t { opal_btl_usnic_segment_t rs_base; mca_btl_base_descriptor_t rs_desc; mca_btl_base_segment_t rs_segment; /* receive segments have protocol header prepended */ uint8_t *rs_protocol_header; opal_btl_usnic_endpoint_t *rs_endpoint; /* verbs recv desc */ struct ibv_recv_wr rs_recv_desc; } opal_btl_usnic_recv_segment_t; /** * Descriptor for a send segment. This is exactly one packet and may * be part of a large or small send or may be an ACK */ typedef struct opal_btl_usnic_send_segment_t { opal_btl_usnic_segment_t ss_base; /* verbs send desc */ struct ibv_send_wr ss_send_desc; /* channel upon which send was posted */ opal_btl_usnic_channel_id_t ss_channel; struct opal_btl_usnic_send_frag_t *ss_parent_frag; int ss_hotel_room; /* current retrans room, or -1 if none */ /* How many times is this frag on a hardware queue? */ uint32_t ss_send_posted; bool ss_ack_pending; /* true until this segment is ACKed */ } opal_btl_usnic_send_segment_t; typedef opal_btl_usnic_send_segment_t opal_btl_usnic_frag_segment_t; typedef opal_btl_usnic_send_segment_t opal_btl_usnic_chunk_segment_t; /** * Common part of usNIC fragment descriptor */ typedef struct opal_btl_usnic_frag_t { mca_btl_base_descriptor_t uf_base; /* fragment descriptor type */ opal_btl_usnic_frag_type_t uf_type; /* utility segments */ mca_btl_base_segment_t uf_local_seg[2]; mca_btl_base_segment_t uf_remote_seg[1]; /* freelist this came from */ ompi_free_list_t *uf_freelist; } opal_btl_usnic_frag_t; /** * Common part of usNIC send fragment descriptor */ typedef struct opal_btl_usnic_send_frag_t { opal_btl_usnic_frag_t sf_base; struct mca_btl_base_endpoint_t *sf_endpoint; size_t sf_size; /* total_fragment size (upper + user payload) */ struct opal_convertor_t sf_convertor; /* copy of original message data if convertor required */ uint32_t sf_seg_post_cnt; /* total segs currently posted for this frag */ size_t sf_ack_bytes_left; /* bytes remaining to be ACKed */ struct opal_btl_usnic_send_frag_t *sf_next; } opal_btl_usnic_send_frag_t; /** * Descriptor for a large fragment * Large fragment uses two SG entries - one points to upper layer header, * other points to data. */ typedef struct opal_btl_usnic_large_send_frag_t { opal_btl_usnic_send_frag_t lsf_base; char lsf_ompi_header[64]; /* space for upper layer header */ mca_btl_base_tag_t lsf_tag; /* save tag */ uint32_t lsf_frag_id; /* fragment ID for reassembly */ size_t lsf_cur_offset; /* next byte offset to be enqueued on the endpoint (incl. any convertor payload) */ size_t lsf_bytes_left; /* bytes remaining to give enqueue on the endpoint (incl. any convertor payload) */ size_t lsf_pack_bytes_left; /* bytes remaining to be packed into chunk segments (incl. any convertor payload) */ uint8_t *lsf_cur_ptr; /* current packing pointer */ int lsf_cur_sge; size_t lsf_bytes_left_in_sge; uint8_t *lsf_buffer; /* attached storage for usnic_alloc() */ opal_list_t lsf_seg_chain; /* chain of segments for converted data */ bool lsf_pack_on_the_fly; /* true if we are packing on the fly */ } opal_btl_usnic_large_send_frag_t; /* Shortcut member macros. Access uf_src_seg array instead of the descriptor's * des_src ptr to save a deref. */ #define lsf_des_src lsf_base.sf_base.uf_local_seg #define lsf_des_local_cnt lsf_base.sf_base.uf_base.des_local_count /** * small send fragment * Small send will optimistically use 2 SG entries in hopes of performing * an inline send, but will convert to a single SG entry is inline cannot * be done and data must be copied. * First segment will point to registered memory of associated segment to * hold BTL and upper layer headers. * Second segment will point directly to user data. If inlining fails, we * will copy user data into the registered memory after the upper layer header * and convert to a single segment. */ typedef struct opal_btl_usnic_small_send_frag_t { opal_btl_usnic_send_frag_t ssf_base; /* small fragments have embedded segs */ opal_btl_usnic_send_segment_t ssf_segment; } opal_btl_usnic_small_send_frag_t; /** * descriptor for a put destination */ typedef opal_btl_usnic_frag_t opal_btl_usnic_put_dest_frag_t; /** * A simple buffer that can be enqueued on an ompi_free_list_t that is intended * to be used for fragment reassembly. Nominally the free list code supports * this via the rb_super.ptr field, but that field is only allocated and * non-NULL if an mpool is used, and we don't need this reassembly memory to be * registered. */ typedef struct opal_btl_usnic_rx_buf_t { ompi_free_list_item_t rb_super; char buf[1]; /* flexible array member for frag reassembly */ } opal_btl_usnic_rx_buf_t; OBJ_CLASS_DECLARATION(opal_btl_usnic_send_frag_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_small_send_frag_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_large_send_frag_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_put_dest_frag_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_segment_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_frag_segment_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_chunk_segment_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_recv_segment_t); OBJ_CLASS_DECLARATION(opal_btl_usnic_rx_buf_t); typedef opal_btl_usnic_send_segment_t opal_btl_usnic_ack_segment_t; OBJ_CLASS_DECLARATION(opal_btl_usnic_ack_segment_t); /* * Alloc a send frag from the send pool */ static inline opal_btl_usnic_small_send_frag_t * opal_btl_usnic_small_send_frag_alloc(opal_btl_usnic_module_t *module) { ompi_free_list_item_t *item; opal_btl_usnic_small_send_frag_t *frag; OMPI_FREE_LIST_GET_MT(&(module->small_send_frags), item); if (OPAL_UNLIKELY(NULL == item)) { return NULL; } frag = (opal_btl_usnic_small_send_frag_t*) item; /* this belongs in constructor... */ frag->ssf_base.sf_base.uf_freelist = &(module->small_send_frags); assert(frag); assert(OPAL_BTL_USNIC_FRAG_SMALL_SEND == frag->ssf_base.sf_base.uf_type); return frag; } static inline opal_btl_usnic_large_send_frag_t * opal_btl_usnic_large_send_frag_alloc(opal_btl_usnic_module_t *module) { ompi_free_list_item_t *item; opal_btl_usnic_large_send_frag_t *frag; OMPI_FREE_LIST_GET_MT(&(module->large_send_frags), item); if (OPAL_UNLIKELY(NULL == item)) { return NULL; } frag = (opal_btl_usnic_large_send_frag_t*) item; /* this belongs in constructor... */ frag->lsf_base.sf_base.uf_freelist = &(module->large_send_frags); assert(frag); assert(OPAL_BTL_USNIC_FRAG_LARGE_SEND == frag->lsf_base.sf_base.uf_type); return frag; } static inline opal_btl_usnic_put_dest_frag_t * opal_btl_usnic_put_dest_frag_alloc( struct opal_btl_usnic_module_t *module) { ompi_free_list_item_t *item; opal_btl_usnic_put_dest_frag_t *frag; OMPI_FREE_LIST_GET_MT(&(module->put_dest_frags), item); if (OPAL_UNLIKELY(NULL == item)) { return NULL; } frag = (opal_btl_usnic_put_dest_frag_t*) item; /* this belongs in constructor... */ frag->uf_freelist = &(module->put_dest_frags); assert(frag); assert(OPAL_BTL_USNIC_FRAG_PUT_DEST == frag->uf_type); return frag; } /* * A send frag can be returned to the freelist when all of the * following are true: * * 1. upper layer is freeing it (via module.free()) * 2. Or all of these: * a) it finishes sending all its segments * b) all of its segments have been ACKed * c) it is owned by the BTL */ static inline bool opal_btl_usnic_send_frag_ok_to_return( opal_btl_usnic_module_t *module, opal_btl_usnic_send_frag_t *frag) { assert(frag); if (OPAL_LIKELY(frag->sf_base.uf_base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) && 0 == frag->sf_ack_bytes_left && 0 == frag->sf_seg_post_cnt) { return true; } return false; } static inline void opal_btl_usnic_frag_return( struct opal_btl_usnic_module_t *module, opal_btl_usnic_frag_t *frag) { #if MSGDEBUG1 opal_output(0, "freeing frag %p, type %s\n", (void *)frag, usnic_frag_type(frag->uf_type)); #endif frag->uf_local_seg[0].seg_len = 0; frag->uf_local_seg[1].seg_len = 0; /* If this is a large fragment, we need to free any * attached storage */ if (frag->uf_type == OPAL_BTL_USNIC_FRAG_LARGE_SEND) { opal_btl_usnic_large_send_frag_t *lfrag; lfrag = (opal_btl_usnic_large_send_frag_t *)frag; if (lfrag->lsf_buffer != NULL) { free(lfrag->lsf_buffer); lfrag->lsf_buffer = NULL; } lfrag->lsf_pack_on_the_fly = false; if (2 == lfrag->lsf_des_local_cnt && NULL == lfrag->lsf_des_src[1].seg_addr.pval) { opal_convertor_cleanup(&lfrag->lsf_base.sf_convertor); } } else if (frag->uf_type == OPAL_BTL_USNIC_FRAG_SMALL_SEND) { opal_btl_usnic_small_send_frag_t *sfrag; sfrag = (opal_btl_usnic_small_send_frag_t *)frag; sfrag->ssf_segment.ss_send_desc.send_flags &= ~IBV_SEND_INLINE; } OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super)); } /* * Return a send frag if it's all done and owned by BTL */ static inline void opal_btl_usnic_send_frag_return_cond( struct opal_btl_usnic_module_t *module, opal_btl_usnic_send_frag_t *frag) { if (opal_btl_usnic_send_frag_ok_to_return(module, frag)) { opal_btl_usnic_frag_return(module, &frag->sf_base); } } /* * Return a frag if it's all done and owned by BTL * If this is a PUT destination, only condition is that we own it. If it's * a send frag, there are other conditions, so use the specific send frag * return checker. */ static inline void opal_btl_usnic_frag_return_cond( struct opal_btl_usnic_module_t *module, opal_btl_usnic_frag_t *frag) { if (OPAL_BTL_USNIC_FRAG_PUT_DEST == frag->uf_type) { if (OPAL_LIKELY(frag->uf_base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { opal_btl_usnic_frag_return(module, frag); } } else { opal_btl_usnic_send_frag_return_cond(module, (opal_btl_usnic_send_frag_t *)frag); } } static inline opal_btl_usnic_chunk_segment_t * opal_btl_usnic_chunk_segment_alloc( opal_btl_usnic_module_t *module) { ompi_free_list_item_t *item; opal_btl_usnic_send_segment_t *seg; OMPI_FREE_LIST_GET_MT(&(module->chunk_segs), item); if (OPAL_UNLIKELY(NULL == item)) { return NULL; } seg = (opal_btl_usnic_send_segment_t*) item; seg->ss_channel = USNIC_DATA_CHANNEL; seg->ss_send_desc.send_flags = IBV_SEND_SIGNALED; assert(seg); assert(OPAL_BTL_USNIC_SEG_CHUNK == seg->ss_base.us_type); return seg; } static inline void opal_btl_usnic_chunk_segment_return( opal_btl_usnic_module_t *module, opal_btl_usnic_chunk_segment_t *seg) { assert(seg); assert(OPAL_BTL_USNIC_SEG_CHUNK == seg->ss_base.us_type); OMPI_FREE_LIST_RETURN_MT(&(module->chunk_segs), &(seg->ss_base.us_list)); } /* * Alloc an ACK segment */ static inline opal_btl_usnic_ack_segment_t * opal_btl_usnic_ack_segment_alloc(opal_btl_usnic_module_t *module) { ompi_free_list_item_t *item; opal_btl_usnic_send_segment_t *ack; OMPI_FREE_LIST_GET_MT(&(module->ack_segs), item); if (OPAL_UNLIKELY(NULL == item)) { return NULL; } ack = (opal_btl_usnic_ack_segment_t*) item; ack->ss_channel = USNIC_PRIORITY_CHANNEL; ack->ss_send_desc.send_flags = IBV_SEND_SIGNALED; assert(ack); assert(OPAL_BTL_USNIC_SEG_ACK == ack->ss_base.us_type); return ack; } /* * Return an ACK segment */ static inline void opal_btl_usnic_ack_segment_return( opal_btl_usnic_module_t *module, opal_btl_usnic_ack_segment_t *ack) { assert(ack); assert(OPAL_BTL_USNIC_SEG_ACK == ack->ss_base.us_type); OMPI_FREE_LIST_RETURN_MT(&(module->ack_segs), &(ack->ss_base.us_list)); } /* returns the expected L2 packet size in bytes for the given FRAG recv * segment, based on the payload_len */ static inline uint32_t opal_btl_usnic_frag_seg_proto_size(opal_btl_usnic_recv_segment_t *rseg) { opal_btl_usnic_segment_t *bseg = &rseg->rs_base; MSGDEBUG1_OUT("us_type=%d\n", bseg->us_type); assert(OPAL_BTL_USNIC_PAYLOAD_TYPE_FRAG == bseg->us_btl_header->payload_type); return (OPAL_BTL_USNIC_PROTO_HDR_SZ + sizeof(*bseg->us_btl_header) + bseg->us_btl_header->payload_len); } /* returns the expected L2 packet size in bytes for the given CHUNK recv * segment, based on the payload_len */ static inline uint32_t opal_btl_usnic_chunk_seg_proto_size(opal_btl_usnic_recv_segment_t *rseg) { opal_btl_usnic_segment_t *bseg = &rseg->rs_base; assert(OPAL_BTL_USNIC_PAYLOAD_TYPE_CHUNK == bseg->us_btl_chunk_header->ch_hdr.payload_type); return (OPAL_BTL_USNIC_PROTO_HDR_SZ + sizeof(*bseg->us_btl_chunk_header) + bseg->us_btl_chunk_header->ch_hdr.payload_len); } END_C_DECLS #endif