/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2013 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2006-2009 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef MCA_BTL_IB_FRAG_H #define MCA_BTL_IB_FRAG_H #include "opal_config.h" #include "opal/align.h" #include "opal/mca/btl/btl.h" #include BEGIN_C_DECLS struct mca_btl_openib_reg_t; struct mca_btl_openib_header_t { mca_btl_base_tag_t tag; uint8_t cm_seen; uint16_t credits; #if OPAL_OPENIB_PAD_HDR uint8_t padding[4]; #endif }; typedef struct mca_btl_openib_header_t mca_btl_openib_header_t; #define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15) #define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG) #define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG) #define BTL_OPENIB_HEADER_HTON(h) \ do { \ (h).credits = htons((h).credits); \ } while (0) #define BTL_OPENIB_HEADER_NTOH(h) \ do { \ (h).credits = ntohs((h).credits); \ } while (0) typedef struct mca_btl_openib_header_coalesced_t { mca_btl_base_tag_t tag; uint32_t size; uint32_t alloc_size; #if OPAL_OPENIB_PAD_HDR uint8_t padding[4]; #endif } mca_btl_openib_header_coalesced_t; #define BTL_OPENIB_HEADER_COALESCED_NTOH(h) \ do { \ (h).size = ntohl((h).size); \ (h).alloc_size = ntohl((h).alloc_size); \ } while(0) #define BTL_OPENIB_HEADER_COALESCED_HTON(h) \ do { \ (h).size = htonl((h).size); \ (h).alloc_size = htonl((h).alloc_size); \ } while(0) #if OPAL_OPENIB_PAD_HDR /* BTL_OPENIB_FTR_PADDING * This macro is used to keep the pointer to openib footers aligned for * systems like SPARC64 that take a big performance hit when addresses * are not aligned (and by default sigbus instead of coercing the type on * an unaligned address). * * We assure alignment of a packet's structures when OPAL_OPENIB_PAD_HDR * is set to 1. When this is the case then several structures are padded * to assure alignment and the mca_btl_openib_footer_t structure itself * will uses the BTL_OPENIB_FTR_PADDING macro to shift the location of the * pointer to assure proper alignment after the PML Header and data. * For example sending a 1 byte data packet the memory layout without * footer alignment would look something like the following: * * 0x00 : mca_btl_openib_coalesced_header_t (12 bytes + 4 byte pad) * 0x10 : mca_btl_openib_control_header_t (1 byte + 7 byte pad) * 0x18 : mca_btl_openib_header_t (4 bytes + 4 byte pad) * 0x20 : PML Header and data (16 bytes PML + 1 byte data) * 0x29 : mca_btl_openib_footer_t (4 bytes + 4 byte pad) * 0x31 : end of packet * * By applying the BTL_OPENIB_FTR_PADDING() in the progress_one_device * and post_send routines we adjust the pointer to mca_btl_openib_footer_t * from 0x29 to 0x2C thus correctly aligning the start of the * footer pointer. This adjustment will cause the padding field of * mca_btl_openib_footer_t to overlap with the neighboring memory but since * we never use the padding we do not end up inadvertently overwriting * memory that does not belong to the fragment. */ #define BTL_OPENIB_FTR_PADDING(size) \ OPAL_ALIGN_PAD_AMOUNT(size, sizeof(uint64_t)) /* BTL_OPENIB_ALIGN_COALESCE_HDR * This macro is used in btl_openib.c, while creating a coalesce fragment, * to align the coalesce headers. */ #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) \ OPAL_ALIGN_PTR(ptr, sizeof(uint32_t), unsigned char*) /* BTL_OPENIB_COALESCE_HDR_PADDING * This macro is used in btl_openib_component.c, while parsing an incoming * coalesce fragment, to determine the padding amount used to align the * mca_btl_openib_coalesce_hdr_t. */ #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) \ OPAL_ALIGN_PAD_AMOUNT(ptr, sizeof(uint32_t)) #else #define BTL_OPENIB_FTR_PADDING(size) 0 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) ptr #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) 0 #endif struct mca_btl_openib_footer_t { #if OPAL_ENABLE_DEBUG uint32_t seq; #endif union { uint32_t size; uint8_t buf[4]; } u; #if OPAL_OPENIB_PAD_HDR #if OPAL_ENABLE_DEBUG /* this footer needs to be of a 8-byte multiple so by adding the * seq field you throw this off and you cannot just remove the * padding because the padding is needed in order to adjust the alignment * and not overwrite other packets. */ uint8_t padding[12]; #else uint8_t padding[8]; #endif #endif }; typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t; #ifdef WORDS_BIGENDIAN #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) #else #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) \ do { \ uint8_t tmp = (ftr).u.buf[0]; \ (ftr).u.buf[0]=(ftr).u.buf[2]; \ (ftr).u.buf[2]=tmp; \ } while (0) #endif #if OPAL_ENABLE_DEBUG #define BTL_OPENIB_FOOTER_SEQ_HTON(h) ((h).seq = htonl((h).seq)) #define BTL_OPENIB_FOOTER_SEQ_NTOH(h) ((h).seq = ntohl((h).seq)) #else #define BTL_OPENIB_FOOTER_SEQ_HTON(h) #define BTL_OPENIB_FOOTER_SEQ_NTOH(h) #endif #define BTL_OPENIB_FOOTER_HTON(h) \ do { \ BTL_OPENIB_FOOTER_SEQ_HTON(h); \ MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \ } while (0) #define BTL_OPENIB_FOOTER_NTOH(h) \ do { \ BTL_OPENIB_FOOTER_SEQ_NTOH(h); \ MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \ } while (0) #define MCA_BTL_OPENIB_CONTROL_CREDITS 0 #define MCA_BTL_OPENIB_CONTROL_RDMA 1 #define MCA_BTL_OPENIB_CONTROL_COALESCED 2 #define MCA_BTL_OPENIB_CONTROL_CTS 3 #if BTL_OPENIB_FAILOVER_ENABLED #define MCA_BTL_OPENIB_CONTROL_EP_BROKEN 4 #define MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR 5 #endif struct mca_btl_openib_control_header_t { uint8_t type; #if OPAL_OPENIB_PAD_HDR uint8_t padding[7]; #endif }; typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t; struct mca_btl_openib_eager_rdma_header_t { mca_btl_openib_control_header_t control; uint32_t rkey; opal_ptr_t rdma_start; }; typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t; #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h) \ do { \ (h).rkey = htonl((h).rkey); \ (h).rdma_start.lval = hton64((h).rdma_start.lval); \ } while (0) #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \ do { \ (h).rkey = ntohl((h).rkey); \ (h).rdma_start.lval = ntoh64((h).rdma_start.lval); \ } while (0) struct mca_btl_openib_rdma_credits_header_t { mca_btl_openib_control_header_t control; #if OPAL_OPENIB_PAD_HDR uint8_t padding[1]; #endif uint8_t qpn; uint16_t rdma_credits; }; typedef struct mca_btl_openib_rdma_credits_header_t mca_btl_openib_rdma_credits_header_t; #define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h) \ do { \ (h).rdma_credits = htons((h).rdma_credits); \ } while (0) #define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \ do { \ (h).rdma_credits = ntohs((h).rdma_credits); \ } while (0) #if BTL_OPENIB_FAILOVER_ENABLED struct mca_btl_openib_broken_connection_header_t { mca_btl_openib_control_header_t control; uint32_t lid; uint64_t subnet_id; uint32_t vpid; uint32_t index; /* for eager RDMA only */ }; typedef struct mca_btl_openib_broken_connection_header_t mca_btl_openib_broken_connection_header_t; #define BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON(h) \ do { \ (h).lid = htonl((h).lid); \ (h).subnet_id = hton64((h).subnet_id); \ (h).vpid = htonl((h).vpid); \ (h).index = htonl((h).index); \ } while (0) #define BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH(h) \ do { \ (h).lid = ntohl((h).lid); \ (h).subnet_id = ntoh64((h).subnet_id); \ (h).vpid = ntohl((h).vpid); \ (h).index = ntohl((h).index); \ } while (0) #endif enum mca_btl_openib_frag_type_t { MCA_BTL_OPENIB_FRAG_RECV, MCA_BTL_OPENIB_FRAG_RECV_USER, MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER, MCA_BTL_OPENIB_FRAG_EAGER_RDMA, MCA_BTL_OPENIB_FRAG_CONTROL, MCA_BTL_OPENIB_FRAG_COALESCED }; typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t; #define openib_frag_type(f) (to_base_frag(f)->type) /** * IB fragment derived type. */ typedef struct mca_btl_openib_segment_t { mca_btl_base_segment_t base; uint32_t key; } mca_btl_openib_segment_t; /* base openib frag */ typedef struct mca_btl_openib_frag_t { mca_btl_base_descriptor_t base; mca_btl_openib_segment_t segment; mca_btl_openib_frag_type_t type; ompi_free_list_t* list; } mca_btl_openib_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t); #define to_base_frag(f) ((mca_btl_openib_frag_t*)(f)) /* frag used for communication */ typedef struct mca_btl_openib_com_frag_t { mca_btl_openib_frag_t super; struct ibv_sge sg_entry; struct mca_btl_openib_reg_t *registration; struct mca_btl_base_endpoint_t *endpoint; /* number of unsignaled frags sent before this frag. */ uint32_t n_wqes_inflight; } mca_btl_openib_com_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t); #define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f)) typedef struct mca_btl_openib_out_frag_t { mca_btl_openib_com_frag_t super; struct ibv_send_wr sr_desc; } mca_btl_openib_out_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t); #define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f)) typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t); #define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f)) typedef struct mca_btl_openib_send_frag_t { mca_btl_openib_out_frag_t super; mca_btl_openib_header_t *hdr, *chdr; mca_btl_openib_footer_t *ftr; uint8_t qp_idx; uint32_t coalesced_length; opal_list_t coalesced_frags; } mca_btl_openib_send_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t); #define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f)) typedef struct mca_btl_openib_recv_frag_t { mca_btl_openib_in_frag_t super; mca_btl_openib_header_t *hdr; mca_btl_openib_footer_t *ftr; struct ibv_recv_wr rd_desc; uint8_t qp_idx; } mca_btl_openib_recv_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t); #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f)) typedef struct mca_btl_openib_put_frag_t { mca_btl_openib_out_frag_t super; struct { mca_btl_base_rdma_completion_fn_t func; mca_btl_base_registration_handle_t *local_handle; void *context; void *data; } cb; } mca_btl_openib_put_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f)) typedef struct mca_btl_openib_get_frag_t { mca_btl_openib_in_frag_t super; struct ibv_send_wr sr_desc; struct { mca_btl_base_rdma_completion_fn_t func; mca_btl_base_registration_handle_t *local_handle; void *context; void *data; } cb; } mca_btl_openib_get_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t); #define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f)) typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t); #define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f)) typedef struct mca_btl_openib_coalesced_frag_t { mca_btl_openib_frag_t super; mca_btl_openib_send_frag_t *send_frag; mca_btl_openib_header_coalesced_t *hdr; bool sent; } mca_btl_openib_coalesced_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t); #define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f)) /* * Allocate an IB send descriptor * */ static inline mca_btl_openib_send_control_frag_t * alloc_control_frag(mca_btl_openib_module_t *btl) { ompi_free_list_item_t *item; OMPI_FREE_LIST_WAIT_MT(&btl->device->send_free_control, item); return to_send_control_frag(item); } static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl, size_t size) { int qp; for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) if(mca_btl_openib_component.qp_infos[qp].size >= size) return qp; return MCA_BTL_NO_ORDER; } static inline mca_btl_openib_com_frag_t *alloc_send_user_frag(void) { ompi_free_list_item_t *item; OMPI_FREE_LIST_GET_MT(&mca_btl_openib_component.send_user_free, item); return to_com_frag(item); } static inline mca_btl_openib_com_frag_t *alloc_recv_user_frag(void) { ompi_free_list_item_t *item; OMPI_FREE_LIST_GET_MT(&mca_btl_openib_component.recv_user_free, item); return to_com_frag(item); } static inline mca_btl_openib_coalesced_frag_t *alloc_coalesced_frag(void) { ompi_free_list_item_t *item; OMPI_FREE_LIST_GET_MT(&mca_btl_openib_component.send_free_coalesced, item); return to_coalesced_frag(item); } #define MCA_BTL_IB_FRAG_RETURN(frag) \ do { \ OMPI_FREE_LIST_RETURN_MT(to_base_frag(frag)->list, \ (ompi_free_list_item_t*)(frag)); \ } while(0); #define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \ while(!opal_list_is_empty(list)){ \ opal_list_item_t *frag_item; \ frag_item = opal_list_remove_first(list); \ MCA_BTL_IB_FRAG_RETURN(frag_item); \ } \ struct mca_btl_openib_module_t; struct mca_btl_openib_frag_init_data_t { uint8_t order; ompi_free_list_t* list; }; typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t; void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx); END_C_DECLS #endif