1
1
openmpi/ompi/mca/btl/openib/btl_openib_frag.h
Jeff Squyres c42ab8ea37 Fixes trac:1210, #1319
Commit from a long-standing Mercurial tree that ended up incorporating a lot of things:

 * A few fixes for CPC interface changes in all the CPCs
 * Attempts (but not yet finished) to fix shutdown problems in the IB CM CPC
 * #1319: add CTS support (i.e., initiator guarantees to send first message; automatically activated for iWARP over the RDMA CM CPC)
   * Some variable and function renamings to make this be generic (e.g., alloc_credit_frag became alloc_control_frag)
   * CPCs no longer post receive buffers; they only post a single receive buffer for the CTS if they use CTS. Instead, the main BTL now posts the main sets of receive buffers. 
   * CPCs allocate a CTS buffer only if they're about to make a connection
 * RDMA CM improvements:
   * Use threaded mode openib fd monitoring to wait for for RDMA CM events
   * Synchronize endpoint finalization and disconnection between main thread and service thread to avoid/fix some race conditions
   * Converted several structs to be OBJs so that we can use reference counting to know when to invoke destructors
   * Make some new OBJ's have opal_list_item_t's as their base, thereby eliminating the need for the local list_item_t type
   * Renamed many variables to be internally consistent
   * Centralize the decision in an inline function as to whether this process or the remote process is supposed to be the initiator
   * Add oodles of OPAL_OUTPUT statements for debugging (hard-wired to output stream -1; to be activated by developers if they want/need them) 
   * Use rdma_create_qp() instead of ibv_create_qp()
 * openib fd monitoring improvements:
   * Renamed a bunch of functions and variables to be a little more obvious as to their true function
   * Use pipes to communicate between main thread and service thread
   * Add ability for main thread to invoke a function back on the service thread 
   * Ensure to set initiator_depth and responder_resources properly, but putting max_qp_rd_ataom and ma_qp_init_rd_atom in the modex (see rdma_connect(3))
   * Ensure to set the source IP address in rdma_resolve() to ensure that we select the correct OpenFabrics source port
   * Make new MCA param: openib_btl_connect_rdmacm_resolve_timeout
 * Other improvements:
   * btl_openib_device_type MCA param: can be "iw" or "ib" or "all" (or "infiniband" or "iwarp")
   * Somewhat improved error handling
   * Bunches of spelling fixes in comments, VERBOSE, and OUTPUT statements
   * Oodles of little coding style fixes
   * Changed shutdown ordering of btl; the device is now an OBJ with ref counting for destruction
   * Added some more show_help error messages
   * Change configury to only build IBCM / RDMACM if we have threads (because we need a progress thread) 

This commit was SVN r19686.

The following Trac tickets were found above:
  Ticket 1210 --> https://svn.open-mpi.org/trac/ompi/ticket/1210
2008-10-06 00:46:02 +00:00

350 строки
11 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_FRAG_H
#define MCA_BTL_IB_FRAG_H
#include "ompi_config.h"
#include <infiniband/verbs.h>
#include "ompi/mca/btl/btl.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct mca_btl_openib_reg_t;
struct mca_btl_openib_header_t {
mca_btl_base_tag_t tag;
uint8_t cm_seen;
uint16_t credits;
};
typedef struct mca_btl_openib_header_t mca_btl_openib_header_t;
#define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15)
#define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG)
#define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG)
#define BTL_OPENIB_HEADER_HTON(h) \
do { \
(h).credits = htons((h).credits); \
} while (0)
#define BTL_OPENIB_HEADER_NTOH(h) \
do { \
(h).credits = ntohs((h).credits); \
} while (0)
typedef struct mca_btl_openib_header_coalesced_t {
mca_btl_base_tag_t tag;
uint32_t size;
uint32_t alloc_size;
} mca_btl_openib_header_coalesced_t;
#define BTL_OPENIB_HEADER_COALESCED_NTOH(h) \
do { \
(h).size = ntohl((h).size); \
(h).alloc_size = ntohl((h).alloc_size); \
} while(0)
#define BTL_OPENIB_HEADER_COALESCED_HTON(h) \
do { \
(h).size = htonl((h).size); \
(h).alloc_size = htonl((h).alloc_size); \
} while(0)
struct mca_btl_openib_footer_t {
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
union {
uint32_t size;
uint8_t buf[4];
} u;
};
typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
#ifdef WORDS_BIGENDIAN
#define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr)
#else
#define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) \
do { \
uint8_t tmp = (ftr).u.buf[0]; \
(ftr).u.buf[0]=(ftr).u.buf[2]; \
(ftr).u.buf[2]=tmp; \
} while (0)
#endif
#if OMPI_ENABLE_DEBUG
#define BTL_OPENIB_FOOTER_SEQ_HTON(h) ((h).seq = htonl((h).seq))
#define BTL_OPENIB_FOOTER_SEQ_NTOH(h) ((h).seq = ntohl((h).seq))
#else
#define BTL_OPENIB_FOOTER_SEQ_HTON(h)
#define BTL_OPENIB_FOOTER_SEQ_NTOH(h)
#endif
#define BTL_OPENIB_FOOTER_HTON(h) \
do { \
BTL_OPENIB_FOOTER_SEQ_HTON(h); \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#define BTL_OPENIB_FOOTER_NTOH(h) \
do { \
BTL_OPENIB_FOOTER_SEQ_NTOH(h); \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#define MCA_BTL_OPENIB_CONTROL_CREDITS 0
#define MCA_BTL_OPENIB_CONTROL_RDMA 1
#define MCA_BTL_OPENIB_CONTROL_COALESCED 2
#define MCA_BTL_OPENIB_CONTROL_CTS 3
struct mca_btl_openib_control_header_t {
uint8_t type;
};
typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t;
struct mca_btl_openib_eager_rdma_header_t {
mca_btl_openib_control_header_t control;
uint8_t padding[3];
uint32_t rkey;
ompi_ptr_t rdma_start;
};
typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t;
#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h) \
do { \
(h).rkey = htonl((h).rkey); \
(h).rdma_start.lval = hton64((h).rdma_start.lval); \
} while (0)
#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \
do { \
(h).rkey = ntohl((h).rkey); \
(h).rdma_start.lval = ntoh64((h).rdma_start.lval); \
} while (0)
struct mca_btl_openib_rdma_credits_header_t {
mca_btl_openib_control_header_t control;
uint8_t qpn;
uint16_t rdma_credits;
};
typedef struct mca_btl_openib_rdma_credits_header_t mca_btl_openib_rdma_credits_header_t;
#define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h) \
do { \
(h).rdma_credits = htons((h).rdma_credits); \
} while (0)
#define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \
do { \
(h).rdma_credits = ntohs((h).rdma_credits); \
} while (0)
enum mca_btl_openib_frag_type_t {
MCA_BTL_OPENIB_FRAG_RECV,
MCA_BTL_OPENIB_FRAG_RECV_USER,
MCA_BTL_OPENIB_FRAG_SEND,
MCA_BTL_OPENIB_FRAG_SEND_USER,
MCA_BTL_OPENIB_FRAG_EAGER_RDMA,
MCA_BTL_OPENIB_FRAG_CONTROL,
MCA_BTL_OPENIB_FRAG_COALESCED
};
typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t;
#define openib_frag_type(f) (to_base_frag(f)->type)
/**
* IB fragment derived type.
*/
/* base openib frag */
typedef struct mca_btl_openib_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segment;
mca_btl_openib_frag_type_t type;
ompi_free_list_t* list;
} mca_btl_openib_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t);
#define to_base_frag(f) ((mca_btl_openib_frag_t*)(f))
/* frag used for communication */
typedef struct mca_btl_openib_com_frag_t {
mca_btl_openib_frag_t super;
struct ibv_sge sg_entry;
struct mca_btl_openib_reg_t *registration;
struct mca_btl_base_endpoint_t *endpoint;
} mca_btl_openib_com_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);
#define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f))
typedef struct mca_btl_openib_out_frag_t {
mca_btl_openib_com_frag_t super;
struct ibv_send_wr sr_desc;
} mca_btl_openib_out_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t);
#define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f))
typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t);
#define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f))
typedef struct mca_btl_openib_send_frag_t {
mca_btl_openib_out_frag_t super;
mca_btl_openib_header_t *hdr, *chdr;
mca_btl_openib_footer_t *ftr;
uint8_t qp_idx;
uint32_t coalesced_length;
opal_list_t coalesced_frags;
} mca_btl_openib_send_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t);
#define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f))
typedef struct mca_btl_openib_recv_frag_t {
mca_btl_openib_in_frag_t super;
mca_btl_openib_header_t *hdr;
mca_btl_openib_footer_t *ftr;
struct ibv_recv_wr rd_desc;
uint8_t qp_idx;
} mca_btl_openib_recv_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
typedef struct mca_btl_openib_get_frag_t {
mca_btl_openib_in_frag_t super;
struct ibv_send_wr sr_desc;
} mca_btl_openib_get_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
#define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f))
typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t);
#define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f))
typedef struct mca_btl_openib_coalesced_frag_t {
mca_btl_openib_frag_t super;
mca_btl_openib_send_frag_t *send_frag;
mca_btl_openib_header_coalesced_t *hdr;
} mca_btl_openib_coalesced_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);
#define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f))
/*
* Allocate an IB send descriptor
*
*/
static inline mca_btl_openib_send_control_frag_t *
alloc_control_frag(mca_btl_openib_module_t *btl)
{
int rc;
ompi_free_list_item_t *item;
OMPI_FREE_LIST_WAIT(&btl->device->send_free_control, item, rc);
return to_send_control_frag(item);
}
static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl,
size_t size)
{
int qp;
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++)
if(mca_btl_openib_component.qp_infos[qp].size >= size)
return qp;
return MCA_BTL_NO_ORDER;
}
static inline mca_btl_openib_com_frag_t *alloc_send_user_frag(void)
{
int rc;
ompi_free_list_item_t *item;
OMPI_FREE_LIST_GET(&mca_btl_openib_component.send_user_free, item, rc);
return to_com_frag(item);
}
static inline mca_btl_openib_com_frag_t *alloc_recv_user_frag(void)
{
int rc;
ompi_free_list_item_t *item;
OMPI_FREE_LIST_GET(&mca_btl_openib_component.recv_user_free, item, rc);
return to_com_frag(item);
}
static inline mca_btl_openib_coalesced_frag_t *alloc_coalesced_frag(void)
{
int rc;
ompi_free_list_item_t *item;
OMPI_FREE_LIST_GET(&mca_btl_openib_component.send_free_coalesced, item, rc);
return to_coalesced_frag(item);
}
#define MCA_BTL_IB_FRAG_RETURN(frag) \
do { \
OMPI_FREE_LIST_RETURN(to_base_frag(frag)->list, \
(ompi_free_list_item_t*)(frag)); \
} while(0);
#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \
while(!opal_list_is_empty(list)){ \
opal_list_item_t *frag_item; \
frag_item = opal_list_remove_first(list); \
MCA_BTL_IB_FRAG_RETURN(frag_item); \
} \
struct mca_btl_openib_module_t;
struct mca_btl_openib_frag_init_data_t {
uint8_t order;
ompi_free_list_t* list;
};
typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t;
void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif