/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef MCA_BTL_IB_ENDPOINT_H #define MCA_BTL_IB_ENDPOINT_H #include "opal/class/opal_list.h" #include "opal/event/event.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/btl/btl.h" #include "btl_openib.h" #include "btl_openib_frag.h" #include "btl_openib_eager_rdma.h" #include #include #include "ompi/mca/btl/base/btl_base_error.h" #include "connect/base.h" BEGIN_C_DECLS struct mca_btl_openib_frag_t; struct mca_btl_openib_proc_modex_t; /** * State of IB endpoint connection. */ typedef enum { /* Defines the state in which this BTL instance * has started the process of connection */ MCA_BTL_IB_CONNECTING, /* Waiting for ack from endpoint */ MCA_BTL_IB_CONNECT_ACK, /*Waiting for final connection ACK from endpoint */ MCA_BTL_IB_WAITING_ACK, /* Connected ... both sender & receiver have * buffers associated with this connection */ MCA_BTL_IB_CONNECTED, /* Connection is closed, there are no resources * associated with this */ MCA_BTL_IB_CLOSED, /* Maximum number of retries have been used. * Report failure on send to upper layer */ MCA_BTL_IB_FAILED } mca_btl_openib_endpoint_state_t; typedef struct mca_btl_openib_rem_qp_info_t { uint32_t rem_qp_num; /* Remote QP number */ uint32_t rem_psn; /* Remote processes port sequence number */ } mca_btl_openib_rem_qp_info_t; typedef struct mca_btl_openib_rem_srq_info_t { /* Remote SRQ number */ uint32_t rem_srq_num; } mca_btl_openib_rem_srq_info_t; typedef struct mca_btl_openib_rem_info_t { /* Local identifier of the remote process */ uint16_t rem_lid; /* subnet id of remote process */ uint64_t rem_subnet_id; /* MTU of remote process */ uint32_t rem_mtu; /* index of remote endpoint in endpoint array */ uint32_t rem_index; /* Remote QPs */ mca_btl_openib_rem_qp_info_t *rem_qps; /* Remote xrc_srq info, used only with XRC connections */ mca_btl_openib_rem_srq_info_t *rem_srqs; } mca_btl_openib_rem_info_t; /** * Agggregates all per peer qp info for an endpoint */ typedef struct mca_btl_openib_endpoint_pp_qp_t { int32_t sd_credits; /**< this rank's view of the credits * available for sending: * this is the credits granted by the * remote peer which has some relation to the * number of receive buffers posted remotely */ int32_t rd_posted; /**< number of descriptors posted to the nic*/ int32_t rd_credits; /**< number of credits to return to peer */ int32_t cm_received; /**< Credit messages received */ int32_t cm_return; /**< how may credits to return */ int32_t cm_sent; /**< Outstanding number of credit messages */ } mca_btl_openib_endpoint_pp_qp_t; /** * Aggregates all srq qp info for an endpoint */ typedef struct mca_btl_openib_endpoint_srq_qp_t { int32_t dummy; } mca_btl_openib_endpoint_srq_qp_t; typedef struct mca_btl_openib_qp_t { struct ibv_qp *lcl_qp; uint32_t lcl_psn; int32_t sd_wqe; /**< number of available send wqe entries */ opal_list_t pending_frags[2]; /**< put fragments here if there is no wqe available */ int users; opal_mutex_t lock; } mca_btl_openib_qp_t; typedef struct mca_btl_openib_endpoint_qp_t { mca_btl_openib_qp_t *qp; opal_list_t pending_frags[2]; /**< put fragment here if there is no credits available */ int32_t rd_credit_send_lock; /**< Lock credit send fragment */ mca_btl_openib_send_control_frag_t *credit_frag; size_t ib_inline_max; /**< max size of inline send*/ union { mca_btl_openib_endpoint_srq_qp_t srq_qp; mca_btl_openib_endpoint_pp_qp_t pp_qp; } u; } mca_btl_openib_endpoint_qp_t; /** * An abstraction that represents a connection to a endpoint process. * An instance of mca_btl_base_endpoint_t is associated w/ each process * and BTL pair at startup. However, connections to the endpoint * are established dynamically on an as-needed basis: */ struct mca_btl_base_endpoint_t { opal_list_item_t super; /** BTL module that created this connection */ struct mca_btl_openib_module_t* endpoint_btl; /** proc structure corresponding to endpoint */ struct mca_btl_openib_proc_t* endpoint_proc; /** local CPC to connect to this endpoint */ ompi_btl_openib_connect_base_module_t *endpoint_local_cpc; /** hook for local CPC to hang endpoint-specific data */ void *endpoint_local_cpc_data; /** pointer to remote CPC's data (essentially its CPC modex message) */ ompi_btl_openib_connect_base_module_data_t *endpoint_remote_cpc_data; /** current state of the connection */ mca_btl_openib_endpoint_state_t endpoint_state; /** number of connection retries attempted */ size_t endpoint_retries; /** timestamp of when the first connection was attempted */ double endpoint_tstamp; /** lock for concurrent access to endpoint state */ opal_mutex_t endpoint_lock; /** list of pending frags due to lazy connection establishment for this endpotint */ opal_list_t pending_lazy_frags; mca_btl_openib_endpoint_qp_t *qps; uint32_t xrc_recv_qp_num; /* in xrc we will use it as recv qp */ uint32_t xrc_recv_psn; /** list of pending rget ops */ opal_list_t pending_get_frags; /** list of pending rput ops */ opal_list_t pending_put_frags; /** number of available get tokens */ int32_t get_tokens; /** subnet id of this endpoint*/ uint64_t subnet_id; /** used only for xrc; pointer to struct that keeps remote port info */ struct ib_address_t *ib_addr; /** number of eager received */ int32_t eager_recv_count; /** info about remote RDMA buffer */ mca_btl_openib_eager_rdma_remote_t eager_rdma_remote; /** info about local RDMA buffer */ mca_btl_openib_eager_rdma_local_t eager_rdma_local; /** index of the endpoint in endpoints array */ int32_t index; /** does the endpoint require network byte ordering? */ bool nbo; /** use eager rdma for this peer? */ bool use_eager_rdma; /** information about the remote port */ mca_btl_openib_rem_info_t rem_info; }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; typedef mca_btl_base_endpoint_t mca_btl_openib_endpoint_t; OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t); static inline int32_t qp_get_wqe(mca_btl_openib_endpoint_t *ep, const int qp) { return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, -1); } static inline int32_t qp_put_wqe(mca_btl_openib_endpoint_t *ep, const int qp) { return OPAL_THREAD_ADD32(&ep->qps[qp].qp->sd_wqe, 1); } int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t*, mca_btl_openib_send_frag_t*); int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*, mca_btl_openib_send_frag_t*); void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int); void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*); int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*); void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*); void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*, mca_btl_base_endpoint_t*, ompi_btl_openib_connect_base_module_t *local_cpc, struct mca_btl_openib_proc_modex_t *remote_proc_info, ompi_btl_openib_connect_base_module_data_t *remote_cpc_data); /* * Invoke an error on the btl associated with an endpoint. If we * don't have an endpoint, then just use the first one on the * component list of BTLs. */ void *mca_btl_openib_endpoint_invoke_error(void *endpoint); static inline int post_recvs(mca_btl_base_endpoint_t *ep, const int qp, const int num_post) { int i, rc; struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL; mca_btl_openib_module_t *openib_btl = ep->endpoint_btl; if(0 == num_post) return OMPI_SUCCESS; for(i = 0; i < num_post; i++) { int rc; ompi_free_list_item_t* item; OMPI_FREE_LIST_WAIT(&openib_btl->device->qps[qp].recv_free, item, rc); to_base_frag(item)->base.order = qp; to_com_frag(item)->endpoint = ep; if(NULL == wr) wr = wr_list = &to_recv_frag(item)->rd_desc; else wr = wr->next = &to_recv_frag(item)->rd_desc; } wr->next = NULL; rc = ibv_post_recv(ep->qps[qp].qp->lcl_qp, wr_list, &bad_wr); if (0 == rc) return OMPI_SUCCESS; BTL_ERROR(("error %d posting receive on qp %d\n", rc, qp)); return OMPI_ERROR; } static inline int mca_btl_openib_endpoint_post_rr_nolock( mca_btl_base_endpoint_t *ep, const int qp) { int rd_rsv = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num; int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low; int cqp = mca_btl_openib_component.credits_qp, rc; int cm_received = 0, num_post = 0; assert(BTL_OPENIB_QP_TYPE_PP(qp)); if(ep->qps[qp].u.pp_qp.rd_posted <= rd_low) num_post = rd_num - ep->qps[qp].u.pp_qp.rd_posted; assert(num_post >= 0); if(ep->qps[qp].u.pp_qp.cm_received >= (rd_rsv >> 2)) cm_received = ep->qps[qp].u.pp_qp.cm_received; if((rc = post_recvs(ep, qp, num_post)) != OMPI_SUCCESS) { return rc; } OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.rd_posted, num_post); OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.rd_credits, num_post); /* post buffers for credit management on credit management qp */ if((rc = post_recvs(ep, cqp, cm_received)) != OMPI_SUCCESS) { return rc; } OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.cm_return, cm_received); OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.cm_received, -cm_received); assert(ep->qps[qp].u.pp_qp.rd_credits <= rd_num && ep->qps[qp].u.pp_qp.rd_credits >= 0); return OMPI_SUCCESS; } static inline int mca_btl_openib_endpoint_post_rr( mca_btl_base_endpoint_t *ep, const int qp) { int ret; OPAL_THREAD_LOCK(&ep->endpoint_lock); ret = mca_btl_openib_endpoint_post_rr_nolock(ep, qp); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return ret; } #define BTL_OPENIB_CREDITS_SEND_TRYLOCK(E, Q) \ OPAL_ATOMIC_CMPSET_32(&(E)->qps[(Q)].rd_credit_send_lock, 0, 1) #define BTL_OPENIB_CREDITS_SEND_UNLOCK(E, Q) \ OPAL_ATOMIC_CMPSET_32(&(E)->qps[(Q)].rd_credit_send_lock, 1, 0) static inline bool check_eager_rdma_credits(const mca_btl_openib_endpoint_t *ep) { return (ep->eager_rdma_local.credits > ep->eager_rdma_local.rd_win) ? true : false; } static inline bool check_send_credits(const mca_btl_openib_endpoint_t *ep, const int qp) { if(!BTL_OPENIB_QP_TYPE_PP(qp)) return false; return (ep->qps[qp].u.pp_qp.rd_credits >= mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win) ? true : false; } static inline void send_credits(mca_btl_openib_endpoint_t *ep, int qp) { if(BTL_OPENIB_QP_TYPE_PP(qp)) { if(check_send_credits(ep, qp)) goto try_send; } else { qp = mca_btl_openib_component.credits_qp; } if(!check_eager_rdma_credits(ep)) return; try_send: if(BTL_OPENIB_CREDITS_SEND_TRYLOCK(ep, qp)) mca_btl_openib_endpoint_send_credits(ep, qp); } static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep, mca_btl_base_descriptor_t *des, opal_list_t *pending_list) { int rc = ORTE_ERR_RESOURCE_BUSY; switch(ep->endpoint_state) { case MCA_BTL_IB_CLOSED: rc = ep->endpoint_local_cpc->cbm_start_connect(ep->endpoint_local_cpc, ep); if (OMPI_SUCCESS == rc) { rc = ORTE_ERR_RESOURCE_BUSY; } /* * As long as we expect a message from the peer (in order * to setup the connection) let the event engine pool the * OOB events. Note: we increment it once peer active * connection. */ opal_progress_event_users_increment(); /* fall through */ default: opal_list_append(pending_list, (opal_list_item_t *)des); break; case MCA_BTL_IB_FAILED: rc = OMPI_ERR_UNREACH; break; case MCA_BTL_IB_CONNECTED: rc = OMPI_SUCCESS; break; } return rc; } static inline __opal_attribute_always_inline__ int ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp) { return IBV_SEND_SIGNALED | ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0); } END_C_DECLS #endif