50bae9c603
/tmp/jms-modular-wireup branch): * This commit moves all the openib BTL connection code out of btl_openib_endpoint.c and into a connect "pseudo-component" area, meaning that different schemes for doing OFA connection schemes can be chosen via function pointer (i.e., MCA parameter) at run-time. * The connect/connect.h file includes comments describing the specific interface for the connect pseudo-component. * Two pseudo-components are in this commit (more can certainly be added). * oob: use the same old oob/rml scheme for creating OFA connections that we've had forever; this now just puts the logic into this self-contained pseudo-component. * rdma_cm: a currently-empty set of functions (that currently return NOT_IMPLEMENTED) that will someday use the RDMA connection manager to make OFA connections. This commit was SVN r15786.
304 строки
11 KiB
C
304 строки
11 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef MCA_BTL_IB_ENDPOINT_H
|
|
#define MCA_BTL_IB_ENDPOINT_H
|
|
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/event/event.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "btl_openib_frag.h"
|
|
#include "btl_openib.h"
|
|
#include "btl_openib_eager_rdma.h"
|
|
#include <errno.h>
|
|
#include <string.h>
|
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
struct mca_btl_openib_frag_t;
|
|
|
|
/**
|
|
* State of IB endpoint connection.
|
|
*/
|
|
|
|
typedef enum {
|
|
/* Defines the state in which this BTL instance
|
|
* has started the process of connection */
|
|
MCA_BTL_IB_CONNECTING,
|
|
|
|
/* Waiting for ack from endpoint */
|
|
MCA_BTL_IB_CONNECT_ACK,
|
|
|
|
/*Waiting for final connection ACK from endpoint */
|
|
MCA_BTL_IB_WAITING_ACK,
|
|
|
|
/* Connected ... both sender & receiver have
|
|
* buffers associated with this connection */
|
|
MCA_BTL_IB_CONNECTED,
|
|
|
|
/* Connection is closed, there are no resources
|
|
* associated with this */
|
|
MCA_BTL_IB_CLOSED,
|
|
|
|
/* Maximum number of retries have been used.
|
|
* Report failure on send to upper layer */
|
|
MCA_BTL_IB_FAILED
|
|
} mca_btl_openib_endpoint_state_t;
|
|
|
|
struct mca_btl_openib_rem_qp_info_t {
|
|
uint32_t rem_qp_num;
|
|
/* Remote QP number */
|
|
uint32_t rem_psn;
|
|
/* Remote processes port sequence number */
|
|
}; typedef struct mca_btl_openib_rem_qp_info_t mca_btl_openib_rem_qp_info_t;
|
|
|
|
struct mca_btl_openib_rem_info_t {
|
|
uint16_t rem_lid;
|
|
/* Local identifier of the remote process */
|
|
uint64_t rem_subnet_id;
|
|
/* subnet id of remote process */
|
|
uint32_t rem_mtu;
|
|
/* MTU of remote process */
|
|
uint32_t rem_index;
|
|
/* index of remote endpoint in endpoint array */
|
|
mca_btl_openib_rem_qp_info_t *rem_qps;
|
|
}; typedef struct mca_btl_openib_rem_info_t mca_btl_openib_rem_info_t;
|
|
|
|
|
|
/**
|
|
* Agggregates all per peer qp info for an endpoint
|
|
*/
|
|
struct mca_btl_openib_endpoint_pp_qp_t {
|
|
int32_t sd_credits; /**< this rank's view of the credits
|
|
* available for sending:
|
|
* this is the credits granted by the
|
|
* remote peer which has some relation to the
|
|
* number of receive buffers posted remotely
|
|
*/
|
|
int32_t rd_posted; /**< number of descriptors posted to the nic*/
|
|
int32_t rd_credits; /**< number of credits to return to peer */
|
|
int32_t cm_received; /**< Credit messages received */
|
|
int32_t cm_return; /**< how may credits to return */
|
|
int32_t cm_sent; /**< Outstanding number of credit messages */
|
|
}; typedef struct mca_btl_openib_endpoint_pp_qp_t mca_btl_openib_endpoint_pp_qp_t;
|
|
|
|
|
|
/**
|
|
* Aggregates all srq qp info for an endpoint
|
|
*/
|
|
struct mca_btl_openib_endpoint_srq_qp_t {
|
|
int32_t dummy;
|
|
}; typedef struct mca_btl_openib_endpoint_srq_qp_t mca_btl_openib_endpoint_srq_qp_t;
|
|
|
|
|
|
struct mca_btl_openib_endpoint_qp_t {
|
|
struct ibv_qp* lcl_qp; /* Local QP (Low and High) */
|
|
struct ibv_qp_attr* lcl_qp_attr;
|
|
/* Local QP attrnibutes (Low and High) */
|
|
uint32_t lcl_psn;
|
|
int32_t sd_wqe; /**< number of available send wqe entries */
|
|
int qp_type;
|
|
opal_list_t pending_frags; /**< put fragments here if there
|
|
is no wqe available or, in
|
|
case of PP QP, if there is
|
|
no credit available */
|
|
int32_t rd_pending_credit_chks; /**< number of outstanding return credit requests */
|
|
struct mca_btl_openib_frag_t *credit_frag;
|
|
union {
|
|
mca_btl_openib_endpoint_srq_qp_t srq_qp;
|
|
mca_btl_openib_endpoint_pp_qp_t pp_qp;
|
|
} u;
|
|
|
|
}; typedef struct mca_btl_openib_endpoint_qp_t mca_btl_openib_endpoint_qp_t;
|
|
|
|
|
|
/**
|
|
* An abstraction that represents a connection to a endpoint process.
|
|
* An instance of mca_btl_base_endpoint_t is associated w/ each process
|
|
* and BTL pair at startup. However, connections to the endpoint
|
|
* are established dynamically on an as-needed basis:
|
|
*/
|
|
|
|
struct mca_btl_base_endpoint_t {
|
|
opal_list_item_t super;
|
|
|
|
struct mca_btl_openib_module_t* endpoint_btl;
|
|
/**< BTL instance that created this connection */
|
|
|
|
struct mca_btl_openib_proc_t* endpoint_proc;
|
|
/**< proc structure corresponding to endpoint */
|
|
|
|
mca_btl_openib_endpoint_state_t endpoint_state;
|
|
/**< current state of the connection */
|
|
|
|
size_t endpoint_retries;
|
|
/**< number of connection retries attempted */
|
|
|
|
double endpoint_tstamp;
|
|
/**< timestamp of when the first connection was attempted */
|
|
|
|
opal_mutex_t endpoint_lock;
|
|
/**< lock for concurrent access to endpoint state */
|
|
|
|
opal_list_t pending_lazy_frags;
|
|
/**< list of pending frags due to lazy connection establishment
|
|
* for this endpotint
|
|
*/
|
|
|
|
mca_btl_openib_endpoint_qp_t * qps;
|
|
|
|
opal_list_t pending_get_frags; /**< list of pending rget ops */
|
|
opal_list_t pending_put_frags; /**< list of pending rput ops */
|
|
|
|
|
|
|
|
|
|
/* Local processes port sequence number (Low and High) */
|
|
|
|
|
|
int32_t get_tokens; /**< number of available get tokens */
|
|
|
|
|
|
uint64_t subnet_id; /**< subnet id of this endpoint*/
|
|
|
|
int32_t eager_recv_count; /**< number of eager received */
|
|
mca_btl_openib_eager_rdma_remote_t eager_rdma_remote;
|
|
/**< info about remote RDMA buffer */
|
|
mca_btl_openib_eager_rdma_local_t eager_rdma_local;
|
|
/**< info about local RDMA buffer */
|
|
uint32_t index; /**< index of the endpoint in endpoints array */
|
|
|
|
/**< frags for sending explicit high priority credits */
|
|
bool nbo; /**< does the endpoint require network byte ordering? */
|
|
bool use_eager_rdma; /**< use eager rdma for this peer? */
|
|
|
|
mca_btl_openib_rem_info_t rem_info;
|
|
};
|
|
|
|
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
|
typedef mca_btl_base_endpoint_t mca_btl_openib_endpoint_t;
|
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t);
|
|
|
|
int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint,
|
|
struct mca_btl_openib_frag_t* frag);
|
|
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
|
|
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
|
|
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint);
|
|
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint);
|
|
|
|
|
|
|
|
static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpoint,
|
|
const int additional,
|
|
const int qp)
|
|
{
|
|
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
|
int rd_rsv = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
|
|
int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
|
|
|
|
int cm_received, rd_posted, rd_low;
|
|
|
|
assert(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type);
|
|
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
|
|
cm_received = endpoint->qps[qp].u.pp_qp.cm_received;
|
|
rd_posted = endpoint->qps[qp].u.pp_qp.rd_posted;
|
|
rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
|
|
|
|
if(cm_received >= (rd_rsv >> 2) || rd_posted <= rd_low) {
|
|
int rc;
|
|
int32_t i, num_post = rd_num - rd_posted;
|
|
struct ibv_recv_wr* bad_wr;
|
|
ompi_free_list_t *free_list;
|
|
|
|
free_list = &openib_btl->qps[qp].recv_free;
|
|
|
|
for(i = 0; i < (num_post + cm_received); i++) {
|
|
ompi_free_list_item_t* item;
|
|
mca_btl_openib_frag_t* frag;
|
|
OMPI_FREE_LIST_WAIT(free_list, item, rc);
|
|
frag = (mca_btl_openib_frag_t*)item;
|
|
frag->endpoint = endpoint;
|
|
frag->base.order = qp;
|
|
if(ibv_post_recv(endpoint->qps[qp].lcl_qp,
|
|
&frag->wr_desc.rd_desc,
|
|
&bad_wr)) {
|
|
BTL_ERROR(("error posting receive errno says %s\n",
|
|
strerror(errno)));
|
|
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
|
return OMPI_ERROR;
|
|
}
|
|
}
|
|
if(num_post > 0) {
|
|
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post);
|
|
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post);
|
|
}
|
|
if(cm_received > 0) {
|
|
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return,
|
|
cm_received);
|
|
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received,
|
|
-cm_received);
|
|
}
|
|
assert(endpoint->qps[qp].u.pp_qp.rd_credits <= rd_num);
|
|
assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0);
|
|
}
|
|
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static inline int mca_btl_openib_endpoint_post_rr_all(mca_btl_base_endpoint_t *endpoint,
|
|
const int additional)
|
|
{
|
|
int qp;
|
|
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++){
|
|
if(MCA_BTL_OPENIB_PP_QP == mca_btl_openib_component.qp_infos[qp].type) {
|
|
mca_btl_openib_endpoint_post_rr(endpoint, additional, qp);
|
|
}
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static inline int btl_openib_check_send_credits(
|
|
mca_btl_openib_endpoint_t *endpoint, const int qp)
|
|
{
|
|
if(BTL_OPENIB_EAGER_RDMA_QP(qp)) {
|
|
if(endpoint->eager_rdma_local.credits > endpoint->eager_rdma_local.rd_win) {
|
|
return OPAL_THREAD_ADD32(&endpoint->qps[qp].rd_pending_credit_chks, 1) == 1;
|
|
}
|
|
}
|
|
|
|
if(MCA_BTL_OPENIB_PP_QP != mca_btl_openib_component.qp_infos[qp].type)
|
|
return 0;
|
|
|
|
if(endpoint->qps[qp].u.pp_qp.rd_credits >=
|
|
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win) {
|
|
return OPAL_THREAD_ADD32(&endpoint->qps[qp].rd_pending_credit_chks, 1) == 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
END_C_DECLS
|
|
|
|
#endif
|