b89f8fabc9
The project includes following components and frameworks: - ML Collective component - NETPATTERNS and COMMPATTERNS common components - BCOL framework - SBGP framework Note: By default the ML collective component is disabled. In order to enable new collectives user should bump up the priority of ml component (coll_ml_priority) ============================================= Primary Contributors (in alphabetical order): Ishai Rabinovich (Mellanox) Joshua S. Ladd (ORNL / Mellanox) Manjunath Gorentla Venkata (ORNL) Mike Dubman (Mellanox) Noam Bloch (Mellanox) Pavel (Pasha) Shamis (ORNL / Mellanox) Richard Graham (ORNL / Mellanox) Vasily Filipov (Mellanox) This commit was SVN r27078.
329 строки
11 KiB
C
329 строки
11 KiB
C
/*
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef MCA_BCOL_IBOFFLOAD_ENDPOINT_H
|
|
#define MCA_BCOL_IBOFFLOAD_ENDPOINT_H
|
|
|
|
#include "ompi_config.h"
|
|
#include "bcol_iboffload.h"
|
|
#include "bcol_iboffload_frag.h"
|
|
|
|
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
|
|
|
|
#define BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) (ep)->ibnet_proc->use_port[(cgroup)->index]
|
|
#define BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep) (BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) - 1)
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
struct mca_bcol_iboffload_endpoint_qp_t {
|
|
struct ompi_common_ofacm_base_qp_t *qp;
|
|
size_t ib_inline_max;
|
|
int32_t sd_wqe; /* Number of available send wqe entries */
|
|
int32_t rd_wqe; /* Number of available recv wqe entries */
|
|
opal_list_t preposted_frags; /* List of preposted frags */
|
|
/* opal_mutex_t lock; */ /* Do I need lock here ? */
|
|
};
|
|
|
|
typedef struct mca_bcol_iboffload_endpoint_qp_t mca_bcol_iboffload_endpoint_qp_t;
|
|
|
|
enum {
|
|
IBOFFLOAD_CQ_SMALL_MESSAGES = 0,
|
|
IBOFFLOAD_CQ_SYNC,
|
|
IBOFFLOAD_CQ_LARGE_MESSAGES,
|
|
IBOFFLOAD_CQ_LAST
|
|
};
|
|
|
|
/* Endpoint object */
|
|
struct mca_bcol_iboffload_endpoint_t {
|
|
opal_list_item_t super;
|
|
|
|
/** BTL module that created this connection */
|
|
mca_bcol_iboffload_module_t *iboffload_module;
|
|
|
|
/** proc structure corresponding to endpoint */
|
|
mca_sbgp_ibnet_proc_t *ibnet_proc;
|
|
|
|
/** lock for concurrent access to endpoint state */
|
|
opal_mutex_t endpoint_lock;
|
|
|
|
/** Penging frag list */
|
|
opal_list_t pending_frags;
|
|
|
|
/** QPs information */
|
|
mca_bcol_iboffload_endpoint_qp_t *qps;
|
|
|
|
/** endpoint index on array */
|
|
int32_t index;
|
|
|
|
/** CQ for receive queues on this endpoint */
|
|
struct ibv_cq *recv_cq[IBOFFLOAD_CQ_LAST];
|
|
|
|
/** QP configuration information */
|
|
ompi_common_ofacm_base_qp_config_t qp_config;
|
|
|
|
/** cpc context */
|
|
ompi_common_ofacm_base_local_connection_context_t *cpc_context;
|
|
|
|
/** caching pointer to remote info */
|
|
ompi_common_ofacm_base_remote_connection_context_t *remote_info;
|
|
|
|
/** caching pointer to cpc */
|
|
ompi_common_ofacm_base_module_t *endpoint_cpc;
|
|
|
|
/** The struct is used for zero RDMA with immediate
|
|
in some collectives, in barrier for example. */
|
|
mca_bcol_iboffload_rdma_info_t remote_zero_rdma_addr;
|
|
mca_bcol_iboffload_rem_rdma_block_t remote_rdma_block;
|
|
|
|
/** The pointer to device - In the destruction function
|
|
the iboffload module may not exist any more - caching the device */
|
|
struct mca_bcol_iboffload_device_t *device;
|
|
|
|
bool need_toset_remote_rdma_info;
|
|
|
|
mca_bcol_iboffload_rdma_info_t remote_rdma_info[MAX_REMOTE_RDMA_INFO];
|
|
};
|
|
typedef struct mca_bcol_iboffload_endpoint_t mca_bcol_iboffload_endpoint_t;
|
|
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_endpoint_t);
|
|
|
|
/* Function declaration */
|
|
int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep);
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
int check_endpoint_state(mca_bcol_iboffload_endpoint_t *ep,
|
|
mca_bcol_base_descriptor_t *des,
|
|
opal_list_t *pending_list)
|
|
{
|
|
int rc = OMPI_ERR_RESOURCE_BUSY;
|
|
|
|
OPAL_THREAD_LOCK(&ep->cpc_context->context_lock);
|
|
/* Adding here one more redirection in critical path. Need to think
|
|
* what is the best way to prevent it */
|
|
switch(ep->cpc_context->state) {
|
|
case MCA_COMMON_OFACM_CLOSED:
|
|
rc = ep->endpoint_cpc->cbm_start_connect(ep->cpc_context);
|
|
if (OMPI_SUCCESS == rc) {
|
|
rc = OMPI_ERR_RESOURCE_BUSY;
|
|
}
|
|
/*
|
|
* As long as we expect a message from the peer (in order
|
|
* to setup the connection) let the event engine pool the
|
|
* OOB events. Note: we increment it once peer active
|
|
* connection.
|
|
*/
|
|
opal_progress_event_users_increment();
|
|
/* fall through */
|
|
default:
|
|
/* opal_list_append(pending_list, (opal_list_item_t *)des); */ /* Vasily: will be uncomment later */
|
|
break;
|
|
case MCA_COMMON_OFACM_FAILED:
|
|
rc = OMPI_ERR_UNREACH;
|
|
break;
|
|
case MCA_COMMON_OFACM_CONNECTED:
|
|
rc = OMPI_SUCCESS;
|
|
break;
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&ep->cpc_context->context_lock);
|
|
return rc;
|
|
}
|
|
|
|
int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup,
|
|
mca_bcol_iboffload_module_t *module);
|
|
|
|
int mca_bcol_iboffload_endpoint_post_recvs(void *context);
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
mca_bcol_iboffload_prepost_recv(
|
|
mca_bcol_iboffload_endpoint_t *endpoint,
|
|
int qp_index, int num_to_prepost)
|
|
{
|
|
mca_bcol_iboffload_prepost_qps_fn_t prepost_recv =
|
|
mca_bcol_iboffload_component.qp_infos[qp_index].prepost_recv;
|
|
if (NULL != prepost_recv) {
|
|
return prepost_recv(endpoint, qp_index, num_to_prepost);
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
mca_bcol_iboffload_post_ml_scatter_recv_frag(
|
|
int qp_index, uint32_t dest_rank,
|
|
int nitems, struct iovec *buff_iovec,
|
|
uint32_t lkey,
|
|
struct ibv_sge *sg_entries,
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
mca_bcol_iboffload_module_t *iboffload)
|
|
{
|
|
int ret, start_wr_index;
|
|
struct ibv_recv_wr *recv_wr, *recv_bad;
|
|
int i;
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
|
|
|
|
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
|
|
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
|
|
(void *) endpoint, qp_index));
|
|
|
|
/* make sure that we do not overrun number of rd_wqe */
|
|
if (0 >= endpoint->qps[qp_index].rd_wqe) {
|
|
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
|
|
endpoint->qps[qp_index].rd_wqe));
|
|
|
|
return 0;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&recv_wrs->lock);
|
|
|
|
/* Calculate start index in array
|
|
* of pre-allocated work requests */
|
|
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
|
|
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
|
|
"start index of WRs - %d", (void *) endpoint,
|
|
qp_index, start_wr_index));
|
|
|
|
for (i = 0; i < nitems; i++) {
|
|
sg_entries[i].length = buff_iovec[i].iov_len;
|
|
sg_entries[i].addr = (uint64_t)buff_iovec[i].iov_base;
|
|
sg_entries[i].lkey = lkey;
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , length %d , address %p",
|
|
i, sg_entries[i].length, sg_entries[i].addr));
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , iovec length %d",
|
|
i, buff_iovec[i].iov_len));
|
|
}
|
|
|
|
recv_wr->num_sge = nitems;
|
|
recv_wr->sg_list = sg_entries;
|
|
|
|
/* Set the tail */
|
|
recv_wr->next = NULL;
|
|
|
|
/* post the list of recvs */
|
|
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
|
|
if (OPAL_UNLIKELY(0 != ret)) {
|
|
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
|
|
"qp_index - %d.\n",
|
|
ibv_get_device_name(device->dev.ib_dev),
|
|
strerror(errno), ret, qp_index));
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* decresing numbers of free recv wqe */
|
|
--endpoint->qps[qp_index].rd_wqe;
|
|
|
|
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Return success: "
|
|
"endpoint %p, qp_index %d, dest_rank %d",
|
|
endpoint, qp_index, dest_rank));
|
|
|
|
return 1;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
mca_bcol_iboffload_prepost_ml_recv_frag(
|
|
int qp_index, uint32_t dest_rank,
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
mca_bcol_iboffload_module_t *iboffload)
|
|
{
|
|
int ret, start_wr_index;
|
|
struct ibv_recv_wr *recv_wr, *recv_bad;
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
|
|
|
|
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
|
|
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
|
|
(void *) endpoint, qp_index));
|
|
|
|
/* make sure that we do not overrun number of rd_wqe */
|
|
if (0 >= endpoint->qps[qp_index].rd_wqe) {
|
|
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
|
|
endpoint->qps[qp_index].rd_wqe));
|
|
|
|
return 0;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&recv_wrs->lock);
|
|
|
|
/* Calculate start index in array
|
|
* of pre-allocated work requests */
|
|
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
|
|
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
|
|
"start index of WRs - %d", (void *) endpoint,
|
|
qp_index, start_wr_index));
|
|
|
|
recv_wr->sg_list = &frag->sg_entry;
|
|
|
|
/* Set the tail */
|
|
recv_wr->next = NULL;
|
|
|
|
/* post the list of recvs */
|
|
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
|
|
if (OPAL_UNLIKELY(0 != ret)) {
|
|
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
|
|
"qp_index - %d.\n",
|
|
ibv_get_device_name(device->dev.ib_dev),
|
|
strerror(errno), ret, qp_index));
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* decresing numbers of free recv wqe */
|
|
--endpoint->qps[qp_index].rd_wqe;
|
|
|
|
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Return success: "
|
|
"endpoint %p, qp_index %d, dest_rank %d",
|
|
endpoint, qp_index, dest_rank));
|
|
|
|
return 1;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_preposted_recv_frag(
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
int source, int qp_index)
|
|
{
|
|
mca_bcol_iboffload_frag_t *frag;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
|
|
|
|
frag = mca_bcol_iboffload_component.qp_infos[qp_index].get_preposted_recv(endpoint, qp_index);
|
|
|
|
/* do we want to run prepost */
|
|
IBOFFLOAD_VERBOSE(10, ("source - %d, qp_index - %d; "
|
|
"allocating preposted addr %p.\n",
|
|
source, qp_index, (void *) frag->sg_entry.addr));
|
|
|
|
if (OPAL_LIKELY(NULL != frag)) {
|
|
frag->next = NULL;
|
|
}
|
|
|
|
return frag;
|
|
}
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* MCA_BCOL_IBOFFLOAD_ENDPOINT_H */
|