1
1
openmpi/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.h
Pavel Shamis b89f8fabc9 Adding Hierarchical Collectives project to the Open MPI trunk.
The project includes following components and frameworks: 
- ML Collective component
- NETPATTERNS and COMMPATTERNS common components
- BCOL framework
- SBGP framework

Note: By default the ML collective component is disabled. In order to enable
new collectives user should bump up the priority of ml component (coll_ml_priority)

=============================================

Primary Contributors (in alphabetical order):

Ishai Rabinovich (Mellanox)
Joshua S. Ladd (ORNL / Mellanox)
Manjunath Gorentla Venkata (ORNL)
Mike Dubman (Mellanox)
Noam Bloch (Mellanox)
Pavel (Pasha) Shamis (ORNL / Mellanox)
Richard Graham (ORNL / Mellanox)
Vasily Filipov (Mellanox)

This commit was SVN r27078.
2012-08-16 19:11:35 +00:00

329 строки
11 KiB
C

/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_ENDPOINT_H
#define MCA_BCOL_IBOFFLOAD_ENDPOINT_H
#include "ompi_config.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
#define BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) (ep)->ibnet_proc->use_port[(cgroup)->index]
#define BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep) (BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) - 1)
BEGIN_C_DECLS
struct mca_bcol_iboffload_endpoint_qp_t {
struct ompi_common_ofacm_base_qp_t *qp;
size_t ib_inline_max;
int32_t sd_wqe; /* Number of available send wqe entries */
int32_t rd_wqe; /* Number of available recv wqe entries */
opal_list_t preposted_frags; /* List of preposted frags */
/* opal_mutex_t lock; */ /* Do I need lock here ? */
};
typedef struct mca_bcol_iboffload_endpoint_qp_t mca_bcol_iboffload_endpoint_qp_t;
enum {
IBOFFLOAD_CQ_SMALL_MESSAGES = 0,
IBOFFLOAD_CQ_SYNC,
IBOFFLOAD_CQ_LARGE_MESSAGES,
IBOFFLOAD_CQ_LAST
};
/* Endpoint object */
struct mca_bcol_iboffload_endpoint_t {
opal_list_item_t super;
/** BTL module that created this connection */
mca_bcol_iboffload_module_t *iboffload_module;
/** proc structure corresponding to endpoint */
mca_sbgp_ibnet_proc_t *ibnet_proc;
/** lock for concurrent access to endpoint state */
opal_mutex_t endpoint_lock;
/** Penging frag list */
opal_list_t pending_frags;
/** QPs information */
mca_bcol_iboffload_endpoint_qp_t *qps;
/** endpoint index on array */
int32_t index;
/** CQ for receive queues on this endpoint */
struct ibv_cq *recv_cq[IBOFFLOAD_CQ_LAST];
/** QP configuration information */
ompi_common_ofacm_base_qp_config_t qp_config;
/** cpc context */
ompi_common_ofacm_base_local_connection_context_t *cpc_context;
/** caching pointer to remote info */
ompi_common_ofacm_base_remote_connection_context_t *remote_info;
/** caching pointer to cpc */
ompi_common_ofacm_base_module_t *endpoint_cpc;
/** The struct is used for zero RDMA with immediate
in some collectives, in barrier for example. */
mca_bcol_iboffload_rdma_info_t remote_zero_rdma_addr;
mca_bcol_iboffload_rem_rdma_block_t remote_rdma_block;
/** The pointer to device - In the destruction function
the iboffload module may not exist any more - caching the device */
struct mca_bcol_iboffload_device_t *device;
bool need_toset_remote_rdma_info;
mca_bcol_iboffload_rdma_info_t remote_rdma_info[MAX_REMOTE_RDMA_INFO];
};
typedef struct mca_bcol_iboffload_endpoint_t mca_bcol_iboffload_endpoint_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_endpoint_t);
/* Function declaration */
int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep);
static inline __opal_attribute_always_inline__
int check_endpoint_state(mca_bcol_iboffload_endpoint_t *ep,
mca_bcol_base_descriptor_t *des,
opal_list_t *pending_list)
{
int rc = OMPI_ERR_RESOURCE_BUSY;
OPAL_THREAD_LOCK(&ep->cpc_context->context_lock);
/* Adding here one more redirection in critical path. Need to think
* what is the best way to prevent it */
switch(ep->cpc_context->state) {
case MCA_COMMON_OFACM_CLOSED:
rc = ep->endpoint_cpc->cbm_start_connect(ep->cpc_context);
if (OMPI_SUCCESS == rc) {
rc = OMPI_ERR_RESOURCE_BUSY;
}
/*
* As long as we expect a message from the peer (in order
* to setup the connection) let the event engine pool the
* OOB events. Note: we increment it once peer active
* connection.
*/
opal_progress_event_users_increment();
/* fall through */
default:
/* opal_list_append(pending_list, (opal_list_item_t *)des); */ /* Vasily: will be uncomment later */
break;
case MCA_COMMON_OFACM_FAILED:
rc = OMPI_ERR_UNREACH;
break;
case MCA_COMMON_OFACM_CONNECTED:
rc = OMPI_SUCCESS;
break;
}
OPAL_THREAD_UNLOCK(&ep->cpc_context->context_lock);
return rc;
}
int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup,
mca_bcol_iboffload_module_t *module);
int mca_bcol_iboffload_endpoint_post_recvs(void *context);
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_prepost_recv(
mca_bcol_iboffload_endpoint_t *endpoint,
int qp_index, int num_to_prepost)
{
mca_bcol_iboffload_prepost_qps_fn_t prepost_recv =
mca_bcol_iboffload_component.qp_infos[qp_index].prepost_recv;
if (NULL != prepost_recv) {
return prepost_recv(endpoint, qp_index, num_to_prepost);
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_post_ml_scatter_recv_frag(
int qp_index, uint32_t dest_rank,
int nitems, struct iovec *buff_iovec,
uint32_t lkey,
struct ibv_sge *sg_entries,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_module_t *iboffload)
{
int ret, start_wr_index;
struct ibv_recv_wr *recv_wr, *recv_bad;
int i;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
(void *) endpoint, qp_index));
/* make sure that we do not overrun number of rd_wqe */
if (0 >= endpoint->qps[qp_index].rd_wqe) {
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
endpoint->qps[qp_index].rd_wqe));
return 0;
}
OPAL_THREAD_LOCK(&recv_wrs->lock);
/* Calculate start index in array
* of pre-allocated work requests */
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
"start index of WRs - %d", (void *) endpoint,
qp_index, start_wr_index));
for (i = 0; i < nitems; i++) {
sg_entries[i].length = buff_iovec[i].iov_len;
sg_entries[i].addr = (uint64_t)buff_iovec[i].iov_base;
sg_entries[i].lkey = lkey;
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , length %d , address %p",
i, sg_entries[i].length, sg_entries[i].addr));
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , iovec length %d",
i, buff_iovec[i].iov_len));
}
recv_wr->num_sge = nitems;
recv_wr->sg_list = sg_entries;
/* Set the tail */
recv_wr->next = NULL;
/* post the list of recvs */
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
if (OPAL_UNLIKELY(0 != ret)) {
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
"qp_index - %d.\n",
ibv_get_device_name(device->dev.ib_dev),
strerror(errno), ret, qp_index));
return -1;
}
/* decresing numbers of free recv wqe */
--endpoint->qps[qp_index].rd_wqe;
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
IBOFFLOAD_VERBOSE(10, ("Return success: "
"endpoint %p, qp_index %d, dest_rank %d",
endpoint, qp_index, dest_rank));
return 1;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_prepost_ml_recv_frag(
int qp_index, uint32_t dest_rank,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_module_t *iboffload)
{
int ret, start_wr_index;
struct ibv_recv_wr *recv_wr, *recv_bad;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
(void *) endpoint, qp_index));
/* make sure that we do not overrun number of rd_wqe */
if (0 >= endpoint->qps[qp_index].rd_wqe) {
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
endpoint->qps[qp_index].rd_wqe));
return 0;
}
OPAL_THREAD_LOCK(&recv_wrs->lock);
/* Calculate start index in array
* of pre-allocated work requests */
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
"start index of WRs - %d", (void *) endpoint,
qp_index, start_wr_index));
recv_wr->sg_list = &frag->sg_entry;
/* Set the tail */
recv_wr->next = NULL;
/* post the list of recvs */
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
if (OPAL_UNLIKELY(0 != ret)) {
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
"qp_index - %d.\n",
ibv_get_device_name(device->dev.ib_dev),
strerror(errno), ret, qp_index));
return -1;
}
/* decresing numbers of free recv wqe */
--endpoint->qps[qp_index].rd_wqe;
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
IBOFFLOAD_VERBOSE(10, ("Return success: "
"endpoint %p, qp_index %d, dest_rank %d",
endpoint, qp_index, dest_rank));
return 1;
}
static inline __opal_attribute_always_inline__
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_preposted_recv_frag(
mca_bcol_iboffload_module_t *iboffload,
int source, int qp_index)
{
mca_bcol_iboffload_frag_t *frag;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
frag = mca_bcol_iboffload_component.qp_infos[qp_index].get_preposted_recv(endpoint, qp_index);
/* do we want to run prepost */
IBOFFLOAD_VERBOSE(10, ("source - %d, qp_index - %d; "
"allocating preposted addr %p.\n",
source, qp_index, (void *) frag->sg_entry.addr));
if (OPAL_LIKELY(NULL != frag)) {
frag->next = NULL;
}
return frag;
}
END_C_DECLS
#endif /* MCA_BCOL_IBOFFLOAD_ENDPOINT_H */