e1066143a4
This commit was SVN r28734.
614 строки
22 KiB
C
614 строки
22 KiB
C
/*
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
* Copyright (c) 2013 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef MCA_BCOL_IBOFFLOAD_TASK_H
|
|
#define MCA_BCOL_IBOFFLOAD_TASK_H
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include <infiniband/verbs.h>
|
|
#include <infiniband/mverbs.h>
|
|
#include <infiniband/mqe.h>
|
|
|
|
#include "bcol_iboffload.h"
|
|
#include "bcol_iboffload_frag.h"
|
|
#include "bcol_iboffload_collreq.h"
|
|
#include "bcol_iboffload_endpoint.h"
|
|
#include "bcol_iboffload_collfrag.h"
|
|
|
|
#define SENDWR(task) ((task)->element.post.send_wr)
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/* the mca_bcol_ibv_mwr_task_t name was replaced with mca_bcol_iboffload_task_t */
|
|
struct mca_bcol_iboffload_task_t {
|
|
ompi_free_list_item_t super;
|
|
|
|
/* pointer to the memory descriptor associated with the task */
|
|
mca_bcol_iboffload_frag_t *frag;
|
|
|
|
/* pointer to the bcol descriptor,
|
|
* we need it for send task only becasue we complete them in async maner
|
|
*/
|
|
mca_bcol_iboffload_collfrag_t *collfrag;
|
|
|
|
/* task to be posted */
|
|
struct mqe_task element;
|
|
|
|
/* allocate ibv_sge structs array - in a CALC case
|
|
* for example it will have two entries.
|
|
*/
|
|
struct ibv_sge *sg_entries;
|
|
|
|
/* sg_entries array length */
|
|
int sg_entries_num;
|
|
|
|
/* Each task is a member of some free list,
|
|
if the pointer is NULL => we assume the task
|
|
is a member of the common task list (tasks_free) */
|
|
ompi_free_list_t *task_list;
|
|
|
|
/* Pointer to the next task */
|
|
struct mca_bcol_iboffload_task_t *next_task;
|
|
|
|
/* pasha - it is crappy work around for driver interface
|
|
* the send_wr and recv_wr should be part of mqe_task and not pointers !
|
|
*/
|
|
union {
|
|
struct ibv_m_send_wr send_wr;
|
|
struct ibv_recv_wr recv_wr;
|
|
} wr;
|
|
|
|
/* If we'll decide to post a task to a different qp */
|
|
struct mqe_qp_entry task_mqe_qp_entry;
|
|
|
|
/* Pointer to endpoint for this task */
|
|
mca_bcol_iboffload_endpoint_t *endpoint;
|
|
};
|
|
typedef struct mca_bcol_iboffload_task_t mca_bcol_iboffload_task_t;
|
|
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_task_t);
|
|
|
|
|
|
/* calc_tasks_free free list init function */
|
|
void
|
|
mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx);
|
|
|
|
/* iovec_tasks_free free list init function */
|
|
void
|
|
mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx);
|
|
|
|
static inline __opal_attribute_always_inline__ void
|
|
mca_bcol_iboffload_return_frag_tolist(
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
ompi_free_list_t *list)
|
|
{
|
|
if (NULL != frag) {
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != frag->type);
|
|
|
|
if (MCA_BCOL_IBOFFLOAD_DUMMY_OWNER != frag->type &&
|
|
0 == frag->ref_counter) {
|
|
if (MCA_BCOL_IBOFFLOAD_BCOL_OWNER == frag->type) {
|
|
OMPI_FREE_LIST_RETURN_MT((&(list[frag->qp_index])),
|
|
(ompi_free_list_item_t*) frag);
|
|
} else if (MCA_BCOL_IBOFFLOAD_ML_OWNER == frag->type) {
|
|
OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)),
|
|
(ompi_free_list_item_t*) frag);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ void
|
|
mca_bcol_iboffload_return_recv_frags_toendpoint(
|
|
mca_bcol_iboffload_frag_t *frags,
|
|
mca_bcol_iboffload_endpoint_t *ep,
|
|
int qp_index)
|
|
{
|
|
mca_bcol_iboffload_frag_t *recv_frag = frags;
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
|
|
while (NULL != recv_frag) {
|
|
assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != recv_frag->type);
|
|
if (MCA_BCOL_IBOFFLOAD_ML_OWNER != recv_frag->type) {
|
|
opal_list_prepend(&ep->qps[qp_index].preposted_frags,
|
|
(opal_list_item_t *) recv_frag);
|
|
} else {
|
|
OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)),
|
|
(ompi_free_list_item_t*) recv_frag);
|
|
}
|
|
|
|
recv_frag = recv_frag->next;
|
|
}
|
|
}
|
|
|
|
/* Wait task allocation and initialization */
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_wait_task(mca_bcol_iboffload_module_t *iboffload,
|
|
uint32_t source, int num_waits,
|
|
mca_bcol_iboffload_frag_t *frags,
|
|
int qp_index, struct ibv_qp *qp)
|
|
{
|
|
ompi_free_list_item_t *item;
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
|
|
|
|
/* blocking allocation for send fragment */
|
|
OMPI_FREE_LIST_GET_MT(&cm->tasks_free, item);
|
|
if (OPAL_UNLIKELY(NULL == item)) {
|
|
mca_bcol_iboffload_return_recv_frags_toendpoint(frags, endpoint, qp_index);
|
|
return NULL;
|
|
}
|
|
|
|
task = (mca_bcol_iboffload_task_t *) item;
|
|
/* set pointer to corresponding recv fragment */
|
|
IBOFFLOAD_SET_FRAGS_ON_TASK(frags, task);
|
|
|
|
task->next_task = NULL;
|
|
task->endpoint = endpoint;
|
|
|
|
/* set opcode */
|
|
task->element.opcode = MQE_WR_CQE_WAIT;
|
|
task->element.flags = 0; /* Here maybe ANY flag, anyway driver ignore it */
|
|
/* set task id */
|
|
task->element.wr_id = (uint64_t) (uintptr_t) task;
|
|
/* set CQ */
|
|
task->element.wait.cq = endpoint->qp_config.init_attr[qp_index].recv_cq;
|
|
|
|
/* set number of tasks to task */
|
|
task->element.wait.count = num_waits;
|
|
/* set pointer to QP */
|
|
|
|
if (NULL == qp) { /* NULL means use MQ's QP */
|
|
task->element.wait.mqe_qp = NULL;
|
|
} else { /* Post wait to the SQ of this QP */
|
|
task->task_mqe_qp_entry.next = NULL;
|
|
task->task_mqe_qp_entry.qp = qp;
|
|
|
|
task->element.wait.mqe_qp = &task->task_mqe_qp_entry;
|
|
}
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Allocating task %p, cq: %p, num waits: %d, qp_index - %d, "
|
|
"destination %d for comm rank: %d.\n",
|
|
(void *) task, (void *) task->element.wait.cq,
|
|
task->element.wait.count, qp_index, source,
|
|
endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
|
|
return task;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_prepare_send_task(
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
mca_bcol_iboffload_endpoint_t *endpoint,
|
|
int qp_index, ompi_free_list_t *task_list,
|
|
mca_bcol_iboffload_collfrag_t *collfrag)
|
|
{
|
|
ompi_free_list_item_t *item;
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Destination rank - %d, QP index - %d, "
|
|
"for comm rank - %d\n", endpoint->index, qp_index,
|
|
endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
|
|
|
|
/* get item from free list */
|
|
OMPI_FREE_LIST_GET_MT(task_list, item);
|
|
if (OPAL_UNLIKELY(NULL == item)) {
|
|
return NULL;
|
|
}
|
|
|
|
task = (mca_bcol_iboffload_task_t*) item;
|
|
task->endpoint = endpoint;
|
|
|
|
++(collfrag->n_sends);
|
|
task->collfrag = collfrag;
|
|
|
|
task->next_task = NULL;
|
|
task->element.wr_id = (uint64_t) (uintptr_t) task;
|
|
|
|
task->element.post.qp = endpoint->qps[qp_index].qp->lcl_qp;
|
|
|
|
task->element.opcode = MQE_WR_SEND;
|
|
|
|
/* define send work request */
|
|
SENDWR(task) = &(task->wr.send_wr);
|
|
|
|
SENDWR(task)->next = NULL;
|
|
|
|
SENDWR(task)->wr_id = (uint64_t) (uintptr_t) collfrag;
|
|
IBOFFLOAD_VERBOSE(10, ("coll_frag - %p.\n", collfrag));
|
|
|
|
/* Allways send IMM on sends ! */
|
|
task->element.flags = MQE_WR_FLAG_IMM_EXE;
|
|
|
|
/* Always signal completion */
|
|
SENDWR(task)->send_flags = IBV_SEND_SIGNALED;
|
|
|
|
return task;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_send_task(
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
uint32_t destination, int qp_index,
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
mca_bcol_iboffload_collfrag_t *collfrag,
|
|
bool enable_inline)
|
|
{
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination];
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n",
|
|
qp_index));
|
|
|
|
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
|
&cm->tasks_free,
|
|
collfrag);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
|
return NULL;
|
|
}
|
|
|
|
/* no support for multiple frags */
|
|
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
|
|
|
/* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */
|
|
if (0 == frag->sg_entry.length) {
|
|
SENDWR(task)->imm_data = 0;
|
|
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
|
|
|
SENDWR(task)->wr.rdma.rkey = endpoint->remote_zero_rdma_addr.rkey;
|
|
SENDWR(task)->wr.rdma.remote_addr = endpoint->remote_zero_rdma_addr.addr;
|
|
} else {
|
|
SENDWR(task)->opcode = IBV_WR_SEND;
|
|
}
|
|
|
|
/* single sge */
|
|
SENDWR(task)->num_sge = 1;
|
|
SENDWR(task)->sg_list = &(frag->sg_entry);
|
|
|
|
/* Use inline send when it is possible */
|
|
if (enable_inline &&
|
|
frag->sg_entry.length < cm->max_inline_data) {
|
|
IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length));
|
|
SENDWR(task)->send_flags |= IBV_SEND_INLINE;
|
|
}
|
|
|
|
return task;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_send_vec_task(
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
uint32_t destination, int qp_index,
|
|
size_t nitems,
|
|
struct iovec *buff_iovec,
|
|
uint32_t lkey,
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
mca_bcol_iboffload_collfrag_t *collfrag,
|
|
bool enable_inline)
|
|
{
|
|
mca_bcol_iboffload_task_t *task;
|
|
int i;
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination];
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n",
|
|
qp_index));
|
|
|
|
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
|
&iboffload->iovec_tasks_free,
|
|
collfrag);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
|
return NULL;
|
|
}
|
|
|
|
/* no support for multiple frags */
|
|
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
|
|
|
/* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */
|
|
SENDWR(task)->opcode = IBV_WR_SEND;
|
|
|
|
assert (task->sg_entries != NULL);
|
|
|
|
for (i = 0; (size_t) i < nitems; ++i){
|
|
task->sg_entries[i].length = buff_iovec[i].iov_len;
|
|
task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base;
|
|
task->sg_entries[i].lkey = lkey;
|
|
}
|
|
|
|
/* multiple sge */
|
|
SENDWR(task)->num_sge = nitems;
|
|
SENDWR(task)->sg_list = (task->sg_entries);
|
|
|
|
/* Use inline send when it is possible */
|
|
if (enable_inline &&
|
|
frag->sg_entry.length < cm->max_inline_data) {
|
|
IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length));
|
|
SENDWR(task)->send_flags |= IBV_SEND_INLINE;
|
|
}
|
|
|
|
return task;
|
|
}
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_rdma_vec_task(
|
|
uint32_t destination, size_t offset, size_t nitems,
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
struct iovec *buff_iovec, uint32_t lkey,
|
|
mca_bcol_iboffload_collfrag_t *collfrag)
|
|
{
|
|
int i;
|
|
mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req;
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
mca_bcol_iboffload_endpoint_t *endpoint =
|
|
iboffload->endpoints[destination];
|
|
|
|
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint,
|
|
coll_request->qp_index,
|
|
&iboffload->iovec_tasks_free,
|
|
collfrag);
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
|
return NULL;
|
|
}
|
|
|
|
/* no support for multiple frags */
|
|
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
|
|
|
SENDWR(task)->imm_data = 0;
|
|
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
|
SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
|
|
|
|
SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t)
|
|
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
|
|
|
|
for (i = 0; (size_t) i < nitems; ++i){
|
|
task->sg_entries[i].length = buff_iovec[i].iov_len;
|
|
task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base;
|
|
task->sg_entries[i].lkey = lkey;
|
|
}
|
|
|
|
/* single sge */
|
|
SENDWR(task)->num_sge = nitems;
|
|
SENDWR(task)->sg_list = (task->sg_entries);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset));
|
|
return task;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_rdma_task(
|
|
uint32_t destination, size_t offset,
|
|
mca_bcol_iboffload_frag_t *frag,
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
mca_bcol_iboffload_collfrag_t *collfrag)
|
|
{
|
|
mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req;
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
mca_bcol_iboffload_endpoint_t *endpoint =
|
|
iboffload->endpoints[destination];
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint,
|
|
coll_request->qp_index,
|
|
&cm->tasks_free, collfrag);
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
|
return NULL;
|
|
}
|
|
|
|
/* no support for multiple frags */
|
|
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
|
|
|
SENDWR(task)->imm_data = 0;
|
|
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
|
SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
|
|
/* Pasha: I really not happy with the way we calculate remote addresses.
|
|
why we don't use rbuf + offset ?*/
|
|
SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t)
|
|
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
|
|
/* single sge */
|
|
SENDWR(task)->num_sge = 1;
|
|
SENDWR(task)->sg_list = &(frag->sg_entry);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset));
|
|
return task;
|
|
}
|
|
|
|
/* Pasha: hacking version of calc operation */
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_calc_task(mca_bcol_iboffload_module_t *iboffload,
|
|
uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag,
|
|
struct ibv_sge *l_operand, struct ibv_sge *r_operand,
|
|
mca_bcol_iboffload_collreq_t *coll_request,
|
|
bool enable_inline)
|
|
/* Some specifications for this function:
|
|
* 1) We assume that the len of two operands (ibv_sge structs) is a same.
|
|
* 2) Possibly we use the results (ibv_sge structs) from previous
|
|
* calc operations => maybe the frag pointer is NULL.
|
|
*/
|
|
{
|
|
mca_bcol_iboffload_task_t *task;
|
|
mca_bcol_iboffload_endpoint_t *endpoint =
|
|
iboffload->endpoints[destination];
|
|
|
|
mca_bcol_iboffload_collfrag_t *collfrag =
|
|
(mca_bcol_iboffload_collfrag_t *)
|
|
opal_list_get_last(&coll_request->work_requests);
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
|
&cm->calc_tasks_free, collfrag);
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
|
return NULL;
|
|
}
|
|
|
|
if (NULL != frag) {
|
|
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
|
} else {
|
|
task->frag = NULL;
|
|
}
|
|
|
|
task->sg_entries[0] = *l_operand;
|
|
task->sg_entries[1] = *r_operand;
|
|
|
|
SENDWR(task)->num_sge = 2;
|
|
SENDWR(task)->sg_list = task->sg_entries;
|
|
|
|
SENDWR(task)->opcode = MCA_BCOL_IBOFFLOAD_SEND_CALC;
|
|
#if OMPI_HAVE_IBOFFLOAD_CALC_RDMA
|
|
SENDWR(task)->wr.calc_send.data_type = coll_request->actual_ib_dtype;
|
|
SENDWR(task)->wr.calc_send.calc_op = coll_request->actual_ib_op;
|
|
#else
|
|
SENDWR(task)->wr.calc.data_type = coll_request->actual_ib_dtype;
|
|
SENDWR(task)->wr.calc.calc_op = coll_request->actual_ib_op;
|
|
#endif
|
|
|
|
return task;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
|
mca_bcol_iboffload_get_rdma_calc_task(mca_bcol_iboffload_module_t *iboffload,
|
|
uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag,
|
|
struct ibv_sge *l_operand, struct ibv_sge *r_operand,
|
|
mca_bcol_iboffload_collreq_t *coll_request,
|
|
size_t offset)
|
|
/* Some specifications for this function:
|
|
* 1) We assume that the len of two operands (ibv_sge structs) is a same.
|
|
* 2) Possibly we use the results (ibv_sge structs) from previous
|
|
* calc operations => maybe the frag pointer is NULL.
|
|
*/
|
|
{
|
|
mca_bcol_iboffload_task_t *task;
|
|
mca_bcol_iboffload_endpoint_t *endpoint =
|
|
iboffload->endpoints[destination];
|
|
|
|
mca_bcol_iboffload_collfrag_t *collfrag =
|
|
(mca_bcol_iboffload_collfrag_t *)
|
|
opal_list_get_last(&coll_request->work_requests);
|
|
|
|
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
|
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
|
&cm->calc_tasks_free, collfrag);
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
|
return NULL;
|
|
}
|
|
|
|
if (NULL != frag) {
|
|
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
|
} else {
|
|
task->frag = NULL;
|
|
}
|
|
|
|
task->sg_entries[0] = *l_operand;
|
|
|
|
/* Hack - we don't really use it.
|
|
task->sg_entries[1] = *r_operand;
|
|
*/
|
|
/* We use only single entry
|
|
SENDWR(task)->num_sge = 2;
|
|
*/
|
|
SENDWR(task)->num_sge = 1;
|
|
SENDWR(task)->sg_list = task->sg_entries;
|
|
|
|
#if OMPI_HAVE_IBOFFLOAD_CALC_RDMA
|
|
SENDWR(task)->opcode = IBV_M_WR_CALC_RDMA_WRITE_WITH_IMM;
|
|
SENDWR(task)->wr.calc_rdma.data_type = coll_request->actual_ib_dtype;
|
|
SENDWR(task)->wr.calc_rdma.calc_op = coll_request->actual_ib_op;
|
|
SENDWR(task)->wr.calc_rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
|
|
SENDWR(task)->wr.calc_rdma.remote_addr = (uint64_t) (uintptr_t)
|
|
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
|
|
#else
|
|
IBOFFLOAD_ERROR(("Fatal error: RDMA CALC was called, but the driver does not support this operation"));
|
|
return NULL;
|
|
#endif
|
|
|
|
return task;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
int release_frags_on_task(mca_bcol_iboffload_task_t *task,
|
|
ompi_free_list_t *list)
|
|
{
|
|
int rc, qp_index;
|
|
|
|
mca_bcol_iboffload_frag_t *temp_frag = task->frag;
|
|
mca_bcol_iboffload_endpoint_t *endpoint = task->endpoint;
|
|
|
|
mca_bcol_iboffload_component_t *cm =
|
|
&mca_bcol_iboffload_component;
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("\nCalling release_frags_on_task"));
|
|
|
|
while (NULL != temp_frag) {
|
|
qp_index = temp_frag->qp_index;
|
|
|
|
--(temp_frag->ref_counter);
|
|
|
|
/* Return credits */
|
|
if (MQE_WR_CQE_WAIT == task->element.opcode) {
|
|
++(endpoint->qps[qp_index].rd_wqe);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Return rd_wqe %d pp_win %d",
|
|
endpoint->qps[qp_index].rd_wqe,
|
|
cm->qp_infos[qp_index].rd_pp_win));
|
|
|
|
/* Call for recv prepost */
|
|
if (endpoint->qps[qp_index].rd_wqe >=
|
|
cm->qp_infos[qp_index].rd_pp_win) {
|
|
IBOFFLOAD_VERBOSE(10, ("Prepost to endpoint->index - %d, qp_index - %d", endpoint->index, qp_index));
|
|
rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index,
|
|
endpoint->qps[qp_index].rd_wqe);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
IBOFFLOAD_ERROR(("QP %d: failed to prepost.\n", qp_index));
|
|
return OMPI_ERROR;
|
|
}
|
|
/* What happens if we can not prepost ?*/
|
|
}
|
|
} else if (MQE_WR_SEND == task->element.opcode) {
|
|
++(endpoint->qps[qp_index].sd_wqe);
|
|
|
|
assert(endpoint->qps[qp_index].sd_wqe <= cm->qp_infos[qp_index].rd_num);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Return sd_wqe %d, qp_index - %d, endpoint - %p",
|
|
endpoint->qps[qp_index].sd_wqe, qp_index, endpoint));
|
|
} else {
|
|
/* We should not arrive to this case */
|
|
IBOFFLOAD_ERROR(("Unsupporeted operation"));
|
|
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
mca_bcol_iboffload_return_frag_tolist(temp_frag, list);
|
|
temp_frag = temp_frag->next;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
END_C_DECLS
|
|
|
|
#endif
|