2012-08-16 23:11:35 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef MCA_BCOL_IBOFFLOAD_BCAST_H
|
|
|
|
#define MCA_BCOL_IBOFFLOAD_BCAST_H
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
|
|
|
#include "bcol_iboffload.h"
|
|
|
|
#include "bcol_iboffload_frag.h"
|
|
|
|
#include "bcol_iboffload_task.h"
|
|
|
|
#include "bcol_iboffload_collreq.h"
|
|
|
|
#include "bcol_iboffload_collfrag.h"
|
|
|
|
#include "bcol_iboffload_endpoint.h"
|
|
|
|
|
|
|
|
#include "opal/include/opal/types.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
|
|
|
int mca_bcol_iboffload_small_msg_bcast_progress(
|
|
|
|
bcol_function_args_t *input_args,
|
|
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
int mca_bcol_iboffload_small_msg_bcast_extra_intra(bcol_function_args_t *fn_arguments,
|
|
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments,
|
|
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
int mca_bcol_iboffload_bcast_scatter_allgather_intra(bcol_function_args_t *fn_arguments,
|
|
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments,
|
|
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments,
|
|
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super);
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_recv_rtr_setup(
|
|
|
|
struct mqe_task **last_wait,
|
|
|
|
uint32_t dest_rank,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
/* Wait for RTR message over credit QP */
|
|
|
|
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
|
|
|
|
iboffload, dest_rank,
|
|
|
|
MCA_BCOL_IBOFFLOAD_QP_CREDIT);
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
task = mca_bcol_iboffload_get_wait_task(
|
|
|
|
iboffload, dest_rank, 1, fragment, MCA_BCOL_IBOFFLOAD_QP_CREDIT,
|
|
|
|
iboffload->endpoints[dest_rank]->qps[MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF].qp->lcl_qp);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_send_small_buff_setup(
|
|
|
|
struct mqe_task **last_send,
|
|
|
|
size_t len, uint32_t dest_rank,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
mca_bcol_iboffload_collreq_t *coll_request =
|
|
|
|
coll_fragment->coll_full_req;
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10,("Get ml frag that I will send dest rank %d, len %d, lkey %d",
|
|
|
|
dest_rank, len, iboffload->rdma_block.ib_info.lkey));
|
|
|
|
|
|
|
|
fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank,
|
|
|
|
coll_request->qp_index, len, 0,
|
|
|
|
SBUF, /* this could be problematic */
|
|
|
|
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML);
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10,("Get an rdma task for dest %d for packet size %d",
|
|
|
|
dest_rank,len));
|
|
|
|
task = mca_bcol_iboffload_get_rdma_task(
|
|
|
|
dest_rank, 0,
|
|
|
|
fragment, iboffload, coll_fragment);
|
|
|
|
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*last_send = &task->element;
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_send_large_buff_setup(
|
|
|
|
struct mqe_task **last_send,
|
|
|
|
int buf_index, int offset,
|
|
|
|
size_t len, uint32_t dest_rank,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
mca_bcol_iboffload_collreq_t *coll_request =
|
|
|
|
coll_fragment->coll_full_req;
|
|
|
|
|
|
|
|
fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank,
|
|
|
|
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
|
|
|
len,
|
|
|
|
offset, buf_index, MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML);
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
task = mca_bcol_iboffload_get_send_task(
|
|
|
|
iboffload, dest_rank,
|
|
|
|
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
|
|
|
fragment, coll_fragment, NO_INLINE);
|
|
|
|
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*last_send = &task->element;
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_send_rtr_setup(
|
|
|
|
struct mqe_task **last_send,
|
|
|
|
uint32_t dest_rank,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
/* Recv is ready , Send RTR message */
|
|
|
|
fragment = mca_bcol_iboffload_get_send_frag(coll_fragment->coll_full_req,
|
|
|
|
dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT, 0,
|
|
|
|
0, RBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
task = mca_bcol_iboffload_get_send_task(iboffload, dest_rank,
|
|
|
|
MCA_BCOL_IBOFFLOAD_QP_CREDIT,
|
|
|
|
fragment, coll_fragment, INLINE);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("dest_rank - %d. qp index - %d.\n",
|
|
|
|
dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT));
|
|
|
|
|
|
|
|
*last_send = &task->element;
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_recv_small_preposted_buff_setup(
|
|
|
|
struct mqe_task **last_wait,
|
|
|
|
size_t len, uint32_t dest_rank,
|
|
|
|
int qp_index,
|
|
|
|
int nwaits,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10,("Get preposted recv from rank %d", dest_rank));
|
|
|
|
|
|
|
|
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
|
|
|
|
iboffload, dest_rank,
|
|
|
|
qp_index);
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, nwaits,
|
|
|
|
fragment, qp_index, NULL);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*last_wait = &task->element;
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_recv_small_buff_setup(
|
|
|
|
struct mqe_task **last_wait,
|
|
|
|
size_t len, uint32_t dest_rank,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
mca_bcol_iboffload_collreq_t *coll_request =
|
|
|
|
coll_fragment->coll_full_req;
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Get preposted recv from rank %d", dest_rank));
|
|
|
|
|
|
|
|
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
|
|
|
|
iboffload, dest_rank,
|
|
|
|
coll_request->qp_index);
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1,
|
|
|
|
fragment, coll_request->qp_index, NULL);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*last_wait = &task->element;
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__ int
|
|
|
|
mca_bcol_iboffload_recv_large_buff_setup(
|
|
|
|
struct mqe_task **last_wait,
|
|
|
|
int buf_index, int offset,
|
|
|
|
size_t len, uint32_t dest_rank,
|
|
|
|
mca_bcol_iboffload_module_t *iboffload,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
|
|
|
{
|
|
|
|
int num_preposted;
|
|
|
|
|
|
|
|
mca_bcol_iboffload_task_t *task;
|
|
|
|
mca_bcol_iboffload_frag_t *fragment;
|
|
|
|
|
|
|
|
mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req;
|
|
|
|
|
|
|
|
/* Post message to recv queue for large messages */
|
|
|
|
fragment = mca_bcol_iboffload_get_ml_frag(
|
|
|
|
iboffload, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, len,
|
|
|
|
coll_request->buffer_info[buf_index].iboffload_reg->mr->lkey,
|
|
|
|
(uint64_t)((unsigned char *)coll_request->buffer_info[buf_index].buf + offset));
|
|
|
|
if (OPAL_UNLIKELY(NULL == fragment)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
num_preposted = mca_bcol_iboffload_prepost_ml_recv_frag(
|
|
|
|
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
|
|
|
dest_rank, fragment, iboffload);
|
|
|
|
if (0 >= num_preposted) {
|
|
|
|
IBOFFLOAD_ERROR(("Failed to prepost recv fragments "
|
|
|
|
"return code - %d; dest_rank - %d",
|
|
|
|
num_preposted, dest_rank));
|
|
|
|
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1,
|
|
|
|
fragment, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, NULL);
|
|
|
|
if (OPAL_UNLIKELY(NULL == task)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*last_wait = &task->element;
|
|
|
|
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
|
|
|
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
|
|
int bcol_iboffload_binomial_root_to_src(int group_root, int my_rank,
|
|
|
|
int pow2_size, int group_size, int *distance)
|
|
|
|
{
|
|
|
|
int root, relative_rank, src,
|
|
|
|
pow2_distance = 0, i;
|
|
|
|
|
|
|
|
if (group_root < pow2_size) {
|
|
|
|
root = group_root;
|
|
|
|
} else {
|
|
|
|
/* the source of the data is extra node,
|
|
|
|
the real root it represented by some rank from
|
|
|
|
pow2 group */
|
|
|
|
root = group_root - pow2_size;
|
|
|
|
/* shortcut for the case when my rank is root for the group */
|
|
|
|
if (my_rank == root) {
|
|
|
|
*distance = -1;
|
|
|
|
return group_root;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size :
|
|
|
|
my_rank - root;
|
|
|
|
|
|
|
|
for (i = 1; i < pow2_size; i<<=1, pow2_distance++) {
|
|
|
|
if (relative_rank & i) {
|
|
|
|
src = my_rank ^ i;
|
|
|
|
if (src >= pow2_size)
|
|
|
|
src -= pow2_size;
|
|
|
|
|
|
|
|
*distance = pow2_distance;
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("AAAAA d %d rel %d it %d root %d my %d", *distance, relative_rank, i, root, my_rank));
|
|
|
|
return src;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* error case */
|
|
|
|
*distance = -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void bcol_iboffload_setup_binomial_connection(mca_bcol_iboffload_module_t *iboffload)
|
|
|
|
{
|
2013-02-06 01:52:55 +04:00
|
|
|
netpatterns_pair_exchange_node_t *my_exchange_node =
|
2012-08-16 23:11:35 +04:00
|
|
|
&iboffload->recursive_doubling_tree;
|
|
|
|
|
|
|
|
int i, n_exchanges = my_exchange_node->n_exchanges,
|
|
|
|
*exchanges = my_exchange_node->rank_exchanges,
|
|
|
|
n_extra_src = my_exchange_node->n_extra_sources,
|
|
|
|
my_rank = iboffload->ibnet->super.my_index,
|
|
|
|
rank_extra_src = my_exchange_node->rank_extra_source;
|
|
|
|
|
|
|
|
mca_bcol_iboffload_endpoint_t *ep;
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
|
|
|
|
|
|
|
|
if (0 < n_extra_src) {
|
|
|
|
ep = iboffload->endpoints[rank_extra_src];
|
|
|
|
while (OMPI_SUCCESS !=
|
|
|
|
check_endpoint_state(ep, NULL, NULL)) {
|
|
|
|
opal_progress();
|
|
|
|
}
|
|
|
|
|
|
|
|
#if OPAL_ENABLE_DEBUG
|
|
|
|
{
|
|
|
|
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
|
|
|
|
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
|
|
|
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
|
|
|
|
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Connect to all extra nodes */
|
|
|
|
if (EXTRA_NODE == my_exchange_node->node_type) {
|
|
|
|
for (i = iboffload->power_of_2_ranks;
|
|
|
|
i < iboffload->num_endpoints; ++i) {
|
|
|
|
if (i != my_rank) {
|
|
|
|
ep = iboffload->endpoints[i];
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("subgroup rank %d: Connect to rank %d.\n", my_rank, i));
|
|
|
|
|
|
|
|
while (OMPI_SUCCESS !=
|
|
|
|
check_endpoint_state(ep, NULL, NULL)) {
|
|
|
|
opal_progress();
|
|
|
|
}
|
|
|
|
|
|
|
|
#if OPAL_ENABLE_DEBUG
|
|
|
|
{
|
|
|
|
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
|
|
|
|
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
|
|
|
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
|
|
|
|
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < n_exchanges; ++i) {
|
|
|
|
ep = iboffload->endpoints[exchanges[i]];
|
|
|
|
|
|
|
|
while (OMPI_SUCCESS !=
|
|
|
|
check_endpoint_state(ep, NULL, NULL)) {
|
|
|
|
opal_progress();
|
|
|
|
}
|
|
|
|
|
|
|
|
#if OPAL_ENABLE_DEBUG
|
|
|
|
{
|
|
|
|
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
|
|
|
|
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
|
|
|
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
|
|
|
|
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
/* set the connection status to connected */
|
|
|
|
iboffload->connection_status[RECURSIVE_DOUBLING_TREE_BCAST] = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
|
|
int bcol_iboffload_bcast_binomial_gather(mca_bcol_iboffload_module_t *iboffload_module,
|
|
|
|
struct mqe_task **last_send, struct mqe_task **last_wait,
|
|
|
|
mca_bcol_iboffload_collfrag_t *coll_fragment,
|
|
|
|
int count, int base_block_size, int radix_mask_pow)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
int i;
|
|
|
|
int my_group_index = iboffload_module->ibnet->super.my_index;
|
|
|
|
int delta, rdelta;
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("bcol_iboffload_bcast_binomial_gather %d %d",
|
|
|
|
radix_mask_pow, my_group_index));
|
|
|
|
|
|
|
|
/* we assume the iteration #iteration already was completed with probe */
|
|
|
|
for (i = 0; i < iboffload_module->power_of_2; i++) {
|
|
|
|
int pow2 = 1 << i;
|
|
|
|
int peer_index = my_group_index ^ pow2;
|
|
|
|
int slen, rlen,
|
|
|
|
send_offset,
|
|
|
|
recv_offset;
|
|
|
|
|
|
|
|
if (i > radix_mask_pow) {
|
|
|
|
slen = rlen = pow2 * base_block_size;
|
|
|
|
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
|
|
|
|
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
|
|
|
|
|
|
|
|
rdelta = count - recv_offset;
|
|
|
|
if (rdelta > 0) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Recv1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
|
|
|
pow2, 1 << iboffload_module->power_of_2,
|
|
|
|
recv_offset, rlen, peer_index));
|
|
|
|
|
|
|
|
rc = mca_bcol_iboffload_send_rtr_setup(last_send,
|
|
|
|
peer_index, iboffload_module,
|
|
|
|
coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
delta = count - send_offset;
|
|
|
|
if (delta > 0) {
|
|
|
|
if (delta < slen) {
|
|
|
|
/* recv the tail */
|
|
|
|
slen = delta;
|
|
|
|
}
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Send1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
|
|
|
pow2, 1 << iboffload_module->power_of_2,
|
|
|
|
send_offset, slen, peer_index));
|
|
|
|
rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index,
|
|
|
|
iboffload_module, coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rdelta > 0) {
|
|
|
|
if (rdelta < rlen) {
|
|
|
|
/* recv the tail */
|
|
|
|
rlen = rdelta;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait,
|
|
|
|
SBUF, recv_offset, rlen, peer_index,
|
|
|
|
iboffload_module, coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (i == radix_mask_pow) {
|
|
|
|
/* only receive data */
|
|
|
|
rlen = pow2 * base_block_size;
|
|
|
|
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
|
|
|
|
delta = count - recv_offset;
|
|
|
|
if (0 >= delta) {
|
|
|
|
/* we have nothing to send, skip the iteration */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (delta < rlen) {
|
|
|
|
/* recv the tail */
|
|
|
|
rlen = delta;
|
|
|
|
}
|
|
|
|
/* receive data from the peer */
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Recv2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
|
|
|
pow2,
|
|
|
|
1 << iboffload_module->power_of_2,
|
|
|
|
recv_offset,
|
|
|
|
rlen, peer_index));
|
|
|
|
rc = mca_bcol_iboffload_send_rtr_setup(last_send,
|
|
|
|
peer_index, iboffload_module,
|
|
|
|
coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait,
|
|
|
|
SBUF, recv_offset, rlen, peer_index,
|
|
|
|
iboffload_module, coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
} else if (i < radix_mask_pow) {
|
|
|
|
/* Only send data */
|
|
|
|
slen = pow2 * base_block_size;
|
|
|
|
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
|
|
|
|
delta = count - send_offset;
|
|
|
|
if (0 >= delta) {
|
|
|
|
/* we have nothing to send, skip the iteration */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (delta < slen) {
|
|
|
|
slen = delta;
|
|
|
|
}
|
|
|
|
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Send2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
|
|
|
pow2,
|
|
|
|
1 << iboffload_module->power_of_2,
|
|
|
|
send_offset,
|
|
|
|
slen,
|
|
|
|
peer_index));
|
|
|
|
|
|
|
|
rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index,
|
|
|
|
iboffload_module, coll_fragment);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
|
|
|
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
|
|
|
|
#endif
|