/* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef MCA_BCOL_IBOFFLOAD_BCAST_H #define MCA_BCOL_IBOFFLOAD_BCAST_H #include "ompi_config.h" #include "bcol_iboffload.h" #include "bcol_iboffload_frag.h" #include "bcol_iboffload_task.h" #include "bcol_iboffload_collreq.h" #include "bcol_iboffload_collfrag.h" #include "bcol_iboffload_endpoint.h" #include "opal/include/opal/types.h" BEGIN_C_DECLS int mca_bcol_iboffload_small_msg_bcast_progress( bcol_function_args_t *input_args, struct coll_ml_function_t *const_args); int mca_bcol_iboffload_small_msg_bcast_extra_intra(bcol_function_args_t *fn_arguments, struct coll_ml_function_t *const_args); int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments, struct coll_ml_function_t *const_args); int mca_bcol_iboffload_bcast_scatter_allgather_intra(bcol_function_args_t *fn_arguments, struct coll_ml_function_t *const_args); int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments, struct coll_ml_function_t *const_args); int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments, struct coll_ml_function_t *const_args); int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super); static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_recv_rtr_setup( struct mqe_task **last_wait, uint32_t dest_rank, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; /* Wait for RTR message over credit QP */ fragment = mca_bcol_iboffload_get_preposted_recv_frag( iboffload, dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } task = mca_bcol_iboffload_get_wait_task( iboffload, dest_rank, 1, fragment, MCA_BCOL_IBOFFLOAD_QP_CREDIT, iboffload->endpoints[dest_rank]->qps[MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF].qp->lcl_qp); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_send_small_buff_setup( struct mqe_task **last_send, size_t len, uint32_t dest_rank, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req; IBOFFLOAD_VERBOSE(10,("Get ml frag that I will send dest rank %d, len %d, lkey %d", dest_rank, len, iboffload->rdma_block.ib_info.lkey)); fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank, coll_request->qp_index, len, 0, SBUF, /* this could be problematic */ MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } IBOFFLOAD_VERBOSE(10,("Get an rdma task for dest %d for packet size %d", dest_rank,len)); task = mca_bcol_iboffload_get_rdma_task( dest_rank, 0, fragment, iboffload, coll_fragment); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } *last_send = &task->element; MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_send_large_buff_setup( struct mqe_task **last_send, int buf_index, int offset, size_t len, uint32_t dest_rank, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req; fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, len, offset, buf_index, MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } task = mca_bcol_iboffload_get_send_task( iboffload, dest_rank, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, fragment, coll_fragment, NO_INLINE); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } *last_send = &task->element; MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_send_rtr_setup( struct mqe_task **last_send, uint32_t dest_rank, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; /* Recv is ready , Send RTR message */ fragment = mca_bcol_iboffload_get_send_frag(coll_fragment->coll_full_req, dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT, 0, 0, RBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } task = mca_bcol_iboffload_get_send_task(iboffload, dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT, fragment, coll_fragment, INLINE); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } IBOFFLOAD_VERBOSE(10, ("dest_rank - %d. qp index - %d.\n", dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT)); *last_send = &task->element; MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_recv_small_preposted_buff_setup( struct mqe_task **last_wait, size_t len, uint32_t dest_rank, int qp_index, int nwaits, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; IBOFFLOAD_VERBOSE(10,("Get preposted recv from rank %d", dest_rank)); fragment = mca_bcol_iboffload_get_preposted_recv_frag( iboffload, dest_rank, qp_index); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, nwaits, fragment, qp_index, NULL); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } *last_wait = &task->element; MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_recv_small_buff_setup( struct mqe_task **last_wait, size_t len, uint32_t dest_rank, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req; IBOFFLOAD_VERBOSE(10, ("Get preposted recv from rank %d", dest_rank)); fragment = mca_bcol_iboffload_get_preposted_recv_frag( iboffload, dest_rank, coll_request->qp_index); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1, fragment, coll_request->qp_index, NULL); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } *last_wait = &task->element; MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_recv_large_buff_setup( struct mqe_task **last_wait, int buf_index, int offset, size_t len, uint32_t dest_rank, mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collfrag_t *coll_fragment) { int num_preposted; mca_bcol_iboffload_task_t *task; mca_bcol_iboffload_frag_t *fragment; mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req; /* Post message to recv queue for large messages */ fragment = mca_bcol_iboffload_get_ml_frag( iboffload, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, len, coll_request->buffer_info[buf_index].iboffload_reg->mr->lkey, (uint64_t)((unsigned char *)coll_request->buffer_info[buf_index].buf + offset)); if (OPAL_UNLIKELY(NULL == fragment)) { IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } num_preposted = mca_bcol_iboffload_prepost_ml_recv_frag( MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, dest_rank, fragment, iboffload); if (0 >= num_preposted) { IBOFFLOAD_ERROR(("Failed to prepost recv fragments " "return code - %d; dest_rank - %d", num_preposted, dest_rank)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1, fragment, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, NULL); if (OPAL_UNLIKELY(NULL == task)) { IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } *last_wait = &task->element; MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); return OMPI_SUCCESS; } static inline __opal_attribute_always_inline__ int bcol_iboffload_binomial_root_to_src(int group_root, int my_rank, int pow2_size, int group_size, int *distance) { int root, relative_rank, src, pow2_distance = 0, i; if (group_root < pow2_size) { root = group_root; } else { /* the source of the data is extra node, the real root it represented by some rank from pow2 group */ root = group_root - pow2_size; /* shortcut for the case when my rank is root for the group */ if (my_rank == root) { *distance = -1; return group_root; } } relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size : my_rank - root; for (i = 1; i < pow2_size; i<<=1, pow2_distance++) { if (relative_rank & i) { src = my_rank ^ i; if (src >= pow2_size) src -= pow2_size; *distance = pow2_distance; IBOFFLOAD_VERBOSE(10, ("AAAAA d %d rel %d it %d root %d my %d", *distance, relative_rank, i, root, my_rank)); return src; } } /* error case */ *distance = -1; return -1; } static inline void bcol_iboffload_setup_binomial_connection(mca_bcol_iboffload_module_t *iboffload) { netpatterns_pair_exchange_node_t *my_exchange_node = &iboffload->recursive_doubling_tree; int i, n_exchanges = my_exchange_node->n_exchanges, *exchanges = my_exchange_node->rank_exchanges, n_extra_src = my_exchange_node->n_extra_sources, my_rank = iboffload->ibnet->super.my_index, rank_extra_src = my_exchange_node->rank_extra_source; mca_bcol_iboffload_endpoint_t *ep; IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); if (0 < n_extra_src) { ep = iboffload->endpoints[rank_extra_src]; while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } #if OPAL_ENABLE_DEBUG { int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; for (qp_index = 0; qp_index < num_qps; ++qp_index) { assert(NULL != ep->qps[qp_index].qp->lcl_qp); IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.", ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num)); } } #endif /* Connect to all extra nodes */ if (EXTRA_NODE == my_exchange_node->node_type) { for (i = iboffload->power_of_2_ranks; i < iboffload->num_endpoints; ++i) { if (i != my_rank) { ep = iboffload->endpoints[i]; IBOFFLOAD_VERBOSE(10, ("subgroup rank %d: Connect to rank %d.\n", my_rank, i)); while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } #if OPAL_ENABLE_DEBUG { int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; for (qp_index = 0; qp_index < num_qps; ++qp_index) { assert(NULL != ep->qps[qp_index].qp->lcl_qp); IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.", ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num)); } } #endif } } } } for (i = 0; i < n_exchanges; ++i) { ep = iboffload->endpoints[exchanges[i]]; while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } #if OPAL_ENABLE_DEBUG { int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; for (qp_index = 0; qp_index < num_qps; ++qp_index) { assert(NULL != ep->qps[qp_index].qp->lcl_qp); IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.", ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num)); } } #endif } /* set the connection status to connected */ iboffload->connection_status[RECURSIVE_DOUBLING_TREE_BCAST] = true; } static inline __opal_attribute_always_inline__ int bcol_iboffload_bcast_binomial_gather(mca_bcol_iboffload_module_t *iboffload_module, struct mqe_task **last_send, struct mqe_task **last_wait, mca_bcol_iboffload_collfrag_t *coll_fragment, int count, int base_block_size, int radix_mask_pow) { int rc; int i; int my_group_index = iboffload_module->ibnet->super.my_index; int delta, rdelta; IBOFFLOAD_VERBOSE(10, ("bcol_iboffload_bcast_binomial_gather %d %d", radix_mask_pow, my_group_index)); /* we assume the iteration #iteration already was completed with probe */ for (i = 0; i < iboffload_module->power_of_2; i++) { int pow2 = 1 << i; int peer_index = my_group_index ^ pow2; int slen, rlen, send_offset, recv_offset; if (i > radix_mask_pow) { slen = rlen = pow2 * base_block_size; send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i)); recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i)); rdelta = count - recv_offset; if (rdelta > 0) { IBOFFLOAD_VERBOSE(10, ("Recv1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", pow2, 1 << iboffload_module->power_of_2, recv_offset, rlen, peer_index)); rc = mca_bcol_iboffload_send_rtr_setup(last_send, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); return OMPI_ERROR; } } delta = count - send_offset; if (delta > 0) { if (delta < slen) { /* recv the tail */ slen = delta; } IBOFFLOAD_VERBOSE(10, ("Send1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", pow2, 1 << iboffload_module->power_of_2, send_offset, slen, peer_index)); rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); return OMPI_ERROR; } } if (rdelta > 0) { if (rdelta < rlen) { /* recv the tail */ rlen = rdelta; } rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait, SBUF, recv_offset, rlen, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } } } else if (i == radix_mask_pow) { /* only receive data */ rlen = pow2 * base_block_size; recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i)); delta = count - recv_offset; if (0 >= delta) { /* we have nothing to send, skip the iteration */ continue; } if (delta < rlen) { /* recv the tail */ rlen = delta; } /* receive data from the peer */ IBOFFLOAD_VERBOSE(10, ("Recv2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", pow2, 1 << iboffload_module->power_of_2, recv_offset, rlen, peer_index)); rc = mca_bcol_iboffload_send_rtr_setup(last_send, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait, SBUF, recv_offset, rlen, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } } else if (i < radix_mask_pow) { /* Only send data */ slen = pow2 * base_block_size; send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i)); delta = count - send_offset; if (0 >= delta) { /* we have nothing to send, skip the iteration */ continue; } if (delta < slen) { slen = delta; } IBOFFLOAD_VERBOSE(10, ("Send2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", pow2, 1 << iboffload_module->power_of_2, send_offset, slen, peer_index)); rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); return OMPI_ERROR; } } } return OMPI_SUCCESS; } END_C_DECLS #endif