/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * Copyright (c) 2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include #include #include #include #include #include "opal_stdint.h" #include "bcol_iboffload.h" #include "bcol_iboffload_alltoall.h" #include "bcol_iboffload_bcast.h" #include "bcol_iboffload_frag.h" #include "bcol_iboffload_task.h" #include "bcol_iboffload_collreq.h" #include "bcol_iboffload_collfrag.h" #include "bcol_iboffload_endpoint.h" #include "opal/include/opal/types.h" static int mca_bcol_iboffload_allgather_init( bcol_function_args_t *fn_arguments, mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t **coll_request, bool if_bcol_last, int mq_credits, collective_message_progress_function progress_fn) { int rc; ompi_free_list_item_t *item; mca_bcol_iboffload_collfrag_t *coll_fragment; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; OMPI_FREE_LIST_WAIT(&cm->collreqs_free, item, rc); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Wait for free list failed.\n")); return rc; } /* setup call request */ (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; (*coll_request)->n_fragments = 0; (*coll_request)->n_frags_sent = 0; (*coll_request)->n_frag_mpi_complete = 0; (*coll_request)->n_frag_net_complete = 0; (*coll_request)->if_bcol_last = if_bcol_last; (*coll_request)->ml_buffer_index = fn_arguments->buffer_index; (*coll_request)->completion_cb_fn = NULL; (*coll_request)->buffer_info[SBUF].buf = (void *) ( (unsigned char *)fn_arguments->sbuf + fn_arguments->sbuf_offset); (*coll_request)->buffer_info[RBUF].buf = (void *) ( (unsigned char *)fn_arguments->rbuf + fn_arguments->rbuf_offset); (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset; (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset; /* seems like we should initialize the memory registration pointer to NULL here */ (*coll_request)->buffer_info[SBUF].iboffload_reg = NULL; (*coll_request)->buffer_info[RBUF].iboffload_reg = NULL; (*coll_request)->dtype = fn_arguments->dtype; (*coll_request)->count = fn_arguments->count; (*coll_request)->module = iboffload_module; /* TODO Pasha: we need it for pending quque. Set it later. */ (*coll_request)->progress_fn = progress_fn; /* TODO Pasha: fix it later */ (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; (*coll_request)->order_info = &fn_arguments->order_info; coll_fragment = &((*coll_request)->first_collfrag); mca_bcol_iboffload_collfrag_init(coll_fragment); /** Vasily ????? */ /* mq_credits = (*coll_request)->total_tasks_num; */ coll_fragment->mq_credits = mq_credits; coll_fragment->mq_index = COLL_MQ; /* pasha: just set it to zero */ coll_fragment->last_wait_num = 0; coll_fragment->alg = -2; /* used only for debug */ /* if (my_rank == algthm_ptr->root) { coll_fragment->last_wait_num = 0; } else { coll_fragment->last_wait_num = algth_lst->last_wait_num; } */ /* Pasha: we have nothing to unpack */ coll_fragment->unpack_size = 0; /* coll_fragment->unpack_size = pack_len; */ /* coll_fragment->alg = RECURSIVE_DOUBLING_TREE_BCAST; */ /* set pointers for (coll frag) <-> (coll full request) */ (*coll_request)->user_handle_freed = false; fn_arguments->bcol_opaque_data = (void *) (*coll_request); /* We don't have root.. if (true == fn_arguments->root_flag) { (*coll_request)->root = my_group_index; } else { (*coll_request)->root = fn_arguments->root_route->rank; } */ MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS((*coll_request), coll_fragment); return OMPI_SUCCESS; } #if 1 static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload) { int i, j; /*Seems that we don't require this*/ netpatterns_k_exchange_node_t *exchange_node = &iboffload->knomial_allgather_tree; mca_bcol_iboffload_endpoint_t *ep; IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); #if 0 fprintf(stderr,"Entering Open Connections\n"); #endif /* start with extras and proxy connections */ if(exchange_node->n_extra_sources > 0) { /* connect to endpoint */ /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_extra_sources_array[0]]];*/ ep = iboffload->endpoints[exchange_node->rank_extra_sources_array[0]]; while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } } /* now move through the recursive k-ing exchanges */ if(NULL != exchange_node->rank_exchanges) { for( i = 0; i < exchange_node->log_tree_order; i++) { for( j = 0; j < ( exchange_node->tree_order - 1 ); j++) { if( exchange_node->rank_exchanges[i][j] < 0 ){ continue; } /* connect to endpoint */ /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_exchanges[i][j]]];*/ ep = iboffload->endpoints[exchange_node->rank_exchanges[i][j]]; if (iboffload->ibnet->super.my_index < ep->index) { while(0 == (ep)->remote_zero_rdma_addr.addr) { opal_progress(); } } else { IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index)); while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } } } } } /* set the connection status to connected */ iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true; } #endif static inline void bcol_iboffload_setup_allgather_ring_endpoints_connection(mca_bcol_iboffload_module_t *iboffload) { int i; const int group_size = iboffload->ibnet->super.group_size; mca_bcol_iboffload_endpoint_t *ep; IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be * */ /* I'm going to leave this alone for now, because I'm * not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids? */ for (i = 0; i < group_size; i++) { ep = iboffload->endpoints[i]; while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } } /* set the connection status to connected */ /*JSL - change this macro */ iboffload->connection_status[ALLGATHER_NEIGHBOR_ALG] = true; } #if 0 /* allgather neighbor exchange algorithm N/2 communication steps, 2 connections */ static int mca_bcol_iboffload_neighbor_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { int rc, src, dst; uint32_t pack_len; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; int group_size = iboffload_module->group_size; int step, roffset, soffset; int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; int even_rank; int parity; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; #if 0 fprintf(stderr,"entering large msg neighbor exchange allgather\n"); #endif IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_NEIGHBOR_ALG])) { IBOFFLOAD_VERBOSE(10,("Allgather open new connection ")); bcol_iboffload_setup_allgather_ring_endpoints_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* register send and receive sides */ /* send side, only sending pack_len data */ /* I think that probably I will only register the rbuf */ /* on receive side I need to register pack_len*group_size data */ rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size, &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Cannot register memory: " "addr - %p, %d bytes.\n", coll_request->buffer_info[RBUF].buf, pack_len)); return OMPI_ERROR; } coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; /* start the neighbor exchange */ even_rank = !(my_group_index % 2); if (even_rank) { neighbor[0] = (my_group_index + 1) % group_size; neighbor[1] = (my_group_index - 1 + group_size) % group_size; recv_data_from[0] = my_group_index; recv_data_from[1] = my_group_index; offset_at_step[0] = (+2); offset_at_step[1] = (-2); } else { neighbor[0] = (my_group_index - 1 + group_size) % group_size; neighbor[1] = (my_group_index + 1) % group_size; recv_data_from[0] = neighbor[0]; recv_data_from[1] = neighbor[0]; offset_at_step[0] = (-2); offset_at_step[1] = (+2); } /* first step is special step, only send one block */ roffset = neighbor[0]*pack_len; soffset = my_group_index*pack_len; /* send receive this */ dst = neighbor[0]; src = neighbor[0]; rc = mca_bcol_iboffload_send_rtr_setup(&last_send, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); /* send the data */ if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_recv_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, RBUF, coll_request->buffer_info[RBUF].offset + soffset/* offset calc */ , pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_send_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, coll_request->buffer_info[RBUF].offset + roffset, pack_len, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* now for the actual neighbor exchange algorithm */ /* determine initial send location */ if(even_rank) { send_data_from = my_group_index; }else { send_data_from = recv_data_from[0]; } for( step = 1; step < (group_size/2); step++) { parity = step % 2; recv_data_from[parity] = (recv_data_from[parity] + offset_at_step[parity] + group_size) % group_size; src = neighbor[parity]; dst = src; roffset = recv_data_from[parity] * pack_len; soffset = send_data_from * pack_len; /* post send rtr and recev rtr together */ if( 1 == step ){ rc = mca_bcol_iboffload_send_rtr_setup(&last_send, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); /* send the data */ if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_recv_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } /* I'm using the hierarchy offset used in the k-nomial allgather */ /* this won't work...*/ rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, RBUF, coll_request->buffer_info[RBUF].offset + soffset/* offset calc */ , 2 * pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_send_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, coll_request->buffer_info[RBUF].offset + roffset, 2 * pack_len, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } send_data_from = recv_data_from[parity]; } /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments = 1; (coll_request)->n_frags_sent = 1; assert(NULL != last_wait); last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index); /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; } #endif #if 0 /* debug connection routine */ static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload) { int i; const int group_size = iboffload->ibnet->super.group_size; mca_bcol_iboffload_endpoint_t *ep; IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be * */ /* I'm going to leave this alone for now, because I'm * not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids? */ for (i = 0; i < group_size; i++) { ep = iboffload->endpoints[i]; while (OMPI_SUCCESS != check_endpoint_state(ep, NULL, NULL)) { opal_progress(); } } /* set the connection status to connected */ /*JSL - change this macro */ iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true; } #endif static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { int rc, src, dst, comm_dst, comm_src; int tree_order, pow_k, i, j; uint32_t pack_len; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; int group_size = iboffload_module->group_size; int *group_list = iboffload_module->super.sbgp_partner_module->group_list; int my_comm_index = group_list[my_group_index]; netpatterns_k_exchange_node_t *exchange_node = &iboffload_module->knomial_allgather_tree; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; #if 0 fprintf(stderr,"entering large msg allgather\n"); #endif IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) { IBOFFLOAD_VERBOSE(10,("Allgather open new connection ")); bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* register send and receive sides */ /* send side, only sending pack_len data */ /* I think that probably I will only register the rbuf */ /* on receive side I need to register pack_len*group_size data */ rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size, &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Cannot register memory: " "addr - %p, %d bytes.\n", coll_request->buffer_info[RBUF].buf, pack_len)); return OMPI_ERROR; } coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; /* start with the extra / proxy phase */ if( EXTRA_NODE == exchange_node->node_type ) { /* send pack_len data to proxy */ comm_dst = exchange_node->rank_extra_sources_array[0]; /* get ib subnet id */ dst = comm_dst; /* comm_to_ibnet[comm_dst];*/ /* post ready-to-receive receive on sender's side */ rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); /* send the data */ if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_recv_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, RBUF, coll_request->buffer_info[RBUF].offset + my_comm_index*pack_len, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_send_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ /* post the receive */ comm_src = comm_dst; src = dst; /* Sending this results in a race condition where if the rtr send bypasses the large msg receive on proxy's side, then it triggers the start of the recurssive k-ing phase prematurely causing random data corruption. */ /* rc = mca_bcol_iboffload_send_rtr_setup(&last_send, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } */ rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, coll_request->buffer_info[RBUF].offset, pack_len*group_size, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } goto FINISHED; } else if( 0 < exchange_node->n_extra_sources ) { /* am a proxy, receive pack_len data from extra */ comm_src = exchange_node->rank_extra_sources_array[0]; /* get ib subnet */ src = comm_src; /*comm_to_ibnet[comm_src];*/ rc = mca_bcol_iboffload_send_rtr_setup(&last_send, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, coll_request->buffer_info[RBUF].offset + pack_len*comm_src, pack_len, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } /* start recursive k - ing */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; for( i = 0; i < pow_k; i++) { /* Post ready-to-recv messages - I am here */ for( j = 0; j <( tree_order - 1); j++) { comm_src = exchange_node->rank_exchanges[i][j]; if( comm_src < 0 ){ continue; } /* get ib subnet */ src = comm_src; /*comm_to_ibnet[comm_src];*/ rc = mca_bcol_iboffload_send_rtr_setup(&last_send, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } /* Post receive ready-to-recev message - I can send to you */ for( j = 0; j < (tree_order - 1); j++) { /* recev ready-to-receive message */ comm_dst = exchange_node->rank_exchanges[i][j]; /* remember, if we have extra ranks, then we won't participate * with a least one peer. Make a check: */ if( comm_dst < 0 ){ continue; } /* get ib subnet id */ dst = comm_dst; /*comm_to_ibnet[comm_dst];*/ /* post ready-to-receive receive on sender's side */ rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); /* send the data */ if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_recv_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } /* (k-1) sends */ for( j = 0; j < (tree_order - 1); j++ ) { /* send phase */ comm_dst = exchange_node->rank_exchanges[i][j]; /* remember, if we have extra ranks, then we won't participate * with a least one peer. Make a check */ if( comm_dst < 0 ){ continue; } /* get ib subnet id */ dst = comm_dst; /*comm_to_ibnet[comm_dst];*/ rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, RBUF, coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].s_offset/* offset calc */ , exchange_node->payload_info[i][j].s_len*pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_send_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ } /* we post receives after all sends in order to achieve concurrent * sends as well as assuring blocking until completely receiving * all data at level k before starting level k+1 sends */ /* (k-1) receives - these are blocking */ for( j = 0; j < (tree_order - 1); j++) { /*recv phase */ comm_src = exchange_node->rank_exchanges[i][j]; if( comm_src < 0 ){ continue; } /* get ib subnet */ src = comm_src; /*comm_to_ibnet[comm_src];*/ rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].r_offset, exchange_node->payload_info[i][j].r_len*pack_len, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } } /* last step, just send it back to the extra if I have one */ if( 0 < exchange_node->n_extra_sources ) { comm_dst = exchange_node->rank_extra_sources_array[0]; /* get ib subnet id */ dst = comm_dst; /*comm_to_ibnet[comm_dst];*/ /* rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); // send the data we are already guaranteed that extra rank is waiting if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_recv_rtr_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } */ rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, RBUF, coll_request->buffer_info[RBUF].offset, pack_len*group_size, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" "mca_bcol_iboffload_send_large_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ } FINISHED: /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments = 1; (coll_request)->n_frags_sent = 1; assert(NULL != last_wait); last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index); /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; } static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { int rc, src, dst, comm_dst, comm_src, i, j; int tree_order, pow_k, knt; uint32_t pack_len; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; int group_size = iboffload_module->group_size; netpatterns_k_exchange_node_t *exchange_node = &iboffload_module->knomial_allgather_tree; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; int *list_connected = iboffload_module->super.list_n_connected; /* test test */ int buff_offset = iboffload_module->super.hier_scather_offset; IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) { IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* now we calculate the actual buff_offset */ buff_offset = buff_offset*pack_len; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; /* we put this in to propagate the lkey into this local data structure */ coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; /* end hack */ if( EXTRA_NODE == exchange_node->node_type ) { /* setup the rdma "send" pack_len data to proxy rank */ comm_dst = exchange_node->rank_extra_sources_array[0]; /* get ib subnet id */ dst = comm_dst; /* now I need to calculate my own offset info */ knt = 0; for( i = 0; i < my_group_index; i++){ knt += list_connected[i]; } rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, pack_len*list_connected[my_group_index], pack_len*knt /* source offset */, pack_len*knt /* destination offset */, dst, iboffload_module, coll_fragment); #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, pack_len, pack_len*group_list[my_group_index] /* source offset */, pack_len*group_list[my_group_index] /* destination offset */, dst, iboffload_module, coll_fragment); #endif /* old flow with ml offset */ #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, pack_len, pack_len*group_list[my_group_index] /* source offset */, coll_request->buffer_info[RBUF].offset + pack_len*group_list[my_group_index] /* destination offset */, dst, iboffload_module, coll_fragment); #endif if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ /* setup the rdma "receive" from proxy */ comm_src = comm_dst; src = dst; /* more general is the number connected */ knt = 0; for( i = 0; i < group_size; i++) { knt += list_connected[i]; } rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len*knt, src, iboffload_module, coll_fragment); /* rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len*group_size, src, iboffload_module, coll_fragment); */ if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } goto FINISHED; } else if( 0 < exchange_node->n_extra_sources ) { /* am a proxy, receive pack_len data from extra */ comm_src = exchange_node->rank_extra_sources_array[0]; /* get ib subnet */ src = comm_src; rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len*list_connected[src], src, iboffload_module, coll_fragment); /* rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len, src, iboffload_module, coll_fragment); */ if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } /* start recursive k - ing */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; /*fprintf(stderr,"tree order %d pow_k %d\n",tree_order,pow_k);*/ for( i = 0; i < pow_k; i++) { for( j = 0; j < (tree_order - 1); j++ ) { /* send phase */ comm_dst = exchange_node->rank_exchanges[i][j]; /* remember, if we have extra ranks, then we won't participate * with a least one peer. Make a check */ /*fprintf(stderr,"AAA my index %d comm_dst %d\n",my_group_index,comm_dst);*/ if( comm_dst < 0 ){ continue; } /* get ib subnet id */ /* again, don't think we need this */ /*dst = ibnet_map[comm_dst];*/ dst = comm_dst; /* fprintf(stderr,"BBB my index %d dst %d pack len %d s_len %d src offset %d r_len %d \n",my_group_index,dst, pack_len,exchange_node->payload_info[i][j].s_len,exchange_node->payload_info[i][j].s_offset, exchange_node->payload_info[i][j].r_len); */ /* rdma "send" setup */ rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, exchange_node->payload_info[i][j].s_len * pack_len, exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */, exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst, iboffload_module, coll_fragment); #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, exchange_node->payload_info[i][j].s_len * pack_len, exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* source offset */, exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* destination offset */, dst, iboffload_module, coll_fragment); #endif #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, exchange_node->payload_info[i][j].s_len * pack_len, exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */, exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst, iboffload_module, coll_fragment); #endif #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, exchange_node->payload_info[i][j].s_len * pack_len, coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */, coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst, iboffload_module, coll_fragment); #endif if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } /* send is done */ } for( j = 0; j < (tree_order - 1); j++) { /* rdma "recv" phase */ comm_src = exchange_node->rank_exchanges[i][j]; /* remember, if we have extra ranks, then we won't participate * with a least one peer. Make a check */ if( comm_src < 0 ){ continue; } /* get ib subnet id */ /* shouldn't need this */ src = comm_src; rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, exchange_node->payload_info[i][j].r_len * pack_len, src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; } return OMPI_ERROR; } } } /* last step, proxies send full data back to the extra ranks */ if( 0 < exchange_node->n_extra_sources ) { /* send pack_len data to proxy */ comm_dst = exchange_node->rank_extra_sources_array[0]; /* get ibnet id */ dst = comm_dst; knt = 0; for( i = 0; i < group_size; i++){ knt += list_connected[i]; } rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, pack_len*knt, 0 /* source offset */, 0 /* destination offset */, dst, iboffload_module, coll_fragment); #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, pack_len*group_size, 0 /* source offset */, 0 /* destination offset */, dst, iboffload_module, coll_fragment); #endif #if 0 rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( &last_send, pack_len*group_size, coll_request->buffer_info[RBUF].offset /* source offset */, coll_request->buffer_info[SBUF].offset /* destination offset */, dst, iboffload_module, coll_fragment); #endif if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ goto out_of_resources; fprintf(stderr,"I'm out of resources \n"); } return OMPI_ERROR; } /* send is done */ } FINISHED: /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments = 1; (coll_request)->n_frags_sent = 1; assert(NULL != last_wait); last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; assert(MCA_COLL_ML_NO_BUFFER != coll_request->ml_buffer_index); /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; } #if 0 static int mca_bcol_iboffload_neighbor_allgather_userbuffer_intra( bcol_function_args_t *fn_arguments, struct mca_bcol_base_function_t *const_args) { mca_bcol_iboffload_module_t *iboffload_module = (mca_bcol_iboffload_module_t *)const_args->bcol_module; int rc; int mq_credits = iboffload_module->group_size * 2 * 2; /* large message protocol consumes * twice as many mq credits */ bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); mca_bcol_iboffload_collreq_t *coll_request; MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module, &coll_request, if_bcol_last, mq_credits, mca_bcol_iboffload_neighbor_allgather_userbuffer_exec); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } rc = coll_request->progress_fn(iboffload_module, coll_request); IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc)); return rc; } #endif #if 1 static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra(bcol_function_args_t *fn_arguments, struct mca_bcol_base_function_t *const_args) { mca_bcol_iboffload_module_t *iboffload_module = (mca_bcol_iboffload_module_t *)const_args->bcol_module; int rc; int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)* iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 * 2; /* large message protocol * consumes twice as much */ bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); mca_bcol_iboffload_collreq_t *coll_request; MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module, &coll_request, if_bcol_last, mq_credits, mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } rc = coll_request->progress_fn(iboffload_module, coll_request); IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc)); return rc; } #endif static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra(bcol_function_args_t *fn_arguments, struct mca_bcol_base_function_t *const_args) { mca_bcol_iboffload_module_t *iboffload_module = (mca_bcol_iboffload_module_t *)const_args->bcol_module; int rc; /* I'll add one for everyone, since nobody wants to feel left out */ int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)* iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 ; bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); mca_bcol_iboffload_collreq_t *coll_request; MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module, &coll_request, if_bcol_last, mq_credits, mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } rc = coll_request->progress_fn(iboffload_module, coll_request); IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_small_msg_bcast_intra was started [%d]\n", rc)); return rc; } /* these progress engines are shared between alltoall and allgather and exist in both files, * should be moved to a common .h file */ static int mca_bcol_iboffload_collreq_mlbuffer_progress( bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int i; mca_bcol_iboffload_collreq_t *coll_request = (mca_bcol_iboffload_collreq_t *) input_args->bcol_opaque_data; IBOFFLOAD_VERBOSE(10, ("Run progress (ml buffer).\n")); for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) { if (BCOL_IS_COMPLETED(coll_request)) { coll_request->user_handle_freed = true; if (COLLREQ_IS_DONE(coll_request)) { IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); RELEASE_COLLREQ(coll_request); } IBOFFLOAD_VERBOSE(10, ("Collective finished (ml buffer).\n")); return BCOL_FN_COMPLETE; } } IBOFFLOAD_VERBOSE(10, ("Collective not finished (ml buffer).\n")); return BCOL_FN_STARTED; } static int mca_bcol_iboffload_collreq_userbuffer_progress( bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int i; mca_bcol_iboffload_collreq_t *coll_request = (mca_bcol_iboffload_collreq_t *) input_args->bcol_opaque_data; IBOFFLOAD_VERBOSE(10, ("Run progress (user buffer)\n")); /* Complete the allgather - progress releases full request descriptors */ for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) { if (coll_request->n_frag_mpi_complete == coll_request->n_fragments && coll_request->n_frag_net_complete == coll_request->n_fragments) { IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n")); if (NULL != coll_request->buffer_info[SBUF].iboffload_reg) { coll_request->module->device->mpool->mpool_deregister( coll_request->module->device->mpool, (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg); coll_request->buffer_info[SBUF].iboffload_reg = NULL; } if (NULL != coll_request->buffer_info[RBUF].iboffload_reg) { coll_request->module->device->mpool->mpool_deregister( coll_request->module->device->mpool, (mca_mpool_base_registration_t *) coll_request->buffer_info[RBUF].iboffload_reg); coll_request->buffer_info[RBUF].iboffload_reg = NULL; } RELEASE_COLLREQ(coll_request); IBOFFLOAD_VERBOSE(10, ("New bcast done !!!")); return BCOL_FN_COMPLETE; } } IBOFFLOAD_VERBOSE(10, ("Collective finished (user buffer).\n")); /* We are not done */ return BCOL_FN_STARTED; } int mca_bcol_iboffload_allgather_register(mca_bcol_base_module_t *super) { mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; IBOFFLOAD_VERBOSE(10, ("Register iboffload Allgather.\n")); comm_attribs.bcoll_type = BCOL_ALLGATHER; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_KNOWN; mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra, mca_bcol_iboffload_collreq_mlbuffer_progress); inv_attribs.bcol_msg_min = 10000000; inv_attribs.bcol_msg_max = 10485760; /* range 4 */ /* zero-copy k-nomial algorithm */ #if 1 mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra, mca_bcol_iboffload_collreq_userbuffer_progress); #endif /* zero-copy neighbor exchange algorithm */ #if 0 mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_neighbor_allgather_userbuffer_intra, mca_bcol_iboffload_collreq_userbuffer_progress); #endif return OMPI_SUCCESS; }