/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "ompi/include/ompi/constants.h" #include "ompi/mca/bcol/bcol.h" #include "bcol_ptpcoll_allreduce.h" /* * Recursive K-ing allreduce */ static inline int bcol_ptpcoll_allreduce_narray_schedule_extra_node_exchange (mca_bcol_ptpcoll_module_t *ptpcoll_module, netpatterns_k_exchange_node_t *k_node, void *data_buffer, size_t data_size, ompi_request_t **requests, int *active_requests, int tag) { ompi_communicator_t *comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int peer_comm_rank, k, offset, rc; if (EXCHANGE_NODE == k_node->node_type) { /* the send data resides in the first part of the buffer */ for (k = 0, offset = data_size ; k < k_node->n_extra_sources ; ++k, offset += data_size) { peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[k]]; PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p len %d tag %d", peer_comm_rank, data_buffer, data_size, tag)); rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + offset), data_size, MPI_BYTE, peer_comm_rank, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); } } else { peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[0]]; PTPCOLL_VERBOSE(10, ("Send data to %d, addr %p len %d tag %d", peer_comm_rank, data_buffer, data_size, tag)); rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer_comm_rank, tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } return OMPI_SUCCESS; } static inline void bcol_ptpcoll_allreduce_narray_reduce (void *data_buffer, struct ompi_datatype_t *data_type, int count, struct ompi_op_t *op, int sources) { size_t data_size = mca_bcol_base_get_buff_length(data_type, count); for (int k = 0, offset = data_size ; k < sources ; ++k, offset += data_size) { ompi_op_reduce(op, (char *) data_buffer + offset, data_buffer, count, data_type); } } static int bcol_ptpcoll_allreduce_narraying_progress (bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; void *data_buffer = (void *) ( (unsigned char *) input_args->sbuf + (size_t) input_args->sbuf_offset); struct ompi_datatype_t *data_type = input_args->dtype; uint32_t buffer_index = input_args->buffer_index; struct ompi_op_t *op = input_args->op; int count = input_args->count; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; int k, rc, peer, group_peer; int offset = 0; ompi_communicator_t *comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; int k_radix = k_node->tree_order; size_t data_size = mca_bcol_base_get_buff_length(data_type, count); int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); /* if we are just staring the collective and there are extra sources then schedule the * extra node exchange. otherwise check if the exchange is complete. */ if (-1 == *iteration) { if (0 < k_node->n_extra_sources) { if (!(*active_requests)) { rc = bcol_ptpcoll_allreduce_narray_schedule_extra_node_exchange (ptpcoll_module, k_node, data_buffer, data_size, requests, active_requests, tag); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } } /* check for extra node exchange completion */ if (!mca_bcol_ptpcoll_test_all_for_match (active_requests, requests, &rc)) { return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; } if (EXCHANGE_NODE == k_node->node_type) { bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_node->n_extra_sources); } } /* start recursive k-ing */ *iteration = 0; } if (*iteration < k_node->n_exchanges) { if (*active_requests) { if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) { return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; } ++(*iteration); bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_radix - 1); } } for ( ; *iteration < k_node->n_exchanges ; ++(*iteration)) { for (k = 0; k < k_radix - 1; k++) { group_peer = k_node->rank_exchanges[*iteration][k]; peer = group_list[group_peer]; PTPCOLL_VERBOSE(10, ("Send data to %d, addr %p len %d tag %d", peer, data_buffer, data_size, tag)); rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } for (k = 0, offset = data_size ; k < k_radix - 1 ; ++k, offset += data_size) { group_peer = k_node->rank_exchanges[*iteration][k]; peer = group_list[group_peer]; PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p len %d tag %d", peer, data_buffer, data_size, tag)); rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + offset ), data_size, MPI_BYTE, peer, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); } if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) { return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; } bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_radix - 1); } /* ensure extra nodes get the result */ if (0 < k_node->n_extra_sources) { if (!(*active_requests)) { int peer_comm_rank; if (EXTRA_NODE == k_node->node_type) { peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[0]]; PTPCOLL_VERBOSE(10, ("EXTRA_NODE: Recv data from %d, addr %p len %d tag %d", peer_comm_rank, data_buffer, data_size, tag)); rc = MCA_PML_CALL(irecv(data_buffer, data_size, MPI_BYTE, peer_comm_rank, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); } else { for (k = 0; k < k_node->n_extra_sources; k++) { peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[k]]; PTPCOLL_VERBOSE(10, ("EXCHANGE_NODE: Send data to %d, addr %p len %d tag %d", peer_comm_rank, data_buffer, data_size, tag)); rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer_comm_rank, tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } } } if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) { return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; } } return BCOL_FN_COMPLETE; } int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args){ mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; int count = input_args->count; struct ompi_datatype_t *dtype = input_args->dtype; size_t buffer_size; int tag; tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; /* start with extra node exchange if needed */ ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration = -1; ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests = 0; ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status = PTPCOLL_NOT_STARTED; /* * ML bufer is segmented into k segments and each of the k segment is used * for reductions */ /* This has to be based on ml buffer size. Need to take into account the space used * by the headers of other bcol modules. */ buffer_size = ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX; assert(buffer_size >= count * dtype->super.size * ptpcoll_module->k_nomial_radix); (void)buffer_size; // silence compiler warning (void)dtype; (void)count; return bcol_ptpcoll_allreduce_narraying_progress (input_args, const_args); } static inline int compute_seg_index(int peer, int kpow_num, int tree_order) { int peer_base, peer_position, peer_base_rank, peer_index; peer_base = peer / (kpow_num * tree_order); peer_base_rank = peer_base * kpow_num * tree_order ; peer_position = peer_base_rank == 0 ? peer : peer % (peer_base_rank); peer_index = peer_position / kpow_num ; return peer_index; } int compute_knomial_allgather_offsets(int group_index, int count, struct ompi_datatype_t *dtype,int k_radix,int n_exchanges, int **offsets){ int modulo_group_size; size_t seg_count, seg_size, seg_index, seg_offset; size_t block_offset, block_count; int exchange_step; ptrdiff_t lb, extent; if (0 >= n_exchanges) { PTPCOLL_VERBOSE(10,("Nothing to initialize ")); return 0; } modulo_group_size = 1; seg_count = count / k_radix; ompi_datatype_get_extent(dtype, &lb, &extent); seg_size = seg_count * extent; seg_index = group_index % k_radix; seg_offset = seg_index * seg_size; offsets[0][BLOCK_OFFSET] = block_offset = 0; offsets[0][BLOCK_COUNT] = block_count = count; offsets[0][LOCAL_REDUCE_SEG_OFFSET] = seg_offset; offsets[0][SEG_SIZE] = seg_size; for(exchange_step = 1; exchange_step < n_exchanges; exchange_step++) { /* Previous step's segment is this exchange step's block */ block_count = seg_count; block_offset = seg_offset; /* Divide the segment into k parts */ seg_count = seg_count / k_radix; seg_size = seg_count * extent; /* Among different segments in block, which segment should I reduce ? */ /* For allgather phase, I will not send out this segment to peers */ modulo_group_size *= k_radix; seg_index = compute_seg_index(group_index, modulo_group_size, k_radix); seg_offset = seg_index * seg_size; offsets[exchange_step][BLOCK_OFFSET] = block_offset; offsets[exchange_step][LOCAL_REDUCE_SEG_OFFSET] = seg_offset; offsets[exchange_step][BLOCK_COUNT] = block_count; offsets[exchange_step][SEG_SIZE] = seg_size; /* Change to absolute offset */ seg_offset = block_offset + seg_offset; } return 0; } static inline int compute_send_segment_size(int block_offset, int send_offset, int segment_size, int padded_offset) { int send_size = -1; /* segment to be sent starts here */ int segment_offset = block_offset + send_offset ; send_size = (segment_offset + segment_size) >= padded_offset ? segment_size - (segment_offset + segment_size - padded_offset) : segment_size; return send_size; } static inline int compute_recv_segment_size(int block_offset, int recv_offset, int segment_size, int padded_offset) { int recv_size = -1; /* segment to be sent starts here */ int segment_offset = block_offset + recv_offset ; recv_size = (segment_offset + segment_size) >= padded_offset ? segment_size - (segment_offset + segment_size - padded_offset) : segment_size; return recv_size; } /* * * K-nomial Reduce Scatter * Example k=3 n=9 * * | ABCDEFGH |0| * * Number of Exchange steps = log (basek) n * Number of steps in exchange step = k (radix) * * block_size = Size of data that is reduce in exchange step * segment_size = Size of data that is send or received by rank in radix step * * block_size = segment_size * k * * my_block_start_addr = Address of the segment in the block where I reference my * offsets * * This is version 1 : Experimenting with decoupling offset calcuations */ int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module, const int buffer_index, void *sbuf, void *rbuf, struct ompi_op_t *op, const int count, struct ompi_datatype_t *dtype, const int relative_group_index, const int padded_start_byte){ int blocks_in_step = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask; int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; void *my_block_start_addr = NULL, *my_block_addr = NULL; int i, k, group_peer, peer ; int k_radix = k_node->tree_order; int rc = OMPI_SUCCESS; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int completed; void *my_recv_start_addr, *my_recv_addr; size_t block_offset, reduce_seg_offset, send_offset, recv_offset; int seg_size, block_size; int block_count, seg_count; ptrdiff_t lb, extent; ompi_datatype_get_extent(dtype, &lb, &extent); my_recv_start_addr = rbuf; my_block_start_addr = sbuf; block_count = count; block_size = count * extent; for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) { block_offset = ptpcoll_module->allgather_offsets[i][BLOCK_OFFSET]; reduce_seg_offset = ptpcoll_module->allgather_offsets[i][LOCAL_REDUCE_SEG_OFFSET]; block_count = ptpcoll_module->allgather_offsets[i][BLOCK_COUNT]; seg_size = ptpcoll_module->allgather_offsets[i][SEG_SIZE]; block_size = block_count * extent; PTPCOLL_VERBOSE(10,("Block offset %d, reduce_seg_offset %d, block_count %d seg_size %d", block_offset, reduce_seg_offset, block_count, seg_size)); seg_count = block_count / k_radix; my_block_addr = (void*)((char*)my_block_start_addr + block_offset); my_recv_addr = (void*)((char*)my_recv_start_addr + block_offset); for (k = 0; k < k_radix - 1; k++) { size_t soffset; int snd_size = 0; group_peer = k_node->rank_exchanges[i][k]; peer = group_list[group_peer]; send_offset = reduce_seg_offset + (seg_size * (k + 1)); if ((int)send_offset + seg_size > block_size) { send_offset = send_offset % block_size; } PTPCOLL_VERBOSE(10, ("Send data to %d,send offset %d len %d", peer, send_offset, seg_size)); soffset = send_offset; snd_size = compute_send_segment_size((int)block_offset,(int)soffset,(int)seg_size,padded_start_byte); if (snd_size > 0) { rc = MCA_PML_CALL(isend((void *)((unsigned char *)my_block_addr + soffset), snd_size, MPI_BYTE, peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send the segment to %d", peer)); return OMPI_ERROR; } ++(*active_requests); } } /* * Receive the segments to tmp addr and then do a reduction */ for (k = 0; k < k_radix - 1; k++) { int recv_size=0; group_peer = k_node->rank_exchanges[i][k]; peer = group_list[group_peer]; recv_offset = reduce_seg_offset + (seg_size * (k+1)); if ((int)recv_offset + seg_size > block_size) { recv_offset = recv_offset % block_size; } PTPCOLL_VERBOSE(10, ("Receive data to receive buffer at offset %d\n", recv_offset)); recv_size = compute_recv_segment_size((int)block_offset, (int)reduce_seg_offset, (int)seg_size, padded_start_byte); if (recv_size > 0 ) { rc = MCA_PML_CALL(irecv((void *)((unsigned char *) my_recv_addr + recv_offset), recv_size, MPI_BYTE, peer, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer)); return OMPI_ERROR; } ++(*active_requests); } } completed = 0; while(!completed){ completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); } /* Do a reduction on received buffers */ { void *src_data_buffer = NULL, *dst_data_buffer = NULL; int reduce_data_count = 0; src_data_buffer = my_block_addr; dst_data_buffer = my_recv_addr; for (k = 0; k < k_radix - 1; k++) { recv_offset = reduce_seg_offset + (seg_size * (k+1)); if ((int)recv_offset + seg_size > block_size) { recv_offset = recv_offset % block_size; } reduce_data_count = (int)(block_offset + reduce_seg_offset) + seg_size >= padded_start_byte ? (seg_size - (((int)(block_offset + reduce_seg_offset) + seg_size) - padded_start_byte))/(int)dtype->super.size : (int)seg_count; if (reduce_data_count > 0) { ompi_3buff_op_reduce(op, (void*)((unsigned char*)my_recv_addr + recv_offset), (void*)((unsigned char*)src_data_buffer + reduce_seg_offset), (void*)((unsigned char*)dst_data_buffer + reduce_seg_offset), reduce_data_count,dtype); } src_data_buffer = dst_data_buffer; } } /* After first iteration we have data (to work with) in recv buffer */ my_block_start_addr = rbuf; } return rc; } int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module, const int buffer_index, void *sbuf,void *rbuf, int count, struct ompi_datatype_t *dtype, const int relative_group_index, const int padded_start_byte){ size_t block_offset = 0, send_offset = 0, recv_offset = 0; int seg_size=0, block_size=0; int i,k,completed; void *my_block_start_addr = rbuf, *my_block_addr; size_t block_count = count; netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; int k_radix = k_node->tree_order; int peer, group_peer; int rc = OMPI_SUCCESS; int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int exchange_step; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; ptrdiff_t lb, extent; ompi_datatype_get_extent(dtype, &lb, &extent); for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; i < k_node->n_exchanges; i++) { exchange_step = k_node->n_exchanges - 1 - i; block_offset = ptpcoll_module->allgather_offsets[exchange_step][BLOCK_OFFSET]; send_offset = ptpcoll_module->allgather_offsets[exchange_step][LOCAL_REDUCE_SEG_OFFSET]; block_count = ptpcoll_module->allgather_offsets[exchange_step][BLOCK_COUNT]; seg_size = ptpcoll_module->allgather_offsets[exchange_step][SEG_SIZE]; block_size = block_count * extent; PTPCOLL_VERBOSE(10, ("Send offset %d block_offset %d seg_size %\n", send_offset, block_offset, seg_size)); my_block_addr = (void*)((unsigned char*)my_block_start_addr + block_offset); for (k = 0; k < k_radix - 1; k++) { size_t soffset=0; int snd_size = 0; group_peer = k_node->rank_exchanges[exchange_step][k]; peer = group_list[group_peer]; soffset = send_offset; snd_size = compute_send_segment_size((int)block_offset, (int)soffset, (int)seg_size, padded_start_byte); if (snd_size > 0) { rc = MCA_PML_CALL(isend((void *)((unsigned char *)my_block_addr + soffset), snd_size, MPI_BYTE, peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send the segment to %d", peer)); return OMPI_ERROR; } ++(*active_requests); } PTPCOLL_VERBOSE(10, ("Send data to receive buffer at offset %d to %d\n", send_offset, peer)); } for (k = 0; k < k_radix - 1; k++) { int recv_size=0; group_peer = k_node->rank_exchanges[exchange_step][k]; peer = group_list[group_peer]; recv_offset = send_offset + (k + 1) * seg_size; if ((int)recv_offset + seg_size > block_size){ recv_offset = recv_offset % block_size; } PTPCOLL_VERBOSE(10, ("Receive data to receive buffer at offset %d from %d\n", recv_offset, peer)); recv_size = compute_recv_segment_size((int)block_offset, (int)recv_offset, (int)seg_size, padded_start_byte); if (recv_size > 0) { rc = MCA_PML_CALL(irecv((void *)((unsigned char *) my_block_addr + recv_offset), recv_size, MPI_BYTE, peer, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer)); return OMPI_ERROR; } ++(*active_requests); } } completed = 0; while(!completed){ completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); } block_count = block_count * k_radix; block_size = block_count * extent; } return rc; } static inline int compute_padding_count(int count, int k_radix, int n_exchanges){ bool fpadding = false; size_t dsize; int i, pad_count=0, kpow; /* is padding required */ dsize = count; kpow = 1; for ( i=0; i < n_exchanges; i++) { if (dsize % k_radix) { fpadding = true; } dsize /= k_radix; kpow *= k_radix; } if (fpadding) { pad_count = count % kpow; pad_count = kpow - pad_count; } return pad_count; } int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args){ mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; struct ompi_op_t *op = input_args->op; int tag; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; void *src_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); void *recv_buffer = (void *) ( (unsigned char *)input_args->rbuf + (size_t)input_args->rbuf_offset); int count = input_args->count; struct ompi_datatype_t *dtype = input_args->dtype; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int *status = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); ptrdiff_t lb, extent; /* Get the knomial tree */ netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; int k_radix = k_node->tree_order; int n_exchanges = k_node->n_exchanges; int padded_start_byte; int padding_count = compute_padding_count(count, k_radix, n_exchanges); ompi_datatype_get_extent(dtype, &lb, &extent); padded_start_byte = count * extent; /* Init for making the functions Re-entrant */ tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; *active_requests = 0; *iteration = -1; *status = PTPCOLL_NOT_STARTED; *iteration = 0; compute_knomial_allgather_offsets(my_group_index,count + padding_count, dtype,k_radix,n_exchanges, ptpcoll_module->allgather_offsets); /* Perform a recursive k'ing reduce scatter */ bcol_ptpcoll_allreduce_recursivek_scatter_reduce(ptpcoll_module, buffer_index, src_buffer, recv_buffer, op, count + padding_count, dtype, my_group_index,padded_start_byte); /* Perform a recursive k'ing allgather */ bcol_ptpcoll_allreduce_knomial_allgather(ptpcoll_module, buffer_index, src_buffer, recv_buffer, count + padding_count, dtype, my_group_index, padded_start_byte); return BCOL_FN_COMPLETE; } int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, int buffer_index, void *sbuf, void *rbuf, struct ompi_op_t *op, const int count, struct ompi_datatype_t *dtype){ int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; int k, peer ; int rc = OMPI_SUCCESS; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int block_count, block_size; char *tmprecv_buffer = NULL, *data_src_buffer, *data_dst_buffer; ptrdiff_t lb, extent; ompi_datatype_get_extent(dtype, &lb, &extent); block_count = count; block_size = count * extent; if (0 < block_size) { tmprecv_buffer = (void*)malloc(block_size); } data_src_buffer = sbuf; data_dst_buffer = rbuf; if (EXCHANGE_NODE == k_node->node_type) { for (k = 0; k < k_node->n_extra_sources; k++){ peer = ptpcoll_module->super.sbgp_partner_module->group_list[ k_node->rank_extra_sources_array[k]]; rc = MCA_PML_CALL(recv((void *)((unsigned char *)tmprecv_buffer), block_size, MPI_BYTE, peer, tag, comm, MPI_STATUS_IGNORE)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer)); rc = OMPI_ERROR; goto clean; } ompi_3buff_op_reduce(op, (void*)((unsigned char*)data_src_buffer), (void*)((unsigned char*)tmprecv_buffer), (void*)((unsigned char*)data_dst_buffer), block_count,dtype); data_src_buffer = data_dst_buffer; } } else { peer = ptpcoll_module->super.sbgp_partner_module->group_list[ k_node->rank_extra_sources_array[0]]; rc = MCA_PML_CALL(send((void *)((unsigned char *)sbuf), block_size, MPI_BYTE, peer, tag, MCA_PML_BASE_SEND_STANDARD, comm)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); rc = OMPI_ERROR; goto clean; } } clean: if (tmprecv_buffer) { free(tmprecv_buffer); } return rc; } int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, int buffer_index, void *sbuf, void *rbuf, const int count, struct ompi_datatype_t *dtype){ int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; int k, peer ; int rc = OMPI_SUCCESS; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int block_size, completed; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); ptrdiff_t lb, extent; ompi_datatype_get_extent(dtype, &lb, &extent); block_size = count * extent; if (EXTRA_NODE == k_node->node_type) { peer = ptpcoll_module->super.sbgp_partner_module->group_list[ k_node->rank_extra_sources_array[0]]; rc = MCA_PML_CALL(irecv((void *)((unsigned char *)rbuf), block_size, MPI_BYTE, peer, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); } else { for (k = 0; k < k_node->n_extra_sources; k++) { peer = ptpcoll_module->super.sbgp_partner_module->group_list[ k_node->rank_extra_sources_array[k]]; rc = MCA_PML_CALL(isend((void *)((unsigned char *)rbuf), block_size, MPI_BYTE, peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } } completed = 0; while(!completed){ completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); } return rc; } int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args){ mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; struct ompi_op_t *op = input_args->op; int tag; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; void *src_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); void *recv_buffer = (void *) ( (unsigned char *)input_args->rbuf + (size_t)input_args->rbuf_offset); int count = input_args->count; struct ompi_datatype_t *dtype = input_args->dtype; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int *status = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); ptrdiff_t lb, extent; /* Get the knomial tree */ netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; int k_radix = k_node->tree_order; int n_exchanges = k_node->n_exchanges; int padded_start_byte; int padding_count = compute_padding_count(count, k_radix, n_exchanges); void *tmpsrc_buffer = NULL; ompi_datatype_get_extent(dtype, &lb, &extent); padded_start_byte = count * extent; /* Init for making the functions Re-entrant */ tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; *active_requests = 0; *iteration = -1; *status = PTPCOLL_NOT_STARTED; *iteration = 0; compute_knomial_allgather_offsets(my_group_index,count + padding_count, dtype,k_radix,n_exchanges, ptpcoll_module->allgather_offsets); if (EXCHANGE_NODE == k_node->node_type) { bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(ptpcoll_module, buffer_index, src_buffer, recv_buffer, op, count, dtype); tmpsrc_buffer = src_buffer; if ( k_node->n_extra_sources > 0){ tmpsrc_buffer = recv_buffer; } bcol_ptpcoll_allreduce_recursivek_scatter_reduce(ptpcoll_module, buffer_index, tmpsrc_buffer, recv_buffer, op, count + padding_count, dtype, my_group_index,padded_start_byte); bcol_ptpcoll_allreduce_knomial_allgather(ptpcoll_module, buffer_index, src_buffer, recv_buffer, count + padding_count, dtype, my_group_index, padded_start_byte); bcol_ptpcoll_allreduce_knomial_allgather_extra(ptpcoll_module, buffer_index, src_buffer, recv_buffer, count, dtype); } else if (EXTRA_NODE == k_node->node_type) { bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(ptpcoll_module, buffer_index, src_buffer, recv_buffer, op, count, dtype); bcol_ptpcoll_allreduce_knomial_allgather_extra(ptpcoll_module, buffer_index, src_buffer, recv_buffer, count, dtype); } return BCOL_FN_COMPLETE; } /* * Register allreduce functions to the BCOL function table, * so they can be selected */ int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) super; mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; comm_attribs.bcoll_type = BCOL_ALLREDUCE; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; /* not an accurate attribute, none of these algorithms * are non-blocking */ comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_KNOWN; mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_allreduce_narraying_init, bcol_ptpcoll_allreduce_narraying_progress); inv_attribs.bcol_msg_min = 10000000; inv_attribs.bcol_msg_max = 10485760; /* range 4 */ if (ptpcoll_module->pow_knum == ptpcoll_module->group_size) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init, NULL); } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init, NULL); } return OMPI_SUCCESS; }