6740813c27
When we are only using local ranks basesmuma needs to provide an allreduce function for both large and small message or else the coll/ml selection logic will fail. In the future this logic should probably be updated to just disable allreduce in coll/ml instead of disabling coll/ml. For now it should be correct to say the basesmuma allgather works for larger messages. cmr=v1.8:reviewer=manjugv This commit was SVN r31190.
611 строки
22 KiB
C
611 строки
22 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include "ompi/constants.h"
|
|
#include "ompi/op/op.h"
|
|
#include "ompi/datatype/ompi_datatype.h"
|
|
#include "ompi/communicator/communicator.h"
|
|
|
|
#include "opal/include/opal_stdint.h"
|
|
|
|
#include "ompi/mca/bcol/base/base.h"
|
|
#include "bcol_basesmuma.h"
|
|
|
|
static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super)
|
|
{
|
|
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
|
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
|
|
|
comm_attribs.bcoll_type = BCOL_ALLREDUCE;
|
|
comm_attribs.comm_size_min = 0;
|
|
comm_attribs.comm_size_max = 1048576;
|
|
comm_attribs.data_src = DATA_SRC_KNOWN;
|
|
|
|
/* selection logic at the ml level specifies a
|
|
* request for a non-blocking algorithm
|
|
* however, these algorithms are blocking
|
|
* following what was done at the p2p level
|
|
* we will specify non-blocking, but beware,
|
|
* these algorithms are blocking and will not make use
|
|
* of the progress engine
|
|
*/
|
|
comm_attribs.waiting_semantics = NON_BLOCKING;
|
|
|
|
inv_attribs.bcol_msg_min = 0;
|
|
inv_attribs.bcol_msg_max = 20000;
|
|
inv_attribs.datatype_bitmap = 0xffffffff;
|
|
inv_attribs.op_types_bitmap = 0xffffffff;
|
|
|
|
/* Set attributes for fanin fanout algorithm */
|
|
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
|
bcol_basesmuma_allreduce_intra_fanin_fanout,
|
|
bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
|
|
|
|
inv_attribs.bcol_msg_min = 10000000;
|
|
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
|
|
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
|
bcol_basesmuma_allreduce_intra_fanin_fanout,
|
|
bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
|
|
|
|
/* Differs only in comm size */
|
|
|
|
comm_attribs.data_src = DATA_SRC_UNKNOWN;
|
|
comm_attribs.waiting_semantics = BLOCKING;
|
|
|
|
comm_attribs.comm_size_min = 0;
|
|
comm_attribs.comm_size_max = 8;
|
|
|
|
/* Set attributes for recursive doubling algorithm */
|
|
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
|
bcol_basesmuma_allreduce_intra_recursive_doubling,
|
|
NULL);
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Small data fanin reduce
|
|
* ML buffers are used for both payload and control structures
|
|
* This functions works with hierarchical allreduce and
|
|
* progress engine
|
|
*/
|
|
static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
|
|
int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift)
|
|
{
|
|
volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
|
|
int bcol_id = (int) bcol_module->super.bcol_id;
|
|
int64_t sequence_number = my_ctl_pointer->sequence_number;
|
|
int8_t ready_flag = my_ctl_pointer->ready_flag;
|
|
int group_size = bcol_module->colls_no_user_data.size_of_group;
|
|
|
|
if (LEAF_NODE != my_reduction_node->my_node_type) {
|
|
volatile char *child_data_pointer;
|
|
volatile void *child_rbuf;
|
|
|
|
/* for each child */
|
|
/* my_result_data = child_result_data (op) my_source_data */
|
|
|
|
for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
|
|
int child_rank = my_reduction_node->children_ranks[child] + process_shift;
|
|
|
|
if (group_size <= child_rank){
|
|
child_rank -= group_size;
|
|
}
|
|
|
|
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
|
|
|
|
if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
|
|
*iteration = child;
|
|
return BCOL_FN_STARTED;
|
|
}
|
|
|
|
child_data_pointer = data_buffs[child_rank].payload;
|
|
child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
|
|
|
|
ompi_op_reduce(op, (void *)child_rbuf, (void *)rbuf, count, dtype);
|
|
} /* end child loop */
|
|
}
|
|
|
|
if (ROOT_NODE != my_reduction_node->my_node_type) {
|
|
opal_atomic_wmb ();
|
|
my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
|
|
}
|
|
|
|
/* done with this step. move on to fan out */
|
|
*iteration = -1;
|
|
|
|
return BCOL_FN_COMPLETE;
|
|
}
|
|
|
|
static int allreduce_fanout (mca_bcol_basesmuma_module_t *bcol_module, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer,
|
|
volatile void *my_data_pointer, int process_shift, volatile mca_bcol_basesmuma_payload_t *data_buffs,
|
|
int sequence_number, int group_size, int rbuf_offset, size_t pack_len)
|
|
{
|
|
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
|
int bcol_id = (int) bcol_module->super.bcol_id;
|
|
int8_t ready_flag = my_ctl_pointer->ready_flag + 1;
|
|
netpatterns_tree_node_t *my_fanout_read_tree;
|
|
volatile void *parent_data_pointer;
|
|
int my_fanout_parent, my_rank;
|
|
void *parent_rbuf, *rbuf;
|
|
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_rank]);
|
|
|
|
if (ROOT_NODE != my_fanout_read_tree->my_node_type) {
|
|
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
|
|
if (group_size <= my_fanout_parent) {
|
|
my_fanout_parent -= group_size;
|
|
}
|
|
|
|
rbuf = (void *)((char *) my_data_pointer + rbuf_offset);
|
|
|
|
/*
|
|
* Get parent payload data and control data.
|
|
* Get the pointer to the base address of the parent's payload buffer.
|
|
* Get the parent's control buffer.
|
|
*/
|
|
parent_data_pointer = data_buffs[my_fanout_parent].payload;
|
|
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
|
|
|
|
parent_rbuf = (void *) ((char *) parent_data_pointer + rbuf_offset);
|
|
|
|
/* Wait until parent signals that data is ready */
|
|
/* The order of conditions checked in this loop is important, as it can
|
|
* result in a race condition.
|
|
*/
|
|
if (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
|
|
return BCOL_FN_STARTED;
|
|
}
|
|
|
|
assert (parent_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] == ready_flag);
|
|
|
|
/* Copy the rank to a shared buffer writable by the current rank */
|
|
memcpy ((void *) rbuf, (const void*) parent_rbuf, pack_len);
|
|
}
|
|
|
|
if (LEAF_NODE != my_fanout_read_tree->my_node_type) {
|
|
opal_atomic_wmb ();
|
|
|
|
/* Signal to children that they may read the data from my shared buffer (bump the ready flag) */
|
|
my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
|
|
}
|
|
|
|
my_ctl_pointer->starting_flag_value[bcol_id] += 1;
|
|
|
|
return BCOL_FN_COMPLETE;
|
|
|
|
}
|
|
|
|
static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args)
|
|
{
|
|
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
|
int buff_idx = buff_idx = input_args->src_desc->buffer_index;
|
|
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
|
|
void *data_addr = (void *) input_args->src_desc->data_addr;
|
|
int my_node_index, my_rank, group_size, leading_dim, idx;
|
|
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
|
int64_t sequence_number = input_args->sequence_num;
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
struct ompi_datatype_t *dtype = input_args->dtype;
|
|
netpatterns_tree_node_t *my_reduction_node;
|
|
struct ompi_op_t *op = input_args->op;
|
|
volatile void *my_data_pointer;
|
|
int count = input_args->count;
|
|
int rc, process_shift;
|
|
ptrdiff_t lb, extent;
|
|
volatile void *rbuf;
|
|
|
|
/* get addressing information */
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
group_size = bcol_module->colls_no_user_data.size_of_group;
|
|
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
|
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
|
|
|
/* Align node index to around sbgp root */
|
|
process_shift = input_args->root;
|
|
my_node_index = my_rank - input_args->root;
|
|
if (0 > my_node_index ) {
|
|
my_node_index += group_size;
|
|
}
|
|
|
|
data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
|
|
/* Get control structure and payload buffer */
|
|
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
|
my_data_pointer = (volatile char *) data_addr;
|
|
|
|
my_data_pointer = (volatile char *) data_addr;
|
|
rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
|
|
|
|
/***************************
|
|
* Fan into root phase
|
|
***************************/
|
|
|
|
my_reduction_node = &(bcol_module->reduction_tree[my_node_index]);
|
|
if (-1 != *iteration) {
|
|
rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer,
|
|
dtype, data_buffs, count, op, process_shift);
|
|
if (BCOL_FN_COMPLETE != rc) {
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
/* there might be non-contig dtype - so compute the length with get_extent */
|
|
ompi_datatype_get_extent(dtype, &lb, &extent);
|
|
|
|
/***************************
|
|
* Fan out from root
|
|
***************************/
|
|
|
|
/* all nodes will have the result after fanout */
|
|
input_args->result_in_rbuf = true;
|
|
|
|
/* Signal that you are ready for fanout phase */
|
|
return allreduce_fanout (bcol_module, my_ctl_pointer, my_data_pointer, process_shift, data_buffs,
|
|
sequence_number, group_size, input_args->rbuf_offset, count * (size_t) extent);
|
|
}
|
|
|
|
/**
|
|
* Shared memory blocking allreduce.
|
|
*/
|
|
int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args)
|
|
{
|
|
/* local variables */
|
|
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
|
int buff_idx = buff_idx = input_args->src_desc->buffer_index;
|
|
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
|
|
void *data_addr = (void *) input_args->src_desc->data_addr;
|
|
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
struct ompi_datatype_t *dtype = input_args->dtype;
|
|
int bcol_id = (int) bcol_module->super.bcol_id;
|
|
int rc, my_rank, leading_dim, idx;
|
|
volatile void *my_data_pointer;
|
|
volatile void *sbuf, *rbuf;
|
|
int8_t ready_flag;
|
|
|
|
/* get addressing information */
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
|
idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
|
|
|
|
data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
|
|
/* Get control structure */
|
|
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
|
|
|
my_data_pointer = (volatile char *) data_addr;
|
|
rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
|
|
sbuf = (volatile void *)((char *) my_data_pointer + input_args->sbuf_offset);
|
|
|
|
/* Setup resource recycling */
|
|
/* Set for multiple instances of bcols */
|
|
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, input_args->sequence_num, bcol_id);
|
|
|
|
if (sbuf != rbuf) {
|
|
rc = ompi_datatype_copy_content_same_ddt (dtype, input_args->count, (char *)rbuf,
|
|
(char *)sbuf);
|
|
if( 0 != rc ) {
|
|
return OMPI_ERROR;
|
|
}
|
|
}
|
|
|
|
*iteration = 0;
|
|
my_ctl_pointer->ready_flag = ready_flag;
|
|
|
|
return bcol_basesmuma_allreduce_intra_fanin_fanout_progress (input_args, c_input_args);
|
|
}
|
|
|
|
|
|
|
|
/* this thing uses the old bcol private control structures */
|
|
int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
|
|
mca_bcol_base_function_t *c_input_args)
|
|
{
|
|
|
|
int my_rank,group_size,my_node_index;
|
|
int pair_rank, exchange, extra_rank, payload_len;
|
|
size_t dt_size;
|
|
int read_offset, write_offset;
|
|
volatile void *my_data_pointer;
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer = NULL,
|
|
*partner_ctl_pointer = NULL,
|
|
*extra_ctl_pointer = NULL;
|
|
volatile void *my_read_pointer, *my_write_pointer, *partner_read_pointer,
|
|
*extra_rank_readwrite_data_pointer,*extra_rank_read_data_pointer;
|
|
mca_bcol_basesmuma_module_t* bcol_module =
|
|
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
|
|
|
int8_t ready_flag;
|
|
int sbuf_offset,rbuf_offset,flag_offset;
|
|
int root,count;
|
|
struct ompi_op_t *op;
|
|
int64_t sequence_number=input_args->sequence_num;
|
|
struct ompi_datatype_t *dtype;
|
|
int first_instance = 0;
|
|
int leading_dim,idx;
|
|
int buff_idx;
|
|
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
|
/*volatile void **data_buffs;*/
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
netpatterns_pair_exchange_node_t *my_exchange_node;
|
|
|
|
|
|
/*
|
|
* Get addressing information
|
|
*/
|
|
buff_idx = input_args->src_desc->buffer_index;
|
|
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
group_size = bcol_module->colls_no_user_data.size_of_group;
|
|
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
|
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
|
|
|
/*
|
|
* Get SM control structures and payload buffers
|
|
*/
|
|
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
|
|
bcol_module->colls_with_user_data.ctl_buffs+idx;
|
|
/*data_buffs = (volatile void **)
|
|
bcol_module->colls_with_user_data.data_buffs+idx;*/
|
|
|
|
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
|
bcol_module->colls_with_user_data.data_buffs + idx;
|
|
|
|
|
|
/*
|
|
* Get control structure and payload buffer
|
|
*/
|
|
my_ctl_pointer = ctl_structs[my_rank];
|
|
if (my_ctl_pointer->sequence_number < sequence_number) {
|
|
first_instance=1;
|
|
}
|
|
my_data_pointer = data_buffs[my_rank].payload;
|
|
|
|
/*
|
|
* Align node index to around sbgp root
|
|
*/
|
|
root = input_args->root;
|
|
my_node_index = my_rank - root;
|
|
if (0 > my_node_index) {
|
|
my_node_index += group_size;
|
|
}
|
|
|
|
/*
|
|
* Get data from arguments
|
|
*/
|
|
sbuf_offset = input_args->sbuf_offset;
|
|
rbuf_offset = input_args->rbuf_offset;
|
|
op = input_args->op;
|
|
count = input_args->count;
|
|
dtype = input_args->dtype;
|
|
|
|
/*
|
|
* Get my node for the reduction tree
|
|
*/
|
|
my_exchange_node = &(bcol_module->recursive_doubling_tree);
|
|
|
|
|
|
if (first_instance) {
|
|
my_ctl_pointer->index = 1;
|
|
my_ctl_pointer->starting_flag_value = 0;
|
|
flag_offset = 0;
|
|
my_ctl_pointer->flag = -1;
|
|
/*
|
|
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
|
|
my_ctl_pointer->flags[ALLREDUCE_FLAG] = -1;
|
|
}
|
|
*/
|
|
} else {
|
|
my_ctl_pointer->index++;
|
|
flag_offset = my_ctl_pointer->starting_flag_value;
|
|
}
|
|
|
|
/* signal that I have arrived */
|
|
/* opal_atomic_wmb (); */
|
|
my_ctl_pointer->sequence_number = sequence_number;
|
|
|
|
/* If we use this buffer more than once by an sm module in
|
|
* a given collective, will need to distinguish between instances, so
|
|
* we pick up the right data.
|
|
*/
|
|
ready_flag = flag_offset + sequence_number + 1;
|
|
|
|
/*
|
|
* Set up pointers for using during recursive doubling phase
|
|
*/
|
|
read_offset = sbuf_offset;
|
|
write_offset = rbuf_offset;
|
|
fprintf(stderr,"read offset %d write offset %d\n",read_offset,write_offset);
|
|
my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset);
|
|
my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
|
|
|
|
/*
|
|
* When there are non-power 2 nodes, the extra nodes' data is copied and
|
|
* reduced by partner exchange nodes.
|
|
* Extra nodes: Nodes with rank greater nearest power of 2
|
|
* Exchange nodes: Nodes with rank lesser than nearest power of 2 that
|
|
* partner with extras nodes during reduction
|
|
*/
|
|
|
|
if (0 < my_exchange_node->n_extra_sources) {
|
|
/*
|
|
* Signal extra node that data is ready
|
|
*/
|
|
opal_atomic_wmb ();
|
|
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
|
extra_rank = my_exchange_node->rank_extra_source;
|
|
extra_ctl_pointer = ctl_structs[extra_rank];
|
|
extra_rank_readwrite_data_pointer = (void *) ((char *) data_buffs[extra_rank].payload +
|
|
read_offset);
|
|
|
|
/*
|
|
* Wait for data to get ready
|
|
*/
|
|
while (!((sequence_number == extra_ctl_pointer->sequence_number) &&
|
|
(extra_ctl_pointer->flag >= ready_flag))){
|
|
}
|
|
|
|
ompi_op_reduce(op,(void *)extra_rank_readwrite_data_pointer,
|
|
(void *)my_read_pointer, count, dtype);
|
|
}
|
|
}
|
|
|
|
|
|
/* --Exchange node that reduces with extra node --: Signal to extra node that data is read
|
|
* --Exchange node that doesn't reduce data with extra node --: This assignment
|
|
* is used so it can sync with other nodes during exchange phase
|
|
* --Extra node--: It can pass to next phase
|
|
*/
|
|
ready_flag++;
|
|
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
|
|
/*
|
|
* Exchange data with all the nodes that are less than max_power_2
|
|
*/
|
|
for (exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) {
|
|
int tmp=0;
|
|
|
|
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
|
my_ctl_pointer->flag = ready_flag;
|
|
pair_rank=my_exchange_node->rank_exchanges[exchange];
|
|
partner_ctl_pointer = ctl_structs[pair_rank];
|
|
partner_read_pointer = (volatile void *) ((char *)data_buffs[pair_rank].payload + read_offset);
|
|
|
|
my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset);
|
|
my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
|
|
|
|
/*
|
|
* Wait for partner to be ready, so we can read
|
|
*/
|
|
/*
|
|
JSL ---- FIX ME !!!!! MAKE ME COMPLIANT WITH NEW BUFFERS
|
|
while (!IS_ALLREDUCE_PEER_READY(partner_ctl_pointer,
|
|
ready_flag, sequence_number)) {
|
|
}
|
|
*/
|
|
|
|
/*
|
|
* Perform reduction operation
|
|
*/
|
|
ompi_3buff_op_reduce(op,(void *)my_read_pointer, (void *)partner_read_pointer,
|
|
(void *)my_write_pointer, count, dtype);
|
|
|
|
|
|
/*
|
|
* Signal that I am done reading my partner's data
|
|
*/
|
|
ready_flag++;
|
|
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
while (ready_flag > partner_ctl_pointer->flag){
|
|
opal_progress();
|
|
}
|
|
|
|
/*
|
|
* Swap read and write offsets
|
|
*/
|
|
tmp = read_offset;
|
|
read_offset = write_offset;
|
|
write_offset = tmp;
|
|
|
|
}
|
|
|
|
|
|
/*
|
|
* Copy data in from the "extra" source, if need be
|
|
*/
|
|
|
|
if (0 < my_exchange_node->n_extra_sources) {
|
|
|
|
if (EXTRA_NODE == my_exchange_node->node_type) {
|
|
|
|
int extra_rank_read_offset=-1,my_write_offset=-1;
|
|
|
|
/* Offset the ready flag to sync with
|
|
* exchange node which might going through exchange phases
|
|
* unlike the extra node
|
|
*/
|
|
ready_flag = ready_flag + my_exchange_node->log_2;
|
|
|
|
if (my_exchange_node->log_2%2) {
|
|
extra_rank_read_offset = rbuf_offset;
|
|
my_write_offset = rbuf_offset;
|
|
|
|
} else {
|
|
extra_rank_read_offset = sbuf_offset;
|
|
my_write_offset = sbuf_offset;
|
|
|
|
}
|
|
|
|
my_write_pointer = (volatile void*)((char *)my_data_pointer + my_write_offset);
|
|
extra_rank = my_exchange_node->rank_extra_source;
|
|
extra_ctl_pointer = ctl_structs[extra_rank];
|
|
|
|
extra_rank_read_data_pointer = (volatile void *) ((char *)data_buffs[extra_rank].payload +
|
|
extra_rank_read_offset);
|
|
|
|
/*
|
|
* Wait for the exchange node to be ready
|
|
*/
|
|
ompi_datatype_type_size(dtype, &dt_size);
|
|
payload_len = count*dt_size;
|
|
#if 0
|
|
fix me JSL !!!!!
|
|
while (!IS_DATA_READY(extra_ctl_pointer, ready_flag, sequence_number)){
|
|
}
|
|
#endif
|
|
memcpy((void *)my_write_pointer,(const void *)
|
|
extra_rank_read_data_pointer, payload_len);
|
|
|
|
ready_flag++;
|
|
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
|
|
} else {
|
|
|
|
/*
|
|
* Signal parent that data is ready
|
|
*/
|
|
opal_atomic_wmb ();
|
|
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
/* wait until child is done to move on - this buffer will
|
|
* be reused for the next stripe, so don't want to move
|
|
* on too quick.
|
|
*/
|
|
extra_rank = my_exchange_node->rank_extra_source;
|
|
extra_ctl_pointer = ctl_structs[extra_rank];
|
|
}
|
|
}
|
|
|
|
input_args->result_in_rbuf = my_exchange_node->log_2 & 1;
|
|
|
|
my_ctl_pointer->starting_flag_value += 1;
|
|
|
|
return BCOL_FN_COMPLETE;
|
|
}
|