b89f8fabc9
The project includes following components and frameworks: - ML Collective component - NETPATTERNS and COMMPATTERNS common components - BCOL framework - SBGP framework Note: By default the ML collective component is disabled. In order to enable new collectives user should bump up the priority of ml component (coll_ml_priority) ============================================= Primary Contributors (in alphabetical order): Ishai Rabinovich (Mellanox) Joshua S. Ladd (ORNL / Mellanox) Manjunath Gorentla Venkata (ORNL) Mike Dubman (Mellanox) Noam Bloch (Mellanox) Pavel (Pasha) Shamis (ORNL / Mellanox) Richard Graham (ORNL / Mellanox) Vasily Filipov (Mellanox) This commit was SVN r27078.
908 строки
31 KiB
C
908 строки
31 KiB
C
/*
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/constants.h"
|
|
#include "ompi/datatype/ompi_datatype.h"
|
|
#include "ompi/communicator/communicator.h"
|
|
|
|
#include "bcol_basesmuma_utils.h"
|
|
#include "bcol_basesmuma.h"
|
|
|
|
/* debug
|
|
* #include "opal/sys/timer.h"
|
|
*
|
|
* extern uint64_t timers[7];
|
|
* end debug */
|
|
|
|
/* debug */
|
|
#include <unistd.h>
|
|
/* end debug */
|
|
|
|
/* includes shared memory optimization */
|
|
|
|
#define BCOL_BASESMUMA_SM_PROBE(src_list, n_src, my_index, matched, src) \
|
|
do { \
|
|
int j; \
|
|
for( j = 0; j < n_src; j++) { \
|
|
parent_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \
|
|
parent_data_pointer = data_buffs[src_list[j]].payload; \
|
|
if( IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { \
|
|
src = src_list[j]; \
|
|
matched = 1; \
|
|
break; \
|
|
} \
|
|
} \
|
|
} while(0)
|
|
|
|
/*
|
|
#define IS_LARGE_DATA_READY(peer, my_flag, my_sequence_number) \
|
|
(((peer)->sequence_number == (my_sequence_number) && \
|
|
(peer)->flags[BCAST_FLAG] >= (my_flag) \
|
|
)? true : false )
|
|
*/
|
|
|
|
/*
|
|
#define IS_KNOWN_ROOT_DATA_READY(peer, my_flag, my_sequence_number) \
|
|
(((peer)->sequence_number == (my_sequence_number) && \
|
|
(peer)->flags[BCAST_FLAG][bcol_id] >= (my_flag) \
|
|
)? true : false )
|
|
*/
|
|
|
|
#define BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(src_list, n_src, my_index, matched, src, flag_index, bcol_id) \
|
|
do { \
|
|
int j; \
|
|
for( j = 0; j < n_src; j++) { \
|
|
/* fprintf(stderr,"my_rank %d and %d\n",my_rank,1); */ \
|
|
if(src_list[j] != -1) { \
|
|
parent_ctl_pointer = ctl_structs[src_list[j]]; \
|
|
parent_data_pointer = (void *) data_buffs[src_list[j]].ctl_struct; \
|
|
/*fprintf(stderr,"my_rank %d ready flag %d partner flag %d and %d\n",my_rank,ready_flag,parent_ctl_pointer->flag,2); */ \
|
|
if( IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, flag_index, bcol_id)) { \
|
|
src = src_list[j]; \
|
|
matched = 1; \
|
|
index = j; \
|
|
/* fprintf(stderr,"found it from %d!\n",src);*/ \
|
|
break; \
|
|
} \
|
|
} \
|
|
} \
|
|
} while(0)
|
|
|
|
#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \
|
|
do { \
|
|
int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \
|
|
my_group_index - group_root + group_size; \
|
|
radix_mask = 1; \
|
|
while (radix_mask < group_size) { \
|
|
if (relative_rank % (radix * radix_mask)) { \
|
|
data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \
|
|
if (data_src >= group_size) data_src -= group_size; \
|
|
break; \
|
|
} \
|
|
radix_mask *= radix; \
|
|
} \
|
|
} while (0)
|
|
|
|
int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args)
|
|
{
|
|
/* local variables */
|
|
mca_bcol_basesmuma_module_t* bcol_module=
|
|
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
|
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
|
int i, matched = 0;
|
|
int group_size;
|
|
int my_rank;
|
|
int leading_dim,
|
|
buff_idx,
|
|
idx;
|
|
int count = input_args->count;
|
|
struct ompi_datatype_t* dtype = input_args->dtype;
|
|
int64_t sequence_number = input_args->sequence_num;
|
|
int radix =
|
|
mca_bcol_basesmuma_component.k_nomial_radix;
|
|
int radix_mask;
|
|
int16_t data_src = -1;
|
|
|
|
volatile int8_t ready_flag;
|
|
int bcol_id = (int) bcol_module->super.bcol_id;
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
volatile char* parent_data_pointer;
|
|
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
|
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
|
|
|
size_t pack_len = 0,
|
|
dt_size;
|
|
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
|
|
input_args->sbuf_offset);
|
|
|
|
#if 0
|
|
fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
|
|
fflush(stderr);
|
|
#endif
|
|
|
|
|
|
/* we will work only on packed data - so compute the length*/
|
|
BASESMUMA_VERBOSE(3, ("Calling bcol_basesmuma_bcast_k_nomial_knownroot"));
|
|
ompi_datatype_type_size(dtype, &dt_size);
|
|
pack_len = count * dt_size;
|
|
/* Some hierarchical algorithms have data that is accumulated at each step
|
|
* this factor accounts for this
|
|
*/
|
|
pack_len = pack_len*input_args->hier_factor;
|
|
buff_idx = input_args->buffer_index;
|
|
|
|
/* Get addressing information */
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
group_size = bcol_module->colls_no_user_data.size_of_group;
|
|
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
|
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
|
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
|
bcol_module->colls_with_user_data.data_buffs + idx;
|
|
|
|
/* Set pointer to current proc ctrl region */
|
|
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
|
|
|
/* setup resource recycling */
|
|
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
|
/* removing dependence on sequence number */
|
|
/* I believe this is resolved now with the signaling flags */
|
|
/*
|
|
ready_temp = 1 + (int8_t) flag_offset + (int8_t) bcol_id;
|
|
if( ready_temp >= my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
|
|
ready_flag = ready_temp;
|
|
} else {
|
|
ready_flag = my_ctl_pointer->flags[BCAST_FLAG][bcol_id];
|
|
}
|
|
MB();
|
|
my_ctl_pointer->sequence_number = sequence_number;
|
|
*/
|
|
|
|
|
|
/* non-blocking broadcast algorithm */
|
|
|
|
/* If I am the root, then signal ready flag */
|
|
if(input_args->root_flag) {
|
|
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
|
/*
|
|
* signal ready flag
|
|
*/
|
|
MB();
|
|
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
|
|
|
|
/* root is finished */
|
|
goto Release;
|
|
}
|
|
|
|
|
|
/* Calculate source of the data */
|
|
K_NOMIAL_DATA_SRC(radix, my_rank, group_size,
|
|
input_args->root_route->rank, data_src, radix_mask);
|
|
|
|
|
|
parent_ctl_pointer = data_buffs[data_src].ctl_struct;
|
|
parent_data_pointer = data_buffs[data_src].payload;
|
|
|
|
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
|
|
|
|
if(IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, BCAST_FLAG, bcol_id)) {
|
|
matched = 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* If not matched, then hop out and put me on progress list */
|
|
if(0 == matched ) {
|
|
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
|
return BCOL_FN_NOT_STARTED;
|
|
}
|
|
|
|
/* else, we found our root within the group ... */
|
|
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", data_src));
|
|
|
|
/* copy the data */
|
|
memcpy(data_addr, (void *) parent_data_pointer, pack_len);
|
|
/* set the memory barrier to ensure completion */
|
|
MB();
|
|
/* signal that I am done */
|
|
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
|
|
|
|
|
|
Release:
|
|
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
|
return BCOL_FN_COMPLETE;
|
|
}
|
|
|
|
|
|
/**
|
|
* Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
|
|
* This routine assumes that buf (the input buffer) is a single writer
|
|
* multi reader (SWMR) shared memory buffer owned by the calling rank
|
|
* which is the only rank that can write to this buffers.
|
|
* It is also assumed that the buffers are registered and fragmented
|
|
* at the ML level and that buf is sufficiently large to hold the data.
|
|
*
|
|
*
|
|
* @param buf - SWMR shared buffer within a sbgp that the
|
|
* executing rank can write to.
|
|
* @param count - the number of elements in the shared buffer.
|
|
* @param dtype - the datatype of a shared buffer element.
|
|
* @param root - the index within the sbgp of the root.
|
|
* @param module - basesmuma module.
|
|
*/
|
|
int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args)
|
|
{
|
|
/* local variables */
|
|
mca_bcol_basesmuma_module_t* bcol_module=
|
|
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
|
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
|
int i;
|
|
int group_size;
|
|
int my_rank;
|
|
int leading_dim, buff_idx, idx;
|
|
int count=input_args->count;
|
|
struct ompi_datatype_t* dtype=input_args->dtype;
|
|
int64_t sequence_number=input_args->sequence_num;
|
|
|
|
|
|
int pow_k_levels;
|
|
int radix = cs->k_nomial_radix;
|
|
int radix_mask;
|
|
int relative_rank;
|
|
int pow_k_group_size;
|
|
|
|
volatile int8_t ready_flag;
|
|
int bcol_id = (int) bcol_module->super.bcol_id;
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
volatile void* parent_data_pointer;
|
|
volatile void* my_data_pointer;
|
|
|
|
|
|
volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
|
|
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
|
|
|
size_t pack_len = 0, dt_size;
|
|
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
|
|
input_args->sbuf_offset);
|
|
|
|
#if 0
|
|
fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
|
|
fflush(stderr);
|
|
#endif
|
|
|
|
|
|
|
|
/* we will work only on packed data - so compute the length*/
|
|
ompi_datatype_type_size(dtype, &dt_size);
|
|
pack_len=count*dt_size;
|
|
|
|
buff_idx = input_args->buffer_index;
|
|
|
|
/* Get addressing information */
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
group_size = bcol_module->colls_no_user_data.size_of_group;
|
|
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
|
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
|
|
|
/* get pow_k_levels and pow_k_group_size */
|
|
pow_k_levels = bcol_module->pow_k_levels;
|
|
pow_k_group_size = bcol_module->pow_k;
|
|
|
|
|
|
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
|
bcol_module->colls_with_user_data.data_buffs+idx;
|
|
|
|
my_data_pointer = data_buffs[my_rank].payload;
|
|
|
|
/* Set pointer to current proc ctrl region */
|
|
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
|
|
|
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
|
|
|
/* non-blocking broadcast algorithm */
|
|
|
|
/* If I am the root, then signal ready flag */
|
|
if(input_args->root_flag) {
|
|
|
|
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
|
/*
|
|
* set the radix_mask */
|
|
radix_mask = pow_k_group_size;
|
|
/* send to children */
|
|
MB();
|
|
BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
|
|
radix,0,
|
|
my_rank,group_size, ready_flag);
|
|
/* root is finished */
|
|
goto Release;
|
|
}
|
|
|
|
/* If I am not the root, then poll on possible "senders'" control structs */
|
|
for( i = 0; i < cs->num_to_probe; i++) {
|
|
|
|
if( ready_flag == my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
|
|
|
|
/* else, we found our root within the group ... */
|
|
parent_data_pointer = data_buffs[my_ctl_pointer->src].payload;
|
|
BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,my_ctl_pointer->src));
|
|
/* memcopy the data */
|
|
memcpy(data_addr, (void *) parent_data_pointer, pack_len);
|
|
/* compute my relative rank */
|
|
relative_rank = (my_rank - my_ctl_pointer->src) < 0 ? my_rank -
|
|
my_ctl_pointer->src + group_size : my_rank - my_ctl_pointer->src;
|
|
|
|
/* compute my radix mask */
|
|
radix_mask = 1;
|
|
while(radix_mask < group_size ){
|
|
if( 0 != relative_rank % (radix*radix_mask)) {
|
|
/* found it */
|
|
break;
|
|
}
|
|
radix_mask *= radix;
|
|
}
|
|
/* go one step back */
|
|
radix_mask /= radix;
|
|
|
|
/* send to children */
|
|
MB();
|
|
BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
|
|
radix, relative_rank,
|
|
my_rank, group_size, ready_flag);
|
|
/* bail */
|
|
|
|
goto Release;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If not matched, then hop out and put me on progress list */
|
|
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
|
/*fprintf(stderr,"bcol_id %d Not started\n",bcol_id);*/
|
|
return BCOL_FN_NOT_STARTED;
|
|
|
|
|
|
|
|
Release:
|
|
|
|
|
|
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
|
|
|
return BCOL_FN_COMPLETE;
|
|
}
|
|
|
|
|
|
/* non-blocking binary scatter allgather anyroot algorithm for large data
|
|
* broadcast
|
|
*/
|
|
|
|
|
|
#if 0
|
|
/* prototype code for shared memory scatter/allgather algorithm. Signaling scheme
|
|
* works, should be used as a reference for other types of shared memory scatter/allgather
|
|
* algorithms.
|
|
*/
|
|
int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args)
|
|
{
|
|
|
|
/* local variables */
|
|
int i, j;
|
|
int length;
|
|
int start;
|
|
int my_rank, parent_rank;
|
|
int partner;
|
|
int src = -1;
|
|
int matched = 0;
|
|
int group_size;
|
|
int first_instance=0;
|
|
int leading_dim, buff_idx, idx;
|
|
int64_t sequence_number=input_args->sequence_num;
|
|
|
|
int64_t ready_flag;
|
|
int64_t local_offset;
|
|
|
|
int flag_offset;
|
|
int pow_2, pow_2_levels;
|
|
int index = -1;
|
|
|
|
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
|
mca_bcol_basesmuma_module_t *bcol_module =
|
|
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
|
/* use the old control structs for large messages,
|
|
* otherwise we will destroy the shared memory
|
|
* optimization
|
|
*/
|
|
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
|
mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
|
|
mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* binomial fanout */
|
|
mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer; /* recursive double */
|
|
|
|
/* for now, we use the payload buffer for single fragment */
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
volatile void *parent_data_pointer; /* binomial scatter */
|
|
volatile void *partner_data_pointer; /* recursive double */
|
|
|
|
uint32_t fragment_size; /* ml buffer size for now */
|
|
|
|
/* we will transfer the entire buffer,
|
|
* so start at the base address of the ml buffer
|
|
*/
|
|
void *data_addr = (void *) ((unsigned char *) input_args->src_desc->base_data_addr);
|
|
#if 0
|
|
fprintf(stderr,"AAA Entering nb-sm large msg broadcast input_args->frag_size %d \n",input_args->frag_size);
|
|
fflush(stderr);
|
|
#endif
|
|
|
|
buff_idx = input_args->src_desc->buffer_index;
|
|
|
|
group_size = bcol_module->colls_no_user_data.size_of_group;
|
|
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
|
|
|
/* get the largest power of two that is smaller than
|
|
* or equal to the group size
|
|
*/
|
|
pow_2_levels = bcol_module->pow_2_levels;
|
|
pow_2 = bcol_module->pow_2;
|
|
|
|
/* get the fragment size
|
|
*/
|
|
|
|
/* still just the size of the entire buffer */
|
|
fragment_size = input_args->buffer_size;
|
|
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
|
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
|
|
|
|
|
/* grab the control structs */
|
|
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
|
|
bcol_module->colls_with_user_data.ctl_buffs+idx;
|
|
|
|
/* grab the data buffs */
|
|
data_buffs = (mca_bcol_basesmuma_payload_t *)
|
|
bcol_module->colls_with_user_data.data_buffs+idx;
|
|
|
|
my_ctl_pointer = ctl_structs[my_rank];
|
|
|
|
if(my_ctl_pointer->sequence_number < sequence_number) {
|
|
first_instance = 1;
|
|
}
|
|
|
|
if(first_instance) {
|
|
my_ctl_pointer->flag = -1;
|
|
my_ctl_pointer->index = 1;
|
|
|
|
my_ctl_pointer->starting_flag_value = 0;
|
|
|
|
flag_offset = 0;
|
|
|
|
} else {
|
|
|
|
my_ctl_pointer->index++;
|
|
}
|
|
|
|
/* increment the starting flag by one and return */
|
|
flag_offset = my_ctl_pointer->starting_flag_value;
|
|
ready_flag = flag_offset + sequence_number + 1;
|
|
|
|
my_ctl_pointer->sequence_number = sequence_number;
|
|
|
|
/* am I the root */
|
|
if(input_args->root_flag) {
|
|
/* if I've already been here, then
|
|
* hop down to the allgather
|
|
*/
|
|
if(ALLGATHER == my_ctl_pointer->status) {
|
|
goto Allgather;
|
|
}
|
|
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
|
/* debug print */
|
|
/*fprintf(stderr,"I am the root %d\n",my_rank);*/
|
|
/*
|
|
* signal ready flag
|
|
*/
|
|
/* set the offset into the buffer */
|
|
my_ctl_pointer->offset = 0;
|
|
/* how many children do I have */
|
|
my_ctl_pointer->n_sends = pow_2_levels;
|
|
/* my data length */
|
|
my_ctl_pointer->length = fragment_size;
|
|
|
|
/* important that these be set before my children
|
|
* see the ready flag raised
|
|
*/
|
|
MB();
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
/* root is finished */
|
|
if( my_rank < pow_2 ) {
|
|
/* if I'm in the power of two group,
|
|
* then goto the allgather
|
|
*/
|
|
my_ctl_pointer->status = ALLGATHER;
|
|
goto Allgather;
|
|
|
|
} else {
|
|
|
|
/* if I'm not, then I'm done and release */
|
|
goto Release;
|
|
}
|
|
|
|
}
|
|
|
|
/* what phase am I participating in
|
|
*/
|
|
switch(my_ctl_pointer->status) {
|
|
|
|
case SCATTER:
|
|
goto Scatter;
|
|
break;
|
|
|
|
case ALLGATHER:
|
|
goto Allgather;
|
|
break;
|
|
|
|
case EXTRA_RANK:
|
|
goto Extra;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
|
|
Extra:
|
|
/* am I part of the non-power-of-2 group */
|
|
if( my_rank >= pow_2 ) {
|
|
/* find parent to copy from */
|
|
parent_rank = my_rank&(pow_2-1);
|
|
parent_ctl_pointer = ctl_structs[parent_rank];
|
|
/* start at the base */
|
|
parent_data_pointer = (void *) data_buffs[parent_rank].ctl_struct;
|
|
|
|
/* now, I need to do some arithmetic to
|
|
* arrive at the value everyone else does
|
|
* when they have completed the algorithm
|
|
*/
|
|
|
|
/* compute ready flag value to poll on */
|
|
ready_flag = ready_flag + pow_2_levels;
|
|
|
|
/* start to poll */
|
|
for( i = 0; i< cs->num_to_probe; i++) {
|
|
if(IS_LARGE_DATA_READY(parent_ctl_pointer,ready_flag, sequence_number)) {
|
|
/* copy the data and bail */
|
|
memcpy(data_addr,(void *)parent_data_pointer,fragment_size);
|
|
goto Release;
|
|
}
|
|
/*
|
|
else {
|
|
opal_progress();
|
|
}
|
|
*/
|
|
}
|
|
my_ctl_pointer->status = EXTRA_RANK;
|
|
|
|
/* hop out and put me onto a progress queue */
|
|
return BCOL_FN_NOT_STARTED;
|
|
}
|
|
|
|
Scatter:
|
|
|
|
/* on first entry, compute the list of possible sources */
|
|
if( NULL == my_ctl_pointer->src_ptr ) {
|
|
my_ctl_pointer->src_ptr = (int *) malloc(sizeof(int)*(pow_2_levels+1));
|
|
|
|
for( i = 0; i < pow_2_levels; i++) {
|
|
my_ctl_pointer->src_ptr[i] = my_rank ^ (1<<i);
|
|
}
|
|
/* am I participating in the non-power of two */
|
|
if((my_rank+pow_2) < group_size) {
|
|
/* extra rank that I'm paired with */
|
|
my_ctl_pointer->src_ptr[i] = my_rank + pow_2;
|
|
} else {
|
|
/* no extra rank to worry about */
|
|
my_ctl_pointer->src_ptr[i] = -1;
|
|
}
|
|
}
|
|
|
|
/* If I am not the root, then poll on possible "senders'" control structs */
|
|
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
|
|
|
|
/* Shared memory iprobe */
|
|
BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(my_ctl_pointer->src_ptr, pow_2_levels+1,
|
|
my_rank, matched, src);
|
|
}
|
|
|
|
/* If not matched, then hop out and put me on progress list */
|
|
if(0 == matched ) {
|
|
|
|
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
|
|
|
my_ctl_pointer->status = SCATTER;
|
|
return BCOL_FN_NOT_STARTED;
|
|
|
|
} else if ( src >= pow_2 ){
|
|
|
|
/* If matched from an extra rank, then get the whole message from partner */
|
|
memcpy((void *) data_addr, (void *) parent_data_pointer,
|
|
parent_ctl_pointer->length);
|
|
|
|
/* now I am the psuedo-root in the power-of-two group */
|
|
my_ctl_pointer->offset = 0;
|
|
my_ctl_pointer->length = parent_ctl_pointer->length;
|
|
my_ctl_pointer->n_sends = parent_ctl_pointer->n_sends;
|
|
|
|
/* set the memory barrier */
|
|
MB();
|
|
|
|
/* fire the ready flag */
|
|
my_ctl_pointer->flag = ready_flag;
|
|
my_ctl_pointer->status = ALLGATHER;
|
|
/* go to the allgather */
|
|
goto Allgather;
|
|
}
|
|
|
|
|
|
/* we need to see whether this is really
|
|
* who we are looking for
|
|
*/
|
|
for( i = 0; i < parent_ctl_pointer->n_sends; i++) {
|
|
/* debug print */
|
|
/*
|
|
fprintf(stderr,"I am %d checking on a hit from %d with n_sends %d\n",my_rank,src,parent_ctl_pointer->n_sends);
|
|
fflush(stderr);
|
|
*/
|
|
/* end debug */
|
|
if( my_rank == (src^(1<<i))) {
|
|
|
|
/* we found our root within the group ... */
|
|
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
|
|
/* this is who I've been looking for */
|
|
my_ctl_pointer->n_sends = i;
|
|
|
|
if ( i > 0) {
|
|
/* compute the size of the chunk to copy */
|
|
length = (parent_ctl_pointer->length)/
|
|
(1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends));
|
|
my_ctl_pointer->length = length;
|
|
my_ctl_pointer->offset =
|
|
parent_ctl_pointer->offset+length;
|
|
|
|
/*fprintf(stderr,"%d's offset %d and length %d \n",my_rank,my_ctl_pointer->offset,length);*/
|
|
|
|
/* now we can copy the data */
|
|
memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
|
|
(void *) ((uint64_t) parent_data_pointer+(uint64_t) parent_ctl_pointer->offset +
|
|
(uint64_t) length),
|
|
(size_t)length);
|
|
} else {
|
|
/* this "trick" takes care of the first level
|
|
* of recurssive doubling
|
|
*/
|
|
length = parent_ctl_pointer->length/
|
|
(1<<(parent_ctl_pointer->n_sends - 1));
|
|
my_ctl_pointer->length = length;
|
|
my_ctl_pointer->offset = parent_ctl_pointer->offset;
|
|
|
|
/*fprintf(stderr,"%d's offset %d and length %d\n",my_rank,my_ctl_pointer->offset,length);*/
|
|
/* now we can copy the data */
|
|
memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
|
|
(void *) ((uint64_t) parent_data_pointer+(uint64_t) my_ctl_pointer->offset),
|
|
(size_t)length);
|
|
}
|
|
/* set the memory barrier to ensure completion */
|
|
MB();
|
|
/* signal that I am done */
|
|
my_ctl_pointer->flag = ready_flag;
|
|
/* set my status */
|
|
my_ctl_pointer->status = ALLGATHER;
|
|
/* time for allgather phase */
|
|
goto Allgather;
|
|
}
|
|
|
|
}
|
|
|
|
/* this is not who we are looking for,
|
|
* mark as false positive so we don't
|
|
* poll here again
|
|
*/
|
|
my_ctl_pointer->src_ptr[index] = -1;
|
|
/* probably we should jump out and put onto progress list */
|
|
my_ctl_pointer->status = SCATTER;
|
|
return BCOL_FN_NOT_STARTED;
|
|
|
|
Allgather:
|
|
|
|
/* zip it back up - we have already taken care of first level */
|
|
/* needed for non-blocking conditional */
|
|
matched = 0;
|
|
|
|
/* get my local_offset */
|
|
local_offset = my_ctl_pointer->offset;
|
|
|
|
/* bump the ready flag */
|
|
ready_flag++;
|
|
|
|
/* first level of zip up */
|
|
length = 2*fragment_size/pow_2;
|
|
|
|
/* first level of zip-up
|
|
* already includes first level of
|
|
* recursive doubling
|
|
*/
|
|
start = 1;
|
|
|
|
/* for non-blocking, check to see if I need to reset the state */
|
|
if(my_ctl_pointer->flag >= ready_flag) {
|
|
/* then reset the state */
|
|
ready_flag = my_ctl_pointer->flag;
|
|
start = my_ctl_pointer->start;
|
|
/* get the local offset */
|
|
local_offset = my_ctl_pointer->offset_zip;
|
|
/* compute the correct length */
|
|
length = length*(1<<(start - 1));
|
|
/* careful! skip over the MB() to avoid the
|
|
* cost on every re-entry
|
|
*/
|
|
goto Loop;
|
|
}
|
|
|
|
|
|
MB();
|
|
/* I am ready, set the flag */
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
Loop:
|
|
|
|
for( i = start; i < pow_2_levels; i++) {
|
|
/* get my partner for this level */
|
|
partner = my_rank^(1<<i);
|
|
partner_ctl_pointer = ctl_structs[partner];
|
|
partner_data_pointer = (void *) data_buffs[partner].ctl_struct;
|
|
|
|
/* is data ready */
|
|
for( j = 0; j < cs->num_to_probe && matched == 0; j++) {
|
|
if(IS_LARGE_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
|
|
|
|
/* debug prints
|
|
fprintf(stderr,"666 I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d partner_offset %d\n",
|
|
my_rank,sequence_number,partner, ready_flag,partner_ctl_pointer->flag,buff_idx,partner_ctl_pointer->offset);
|
|
*/
|
|
/* debug print */
|
|
#if 0
|
|
fprintf(stderr,"I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d \n",
|
|
my_rank,sequence_number,partner, ready_flag,parent_ctl_pointer->flag,buff_idx);
|
|
#endif
|
|
/* end debug prints */
|
|
|
|
assert(partner_ctl_pointer->flag >= ready_flag);
|
|
/* found it */
|
|
matched = 1;
|
|
/* only copy it, if you sit at a lower level in the tree */
|
|
if( my_ctl_pointer->n_sends <= partner_ctl_pointer->n_sends ) {
|
|
|
|
/* calculate the local offset based on partner's remote offset */
|
|
if( partner_ctl_pointer->offset < my_ctl_pointer->offset ) {
|
|
/* then I'm looking "up" the tree */
|
|
local_offset -= length;
|
|
/* debug print */
|
|
/*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
|
|
/* end debug */
|
|
memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
|
|
(void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
|
|
length);
|
|
} else {
|
|
/* I'm looking "down" the tree */
|
|
local_offset += length;
|
|
/* debug print */
|
|
/*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
|
|
/* end debug */
|
|
memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
|
|
(void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
|
|
length);
|
|
/* reset my local offset */
|
|
local_offset -= length;
|
|
}
|
|
|
|
}
|
|
/* bump the ready flag */
|
|
ready_flag++;
|
|
/* ensure completion */
|
|
MB();
|
|
|
|
/* fire the flag for the next level */
|
|
my_ctl_pointer->flag = ready_flag;
|
|
|
|
/* double the length */
|
|
length *= 2;
|
|
}
|
|
}
|
|
/* check to see what kind of progress I've made */
|
|
if( 0 == matched ) {
|
|
/* save state, hop out and try again later */
|
|
my_ctl_pointer->start = i;
|
|
/* save the local offset */
|
|
my_ctl_pointer->offset_zip = local_offset;
|
|
/* put in progress queue */
|
|
return BCOL_FN_STARTED;
|
|
}
|
|
/* else, start next level of recursive doubling */
|
|
matched = 0;
|
|
|
|
}
|
|
|
|
|
|
/* cleanup */
|
|
if(NULL != my_ctl_pointer->src_ptr) {
|
|
free(my_ctl_pointer->src_ptr);
|
|
my_ctl_pointer->src_ptr = NULL;
|
|
}
|
|
|
|
Release:
|
|
|
|
|
|
/* If I am the last instance, release the resource */
|
|
/*
|
|
if( IS_LAST_BCOL_FUNC(c_input_args)) {
|
|
rc = bcol_basesmuma_free_buff(
|
|
&(bcol_module->colls_with_user_data),
|
|
sequence_number);
|
|
}
|
|
*/
|
|
|
|
my_ctl_pointer->starting_flag_value++;
|
|
my_ctl_pointer->status = FINISHED;
|
|
return BCOL_FN_COMPLETE;
|
|
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
int mca_bcol_basesmuma_bcast_binomial_scatter_allgather(void *desc)
|
|
{
|
|
/* local variables */
|
|
int rc, n_frags_sent;
|
|
uint32_t stripe_number;
|
|
int count, count_processed;
|
|
size_t dt_size;
|
|
uint32_t n_data_segments_to_schedule;
|
|
ompi_datatype_t *dtype;
|
|
message_descriptor_t *message_descriptor;
|
|
mca_bcol_basesmuma_module_t *bcol_module;
|
|
int pipe_depth;
|
|
|
|
|
|
/* get the full message descriptor */
|
|
|
|
|
|
/* compute the number of fragments to send */
|
|
|
|
|
|
/* start to fill the pipeline */
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
|
|
|
|
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|