The project includes following components and frameworks: - ML Collective component - NETPATTERNS and COMMPATTERNS common components - BCOL framework - SBGP framework Note: By default the ML collective component is disabled. In order to enable new collectives user should bump up the priority of ml component (coll_ml_priority) ============================================= Primary Contributors (in alphabetical order): Ishai Rabinovich (Mellanox) Joshua S. Ladd (ORNL / Mellanox) Manjunath Gorentla Venkata (ORNL) Mike Dubman (Mellanox) Noam Bloch (Mellanox) Pavel (Pasha) Shamis (ORNL / Mellanox) Richard Graham (ORNL / Mellanox) Vasily Filipov (Mellanox) This commit was SVN r27078.
474 строки
16 KiB
474 строки
16 KiB
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Additional copyrights may follow
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "bcol_basesmuma.h"
#define __TEST_BLOCKING__ 1
#define __TEST_WAIT__ 0
#define __TEST_TEST__ 0
/* debug
* #include "opal/sys/timer.h"
* extern uint64_t timers[7];
* end debug */
/* debug */
/* end debug */
int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super)
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_BCAST;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 64;
comm_attribs.data_src = DATA_SRC_KNOWN;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
comm_attribs.data_src = DATA_SRC_UNKNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
comm_attribs.data_src = DATA_SRC_UNKNOWN;
inv_attribs.bcol_msg_min = 10000000;
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
#ifdef __PORTALS_AVAIL__
comm_attribs.waiting_semantics = BLOCKING;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
comm_attribs.waiting_semantics = NON_BLOCKING;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
if (super->use_hdl) {
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
} else { */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
/* } */
/* includes shared memory optimization */
* Shared memory blocking Broadcast - fanin, for small data buffers.
* This routine assumes that buf (the input buffer) is a single writer
* multi reader (SWMR) shared memory buffer owned by the calling rank
* which is the only rank that can write to this buffers.
* It is also assumed that the buffers are registered and fragmented
* at the ML level and that buf is sufficiently large to hold the data.
* @param buf - SWMR shared buffer within a sbgp that the
* executing rank can write to.
* @param count - the number of elements in the shared buffer.
* @param dtype - the datatype of a shared buffer element.
* @param root - the index within the sbgp of the root.
* @param module - basesmuma module.
int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args)
/* local variables */
int group_size, process_shift, my_node_index;
int my_rank;
int rc = OMPI_SUCCESS;
int my_fanout_parent;
int leading_dim, buff_idx, idx;
volatile int8_t ready_flag;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int root=input_args->root;
int64_t sequence_number=input_args->sequence_num;
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
int bcol_id = (int) bcol_module->super.bcol_id;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char* parent_data_pointer;
mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
mca_common_netpatterns_tree_node_t* my_fanout_read_tree;
size_t pack_len = 0, dt_size;
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr );
#if 0
fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
/* we will work only on packed data - so compute the length*/
ompi_datatype_type_size(dtype, &dt_size);
buff_idx = input_args->src_desc->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
/* Align node index to around sbgp root */
process_shift = root;
my_node_index = my_rank - root;
if(0 > my_node_index ) {
my_node_index += group_size;
/* get my node for the bcast tree */
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]);
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
if(group_size <= my_fanout_parent){
my_fanout_parent -= group_size;
/* Set pointer to current proc ctrl region */
/*my_ctl_pointer = ctl_structs[my_rank]; */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* setup resource recycling */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
* Fan out from root
if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
/* Root should only signal it is ready */
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
}else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
* Get parent payload data and control data.
* Get the pointer to the base address of the parent's payload buffer.
* Get the parent's control buffer.
parent_data_pointer = data_buffs[my_fanout_parent].payload;
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
/* Wait until parent signals that data is ready */
/* The order of conditions checked in this loop is important, as it can
* result in a race condition.
while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
/* Copy the rank to a shared buffer writable by the current rank */
memcpy(data_addr, (void *)parent_data_pointer, pack_len);
if( 0 != rc ) {
return OMPI_ERROR;
input_args->result_in_rbuf = false;
/* Interior node */
/* Get parent payload data and control data */
parent_data_pointer = data_buffs[my_fanout_parent].payload;
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
/* Wait until parent signals that data is ready */
/* The order of conditions checked in this loop is important, as it can
* result in a race condition.
while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
/* Copy the rank to a shared buffer writable by the current rank */
memcpy(data_addr, (void *)parent_data_pointer,pack_len);
/* Signal to children that they may read the data from my shared buffer */
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
/* if I am the last instance of a basesmuma function in this collectie,
* release the resrouces */
return rc;
/*zero-copy large massage communication methods*/
#if 0
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args)
/* local variables */
int group_size, process_shift, my_node_index;
int my_rank, first_instance=0, flag_offset;
int rc = OMPI_SUCCESS;
int my_fanout_parent;
int leading_dim, buff_idx, idx;
volatile int64_t ready_flag;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int root=input_args->root;
int64_t sequence_number=input_args->sequence_num;
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
mca_common_netpatterns_tree_node_t* my_fanout_read_tree;
size_t pack_len = 0, dt_size;
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);
struct mca_hdl_base_descriptor_t *hdl_desc;
struct mca_hdl_base_segment_t *hdl_seg;
int ret, completed, ridx/*remote rank index*/;
bool status;
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL;
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer= NULL;
volatile mca_bcol_basesmuma_ctl_struct_t *child_ctl_pointer= NULL;
struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0];
/* we will work only on packed data - so compute the length*/
ompi_datatype_type_size(dtype, &dt_size);
pack_len = count * dt_size;
buff_idx = input_args->src_desc->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
my_ctl_pointer = ctl_structs[my_rank];
/* Align node index to around sbgp root */
process_shift = root;
my_node_index = my_rank - root;
if(0 > my_node_index ) {
my_node_index += group_size;
/* get my node for the bcast tree */
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]);
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
if(group_size <= my_fanout_parent){
my_fanout_parent -= group_size;
/* setup resource recycling */
if( my_ctl_pointer->sequence_number < sequence_number ) {
first_instance = 1;
if( first_instance ) {
/* Signal arrival */
my_ctl_pointer->flag = -1;
my_ctl_pointer->index = 1;
/* this does not need to use any flag values , so only need to
* set the value for subsequent values that may need this */
my_ctl_pointer->starting_flag_value = 0;
flag_offset = 0;
} else {
/* only one thread at a time will be making progress on this
* collective, so no need to make this atomic */
/* increment the starting flag by one and return */
flag_offset = my_ctl_pointer->starting_flag_value;
ready_flag = flag_offset + sequence_number + 1;
my_ctl_pointer->sequence_number = sequence_number;
hdl_desc = (mca_hdl_base_descriptor_t *)
malloc (sizeof (mca_hdl_base_descriptor_t) * 1);
/*prepare a hdl data segment*/
hdl_seg = (mca_hdl_base_segment_t*)
malloc ( sizeof (mca_hdl_base_segment_t) * 1);
hdl_seg->seg_addr.pval = input_args->sbuf;
hdl_seg->seg_len = pack_len;
hdl->endpoint->ready_flag = ready_flag;
hdl->endpoint->local_ctrl = my_ctl_pointer;
hdl->endpoint->sbgp_contextid =
* Fan out from root
if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
hdl_desc->des_src = hdl_seg;
hdl_desc->des_src_cnt = 1;
hdl_desc->isroot = true;
/*As the general semantics, there might multiple pairs of send/recv
*on the topology tree*/
for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
child_ctl_pointer =
hdl->endpoint->remote_ctrl = child_ctl_pointer;
ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
if (ret != OMPI_SUCCESS) {
BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
goto ERROR;
}else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
* Get parent payload data and control data.
* Get the pointer to the base address of the parent's payload buffer.
* Get the parent's control buffer.
parent_ctl_pointer = ctl_structs[my_fanout_parent];
hdl_desc->des_dst = hdl_seg;
hdl_desc->des_dst_cnt = 1;
hdl_desc->isroot = false;
hdl->endpoint->remote_ctrl = parent_ctl_pointer;
ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc);
#if __TEST_WAIT__
ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc);
BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank));
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
goto ERROR;
status = false;
#if __TEST_TEST__
while (!status) {
hdl->hdl_test(&hdl_desc, &completed, &status);
BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank));
goto Release;
input_args->result_in_rbuf = false;
/* Interior node */
/* Get parent payload data and control data */
parent_ctl_pointer = ctl_structs[my_fanout_parent];
hdl_desc->des_dst = hdl_seg;
hdl_desc->des_dst_cnt = 1;
hdl_desc->isroot = false;
hdl->endpoint->remote_ctrl = parent_ctl_pointer;
ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
if (OMPI_SUCCESS != ret) {
goto ERROR;
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
goto ERROR;
/* Signal to children that they may read the data from my shared buffer */
hdl_desc->des_src = hdl_seg;
hdl_desc->des_src_cnt = 1;
for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
child_ctl_pointer =
hdl->endpoint->remote_ctrl = child_ctl_pointer;
ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
if (ret != OMPI_SUCCESS) {
BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
goto ERROR;
goto Release;
/* if I am the last instance of a basesmuma function in this collectie,
* release the resrouces */
if (IS_LAST_BCOL_FUNC(c_input_args)) {
rc = bcol_basesmuma_free_buff(
my_ctl_pointer->starting_flag_value += 1;
return OMPI_ERROR;