diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c index 3b3a6add14..340c0c4c7f 100644 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c +++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c @@ -37,19 +37,19 @@ /* end debug */ int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super) { - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; + mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; + mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - comm_attribs.bcoll_type = BCOL_BCAST; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1048576; - comm_attribs.data_src = DATA_SRC_KNOWN; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; + comm_attribs.bcoll_type = BCOL_BCAST; + comm_attribs.comm_size_min = 0; + comm_attribs.comm_size_max = 1048576; + comm_attribs.data_src = DATA_SRC_KNOWN; + comm_attribs.waiting_semantics = NON_BLOCKING; + + inv_attribs.bcol_msg_min = 0; + inv_attribs.bcol_msg_max = 20000; /* range 1 */ + inv_attribs.datatype_bitmap = 0xffffffff; + inv_attribs.op_types_bitmap = 0xffffffff; mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_bcast_k_nomial_knownroot, @@ -67,49 +67,47 @@ int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super) inv_attribs.bcol_msg_max = 20000; /* range 1 */ mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_bcast_k_nomial_anyroot, - bcol_basesmuma_bcast_k_nomial_anyroot); - - comm_attribs.data_src = DATA_SRC_UNKNOWN; - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ + bcol_basesmuma_bcast_k_nomial_anyroot, + bcol_basesmuma_bcast_k_nomial_anyroot); + + comm_attribs.data_src = DATA_SRC_UNKNOWN; + inv_attribs.bcol_msg_min = 10000000; + inv_attribs.bcol_msg_max = 10485760; /* range 4 */ #ifdef __PORTALS_AVAIL__ - comm_attribs.waiting_semantics = BLOCKING; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_lmsg_scatter_allgather_portals_bcast, - bcol_basesmuma_lmsg_scatter_allgather_portals_bcast); + comm_attribs.waiting_semantics = BLOCKING; + mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, + bcol_basesmuma_lmsg_scatter_allgather_portals_bcast, + bcol_basesmuma_lmsg_scatter_allgather_portals_bcast); - - comm_attribs.waiting_semantics = NON_BLOCKING; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast); - comm_attribs.data_src = DATA_SRC_KNOWN; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast); + comm_attribs.waiting_semantics = NON_BLOCKING; + mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, + bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast, + bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast); + + comm_attribs.data_src = DATA_SRC_KNOWN; + mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, + bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast, + bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast); #else -/* - if (super->use_hdl) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_hdl_zerocopy_bcast, - bcol_basesmuma_hdl_zerocopy_bcast); - } else { */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - NULL, - NULL); - /* - bcol_basesmuma_binary_scatter_allgather_segment, - bcol_basesmuma_binary_scatter_allgather_segment); - */ -/* } */ + /* + if (super->use_hdl) { + mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, + bcol_basesmuma_hdl_zerocopy_bcast, + bcol_basesmuma_hdl_zerocopy_bcast); + } else { */ + mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL); + /* + bcol_basesmuma_binary_scatter_allgather_segment, + bcol_basesmuma_binary_scatter_allgather_segment); + */ + /* } */ #endif - return OMPI_SUCCESS; + return OMPI_SUCCESS; } /* includes shared memory optimization */ @@ -118,12 +116,12 @@ int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super) * Shared memory blocking Broadcast - fanin, for small data buffers. * This routine assumes that buf (the input buffer) is a single writer * multi reader (SWMR) shared memory buffer owned by the calling rank - * which is the only rank that can write to this buffers. + * which is the only rank that can write to this buffers. * It is also assumed that the buffers are registered and fragmented * at the ML level and that buf is sufficiently large to hold the data. * * - * @param buf - SWMR shared buffer within a sbgp that the + * @param buf - SWMR shared buffer within a sbgp that the * executing rank can write to. * @param count - the number of elements in the shared buffer. * @param dtype - the datatype of a shared buffer element. @@ -131,15 +129,15 @@ int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super) * @param module - basesmuma module. */ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) + mca_bcol_base_function_t *c_input_args) { /* local variables */ int group_size, process_shift, my_node_index; - int my_rank; + int my_rank; int rc = OMPI_SUCCESS; int my_fanout_parent; int leading_dim, buff_idx, idx; - volatile int8_t ready_flag; + volatile int8_t ready_flag; int count=input_args->count; struct ompi_datatype_t* dtype=input_args->dtype; int root=input_args->root; @@ -156,9 +154,9 @@ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr ); -#if 0 +#if 0 fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); + fflush(stderr); #endif @@ -168,7 +166,7 @@ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, buff_idx = input_args->src_desc->buffer_index; - /* Get addressing information */ + /* Get addressing information */ my_rank = bcol_module->super.sbgp_partner_module->my_index; group_size = bcol_module->colls_no_user_data.size_of_group; leading_dim=bcol_module->colls_no_user_data.size_of_group; @@ -184,20 +182,20 @@ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, } /* get my node for the bcast tree */ - my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); + my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; if(group_size <= my_fanout_parent){ my_fanout_parent -= group_size; - } + } /* Set pointer to current proc ctrl region */ /*my_ctl_pointer = ctl_structs[my_rank]; */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - + my_ctl_pointer = data_buffs[my_rank].ctl_struct; + /* setup resource recycling */ - + BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - + /* * Fan out from root */ @@ -214,23 +212,23 @@ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, * Get the parent's control buffer. */ parent_data_pointer = data_buffs[my_fanout_parent].payload; - parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; + parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; /* Wait until parent signals that data is ready */ - /* The order of conditions checked in this loop is important, as it can - * result in a race condition. - */ - while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ - opal_progress(); - } - - /* Copy the rank to a shared buffer writable by the current rank */ + /* The order of conditions checked in this loop is important, as it can + * result in a race condition. + */ + while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ + opal_progress(); + } + + /* Copy the rank to a shared buffer writable by the current rank */ memcpy(data_addr, (void *)parent_data_pointer, pack_len); - if( 0 != rc ) { + if( 0 != rc ) { return OMPI_ERROR; } - + }else{ input_args->result_in_rbuf = false; /* Interior node */ @@ -238,19 +236,19 @@ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, /* Get parent payload data and control data */ parent_data_pointer = data_buffs[my_fanout_parent].payload; parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; - + /* Wait until parent signals that data is ready */ - /* The order of conditions checked in this loop is important, as it can - * result in a race condition. - */ - while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ - opal_progress(); + /* The order of conditions checked in this loop is important, as it can + * result in a race condition. + */ + while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ + opal_progress(); } - + /* Copy the rank to a shared buffer writable by the current rank */ memcpy(data_addr, (void *)parent_data_pointer,pack_len); - + /* Signal to children that they may read the data from my shared buffer */ opal_atomic_wmb (); my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; @@ -268,15 +266,15 @@ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, /*zero-copy large massage communication methods*/ #if 0 int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) + mca_bcol_base_function_t *c_input_args) { /* local variables */ int group_size, process_shift, my_node_index; - int my_rank, first_instance=0, flag_offset; + int my_rank, first_instance=0, flag_offset; int rc = OMPI_SUCCESS; int my_fanout_parent; int leading_dim, buff_idx, idx; - volatile int64_t ready_flag; + volatile int64_t ready_flag; int count=input_args->count; struct ompi_datatype_t* dtype=input_args->dtype; int root=input_args->root; @@ -291,7 +289,7 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, struct mca_hdl_base_descriptor_t *hdl_desc; struct mca_hdl_base_segment_t *hdl_seg; - int ret, completed, ridx/*remote rank index*/; + int ret, completed, ridx/*remote rank index*/; bool status; volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL; @@ -306,7 +304,7 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, buff_idx = input_args->src_desc->buffer_index; - /* Get addressing information */ + /* Get addressing information */ my_rank = bcol_module->super.sbgp_partner_module->my_index; group_size = bcol_module->colls_no_user_data.size_of_group; leading_dim=bcol_module->colls_no_user_data.size_of_group; @@ -323,18 +321,18 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, } /* get my node for the bcast tree */ - my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); + my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; if(group_size <= my_fanout_parent){ my_fanout_parent -= group_size; - } + } /* setup resource recycling */ if( my_ctl_pointer->sequence_number < sequence_number ) { first_instance = 1; } - if( first_instance ) { + if( first_instance ) { /* Signal arrival */ my_ctl_pointer->flag = -1; my_ctl_pointer->index = 1; @@ -354,19 +352,19 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, ready_flag = flag_offset + sequence_number + 1; my_ctl_pointer->sequence_number = sequence_number; - hdl_desc = (mca_hdl_base_descriptor_t *) + hdl_desc = (mca_hdl_base_descriptor_t *) malloc (sizeof (mca_hdl_base_descriptor_t) * 1); - + /*prepare a hdl data segment*/ - hdl_seg = (mca_hdl_base_segment_t*) + hdl_seg = (mca_hdl_base_segment_t*) malloc ( sizeof (mca_hdl_base_segment_t) * 1); hdl_seg->seg_addr.pval = input_args->sbuf; hdl_seg->seg_len = pack_len; - + hdl->endpoint->ready_flag = ready_flag; hdl->endpoint->local_ctrl = my_ctl_pointer; - hdl->endpoint->sbgp_contextid = + hdl->endpoint->sbgp_contextid = bcol_module->super.sbgp_partner_module->group_comm->c_contextid; /* @@ -379,11 +377,11 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, hdl_desc->des_src_cnt = 1; hdl_desc->isroot = true; - /*As the general semantics, there might multiple pairs of send/recv - *on the topology tree*/ + /*As the general semantics, there might multiple pairs of send/recv + *on the topology tree*/ for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { - child_ctl_pointer = - ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; + child_ctl_pointer = + ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; hdl->endpoint->remote_ctrl = child_ctl_pointer; ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); if (ret != OMPI_SUCCESS) { @@ -398,10 +396,10 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, * Get the pointer to the base address of the parent's payload buffer. * Get the parent's control buffer. */ - parent_ctl_pointer = ctl_structs[my_fanout_parent]; + parent_ctl_pointer = ctl_structs[my_fanout_parent]; hdl_desc->des_dst = hdl_seg; - hdl_desc->des_dst_cnt = 1; + hdl_desc->des_dst_cnt = 1; hdl_desc->isroot = false; hdl->endpoint->remote_ctrl = parent_ctl_pointer; @@ -412,10 +410,10 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, #endif #if __TEST_WAIT__ - ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); + ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank)); #endif - if (OMPI_SUCCESS != ret) { + if (OMPI_SUCCESS != ret) { BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); goto exit_ERROR; } @@ -424,42 +422,42 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, #if __TEST_TEST__ while (!status) { hdl->hdl_test(&hdl_desc, &completed, &status); - opal_progress(); + opal_progress(); BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank)); } #endif goto Release; - + }else{ input_args->result_in_rbuf = false; /* Interior node */ /* Get parent payload data and control data */ - parent_ctl_pointer = ctl_structs[my_fanout_parent]; - + parent_ctl_pointer = ctl_structs[my_fanout_parent]; + hdl_desc->des_dst = hdl_seg; - hdl_desc->des_dst_cnt = 1; + hdl_desc->des_dst_cnt = 1; hdl_desc->isroot = false; hdl->endpoint->remote_ctrl = parent_ctl_pointer; ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); - if (OMPI_SUCCESS != ret) { + if (OMPI_SUCCESS != ret) { goto exit_ERROR; } - if (OMPI_SUCCESS != ret) { + if (OMPI_SUCCESS != ret) { BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); goto exit_ERROR; } - + /* Signal to children that they may read the data from my shared buffer */ opal_atomic_wmb (); hdl_desc->des_src = hdl_seg; - hdl_desc->des_src_cnt = 1; + hdl_desc->des_src_cnt = 1; for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { - child_ctl_pointer = - ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; + child_ctl_pointer = + ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; hdl->endpoint->remote_ctrl = child_ctl_pointer; ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); @@ -471,19 +469,19 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, goto Release; } -Release: + Release: /* if I am the last instance of a basesmuma function in this collectie, * release the resrouces */ if (IS_LAST_BCOL_FUNC(c_input_args)) { rc = bcol_basesmuma_free_buff( - &(bcol_module->colls_with_user_data), - sequence_number); + &(bcol_module->colls_with_user_data), + sequence_number); } my_ctl_pointer->starting_flag_value += 1; return BCOL_FN_COMPLETE; -exit_ERROR: + exit_ERROR: return OMPI_ERROR; } -#endif +#endif