diff --git a/ompi/mca/coll/sm2/coll_sm2.h b/ompi/mca/coll/sm2/coll_sm2.h index 42f193cb30..d8d3a8248d 100644 --- a/ompi/mca/coll/sm2/coll_sm2.h +++ b/ompi/mca/coll/sm2/coll_sm2.h @@ -106,11 +106,32 @@ BEGIN_C_DECLS /** MCA parameter: order of fan-out read tree */ int order_fanout_read_tree; - /*I MCA paramenter: number of polling loops to run while waiting + /** MCA paramenter: number of polling loops to run while waiting * for children or parent to complete their work */ int n_poll_loops; + /** MCA parameter: message size cutoff for switching between + * short and long protocol + */ + size_t short_message_size; + + /* + * Parameters to control methods used + */ + /** MCA parameter: method to force a given barrier method to be used. + * 0 - FANIN_FAN_OUT_BARRIER_FN + * 1 - RECURSIVE_DOUBLING_BARRIER_FN + */ + int force_barrier; + + /** MCA parameter: method to force a given reduce method to be used. + * 0 - FANIN_FAN_OUT_REDUCE_FN, + * 1 - REDUCE_SCATTER_GATHER_FN, + */ + int force_reduce; + + }; /** @@ -118,6 +139,30 @@ BEGIN_C_DECLS */ typedef struct mca_coll_sm2_component_t mca_coll_sm2_component_t; + /* + * Implemented function index list + */ + + /* barrier */ + enum{ + FANIN_FAN_OUT_BARRIER_FN, + RECURSIVE_DOUBLING_BARRIER_FN, + N_BARRIER_FNS + }; + + /* reduce */ + enum{ + FANIN_REDUCE_FN, + REDUCE_SCATTER_GATHER_FN, + N_REDUCE_FNS + }; + enum{ + SHORT_DATA_FN, + LONG_DATA_FN, + N_REDUCE_FNS_USED + }; + + /* enum for node type */ enum{ ROOT_NODE, @@ -326,7 +371,7 @@ BEGIN_C_DECLS int index_blocking_barrier_memory_bank; /* pointers to blocking memory control regions */ - mca_coll_sm2_nb_request_process_shared_mem_t ***ctl_blocking_barrier; + volatile mca_coll_sm2_nb_request_process_shared_mem_t ***ctl_blocking_barrier; /* description of allocated temp buffers - one struct per * buffer. Each buffer has space "owned" by each process @@ -403,16 +448,22 @@ BEGIN_C_DECLS /* collective tag */ long long collective_tag; - /* debug flag RLG */ - int blocked_on_barrier; - - long long barrier_bank_list[BARRIER_BANK_LIST_SIZE]; - long long barrier_bank_cntr; - /* end debug */ - /* scratch space - one int per process */ int *scratch_space; + /* message size cutoff for switching between short and long + * protocol + */ + size_t short_message_size; + + /* + * function table for variants of a given collective + * function. + */ + mca_coll_base_module_barrier_fn_t barrier_functions[N_BARRIER_FNS]; + mca_coll_base_module_reduce_fn_t list_reduce_functions[N_REDUCE_FNS]; + mca_coll_base_module_reduce_fn_t reduce_functions[N_REDUCE_FNS_USED]; + }; typedef struct mca_coll_sm2_module_t mca_coll_sm2_module_t; @@ -541,6 +592,16 @@ BEGIN_C_DECLS int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module); + int mca_coll_sm2_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf, + int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + struct mca_coll_base_module_1_1_0_t *module); + + int mca_coll_sm2_reduce_intra_fanin(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + struct mca_coll_base_module_1_1_0_t *module); + /** * Shared memory blocking broadcast. */ @@ -555,6 +616,14 @@ BEGIN_C_DECLS int mca_coll_sm2_barrier_intra( struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module); + int mca_coll_sm2_barrier_intra_fanin_fanout( + struct ompi_communicator_t *comm, + struct mca_coll_base_module_1_1_0_t *module); + + int mca_coll_sm2_barrier_intra_recursive_doubling( + struct ompi_communicator_t *comm, + struct mca_coll_base_module_1_1_0_t *module); + END_C_DECLS #endif /* MCA_COLL_SM2_EXPORT_H */ diff --git a/ompi/mca/coll/sm2/coll_sm2_allreduce.c b/ompi/mca/coll/sm2/coll_sm2_allreduce.c index 145dec2565..c468d56cca 100644 --- a/ompi/mca/coll/sm2/coll_sm2_allreduce.c +++ b/ompi/mca/coll/sm2/coll_sm2_allreduce.c @@ -1118,7 +1118,7 @@ int mca_coll_sm2_allreduce_intra_reducescatter_allgather(void *sbuf, void *rbuf, /* local varibles */ int i,rc=OMPI_SUCCESS,n_dts_per_buffer,n_data_segments,stripe_number; int pair_rank,exchange,extra_rank,n_proc_data,tmp; - int starting_proc,n_procs_to_read,base_block_proc,base_read_proc; + int starting_proc; int n_elements_per_proc, n_residual_elements; int cnt_offset,n_copy; pair_exchange_node_t *my_exchange_node; @@ -1129,7 +1129,6 @@ int mca_coll_sm2_allreduce_intra_reducescatter_allgather(void *sbuf, void *rbuf, ptrdiff_t dt_extent; long long tag, base_tag; sm_work_buffer_t *sm_buffer_desc; - volatile char * my_write_pointer; volatile char * extra_rank_write_data_pointer; volatile char * extra_rank_read_data_pointer; volatile char * partner_base_pointer; diff --git a/ompi/mca/coll/sm2/coll_sm2_barrier.c b/ompi/mca/coll/sm2/coll_sm2_barrier.c index 58373c2756..9915acf7f6 100644 --- a/ompi/mca/coll/sm2/coll_sm2_barrier.c +++ b/ompi/mca/coll/sm2/coll_sm2_barrier.c @@ -314,13 +314,12 @@ DONE: /** * Shared memory blocking allreduce. */ -static int mca_coll_sm2_barrier_intra_fanin_fanout( struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module) { /* local variables */ - int rc=OMPI_SUCCESS; + int rc=OMPI_SUCCESS,bar_buff_index; int my_rank, child_rank, child, n_parents, n_children; int my_fanin_parent; int my_fanout_parent; @@ -334,8 +333,6 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( sm_module=(mca_coll_sm2_module_t *) module; - - /* get my node for the reduction tree */ my_rank=ompi_comm_rank(comm); my_reduction_node=&(sm_module->reduction_tree[my_rank]); @@ -351,10 +348,17 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( tag=sm_module->collective_tag; sm_module->collective_tag++; + /* sm_buffer_desc=alloc_sm2_shared_buffer(sm_module); + */ + sm_module->index_blocking_barrier_memory_bank^=1; + bar_buff_index=sm_module->index_blocking_barrier_memory_bank; - /* offset to data segment */ - my_ctl_pointer=sm_buffer_desc->proc_memory[my_rank].control_region; + my_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][my_rank]; + /* + sm_buffer_desc->proc_memory[my_rank].control_region; + */ /*************************** * Fan into root phase @@ -369,11 +373,13 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( child_rank=my_reduction_node->children_ranks[child]; child_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][child_rank]; + /* sm_buffer_desc->proc_memory[child_rank].control_region; + */ /* wait until child flag is set */ while( child_ctl_pointer->flag != tag ) { - /* Note: Actually need to make progress here */ opal_progress(); } @@ -384,7 +390,9 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( /* set memory barriet to make sure data is in main memory before * the completion flgas are set. */ + /* MB(); + */ /* * Signal parent that data is ready @@ -396,7 +404,9 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( /* set memory barriet to make sure data is in main memory before * the completion flgas are set. */ + /* MB(); + */ /* * Signal parent that data is ready @@ -415,14 +425,19 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( /* I am the root - so copy signal children, and then * start reading */ + /* MB(); + */ my_ctl_pointer->flag=-tag; } else if( LEAF_NODE == my_fanout_read_tree->my_node_type ) { parent_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][my_fanout_parent]; + /* sm_buffer_desc->proc_memory[my_fanout_parent].control_region; + */ /* * wait on Parent to signal that data is ready @@ -435,7 +450,10 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( /* interior nodes */ parent_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][my_fanout_parent]; + /* sm_buffer_desc->proc_memory[my_fanout_parent].control_region; + */ /* * wait on Parent to signal that data is ready @@ -447,7 +465,9 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( /* set memory barriet to make sure data is in main memory before * the completion flgas are set. */ + /* MB(); + */ /* signal children that they may read the result data */ my_ctl_pointer->flag=-tag; @@ -455,10 +475,12 @@ int mca_coll_sm2_barrier_intra_fanin_fanout( } /* "free" the shared-memory working buffer */ + /* rc=free_sm2_shared_buffer(sm_module); if( OMPI_SUCCESS != rc ) { goto Error; } + */ /* return */ return rc; @@ -467,6 +489,204 @@ Error: return rc; } +/** + * Shared memory blocking barrier. + */ +int mca_coll_sm2_barrier_intra_recursive_doubling( + struct ompi_communicator_t *comm, + struct mca_coll_base_module_1_1_0_t *module) +{ + /* local variables */ + int rc=OMPI_SUCCESS; + int pair_rank,exchange,extra_rank; + pair_exchange_node_t *my_exchange_node; + int my_rank,bar_buff_index; + long long tag, base_tag; + mca_coll_sm2_nb_request_process_shared_mem_t *my_ctl_pointer; + volatile mca_coll_sm2_nb_request_process_shared_mem_t * + partner_ctl_pointer; + volatile mca_coll_sm2_nb_request_process_shared_mem_t * + extra_ctl_pointer; + mca_coll_sm2_module_t *sm_module; + /* debug + opal_timer_t t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10; + end debug */ + + sm_module=(mca_coll_sm2_module_t *) module; + + /* get my node for the reduction tree */ + my_exchange_node=&(sm_module->recursive_doubling_tree); + my_rank=ompi_comm_rank(comm); + + /* get pointer to barrier strcuture */ + sm_module->index_blocking_barrier_memory_bank^=1; + bar_buff_index=sm_module->index_blocking_barrier_memory_bank; + + + /* get unique set of tags for this stripe. + * Assume only one collective + * per communicator at a given time, so no locking needed + * for atomic update of the tag */ + base_tag=sm_module->collective_tag; + sm_module->collective_tag+=my_exchange_node->n_tags; + + /* get pointers to my work buffers */ + my_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][my_rank]; + /* + sm_buffer_desc->proc_memory[my_rank].control_region; + */ + + /* copy data in from the "extra" source, if need be */ + tag=base_tag; + if(0 < my_exchange_node->n_extra_sources) { + + if ( EXCHANGE_NODE == my_exchange_node->node_type ) { + + extra_rank=my_exchange_node->rank_extra_source; + extra_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][extra_rank]; + /* + sm_buffer_desc->proc_memory[extra_rank].control_region; + */ + + /* wait until remote data is read */ + while( extra_ctl_pointer->flag < tag ) { + opal_progress(); + } + + } else { + + /* + MB(); + */ + + /* + * Signal parent that data is ready + */ + my_ctl_pointer->flag=tag; + + } + } + + /* + MB(); + */ + /* + * Signal parent that data is ready + */ + tag=base_tag+1; + my_ctl_pointer->flag=tag; + + /* loop over data exchanges */ + for(exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) { + + /* debug + t4=opal_sys_timer_get_cycles(); + end debug */ + + /* is the remote data read */ + pair_rank=my_exchange_node->rank_exchanges[exchange]; + partner_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][pair_rank]; + /* + sm_buffer_desc->proc_memory[pair_rank].control_region; + */ + + /* + MB(); + */ + my_ctl_pointer->flag=tag; + + /* wait until remote data is read */ + while( partner_ctl_pointer->flag < tag ) { + opal_progress(); + } + + /* end test */ + + /* signal that I am done reading my peer's data */ + tag++; + + } + + /* copy data in from the "extra" source, if need be */ + if(0 < my_exchange_node->n_extra_sources) { + tag=base_tag+my_exchange_node->n_tags-1; + + if ( EXTRA_NODE == my_exchange_node->node_type ) { + + extra_rank=my_exchange_node->rank_extra_source; + extra_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][extra_rank]; + /* + sm_buffer_desc->proc_memory[extra_rank].control_region; + */ + + /* wait until remote data is read */ + while(! ( extra_ctl_pointer->flag == tag ) ) { + opal_progress(); + } + + + /* signal that I am done */ + my_ctl_pointer->flag=tag; + + } else { + + tag=base_tag+my_exchange_node->n_tags-1; + /* set memory barriet to make sure data is in main memory before + * the completion flgas are set. + */ + + /* + MB(); + */ + + /* + * Signal parent that data is ready + */ + my_ctl_pointer->flag=tag; + + /* wait until child is done to move on - this buffer will + * be reused for the next stripe, so don't want to move + * on too quick. + */ + extra_rank=my_exchange_node->rank_extra_source; + extra_ctl_pointer= + sm_module->ctl_blocking_barrier[bar_buff_index][extra_rank]; + /* + sm_buffer_desc->proc_memory[extra_rank].control_region; + */ + + /* wait until remote data is read */ + while( extra_ctl_pointer->flag < tag ) { + opal_progress(); + } + } + } + + + + /* debug + + t9=opal_sys_timer_get_cycles(); + timers[5]+=(t9-t8); + end debug */ + + + /* "free" the shared-memory working buffer */ + rc=free_sm2_shared_buffer(sm_module); + if( OMPI_SUCCESS != rc ) { + goto Error; + } + + /* return */ + return rc; + +Error: + return rc; +} /** * Shared memory blocking barrier */ @@ -475,8 +695,11 @@ int mca_coll_sm2_barrier_intra( struct ompi_communicator_t *comm, { /* local variables */ int rc; + mca_coll_sm2_module_t *sm_module; - rc= mca_coll_sm2_barrier_intra_fanin_fanout(comm, module); + sm_module=(mca_coll_sm2_module_t *) module; + + rc= sm_module->barrier_functions[0](comm, module); if( OMPI_SUCCESS != rc ) { goto Error; } diff --git a/ompi/mca/coll/sm2/coll_sm2_component.c b/ompi/mca/coll/sm2/coll_sm2_component.c index 1acd4f6b82..a6ddf0e833 100644 --- a/ompi/mca/coll/sm2/coll_sm2_component.c +++ b/ompi/mca/coll/sm2/coll_sm2_component.c @@ -140,7 +140,7 @@ static int sm2_open(void) /* set component priority */ cs->sm2_priority= - mca_coll_sm2_param_register_int("sm2_priority",0); + mca_coll_sm2_param_register_int("sm2_priority",90); /* set control region size (bytes), per proc */ cs->sm2_ctl_size_per_proc= @@ -155,7 +155,7 @@ static int sm2_open(void) /* Min data Segment size (bytes) - per proc */ cs->sm2_data_seg_size= - mca_coll_sm2_param_register_int("sm2_data_seg_size",0); + mca_coll_sm2_param_register_int("sm2_data_seg_size",32768); /* Max data Segment size (bytes) - per proc */ cs->sm2_max_data_seg_size= @@ -194,6 +194,20 @@ static int sm2_open(void) cs->n_poll_loops= mca_coll_sm2_param_register_int("n_poll_loops",4); + /* Size of message for switching between short and long protocol. + * This should probably be the segment size for several algorithms, + * though not all. + */ + cs->short_message_size= + mca_coll_sm2_param_register_int("short_message_size",32768); + + /* collective ops to use */ + cs->force_barrier= + mca_coll_sm2_param_register_int("force_barrier",(-1)); + cs->force_reduce= + mca_coll_sm2_param_register_int("force_reduce",(-1)); + + /* debug */ /* new_sigact.sa_handler=dbg_handler; diff --git a/ompi/mca/coll/sm2/coll_sm2_module.c b/ompi/mca/coll/sm2/coll_sm2_module.c index e01ea8b923..a8013e1d37 100644 --- a/ompi/mca/coll/sm2/coll_sm2_module.c +++ b/ompi/mca/coll/sm2/coll_sm2_module.c @@ -726,6 +726,42 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority) sm_module->super.coll_scatter = NULL; sm_module->super.coll_scatterv = NULL; + /* + * set up specific function to be used + */ + + /* barrier */ + sm_module->barrier_functions[FANIN_FAN_OUT_BARRIER_FN]= + mca_coll_sm2_barrier_intra_fanin_fanout; + sm_module->barrier_functions[RECURSIVE_DOUBLING_BARRIER_FN]= + mca_coll_sm2_barrier_intra_fanin_fanout; + if( ( 0 <= mca_coll_sm2_component.force_barrier ) && + ( N_BARRIER_FNS > mca_coll_sm2_component.force_barrier ) ) { + /* set user specifed function */ + mca_coll_base_module_barrier_fn_t tmp_fn= + sm_module->barrier_functions[mca_coll_sm2_component.force_barrier]; + sm_module->barrier_functions[FANIN_FAN_OUT_BARRIER_FN]=tmp_fn; + sm_module->barrier_functions[RECURSIVE_DOUBLING_BARRIER_FN]=tmp_fn; + } + + /* reduce */ + sm_module->list_reduce_functions[FANIN_REDUCE_FN]= + mca_coll_sm2_reduce_intra_fanin; + sm_module->list_reduce_functions[REDUCE_SCATTER_GATHER_FN]= + mca_coll_sm2_reduce_intra_reducescatter_gather; + sm_module->reduce_functions[SHORT_DATA_FN]= + sm_module->list_reduce_functions[FANIN_REDUCE_FN]; + sm_module->reduce_functions[LONG_DATA_FN]= + sm_module->list_reduce_functions[REDUCE_SCATTER_GATHER_FN]; + if( ( 0 <= mca_coll_sm2_component.force_reduce ) && + ( N_REDUCE_FNS > mca_coll_sm2_component.force_reduce ) ) { + /* set user specifed function */ + mca_coll_base_module_barrier_fn_t tmp_fn= + sm_module->reduce_functions[mca_coll_sm2_component.force_reduce]; + sm_module->reduce_functions[SHORT_DATA_FN]=tmp_fn; + sm_module->reduce_functions[LONG_DATA_FN]=tmp_fn; + } + /* * Some initialization */ @@ -1059,7 +1095,7 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority) sm_module->index_blocking_barrier_memory_bank=0; sm_module->ctl_blocking_barrier= - (mca_coll_sm2_nb_request_process_shared_mem_t ***) + (volatile mca_coll_sm2_nb_request_process_shared_mem_t ***) malloc(2*sizeof(mca_coll_sm2_nb_request_process_shared_mem_t **)); if( NULL == sm_module->ctl_blocking_barrier ) { goto CLEANUP; @@ -1077,11 +1113,6 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority) goto CLEANUP; } - /* debug */ - fprintf(stderr," sizeof(mca_coll_sm2_nb_request_process_shared_mem_t) %lx \n", - sizeof(mca_coll_sm2_nb_request_process_shared_mem_t)); - fflush(stderr); - /* end debug */ for( j= 0 ; j < 2 ; j++ ) { for( i=0 ; i < group_size ; i++ ) { sm_module->ctl_blocking_barrier[j][i]= @@ -1091,18 +1122,12 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority) j*sizeof(mca_coll_sm2_nb_request_process_shared_mem_t)+ i*sm_module->per_proc_size_of_blocking_barrier_region ) ; - /* debug */ - fprintf(stderr," i %d j %d %p base %p pp %lx\n",i,j, - sm_module->ctl_blocking_barrier[j][i], - sm_module->sm_blocking_barrier_region, - sm_module->per_proc_size_of_blocking_barrier_region); - fflush(stderr); - /* end debug */ sm_module->ctl_blocking_barrier[j][i]->flag=0; } } - + /* set the switch-over parameter */ + sm_module->short_message_size=mca_coll_sm2_component.short_message_size; /* touch pages to apply memory affinity - Note: do we really need this or will * the algorithms do this */ diff --git a/ompi/mca/coll/sm2/coll_sm2_reduce.c b/ompi/mca/coll/sm2/coll_sm2_reduce.c index 5de162b1d0..27e969d790 100644 --- a/ompi/mca/coll/sm2/coll_sm2_reduce.c +++ b/ompi/mca/coll/sm2/coll_sm2_reduce.c @@ -20,22 +20,10 @@ extern uint64_t timers[7]; end debug */ -/* debug -#include -extern void debug_module(void); -extern int last_root; -extern int node_type; -extern long long free_buff_free_index; -int last_root; -int node_type; - end debug */ - - /** * Shared memory blocking allreduce. */ -static int mca_coll_sm2_reduce_intra_fanin(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, @@ -400,8 +388,7 @@ Error: /** * Shared memory blocking reduce. */ -static -int mca_coll_sm2_reduce_intra_reducescatter_allgather(void *sbuf, void *rbuf, +int mca_coll_sm2_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, @@ -492,14 +479,6 @@ int mca_coll_sm2_reduce_intra_reducescatter_allgather(void *sbuf, void *rbuf, sm_module->scratch_space[i]++; } } - /* debug - fprintf(stderr," my_rank %d element list count_this_stripe %d : ",my_rank,count_this_stripe); - for(i=0 ; i < comm_size ; i++ ) { - fprintf(stderr," %d ",sm_module->scratch_space[i]); - } - fprintf(stderr," \n"); - fflush(stderr); - end debug */ /* get unique set of tags for this stripe. * Assume only one collective @@ -745,6 +724,34 @@ int mca_coll_sm2_reduce_intra(void *sbuf, void *rbuf, int count, { /* local variables */ int rc; + mca_coll_sm2_module_t *sm_module; + ptrdiff_t dt_extent; + size_t len_data_buffer; + + sm_module=(mca_coll_sm2_module_t *) module; + + /* get size of data needed - same layout as user data, so that + * we can apply the reudction routines directly on these buffers + */ + rc=ompi_ddt_type_extent(dtype, &dt_extent); + if( OMPI_SUCCESS != rc ) { + goto Error; + } + + len_data_buffer=count*dt_extent; + + if( len_data_buffer <= sm_module->short_message_size) { + rc=sm_module->reduce_functions[SHORT_DATA_FN] + (sbuf, rbuf, count, dtype, op, root, comm, module); + } + else { + rc=sm_module->reduce_functions[LONG_DATA_FN] + (sbuf, rbuf, count, dtype, op, root, comm, module); + } + + if( OMPI_SUCCESS != rc ) { + goto Error; + } #if 0 rc= mca_coll_sm2_reduce_intra_fanin(sbuf, rbuf, count, @@ -752,13 +759,13 @@ int mca_coll_sm2_reduce_intra(void *sbuf, void *rbuf, int count, if( OMPI_SUCCESS != rc ) { goto Error; } -#endif - rc= mca_coll_sm2_reduce_intra_reducescatter_allgather(sbuf, rbuf, count, + rc= mca_coll_sm2_reduce_intra_reducescatter_gather(sbuf, rbuf, count, dtype, op, root, comm, module); if( OMPI_SUCCESS != rc ) { goto Error; } +#endif return OMPI_SUCCESS;