diff --git a/ompi/mca/coll/sm2/coll_sm2_allreduce.c b/ompi/mca/coll/sm2/coll_sm2_allreduce.c index 0f1c590642..974fbac2de 100644 --- a/ompi/mca/coll/sm2/coll_sm2_allreduce.c +++ b/ompi/mca/coll/sm2/coll_sm2_allreduce.c @@ -52,11 +52,6 @@ int mca_coll_sm2_allreduce_intra_fanin_fanout(void *sbuf, void *rbuf, int count, sm_module=(mca_coll_sm2_module_t *) module; - /* get unique tag for this collective - assume only one collective - * per communicator at a given time, so no locking needed - * for atomic update of the tag */ - tag=sm_module->collective_tag; - sm_module->collective_tag++; /* get size of data needed - same layout as user data, so that * we can apply the reudction routines directly on these buffers @@ -95,6 +90,12 @@ int mca_coll_sm2_allreduce_intra_fanin_fanout(void *sbuf, void *rbuf, int count, /* NOTE: starting with a rather synchronous approach */ for( stripe_number=0 ; stripe_number < n_data_segments ; stripe_number++ ) { + /* get unique tag for this stripe - assume only one collective + * per communicator at a given time, so no locking needed + * for atomic update of the tag */ + tag=sm_module->collective_tag; + sm_module->collective_tag++; + sm_buffer_desc=alloc_sm2_shared_buffer(sm_module); /* get number of elements to process in this stripe */ diff --git a/ompi/op/op_predefined.c b/ompi/op/op_predefined.c index 3b935cacac..98201bb6df 100644 --- a/ompi/op/op_predefined.c +++ b/ompi/op/op_predefined.c @@ -680,8 +680,8 @@ LOC_FUNC(minloc, long_double_int, <) * routines, needed for some optimizations. */ #define OP_FUNC_3BUF(name, type_name, type, op) \ - void ompi_mpi_op_three_buff_##name##_##type_name(restrict void *in1, \ - restrict void *in2, restrict void *out, int *count, \ + void ompi_mpi_op_three_buff_##name##_##type_name(void * restrict in1, \ + void * restrict in2, void * restrict out, int *count, \ MPI_Datatype *dtype) \ { \ int i; \ @@ -694,8 +694,8 @@ LOC_FUNC(minloc, long_double_int, <) } #define COMPLEX_OP_FUNC_SUM_3BUF(type_name, type) \ - void ompi_mpi_op_sum_three_buff_##type_name(restrict void *in1, \ - restrict void * in2, restrict void *out, int *count, \ + void ompi_mpi_op_sum_three_buff_##type_name(void * restrict in1, \ + void * restrict in2, void * restrict out, int *count, \ MPI_Datatype *dtype) \ { \ int i; \ @@ -709,8 +709,8 @@ LOC_FUNC(minloc, long_double_int, <) } #define COMPLEX_OP_FUNC_PROD_3BUF(type_name, type) \ - void ompi_mpi_op_prod_three_buff_##type_name(restrict void *in1, \ - restrict void *in2, restrict void *out, int *count, \ + void ompi_mpi_op_prod_three_buff_##type_name(void * restrict in1, \ + void * restrict in2, void * restrict out, int *count, \ MPI_Datatype *dtype) \ { \ int i; \ @@ -734,8 +734,8 @@ LOC_FUNC(minloc, long_double_int, <) * This macro is for (out = op(in1, in2)) */ #define FUNC_FUNC_3BUF(name, type_name, type) \ - void ompi_mpi_op_three_buff_##name##_##type_name(restrict void *in1, \ - restrict void *in2, restrict void *out, int *count, \ + void ompi_mpi_op_three_buff_##name##_##type_name(void * restrict in1, \ + void * restrict in2, void * restrict out, int *count, \ MPI_Datatype *dtype) \ { \ int i; \ @@ -766,9 +766,9 @@ LOC_FUNC(minloc, long_double_int, <) */ #define LOC_FUNC_3BUF(name, type_name, op) \ - void ompi_mpi_op_three_buff_##name##_##type_name(restrict void *in1, \ - restrict void *in2, restrict void *out, int *count, \ - MPI_Datatype *dtype) \ + void ompi_mpi_op_three_buff_##name##_##type_name(void * restrict in1, \ + void * restrict in2, void * restrict out, int *count, \ + MPI_Datatype *dtype) \ { \ int i; \ ompi_op_predefined_##type_name##_t *a1 = (ompi_op_predefined_##type_name##_t*) in1; \