diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 2250b59c8d..1cda6866c0 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -45,7 +45,23 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_use_dynamic_rules; OMPI_COMP_EXPORT extern int mca_coll_tuned_init_tree_fanout; OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout; +/* forced algorithm choices */ +OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_choice; +OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_segsize; +OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_tree_fanout; +OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_chain_fanout; +OMPI_COMP_EXPORT extern int mca_coll_tuned_barrier_forced_choice; + +OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_choice; +OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_segsize; +OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_tree_fanout; +OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_chain_fanout; + +OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_choice; +OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_segsize; +OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_tree_fanout; +OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_chain_fanout; /* * coll API functions @@ -146,6 +162,13 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout; void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm); + int mca_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + int mca_coll_tuned_alltoall_intra_check_forced(void); + int mca_coll_tuned_alltoall_intra_query (void); int mca_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount, struct ompi_datatype_t *sdtype, @@ -233,11 +256,14 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout; int mca_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm); int mca_coll_tuned_barrier_intra_dec_dynamic( struct ompi_communicator_t *comm); + int mca_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm); + int mca_coll_tuned_barrier_intra_check_forced(void); + int mca_coll_tuned_barrier_intra_query (void); + int mca_coll_tuned_barrier_inter_dec_fixed(struct ompi_communicator_t *comm); int mca_coll_tuned_barrier_inter_dec_dynamic( struct ompi_communicator_t *comm); - int mca_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm); int mca_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm); int mca_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm); @@ -253,6 +279,12 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout; struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm); + int mca_coll_tuned_bcast_intra_do_forced(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm); + int mca_coll_tuned_bcast_intra_check_forced(void); + int mca_coll_tuned_bcast_intra_query (void); int mca_coll_tuned_bcast_intra_linear(void *buff, int count, struct ompi_datatype_t *datatype, @@ -366,6 +398,12 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout; struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm); + int mca_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm); + int mca_coll_tuned_reduce_intra_check_forced(void); + int mca_coll_tuned_reduce_intra_query (void); int mca_coll_tuned_reduce_intra_chain(void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall.c b/ompi/mca/coll/tuned/coll_tuned_alltoall.c index 78ff48cfee..dcfc0fe9c5 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall.c @@ -30,6 +30,7 @@ #include #include +#include int mca_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount, struct ompi_datatype_t *sdtype, @@ -331,6 +332,67 @@ int mca_coll_tuned_alltoall_intra_linear(void *sbuf, int scount, return err; } +/* The following are used by dynamic and forced rules */ + +/* publish details of each algorithm and if its forced/fixed/locked in */ +/* as you add methods/algorithms you must update this and the query/map routines */ + +int mca_coll_tuned_alltoall_intra_check_forced ( ) +{ + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "alltoall_algorithm", + "Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 pairwise, 3: modified bruck, 4: two proc only.", + false, false, mca_coll_tuned_alltoall_forced_choice, + &mca_coll_tuned_alltoall_forced_choice); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "alltoall_algorithm_segmentsize", + "Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + false, false, mca_coll_tuned_alltoall_forced_segsize, + &mca_coll_tuned_alltoall_forced_segsize); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "alltoall_algorithm_tree_fanout", + "Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", + false, false, + mca_coll_tuned_init_tree_fanout, /* get system wide default */ + &mca_coll_tuned_alltoall_forced_tree_fanout); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "alltoall_algorithm_chain_fanout", + "Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", + false, false, + mca_coll_tuned_init_chain_fanout, /* get system wide default */ + &mca_coll_tuned_alltoall_forced_chain_fanout); + +return (MPI_SUCCESS); +} +int mca_coll_tuned_alltoall_intra_query ( ) +{ + return (4); /* 4 algorithms available */ +} + + +int mca_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm) +{ +switch (mca_coll_tuned_alltoall_forced_choice) { + case (0): return mca_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + case (1): return mca_coll_tuned_alltoall_intra_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + case (2): return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + case (3): return mca_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + case (4): return mca_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + default: + OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + mca_coll_tuned_alltoall_forced_choice, mca_coll_tuned_alltoall_intra_query())); + return (MPI_ERR_ARG); + } /* switch */ + +} diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision_fixed.c index 7256dbf59a..f2d498fc6a 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision_fixed.c @@ -48,21 +48,39 @@ int mca_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount, int err; int contig; int dsize; + MPI_Aint sext; + long lb; OPAL_OUTPUT((mca_coll_tuned_stream, "mca_coll_tuned_alltoall_intra_dec_fixed")); size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); + /* special case */ if (size==2) { return mca_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } - else { -/* return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); */ - return mca_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + + /* else we need data size for decision function */ + err = ompi_ddt_get_extent (sdtype, &lb, &sext); + if (err != MPI_SUCCESS) { + OPAL_OUTPUT((mca_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank)); + return (err); } -/* return OMPI_ERR_NOT_IMPLEMENTED; */ + dsize = sext * scount * size; /* needed for decision */ + + if (size >= 12 && dsize <= 768) { + return mca_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + } + else if (dsize <= 131072) { +/* not implemented yet.. need to find a 'nice' way to use the basic linear version without duplicating code */ +/* return mca_coll_tuned_alltoall_intra_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); */ + return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + } + else { + return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + } } diff --git a/ompi/mca/coll/tuned/coll_tuned_barrier.c b/ompi/mca/coll/tuned/coll_tuned_barrier.c index 77b3913ee9..4ec4887de2 100644 --- a/ompi/mca/coll/tuned/coll_tuned_barrier.c +++ b/ompi/mca/coll/tuned/coll_tuned_barrier.c @@ -224,6 +224,47 @@ int mca_coll_tuned_barrier_intra_linear(struct ompi_communicator_t *comm) return OMPI_ERR_NOT_IMPLEMENTED; } +/* The following are used by dynamic and forced rules */ + +/* publish details of each algorithm and if its forced/fixed/locked in */ +/* as you add methods/algorithms you must update this and the query/map routines */ + +int mca_coll_tuned_barrier_intra_check_forced ( ) +{ + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "barrier_algorithm", + "Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: step based bmtree", + false, false, mca_coll_tuned_barrier_forced_choice, + &mca_coll_tuned_barrier_forced_choice); + +return (MPI_SUCCESS); +} +int mca_coll_tuned_barrier_intra_query ( ) +{ + return (4); /* 4 algorithms available */ + /* 2 to do */ +} + + +int mca_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm) +{ +switch (mca_coll_tuned_barrier_forced_choice) { + case (0): return mca_coll_tuned_barrier_intra_dec_fixed (comm); +/* case (1): return mca_coll_tuned_barrier_intra_linear (comm); */ + case (2): return mca_coll_tuned_barrier_intra_doublering (comm); + case (3): return mca_coll_tuned_barrier_intra_recursivedoubling (comm); + case (4): return mca_coll_tuned_barrier_intra_bruck (comm); + case (5): return mca_coll_tuned_barrier_intra_two_procs (comm); +/* case (6): return mca_coll_tuned_barrier_intra_bmtree_step (comm); */ + default: + OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + mca_coll_tuned_barrier_forced_choice, mca_coll_tuned_barrier_intra_query())); + return (MPI_ERR_ARG); + } /* switch */ + +} + diff --git a/ompi/mca/coll/tuned/coll_tuned_bcast.c b/ompi/mca/coll/tuned/coll_tuned_bcast.c index 461c230eb9..a1d38a3306 100644 --- a/ompi/mca/coll/tuned/coll_tuned_bcast.c +++ b/ompi/mca/coll/tuned/coll_tuned_bcast.c @@ -682,3 +682,65 @@ mca_coll_tuned_bcast_intra_bintree ( void* buffer, } +int mca_coll_tuned_bcast_intra_check_forced ( ) +{ + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "bcast_algorithm", + "Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: BM tree.", + false, false, mca_coll_tuned_bcast_forced_choice, + &mca_coll_tuned_bcast_forced_choice); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "bcast_algorithm_segmentsize", + "Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + false, false, mca_coll_tuned_bcast_forced_segsize, + &mca_coll_tuned_bcast_forced_segsize); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "bcast_algorithm_tree_fanout", + "Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", + false, false, + mca_coll_tuned_init_tree_fanout, /* get system wide default */ + &mca_coll_tuned_bcast_forced_tree_fanout); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "bcast_algorithm_chain_fanout", + "Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", + false, false, + mca_coll_tuned_init_chain_fanout, /* get system wide default */ + &mca_coll_tuned_bcast_forced_chain_fanout); + +return (MPI_SUCCESS); +} + + +int mca_coll_tuned_bcast_intra_query ( ) +{ + return (4); /* 4 algorithms available */ + /* 2 left to implement + NEC version */ +} + + +int mca_coll_tuned_bcast_intra_do_forced(void *buf, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm) +{ +switch (mca_coll_tuned_bcast_forced_choice) { + case (0): return mca_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm); +/* case (1): return mca_coll_tuned_bcast_intra_linear (buf, count, dtype, root, comm); */ + case (2): return mca_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize, mca_coll_tuned_bcast_forced_chain_fanout ); + case (3): return mca_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize); + case (4): return mca_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize); + case (5): return mca_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize); +/* case (6): return mca_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm, + * mca_coll_tuned_bcast_forced_segsize); */ + default: + OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + mca_coll_tuned_bcast_forced_choice, mca_coll_tuned_bcast_intra_query())); + return (MPI_ERR_ARG); + } /* switch */ + +} + diff --git a/ompi/mca/coll/tuned/coll_tuned_bcast_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_bcast_decision_fixed.c index 8b8a7d3c10..61c357669d 100644 --- a/ompi/mca/coll/tuned/coll_tuned_bcast_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_bcast_decision_fixed.c @@ -45,21 +45,56 @@ int mca_coll_tuned_bcast_intra_dec_fixed(void *buff, int count, int rank; int err; int contig; - int dsize; + int msgsize; + MPI_Aint ext; + long lb; + int segsize = 0; + OPAL_OUTPUT((mca_coll_tuned_stream,"mca_coll_tuned_bcast_intra_dec_fixed")); size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); -/* err = mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm); */ -/* err = mca_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, (0)); */ -/* err = mca_coll_tuned_bcast_intra_chain (buff, count, datatype, root, comm, (0), 1); */ -/* err = mca_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, (8192)); */ - err = mca_coll_tuned_bcast_intra_split_bintree (buff, count, datatype, root, comm, (100)); -/* err = mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, (100)); */ + /* else we need data size for decision function */ + err = ompi_ddt_get_extent (datatype, &lb, &ext); + if (err != MPI_SUCCESS) { + OPAL_OUTPUT((mca_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank)); + return (err); + } + + msgsize = ext * count; /* needed for decision */ + + /* this is based on gige measurements */ + + if ((size < 4)) { + segsize = 0; + return mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm); + } + else if (size == 4) { + if (msgsize < 524288) segsize = 0; + else msgsize = 16384; + return mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize); + } + else if (size > 4 && size <= 8 && msgsize < 4096) { + segsize = 0; + return mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm); + } + else if (size > 8 && msgsize >= 32768 && msgsize < 524288) { + segsize = 16384; + return mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize); + } + else if (size > 4 && msgsize >= 524288) { + segsize = 16384; + return mca_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize); + } + else { + segsize = 0; + /* once tested can swap this back in */ +/* return mca_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */ + return mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize); + } - return err; } diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index bf76eb5234..c4e10cc4f7 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -46,6 +46,22 @@ int mca_coll_tuned_init_chain_fanout = 4; /* forced alogrithm variables */ int mca_coll_tuned_alltoall_forced_choice = 0; int mca_coll_tuned_alltoall_forced_segsize = 0; +int mca_coll_tuned_alltoall_forced_chain_fanout = 0; +int mca_coll_tuned_alltoall_forced_tree_fanout = 0; + +int mca_coll_tuned_barrier_forced_choice = 0; + +int mca_coll_tuned_bcast_forced_choice = 0; +int mca_coll_tuned_bcast_forced_segsize = 0; +int mca_coll_tuned_bcast_forced_chain_fanout = 0; +int mca_coll_tuned_bcast_forced_tree_fanout = 0; + +int mca_coll_tuned_reduce_forced_choice = 0; +int mca_coll_tuned_reduce_forced_segsize = 0; +int mca_coll_tuned_reduce_forced_chain_fanout = 0; +int mca_coll_tuned_reduce_forced_tree_fanout = 0; + + /* * Local function */ @@ -145,6 +161,20 @@ static int tuned_open(void) mca_coll_tuned_stream = opal_output_open(NULL); } } + + /* now check that the user hasn't overrode any of the decision functions */ + /* the user can do this before every comm dup/create if they like */ + /* this is useful for benchmarking and user knows best tuning */ + + /* intra functions first */ + mca_coll_tuned_alltoall_intra_check_forced(); + mca_coll_tuned_barrier_intra_check_forced(); + mca_coll_tuned_bcast_intra_check_forced(); + mca_coll_tuned_reduce_intra_check_forced(); + + + + OPAL_OUTPUT((mca_coll_tuned_stream, "coll:tuned:component_open: done!")); return OMPI_SUCCESS; diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce.c b/ompi/mca/coll/tuned/coll_tuned_reduce.c index d6f2b897a6..0a538130b0 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce.c @@ -327,3 +327,67 @@ int mca_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf, segsize, 1 ); } +/* The following are used by dynamic and forced rules */ + +/* publish details of each algorithm and if its forced/fixed/locked in */ +/* as you add methods/algorithms you must update this and the query/map routines */ + +int mca_coll_tuned_reduce_intra_check_forced ( ) +{ + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "reduce_algorithm", + "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline", + false, false, mca_coll_tuned_reduce_forced_choice, + &mca_coll_tuned_reduce_forced_choice); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "reduce_algorithm_segmentsize", + "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + false, false, mca_coll_tuned_reduce_forced_segsize, + &mca_coll_tuned_reduce_forced_segsize); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "reduce_algorithm_tree_fanout", + "Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", + false, false, + mca_coll_tuned_init_tree_fanout, /* get system wide default */ + &mca_coll_tuned_reduce_forced_tree_fanout); + +mca_base_param_reg_int(&mca_coll_tuned_component.collm_version, + "reduce_algorithm_chain_fanout", + "Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", + false, false, + mca_coll_tuned_init_chain_fanout, /* get system wide default */ + &mca_coll_tuned_reduce_forced_chain_fanout); + +return (MPI_SUCCESS); +} + + +int mca_coll_tuned_reduce_intra_query ( ) +{ + return (3); /* 3 algorithms available */ +} + + +int mca_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm) +{ +switch (mca_coll_tuned_reduce_forced_choice) { + case (0): return mca_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm); +/* case (1): return mca_coll_tuned_reduce_intra_linear (sbuf, rbuf, count, dtype, op, root, comm); */ + case (2): return mca_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm, + mca_coll_tuned_reduce_forced_segsize, mca_coll_tuned_reduce_forced_chain_fanout); + case (3): return mca_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm, + mca_coll_tuned_reduce_forced_segsize); + default: + OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + mca_coll_tuned_reduce_forced_choice, mca_coll_tuned_reduce_intra_query())); + return (MPI_ERR_ARG); + } /* switch */ + +} + diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision_fixed.c index 2727204ca6..997e878287 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision_fixed.c @@ -46,17 +46,47 @@ int mca_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf, int rank; int err; int contig; - int dsize; + int msgsize; + MPI_Aint ext; + long lb; + int segsize = 0; + int fanout = 0; + OPAL_OUTPUT((mca_coll_tuned_stream, "mca_coll_tuned_reduce_intra_dec_fixed")); size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); -/* err = mca_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, (8192)); */ -/* err = mca_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, (8192)); */ - err = mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, (8192), 3); + /* need data size for decision function */ + err = ompi_ddt_get_extent (datatype, &lb, &ext); + if (err != MPI_SUCCESS) { + OPAL_OUTPUT((mca_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank)); + return (err); + } + + msgsize = ext * count; /* needed for decision */ + /* for small messages use linear algorithm */ + if (msgsize <= 4096) { + segsize = 0; + fanout = size-1; +/* when linear implemented or taken from basic put here, right now using chain as a linear system */ +/* return mca_coll_tuned_reduce_intra_linear (sendbuf, recvbuf, count, datatype, op, root, comm); */ + return mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); + } else if (msgsize <= 65536 ) { + segsize = 32768; + fanout = 8; + return mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); + } else if (msgsize < 524288) { + segsize = 1024; + fanout = size/2; +/* later swap this for a binary tree */ +/* fanout = 2; */ + return mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); + } else { + segsize = 1024; + return mca_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize); + } - return err; }