diff --git a/ompi/mca/coll/tuned/Makefile.am b/ompi/mca/coll/tuned/Makefile.am index 175bb95e88..87217a478a 100644 --- a/ompi/mca/coll/tuned/Makefile.am +++ b/ompi/mca/coll/tuned/Makefile.am @@ -36,6 +36,7 @@ sources = \ coll_tuned_barrier.c \ coll_tuned_bcast.c \ coll_tuned_reduce.c \ + coll_tuned_gather.c \ coll_tuned_component.c \ coll_tuned_module.c diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 7544f56a8f..e71ea38aae 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -206,6 +206,11 @@ extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT]; /* Gather */ int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS); int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS); + int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS); + int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize); + int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); + int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS); + int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS); int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS); int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS); @@ -329,6 +334,10 @@ struct mca_coll_base_comm_t { ompi_coll_tree_t *cached_bmtree; int cached_bmtree_root; + /* binomial tree */ + ompi_coll_tree_t *cached_in_order_bmtree; + int cached_in_order_bmtree_root; + /* chained tree (fanout followed by pipelines) */ ompi_coll_tree_t *cached_chain; int cached_chain_root; @@ -387,6 +396,19 @@ do { } \ } while (0) +#define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, ROOT ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (OMPI_COMM)->c_coll_selected_data; \ + if( !( (coll_comm->cached_in_order_bmtree) \ + && (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \ + if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \ + ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \ + } \ + coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \ + coll_comm->cached_in_order_bmtree_root = (ROOT); \ + } \ +} while (0) + #define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, ROOT ) \ do { \ mca_coll_base_comm_t* coll_comm = (OMPI_COMM)->c_coll_selected_data; \ diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index f8df6c0de8..4cc993f6d9 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -191,6 +191,7 @@ static int tuned_open(void) ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]); ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]); ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]); + ompi_coll_tuned_gather_intra_check_forced_init(&ompi_coll_tuned_forced_params[GATHER]); } OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!")); diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c index 27d7de2cc5..a25fc2dec0 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c @@ -301,3 +301,23 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount, comm); } +int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm) +{ + + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_dec_dynamic")); + + if (comm->c_coll_selected_data->user_forced[GATHER].algorithm) { + return ompi_coll_tuned_gather_intra_do_forced (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + } + + return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); +} diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 18f6eac79f..c10e9e5df1 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -496,3 +496,24 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount, rbuf, rcount, rdtype, comm); #endif /* defined(USE_MPICH2_DECISION) */ } +/* + * gather_intra_dec + * + * Function: - seletects gather algorithm to use + * Accepts: - same arguments as MPI_Gather() + * Returns: - MPI_SUCCESS or error code, passed from corresponding + * internal allgather function. + */ + +int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm) +{ + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_dec_fixed")); + return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); +} diff --git a/ompi/mca/coll/tuned/coll_tuned_gather.c b/ompi/mca/coll/tuned/coll_tuned_gather.c new file mode 100644 index 0000000000..43e9b32d8b --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_gather.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/constants.h" +#include "ompi/datatype/datatype.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/op/op.h" +#include "coll_tuned.h" +#include "coll_tuned_topo.h" +#include "coll_tuned_util.h" + + +int +ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm) +{ + int line = -1; + int i; + int rank; + int vrank; + int size; + int total_recv = 0; + char *ptmp = NULL; + char *tempbuf = NULL; + int err; + ompi_coll_tree_t* bmtree; + MPI_Status status; + MPI_Aint sextent, slb, strue_lb, strue_extent; + MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; + + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_binomial rank %d", rank)); + + /* create the binomial tree */ + COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, root ); + bmtree = comm->c_coll_selected_data->cached_in_order_bmtree; + + ompi_ddt_get_extent(sdtype, &slb, &sextent); + ompi_ddt_get_true_extent(sdtype, &strue_lb, &strue_extent); + ompi_ddt_get_extent(rdtype, &rlb, &rextent); + ompi_ddt_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); + + vrank = (rank - root + size) % size; + + if (rank == root) { + if (0 == root){ + /* root on 0, just use the recv buffer */ + ptmp = rbuf; + if (sbuf != MPI_IN_PLACE) { + err = ompi_ddt_sndrcv(sbuf, scount, sdtype, + ptmp, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + } else { + /* root is not on 0, allocate temp buffer for recv, + * rotate data at the end */ + tempbuf = malloc(rtrue_extent + (rcount*size - 1) * rextent); + if (NULL == tempbuf) { + err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; + } + + ptmp = tempbuf - rlb; + if (sbuf != MPI_IN_PLACE) { + /* copy from sbuf to temp buffer */ + err = ompi_ddt_sndrcv(sbuf, scount, sdtype, + ptmp, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } else { + /* copy from rbuf to temp buffer */ + err = ompi_ddt_copy_content_same_ddt(rdtype, rcount, ptmp, + (char *) rbuf + rank*rextent*rcount); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + } + total_recv = rcount; + } else if (!(vrank % 2)) { + /* other non-leaf nodes, allocate temp buffer for data received from + * children, the most we need is half of the total data elements due + * to the property of binimoal tree */ + tempbuf = malloc(strue_extent + (scount*size - 1) * sextent); + if (NULL == tempbuf) { + err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; + } + + ptmp = tempbuf - slb; + /* local copy to tempbuf */ + err = ompi_ddt_sndrcv(sbuf, scount, sdtype, + ptmp, scount, sdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + /* use sdtype,scount as rdtype,rdcount since they are ignored on + * non-root procs */ + rdtype = sdtype; + rcount = scount; + rextent = sextent; + total_recv = rcount; + } else { + /* leaf nodes, no temp buffer needed, use sdtype,scount as + * rdtype,rdcount since they are ignored on non-root procs */ + ptmp = sbuf; + total_recv = scount; + } + + if (!(vrank % 2)) { + /* all non-leaf nodes recv from children */ + for (i = 0; i < bmtree->tree_nextsize; i++) { + int cur_count = 0; + + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_binomial rank %d recv %d ", + rank, bmtree->tree_next[i])); + + err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, rcount*size-total_recv, rdtype, + bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, + comm, &status)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + /* the number of elements actually received may be smaller than the count + * passed to recv when comm_size is not power of 2 */ + MPI_Get_count(&status, rdtype, &cur_count); + total_recv += cur_count; + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_binomial received %d total %d\n", + cur_count, total_recv)); + } + } + + if (rank != root) { + /* all nodes except root send to parents */ + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n", + rank, bmtree->tree_prev, total_recv)); + + err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, + bmtree->tree_prev, + MCA_COLL_BASE_TAG_GATHER, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + if (rank == root) { + if (root != 0) { + /* rotate received data on root if root != 0 */ + err = ompi_ddt_copy_content_same_ddt(rdtype, rcount*(size - root), + (char *) rbuf + rextent*root*rcount, ptmp); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + + err = ompi_ddt_copy_content_same_ddt(rdtype, rcount*root, + rbuf, ptmp + rextent*rcount*(size-root)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + free(tempbuf); + } + } else if (!(vrank % 2)) { + /* other non-leaf nodes */ + free(tempbuf); + } + return MPI_SUCCESS; + + err_hndl: + if (NULL != tempbuf) + free(tempbuf); + + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; +} + +/* + * Linear functions are copied from the BASIC coll module + * they do not segment the message and are simple implementations + * but for some small number of nodes and/or small data sizes they + * are just as fast as tuned/tree based segmenting operations + * and as such may be selected by the decision functions + * These are copied into this module due to the way we select modules + * in V1. i.e. in V2 we will handle this differently and so will not + * have to duplicate code. + * JPG following the examples from other coll_tuned implementations. Dec06. + */ + +/* copied function (with appropriate renaming) starts here */ +/* + * gather_intra + * + * Function: - basic gather operation + * Accepts: - same arguments as MPI_Gather() + * Returns: - MPI_SUCCESS or error code + */ +int +ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm) +{ + int i; + int err; + int rank; + int size; + char *ptmp; + MPI_Aint incr; + MPI_Aint extent; + MPI_Aint lb; + + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + /* Everyone but root sends data and returns. */ + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_basic_linear rank %d", rank)); + + if (rank != root) { + return MCA_PML_CALL(send(sbuf, scount, sdtype, root, + MCA_COLL_BASE_TAG_GATHER, + MCA_PML_BASE_SEND_STANDARD, comm)); + } + + /* I am the root, loop receiving the data. */ + + ompi_ddt_get_extent(rdtype, &lb, &extent); + incr = extent * rcount; + for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) { + if (i == rank) { + if (MPI_IN_PLACE != sbuf) { + err = ompi_ddt_sndrcv(sbuf, scount, sdtype, + ptmp, rcount, rdtype); + } else { + err = MPI_SUCCESS; + } + } else { + err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i, + MCA_COLL_BASE_TAG_GATHER, + comm, MPI_STATUS_IGNORE)); + } + if (MPI_SUCCESS != err) { + return err; + } + } + + /* All done */ + + return MPI_SUCCESS; +} + + +/* copied function (with appropriate renaming) ends here */ + +/* The following are used by dynamic and forced rules */ + +/* publish details of each algorithm and if its forced/fixed/locked in */ +/* as you add methods/algorithms you must update this and the query/map + routines */ + +/* this routine is called by the component only */ +/* this makes sure that the mca parameters are set to their initial values + and perms */ +/* module does not call this they call the forced_getvalues routine instead */ + +int +ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) +{ + int rc, max_alg = 2, requested_alg; + + ompi_coll_tuned_forced_max_algorithms[GATHER] = max_alg; + + rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version, + "gather_algorithm_count", + "Number of gather algorithms available", + false, true, max_alg, NULL); + + mca_param_indices->algorithm_param_index + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "gather_algorithm", + "Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.", + false, false, 0, NULL); + mca_base_param_lookup_int(mca_param_indices->algorithm_param_index, + &(requested_alg)); + if( requested_alg > max_alg ) { + if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { + opal_output( 0, "Gather algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n", + requested_alg, max_alg ); + } + mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0); + } + + mca_param_indices->segsize_param_index + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "gather_algorithm_segmentsize", + "Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", + false, false, 0, NULL); + + mca_param_indices->tree_fanout_param_index + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "gather_algorithm_tree_fanout", + "Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", + false, false, + ompi_coll_tuned_init_tree_fanout, /* get system wide default */ + NULL); + + mca_param_indices->chain_fanout_param_index + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "gather_algorithm_chain_fanout", + "Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", + false, false, + ompi_coll_tuned_init_chain_fanout, /* get system wide default */ + NULL); + + return (MPI_SUCCESS); +} + +int +ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm) +{ + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:gather_intra_do_forced selected algorithm %d", + comm->c_coll_selected_data->user_forced[GATHER].algorithm)); + + switch (comm->c_coll_selected_data->user_forced[GATHER].algorithm) { + case (0): + return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + case (1): + return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + case (2): + return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + default: + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + comm->c_coll_selected_data->user_forced[GATHER].algorithm, + ompi_coll_tuned_forced_max_algorithms[GATHER])); + return (MPI_ERR_ARG); + } /* switch */ +} + +int +ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm, + int algorithm, int faninout, int segsize) +{ + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); + + switch (algorithm) { + case (0): + return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + case (1): + return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + case (2): + return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); + default: + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, + ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); + return (MPI_ERR_ARG); + } /* switch */ +} diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index 866af96bfa..1bb78e3d8d 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -67,10 +67,10 @@ static const mca_coll_base_module_1_0_0_t intra_fixed = { ompi_coll_tuned_barrier_intra_dec_fixed, /* NULL, */ ompi_coll_tuned_bcast_intra_dec_fixed, - /* NULL, */ + /* NULL, */ /* ompi_coll_tuned_exscan_intra_dec_fixed, */ NULL, - /* ompi_coll_tuned_gather_intra_dec_fixed, */ + /* ompi_coll_tuned_gather_intra_dec_fixed, */ NULL, /* ompi_coll_tuned_gatherv_intra_dec_fixed, */ NULL, @@ -113,8 +113,8 @@ static const mca_coll_base_module_1_0_0_t intra_dynamic = { /* NULL, */ /* ompi_coll_tuned_exscan_intra_dec_dynamic, */ NULL, - /* ompi_coll_tuned_gather_intra_dec_dynamic, */ - NULL, + ompi_coll_tuned_gather_intra_dec_dynamic, + /* NULL, */ /* ompi_coll_tuned_gatherv_intra_dec_dynamic, */ NULL, ompi_coll_tuned_reduce_intra_dec_dynamic, @@ -400,6 +400,7 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm) ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER])); ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST])); ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE])); + ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[GATHER], &(data->user_forced[GATHER])); } @@ -468,6 +469,9 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm) data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0); data->cached_bmtree_root = 0; + /* binomial tree */ + data->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree (comm, 0); + data->cached_in_order_bmtree_root = 0; /* * chains (fanout followed by pipelines) * are more difficuilt as the fan out really really depends on message size [sometimes].. @@ -523,6 +527,9 @@ int ompi_coll_tuned_module_finalize(struct ompi_communicator_t *comm) if (comm->c_coll_selected_data->cached_bmtree) { /* destroy bmtree if defined */ ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_bmtree); } + if (comm->c_coll_selected_data->cached_in_order_bmtree) { /* destroy bmtree if defined */ + ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_in_order_bmtree); + } if (comm->c_coll_selected_data->cached_chain) { /* destroy general chain if defined */ ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_chain); } diff --git a/ompi/mca/coll/tuned/coll_tuned_topo.c b/ompi/mca/coll/tuned/coll_tuned_topo.c index dff6d9a2b5..357f15ee63 100644 --- a/ompi/mca/coll/tuned/coll_tuned_topo.c +++ b/ompi/mca/coll/tuned/coll_tuned_topo.c @@ -307,6 +307,18 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ) return OMPI_SUCCESS; } +/* + * + * Here are some of the examples of this tree: + * size == 2 size = 4 size = 8 + * 0 0 0 + * / | \ / | \ + * 1 2 1 4 2 1 + * | | |\ + * 3 6 5 3 + * | + * 7 + */ ompi_coll_tree_t* ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, int root ) @@ -375,6 +387,82 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, return bmtree; } +/* + * Constructs in-order binomial tree which can be used for gather/scatter + * operations. + * + * Here are some of the examples of this tree: + * size == 2 size = 4 size = 8 + * 0 0 0 + * / / | / | \ + * 1 1 2 1 2 4 + * | | | \ + * 3 3 5 6 + * | + * 7 + */ +ompi_coll_tree_t* +ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, + int root ) +{ + int childs = 0; + int rank, vrank; + int size; + int mask = 1; + int remote; + ompi_coll_tree_t *bmtree; + int i; + + OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root)); + + /* + * Get size and rank of the process in this communicator + */ + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + vrank = (rank - root + size) % size; + + bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); + if (!bmtree) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory")); + return NULL; + } + + bmtree->tree_bmtree = 1; + bmtree->tree_root = MPI_UNDEFINED; + bmtree->tree_nextsize = MPI_UNDEFINED; + for(i=0;itree_next[i] = -1; + } + + if (root == rank) { + bmtree->tree_prev = root; + } + + while (mask < size) { + remote = vrank ^ mask; + if (remote < vrank) { + bmtree->tree_prev = (remote + root) % size; + break; + } else if (remote < size) { + bmtree->tree_next[childs] = (remote + root) % size; + childs++; + if (childs==MAXTREEFANOUT) { + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", + MAXTREEFANOUT, childs)); + return NULL; + } + } + mask <<= 1; + } + bmtree->tree_nextsize = childs; + bmtree->tree_root = root; + + return bmtree; +} + ompi_coll_tree_t* ompi_coll_tuned_topo_build_chain( int fanout, diff --git a/ompi/mca/coll/tuned/coll_tuned_topo.h b/ompi/mca/coll/tuned/coll_tuned_topo.h index a1c0817334..33e1ef0e01 100644 --- a/ompi/mca/coll/tuned/coll_tuned_topo.h +++ b/ompi/mca/coll/tuned/coll_tuned_topo.h @@ -46,7 +46,9 @@ extern "C" { ompi_coll_tree_t* ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, int root ); - + ompi_coll_tree_t* + ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, + int root ); ompi_coll_tree_t* ompi_coll_tuned_topo_build_chain( int fanout, struct ompi_communicator_t* com,