From aa019e239e4b408a87f8d2442d0d0476b5363080 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 15 Feb 2015 14:47:27 -0500 Subject: [PATCH] Rename the base header file containing the prototypes of the collective functions. --- ompi/mca/coll/base/Makefile.am | 21 +- ompi/mca/coll/base/base.h | 2 +- ompi/mca/coll/base/coll_base.h | 558 ------------------ ompi/mca/coll/base/coll_base_allgather.c | 383 +++--------- ompi/mca/coll/base/coll_base_allgatherv.c | 255 ++------ ompi/mca/coll/base/coll_base_allreduce.c | 428 +++++--------- ompi/mca/coll/base/coll_base_alltoall.c | 333 +++-------- ompi/mca/coll/base/coll_base_alltoallv.c | 195 +----- ompi/mca/coll/base/coll_base_barrier.c | 242 ++------ ompi/mca/coll/base/coll_base_bcast.c | 450 +++++--------- ompi/mca/coll/base/coll_base_frame.c | 58 +- ompi/mca/coll/base/coll_base_functions.h | 341 +++++++++++ ompi/mca/coll/base/coll_base_gather.c | 265 ++------- ompi/mca/coll/base/coll_base_reduce.c | 468 +++++---------- ompi/mca/coll/base/coll_base_reduce_scatter.c | 284 +++------ ompi/mca/coll/base/coll_base_scatter.c | 199 +------ ompi/mca/coll/base/coll_base_topo.c | 185 +++--- ompi/mca/coll/base/coll_base_topo.h | 27 +- ompi/mca/coll/base/coll_base_util.c | 14 +- ompi/mca/coll/base/coll_base_util.h | 12 +- ompi/mca/coll/coll.h | 3 + 21 files changed, 1400 insertions(+), 3323 deletions(-) delete mode 100644 ompi/mca/coll/base/coll_base.h create mode 100644 ompi/mca/coll/base/coll_base_functions.h diff --git a/ompi/mca/coll/base/Makefile.am b/ompi/mca/coll/base/Makefile.am index ca608693ca..10524b60b9 100644 --- a/ompi/mca/coll/base/Makefile.am +++ b/ompi/mca/coll/base/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2015 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -20,10 +20,25 @@ dist_ompidata_DATA = base/help-mca-coll-base.txt headers += \ base/base.h \ - base/coll_tags.h + base/coll_tags.h \ + base/coll_base_topo.h \ + base/coll_base_util.h libmca_coll_la_SOURCES += \ base/coll_base_comm_select.c \ base/coll_base_comm_unselect.c \ base/coll_base_find_available.c \ - base/coll_base_frame.c + base/coll_base_frame.c \ + base/coll_base_bcast.c \ + base/coll_base_scatter.c \ + base/coll_base_topo.c \ + base/coll_base_allgather.c \ + base/coll_base_allgatherv.c \ + base/coll_base_util.c \ + base/coll_base_allreduce.c \ + base/coll_base_alltoall.c \ + base/coll_base_gather.c \ + base/coll_base_alltoallv.c \ + base/coll_base_reduce.c \ + base/coll_base_barrier.c \ + base/coll_base_reduce_scatter.c diff --git a/ompi/mca/coll/base/base.h b/ompi/mca/coll/base/base.h index 1c9a95c180..3d54de22bf 100644 --- a/ompi/mca/coll/base/base.h +++ b/ompi/mca/coll/base/base.h @@ -87,7 +87,7 @@ int mca_coll_base_find_available(bool enable_progress_threads, * coll component needs to be selected for it. It should be invoked * near the end of the communicator creation process such that * almost everything else is functional on the communicator (e.g., - * point-to-point communication). + * point-to-point communication). * * Note that new communicators may be created as a result of * invoking this function. Specifically: this function is called in diff --git a/ompi/mca/coll/base/coll_base.h b/ompi/mca/coll/base/coll_base.h deleted file mode 100644 index f044a60375..0000000000 --- a/ompi/mca/coll/base/coll_base.h +++ /dev/null @@ -1,558 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_COLL_TUNED_EXPORT_H -#define MCA_COLL_TUNED_EXPORT_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "opal/mca/mca.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/request/request.h" - -/* need to include our own topo prototypes so we can malloc data on the comm correctly */ -#include "coll_tuned_topo.h" - -/* also need the dynamic rule structures */ -#include "coll_tuned_dynamic_rules.h" - -/* some fixed value index vars to simplify certain operations */ -typedef enum COLLTYPE { - ALLGATHER = 0, /* 0 */ - ALLGATHERV, /* 1 */ - ALLREDUCE, /* 2 */ - ALLTOALL, /* 3 */ - ALLTOALLV, /* 4 */ - ALLTOALLW, /* 5 */ - BARRIER, /* 6 */ - BCAST, /* 7 */ - EXSCAN, /* 8 */ - GATHER, /* 9 */ - GATHERV, /* 10 */ - REDUCE, /* 11 */ - REDUCESCATTER, /* 12 */ - SCAN, /* 13 */ - SCATTER, /* 14 */ - SCATTERV, /* 15 */ - COLLCOUNT /* 16 end counter keep it as last element */ -} COLLTYPE_T; - -/* defined arg lists to simply auto inclusion of user overriding decision functions */ -#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module -/* end defined arg lists to simply auto inclusion of user overriding decision functions */ - -BEGIN_C_DECLS - -/* these are the same across all modules and are loaded at component query time */ -extern int ompi_coll_tuned_stream; -extern int ompi_coll_tuned_priority; -extern int ompi_coll_tuned_preallocate_memory_comm_size_limit; -extern bool ompi_coll_tuned_use_dynamic_rules; -extern char* ompi_coll_tuned_dynamic_rules_filename; -extern int ompi_coll_tuned_init_tree_fanout; -extern int ompi_coll_tuned_init_chain_fanout; -extern int ompi_coll_tuned_init_max_requests; -extern int ompi_coll_tuned_alltoall_small_msg; -extern int ompi_coll_tuned_alltoall_intermediate_msg; - -/* forced algorithm choices */ -/* this structure is for storing the indexes to the forced algorithm mca params... */ -/* we get these at component query (so that registered values appear in ompi_infoi) */ -struct coll_tuned_force_algorithm_mca_param_indices_t { - int algorithm_param_index; /* which algorithm you want to force */ - int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */ - int tree_fanout_param_index; /* tree fanout/in to use */ - int chain_fanout_param_index; /* K-chain fanout/in to use */ - int max_requests_param_index; /* Maximum number of outstanding send or recv requests */ -}; -typedef struct coll_tuned_force_algorithm_mca_param_indices_t coll_tuned_force_algorithm_mca_param_indices_t; - - -/* the following type is for storing actual value obtained from the MCA on each tuned module */ -/* via their mca param indices lookup in the component */ -/* this structure is stored once per collective type per communicator... */ -struct coll_tuned_force_algorithm_params_t { - int algorithm; /* which algorithm you want to force */ - int segsize; /* segsize to use (if supported), 0 = no segmentation */ - int tree_fanout; /* tree fanout/in to use */ - int chain_fanout; /* K-chain fanout/in to use */ - int max_requests; /* Maximum number of outstanding send or recv requests */ -}; -typedef struct coll_tuned_force_algorithm_params_t coll_tuned_force_algorithm_params_t; - -/* the indices to the MCA params so that modules can look them up at open / comm create time */ -extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT]; -/* the actual max algorithm values (readonly), loaded at component open */ -extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT]; - -/* - * coll API functions - */ - -/* API functions */ - -int ompi_coll_tuned_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -mca_coll_base_module_t * -ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority); - -/* API functions of decision functions and any implementations */ - -/* - * Note this gets long as we have to have a prototype for each - * MPI collective 4 times.. 2 for the comm type and 2 for each decision - * type. - * we might cut down the decision prototypes by conditional compiling - */ - -/* All Gather */ -int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS); -int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS); - -/* All GatherV */ -int ompi_coll_tuned_allgatherv_intra_dec_fixed(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS); -int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS); - -/* All Reduce */ -int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize); -int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS); -int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS); - -/* AlltoAll */ -int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests); -int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests); -int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS); -int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS); - -/* AlltoAllV */ -int ompi_coll_tuned_alltoallv_intra_dec_fixed(ALLTOALLV_ARGS); -int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS); -int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS); -int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm); -int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS); -int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS); -int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS); -int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS); - -/* AlltoAllW */ -int ompi_coll_tuned_alltoallw_intra_dec_fixed(ALLTOALLW_ARGS); -int ompi_coll_tuned_alltoallw_intra_dec_dynamic(ALLTOALLW_ARGS); -int ompi_coll_tuned_alltoallw_inter_dec_fixed(ALLTOALLW_ARGS); -int ompi_coll_tuned_alltoallw_inter_dec_dynamic(ALLTOALLW_ARGS); - -/* Barrier */ -int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS); -int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS); -int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS); - -/* Bcast */ -int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree ); -int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS); -int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS); -int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS); -int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS); -int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains); -int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize); -int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize); -int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize); -int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize); -int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS); -int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS); - -/* Exscan */ -int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS); -int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS); -int ompi_coll_tuned_exscan_inter_dec_fixed(EXSCAN_ARGS); -int ompi_coll_tuned_exscan_inter_dec_dynamic(EXSCAN_ARGS); - -/* Gather */ -int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS); -int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS); -int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS); -int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS); -int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS); -int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size); -int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS); -int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS); - -/* GatherV */ -int ompi_coll_tuned_gatherv_intra_dec_fixed(GATHERV_ARGS); -int ompi_coll_tuned_gatherv_intra_dec_dynamic(GATHER_ARGS); -int ompi_coll_tuned_gatherv_inter_dec_fixed(GATHER_ARGS); -int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS); - -/* Reduce */ -int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ); -int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS); -int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS); -int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS); -int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs); -int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS); -int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs ); -int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); -int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS); -int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS); - -/* Reduce_scatter */ -int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS); - -int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS); -int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS); - -/* Scan */ -int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS); -int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS); -int ompi_coll_tuned_scan_inter_dec_fixed(SCAN_ARGS); -int ompi_coll_tuned_scan_inter_dec_dynamic(SCAN_ARGS); - -/* Scatter */ -int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS); -int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS); -int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS); -int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize); -int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); -int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS); -int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS); -int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS); -int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS); - -/* ScatterV */ -int ompi_coll_tuned_scatterv_intra_dec_fixed(SCATTERV_ARGS); -int ompi_coll_tuned_scatterv_intra_dec_dynamic(SCATTERV_ARGS); -int ompi_coll_tuned_scatterv_inter_dec_fixed(SCATTERV_ARGS); -int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS); - -int mca_coll_tuned_ft_event(int state); - - -/* Utility functions */ - -static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count) -{ - int i; - for (i = 0; i < count; ++i) - ompi_request_free(&reqs[i]); -} - -struct mca_coll_tuned_component_t { - /** Base coll component */ - mca_coll_base_component_2_0_0_t super; - - /** MCA parameter: Priority of this component */ - int tuned_priority; - - /** global stuff that I need the component to store */ - - /* MCA parameters first */ - - /* cached decision table stuff (moved from MCW module) */ - ompi_coll_alg_rule_t *all_base_rules; -}; -/** - * Convenience typedef - */ -typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t; - -/** - * Global component instance - */ -OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component; - -/* - * Data structure for hanging data off the communicator - * i.e. per module instance - */ -struct mca_coll_tuned_comm_t { - /* standard data for requests and PML usage */ - - /* Precreate space for requests - * Note this does not effect basic, - * but if in wrong context can confuse a debugger - * this is controlled by an MCA param - */ - - ompi_request_t **mcct_reqs; - int mcct_num_reqs; - - /* - * tuned topo information caching per communicator - * - * for each communicator we cache the topo information so we can - * reuse without regenerating if we change the root, [or fanout] - * then regenerate and recache this information - */ - - /* general tree with n fan out */ - ompi_coll_tree_t *cached_ntree; - int cached_ntree_root; - int cached_ntree_fanout; - - /* binary tree */ - ompi_coll_tree_t *cached_bintree; - int cached_bintree_root; - - /* binomial tree */ - ompi_coll_tree_t *cached_bmtree; - int cached_bmtree_root; - - /* binomial tree */ - ompi_coll_tree_t *cached_in_order_bmtree; - int cached_in_order_bmtree_root; - - /* chained tree (fanout followed by pipelines) */ - ompi_coll_tree_t *cached_chain; - int cached_chain_root; - int cached_chain_fanout; - - /* pipeline */ - ompi_coll_tree_t *cached_pipeline; - int cached_pipeline_root; - - /* in-order binary tree (root of the in-order binary tree is rank 0) */ - ompi_coll_tree_t *cached_in_order_bintree; - - /* moving to the component */ - ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */ - - /* for forced algorithms we store the information on the module */ - /* previously we only had one shared copy, ops, it really is per comm/module */ - coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT]; -}; -typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t; - -struct mca_coll_tuned_module_t { - mca_coll_base_module_t super; - - mca_coll_tuned_comm_t *tuned_data; -}; -typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t; -OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t); - -static inline void mca_coll_tuned_free_reqs(ompi_request_t ** reqs, - int count) -{ - int i; - for (i = 0; i < count; ++i) - ompi_request_free(reqs + i); -} - -END_C_DECLS - -#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \ -do { \ - mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \ - if( !( (coll_comm->cached_bintree) \ - && (coll_comm->cached_bintree_root == (ROOT)) ) ) { \ - if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \ - ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) ); \ - } \ - coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \ - coll_comm->cached_bintree_root = (ROOT); \ - } \ -} while (0) - -#define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \ -do { \ - mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \ - if( !( (coll_comm->cached_bmtree) \ - && (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \ - if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \ - ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \ - } \ - coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \ - coll_comm->cached_bmtree_root = (ROOT); \ - } \ -} while (0) - -#define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \ -do { \ - mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \ - if( !( (coll_comm->cached_in_order_bmtree) \ - && (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \ - if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \ - ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \ - } \ - coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \ - coll_comm->cached_in_order_bmtree_root = (ROOT); \ - } \ -} while (0) - -#define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, TUNED_MODULE, ROOT ) \ -do { \ - mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \ - if( !( (coll_comm->cached_pipeline) \ - && (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \ - if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \ - ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \ - } \ - coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \ - coll_comm->cached_pipeline_root = (ROOT); \ - } \ -} while (0) - -#define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, TUNED_MODULE, ROOT, FANOUT ) \ -do { \ - mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \ - if( !( (coll_comm->cached_chain) \ - && (coll_comm->cached_chain_root == (ROOT)) \ - && (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \ - if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \ - ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) ); \ - } \ - coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \ - coll_comm->cached_chain_root = (ROOT); \ - coll_comm->cached_chain_fanout = (FANOUT); \ - } \ -} while (0) - -#define COLL_TUNED_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, TUNED_MODULE ) \ -do { \ - mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \ - if( !(coll_comm->cached_in_order_bintree) ) { \ - /* In-order binary tree topology is defined by communicator size */ \ - /* Thus, there is no need to destroy anything */ \ - coll_comm->cached_in_order_bintree = \ - ompi_coll_tuned_topo_build_in_order_bintree((OMPI_COMM)); \ - } \ -} while (0) - -/** - * This macro give a generic way to compute the best count of - * the segment (i.e. the number of complete datatypes that - * can fit in the specified SEGSIZE). Beware, when this macro - * is called, the SEGCOUNT should be initialized to the count as - * expected by the collective call. - */ -#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \ - if( ((SEGSIZE) >= (TYPELNG)) && \ - ((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \ - size_t residual; \ - (SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \ - residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \ - if( residual > ((TYPELNG) >> 1) ) \ - (SEGCOUNT)++; \ - } \ - -/** - * This macro gives a generic wait to compute the well distributed block counts - * when the count and number of blocks are fixed. - * Macro returns "early-block" count, "late-block" count, and "split-index" - * which is the block at which we switch from "early-block" count to - * the "late-block" count. - * count = split_index * early_block_count + - * (block_count - split_index) * late_block_count - * We do not perform ANY error checks - make sure that the input values - * make sense (eg. count > num_blocks). - */ -#define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \ - EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \ - EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \ - SPLIT_INDEX = COUNT % NUM_BLOCKS; \ - if (0 != SPLIT_INDEX) { \ - EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \ - } \ - - -#endif /* MCA_COLL_TUNED_EXPORT_H */ - diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index 4fe5c5eca3..6c90b10fa5 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -30,31 +30,12 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* allgather algorithm variables */ -static int coll_tuned_allgather_algorithm_count = 6; -static int coll_tuned_allgather_forced_algorithm = 0; -static int coll_tuned_allgather_segment_size = 0; -static int coll_tuned_allgather_tree_fanout; -static int coll_tuned_allgather_chain_fanout; - -/* valid values for coll_tuned_allgather_forced_algorithm */ -static mca_base_var_enum_value_t allgather_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "bruck"}, - {3, "recursive_doubling"}, - {4, "ring"}, - {5, "neighbor"}, - {6, "two_proc"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" /* - * ompi_coll_tuned_allgather_intra_bruck + * ompi_coll_base_allgather_intra_bruck * * Function: allgather using O(log(N)) steps. * Accepts: Same arguments as MPI_Allgather @@ -65,7 +46,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = { * in Multiport Message-Passing Systems" * Memory requirements: non-zero ranks require shift buffer to perform final * step in the algorithm. - * + * * Example on 6 nodes: * Initialization: everyone has its own buffer at location 0 in rbuf * This means if user specified MPI_IN_PLACE for sendbuf @@ -84,7 +65,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = { * [2] [3] [4] [5] [0] [1] * [3] [4] [5] [0] [1] [2] * Step 2: send message to (rank - 2^2), receive message from (rank + 2^2) - * message size is "all remaining blocks" + * message size is "all remaining blocks" * # 0 1 2 3 4 5 * [0] [1] [2] [3] [4] [5] * [1] [2] [3] [4] [5] [0] @@ -101,7 +82,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = { * [4] [4] [4] [4] [4] [4] * [5] [5] [5] [5] [5] [5] */ -int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, +int ompi_coll_base_allgather_intra_bruck(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -115,8 +96,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_bruck rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -125,7 +106,7 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of + - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of receive buffer, else - if rank r != 0, copy r^th block from receive buffer to block 0. */ @@ -140,15 +121,15 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend); if (err < 0) { line = __LINE__; goto err_hndl; } } - + /* Communication step: At every step i, rank r: - doubles the distance - - sends message which starts at begining of rbuf and has size + - sends message which starts at begining of rbuf and has size (blockcount * rcount) to rank (r - distance) - receives message of size blockcount * rcount from rank (r + distance) at location (rbuf + distance * rcount * rext) - - blockcount doubles until last step when only the remaining data is + - blockcount doubles until last step when only the remaining data is exchanged. */ blockcount = 1; @@ -162,14 +143,14 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, if (distance <= (size >> 1)) { blockcount = distance; - } else { + } else { blockcount = size - distance; } /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, blockcount * rcount, rdtype, + err = ompi_coll_base_sendrecv(tmpsend, blockcount * rcount, rdtype, sendto, MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, blockcount * rcount, rdtype, + tmprecv, blockcount * rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE, rank); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -178,8 +159,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, /* Finalization step: On all nodes except 0, data needs to be shifted locally: - - create temporary shift buffer, - see discussion in coll_basic_reduce.c about the size and begining + - create temporary shift buffer, + see discussion in coll_basic_reduce.c about the size and begining of temporary buffer. - copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer - move blocks [(size - rank) .. size] from rbuf to begining of rbuf @@ -195,8 +176,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, free_buf = (char*) calloc(((true_extent + ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)), sizeof(char)); - if (NULL == free_buf) { - line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; + if (NULL == free_buf) { + line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; } shift_buf = free_buf - true_lb; @@ -207,13 +188,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, /* 2. move blocks [(size - rank) .. size] from rbuf to the begining of rbuf */ tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext; - err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount, + err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount, rbuf, tmpsend); if (err < 0) { line = __LINE__; goto err_hndl; } /* 3. copy blocks from shift buffer back to rbuf starting at block [rank]. */ tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount, + err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount, tmprecv, shift_buf); if (err < 0) { line = __LINE__; goto err_hndl; } @@ -223,13 +204,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } /* - * ompi_coll_tuned_allgather_intra_recursivedoubling + * ompi_coll_base_allgather_intra_recursivedoubling * * Function: allgather using O(log(N)) steps. * Accepts: Same arguments as MPI_Allgather @@ -239,29 +220,29 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, * This algorithm is used in MPICH-2 for small- and medium-sized * messages on power-of-two processes. * - * Limitation: Current implementation only works on power-of-two number of - * processes. + * Limitation: Current implementation only works on power-of-two number of + * processes. * In case this algorithm is invoked on non-power-of-two * processes, Bruck algorithm will be invoked. - * + * * Memory requirements: * No additional memory requirements beyond user-supplied buffers. - * + * * Example on 4 nodes: * Initialization: everyone has its own buffer at location rank in rbuf - * # 0 1 2 3 + * # 0 1 2 3 * [0] [ ] [ ] [ ] * [ ] [1] [ ] [ ] * [ ] [ ] [2] [ ] * [ ] [ ] [ ] [3] * Step 0: exchange data with (rank ^ 2^0) - * # 0 1 2 3 + * # 0 1 2 3 * [0] [0] [ ] [ ] * [1] [1] [ ] [ ] * [ ] [ ] [2] [2] * [ ] [ ] [3] [3] * Step 1: exchange data with (rank ^ 2^1) (if you can) - * # 0 1 2 3 + * # 0 1 2 3 * [0] [0] [0] [0] * [1] [1] [1] [1] * [2] [2] [2] [2] @@ -269,12 +250,12 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, * * TODO: Modify the algorithm to work with any number of nodes. * We can modify code to use identical implementation like MPICH-2: - * - using recursive-halving algorithm, at the end of each step, + * - using recursive-halving algorithm, at the end of each step, * determine if there are nodes who did not exchange their data in that * step, and send them appropriate messages. */ -int -ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, +int +ompi_coll_base_allgather_intra_recursivedoubling(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -293,21 +274,21 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, pow2size >>=1; /* Current implementation only handles power-of-two number of processes. - If the function was called on non-power-of-two number of processes, + If the function was called on non-power-of-two number of processes, print warning and call bruck allgather algorithm with same parameters. */ if (pow2size != size) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm", size)); - return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, + return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_recursivedoubling rank %d, size %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_recursivedoubling rank %d, size %d", rank, size)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); @@ -317,7 +298,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of + - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of receive buffer */ if (MPI_IN_PLACE != sbuf) { @@ -326,8 +307,8 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } - + } + /* Communication step: At every step i, rank r: - exchanges message with rank remote = (r ^ 2^i). @@ -347,7 +328,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, } /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, + err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, remote, MCA_COLL_BASE_TAG_ALLGATHER, tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, remote, MCA_COLL_BASE_TAG_ALLGATHER, @@ -359,7 +340,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -367,7 +348,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, /* - * ompi_coll_tuned_allgather_intra_ring + * ompi_coll_base_allgather_intra_ring * * Function: allgather using O(N) steps. * Accepts: Same arguments as MPI_Allgather @@ -379,9 +360,9 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, * (r + 1) containing data from rank (r - i), with wrap arounds. * Memory requirements: * No additional memory requirements. - * + * */ -int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, +int ompi_coll_base_allgather_intra_ring(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -395,8 +376,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_ring rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_ring rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -413,15 +394,15 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, tmpsend = (char*) sbuf; err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } - + } + /* Communication step: At every step i: 0 .. (P-1), rank r: - receives message from [(r - 1 + size) % size] containing data from rank [(r - i - 1 + size) % size] - sends message to rank [(r + 1) % size] containing data from rank [(r - i + size) % size] - - sends message which starts at begining of rbuf and has size + - sends message which starts at begining of rbuf and has size */ sendto = (rank + 1) % size; recvfrom = (rank - 1 + size) % size; @@ -434,7 +415,7 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext; /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, sendto, + err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, sendto, MCA_COLL_BASE_TAG_ALLGATHER, tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLGATHER, @@ -446,34 +427,34 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } /* - * ompi_coll_tuned_allgather_intra_neighborexchange + * ompi_coll_base_allgather_intra_neighborexchange * * Function: allgather using N/2 steps (O(N)) * Accepts: Same arguments as MPI_Allgather * Returns: MPI_SUCCESS or error code * * Description: Neighbor Exchange algorithm for allgather. - * Described by Chen et.al. in - * "Performance Evaluation of Allgather Algorithms on + * Described by Chen et.al. in + * "Performance Evaluation of Allgather Algorithms on * Terascale Linux Cluster with Fast Ethernet", - * Proceedings of the Eighth International Conference on + * Proceedings of the Eighth International Conference on * High-Performance Computing inn Asia-Pacific Region * (HPCASIA'05), 2005 - * + * * Rank r exchanges message with one of its neighbors and * forwards the data further in the next step. * * No additional memory requirements. - * + * * Limitations: Algorithm works only on even number of processes. * For odd number of processes we switch to ring algorithm. - * + * * Example on 6 nodes: * Initial state * # 0 1 2 3 4 5 @@ -508,8 +489,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, * [4] [4] [4] [4] [4] [4] * [5] [5] [5] [5] [5] [5] */ -int -ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, +int +ompi_coll_base_allgather_intra_neighborexchange(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -525,16 +506,16 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, rank = ompi_comm_rank(comm); if (size % 2) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", size)); - return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, + return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_neighborexchange rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_neighborexchange rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -551,7 +532,7 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, tmpsend = (char*) sbuf; err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + } /* Determine neighbors, order in which blocks will arrive, etc. */ even_rank = !(rank % 2); @@ -573,15 +554,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, /* Communication loop: - First step is special: exchange a single block with neighbor[0]. - - Rest of the steps: - update recv_data_from according to offset, and + - Rest of the steps: + update recv_data_from according to offset, and exchange two blocks with appropriate neighbor. the send location becomes previous receve location. */ tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext; tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, neighbor[0], + err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, neighbor[0], MCA_COLL_BASE_TAG_ALLGATHER, tmprecv, rcount, rdtype, neighbor[0], MCA_COLL_BASE_TAG_ALLGATHER, @@ -597,15 +578,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, for (i = 1; i < (size / 2); i++) { const int i_parity = i % 2; - recv_data_from[i_parity] = + recv_data_from[i_parity] = (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size; tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext; tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext; - + /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, - neighbor[i_parity], + err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, + neighbor[i_parity], MCA_COLL_BASE_TAG_ALLGATHER, tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, neighbor[i_parity], @@ -619,13 +600,13 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } -int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, +int ompi_coll_base_allgather_intra_two_procs(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -638,8 +619,8 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_allgather_intra_two_procs rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_allgather_intra_two_procs rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -661,7 +642,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, } tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext; - err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote, + err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote, MCA_COLL_BASE_TAG_ALLGATHER, tmprecv, rcount, rdtype, remote, MCA_COLL_BASE_TAG_ALLGATHER, @@ -670,7 +651,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, /* Place your data in correct location if necessary */ if (MPI_IN_PLACE != sbuf) { - err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype, + err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype, (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } @@ -678,7 +659,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -687,13 +668,13 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not * have to duplicate code. - * JPG following the examples from other coll_tuned implementations. Dec06. + * JPG following the examples from other coll_base implementations. Dec06. */ /* copied function (with appropriate renaming) starts here */ @@ -706,10 +687,10 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, +ompi_coll_base_allgather_intra_basic_linear(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, - int rcount, + int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) @@ -727,7 +708,7 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount, sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount); sdtype = rdtype; scount = rcount; - } + } /* Gather and broadcast. */ @@ -755,183 +736,3 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount, } /* copied function (with appropriate renaming) ends here */ - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map - routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values - and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int -ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = coll_tuned_allgather_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_count", - "Number of allgather algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_allgather_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_allgather_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_allgather_algorithms", allgather_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm", - "Which allallgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgather_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_allgather_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_segmentsize", - "Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgather_segment_size); - - coll_tuned_allgather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_tree_fanout", - "Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgather_tree_fanout); - - coll_tuned_allgather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_chain_fanout", - "Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgather_chain_fanout); - - return (MPI_SUCCESS); -} - -int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_forced selected algorithm %d", - data->user_forced[ALLGATHER].algorithm)); - - switch (data->user_forced[ALLGATHER].algorithm) { - case (0): - return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_allgather_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (2): - return ompi_coll_tuned_allgather_intra_bruck (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (3): - return ompi_coll_tuned_allgather_intra_recursivedoubling (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (4): - return ompi_coll_tuned_allgather_intra_ring (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (5): - return ompi_coll_tuned_allgather_intra_neighborexchange (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (6): - return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[ALLGATHER].algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); - return (MPI_ERR_ARG); - } /* switch */ - -} - - -int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): - return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_allgather_intra_basic_linear(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (2): - return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (3): - return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (4): - return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (5): - return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (6): - return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); - return (MPI_ERR_ARG); - } /* switch */ -} diff --git a/ompi/mca/coll/base/coll_base_allgatherv.c b/ompi/mca/coll/base/coll_base_allgatherv.c index 4edb141c00..b884dc8591 100644 --- a/ompi/mca/coll/base/coll_base_allgatherv.c +++ b/ompi/mca/coll/base/coll_base_allgatherv.c @@ -30,19 +30,12 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" -/* allgatherv algorithm variables */ -static int coll_tuned_allgatherv_algorithm_count = 5; -static int coll_tuned_allgatherv_forced_algorithm = 0; -static int coll_tuned_allgatherv_segment_size = 0; -static int coll_tuned_allgatherv_tree_fanout; -static int coll_tuned_allgatherv_chain_fanout; - -/* valid values for coll_tuned_allgatherv_forced_algorithm */ -static mca_base_var_enum_value_t allgatherv_algorithms[] = { +/* valid values for coll_base_allgatherv_forced_algorithm */ +mca_base_var_enum_value_t coll_base_allgatherv_algorithms[] = { {0, "ignore"}, {1, "default"}, {2, "bruck"}, @@ -53,7 +46,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = { }; /* - * ompi_coll_tuned_allgatherv_intra_bruck + * ompi_coll_base_allgatherv_intra_bruck * * Function: allgather using O(log(N)) steps. * Accepts: Same arguments as MPI_Allgather @@ -107,7 +100,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = { * [5] [5] [5] [5] [5] [5] [5] * [6] [6] [6] [6] [6] [6] [6] */ -int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount, +int ompi_coll_base_allgatherv_intra_bruck(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdispls, @@ -124,8 +117,8 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_bruck rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -198,7 +191,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto, + err = ompi_coll_base_sendrecv(rbuf, 1, new_sdtype, sendto, MCA_COLL_BASE_TAG_ALLGATHERV, rbuf, 1, new_rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLGATHERV, @@ -217,14 +210,14 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount, err_hndl: if( NULL != new_rcounts ) free(new_rcounts); - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } /* - * ompi_coll_tuned_allgatherv_intra_ring + * ompi_coll_base_allgatherv_intra_ring * * Function: allgatherv using O(N) steps. * Accepts: Same arguments as MPI_Allgatherv @@ -238,7 +231,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount, * No additional memory requirements. * */ -int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, +int ompi_coll_base_allgatherv_intra_ring(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, @@ -252,8 +245,8 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_ring rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgatherv_intra_ring rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -292,7 +285,7 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, tmpsend = (char*)rbuf + rdisps[senddatafrom] * rext; /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[senddatafrom], rdtype, + err = ompi_coll_base_sendrecv(tmpsend, rcounts[senddatafrom], rdtype, sendto, MCA_COLL_BASE_TAG_ALLGATHERV, tmprecv, rcounts[recvdatafrom], rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLGATHERV, @@ -304,13 +297,13 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } /* - * ompi_coll_tuned_allgatherv_intra_neighborexchange + * ompi_coll_base_allgatherv_intra_neighborexchange * * Function: allgatherv using N/2 steps (O(N)) * Accepts: Same arguments as MPI_Allgatherv @@ -368,7 +361,7 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, * [5] [5] [5] [5] [5] [5] */ int -ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, +ompi_coll_base_allgatherv_intra_neighborexchange(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int *rcounts, int *rdispls, struct ompi_datatype_t *rdtype, @@ -386,17 +379,17 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, rank = ompi_comm_rank(comm); if (size % 2) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", size)); - return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype, + return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm, module); } - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_neighborexchange rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgatherv_intra_neighborexchange rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -445,7 +438,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, */ tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[neighbor[0]] * rext; tmpsend = (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext; - err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[rank], rdtype, + err = ompi_coll_base_sendrecv(tmpsend, rcounts[rank], rdtype, neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV, tmprecv, rcounts[neighbor[0]], rdtype, neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV, @@ -493,7 +486,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, tmpsend = (char*)rbuf; /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity], + err = ompi_coll_base_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity], MCA_COLL_BASE_TAG_ALLGATHERV, tmprecv, 1, new_rdtype, neighbor[i_parity], MCA_COLL_BASE_TAG_ALLGATHERV, @@ -509,13 +502,13 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } -int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, +int ompi_coll_base_allgatherv_intra_two_procs(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int *rcounts, int *rdispls, @@ -529,8 +522,8 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_allgatherv_intra_two_procs rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_allgatherv_intra_two_procs rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -552,7 +545,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, } tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[remote] * rext; - err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote, + err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote, MCA_COLL_BASE_TAG_ALLGATHERV, tmprecv, rcounts[remote], rdtype, remote, MCA_COLL_BASE_TAG_ALLGATHERV, @@ -570,7 +563,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -580,12 +573,12 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not * have to duplicate code. - * JPG following the examples from other coll_tuned implementations. Dec06. + * JPG following the examples from other coll_base implementations. Dec06. */ /* copied function (with appropriate renaming) starts here */ @@ -599,7 +592,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount, +ompi_coll_base_allgatherv_intra_basic_default(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, @@ -619,8 +612,8 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount, * to process with rank 0 (OMPI convention) */ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_allgatherv_intra_basic_default rank %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_allgatherv_intra_basic_default rank %d", rank)); if (MPI_IN_PLACE == sbuf) { @@ -676,177 +669,3 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount, /* copied function (with appropriate renaming) ends here */ -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map - routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values - and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int -ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[ALLGATHERV] = coll_tuned_allgatherv_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgatherv_algorithm_count", - "Number of allgatherv algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_allgatherv_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_allgatherv_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_allgatherv_algorithms", allgatherv_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgatherv_algorithm", - "Which allallgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgatherv_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_allgatherv_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgatherv_algorithm_segmentsize", - "Segment size in bytes used by default for allgatherv algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgatherv_segment_size); - - coll_tuned_allgatherv_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgatherv_algorithm_tree_fanout", - "Fanout for n-tree used for allgatherv algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgatherv_tree_fanout); - - coll_tuned_allgatherv_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allgatherv_algorithm_chain_fanout", - "Fanout for chains used for allgatherv algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allgatherv_chain_fanout); - - return (MPI_SUCCESS); -} - -int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int *rcounts, - int *rdispls, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_do_forced selected algorithm %d", - data->user_forced[ALLGATHERV].algorithm)); - - switch (data->user_forced[ALLGATHERV].algorithm) { - case (0): - return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_allgatherv_intra_basic_default (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (2): - return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (3): - return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (4): - return ompi_coll_tuned_allgatherv_intra_neighborexchange (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (5): - return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[ALLGATHERV].algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLGATHERV])); - return (MPI_ERR_ARG); - } /* switch */ - -} - - -int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int *rcounts, - int *rdispls, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, - int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): - return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_allgatherv_intra_basic_default(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (2): - return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (3): - return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (4): - return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - case (5): - return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLGATHERV])); - return (MPI_ERR_ARG); - } /* switch */ -} diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index b67dbee466..54f444b6cf 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -31,41 +31,23 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* allreduce algorithm variables */ -static int coll_tuned_allreduce_algorithm_count = 5; -static int coll_tuned_allreduce_forced_algorithm = 0; -static int coll_tuned_allreduce_segment_size = 0; -static int coll_tuned_allreduce_tree_fanout; -static int coll_tuned_allreduce_chain_fanout; - -/* valid values for coll_tuned_allreduce_forced_algorithm */ -static mca_base_var_enum_value_t allreduce_algorithms[] = { - {0, "ignore"}, - {1, "basic_linear"}, - {2, "nonoverlapping"}, - {3, "recursive_doubling"}, - {4, "ring"}, - {5, "segmented_ring"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" /* - * ompi_coll_tuned_allreduce_intra_nonoverlapping + * ompi_coll_base_allreduce_intra_nonoverlapping * * This function just calls a reduce followed by a broadcast - * both called functions are tuned but they complete sequentially, + * both called functions are base but they complete sequentially, * i.e. no additional overlapping - * meaning if the number of segments used is greater than the topo depth + * meaning if the number of segments used is greater than the topo depth * then once the first segment of data is fully 'reduced' it is not broadcast * while the reduce continues (cost = cost-reduce + cost-bcast + decision x 3) * */ int -ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count, +ompi_coll_base_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, @@ -75,16 +57,16 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_nonoverlapping rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_nonoverlapping rank %d", rank)); /* Reduce to 0 and broadcast. */ if (MPI_IN_PLACE == sbuf) { if (0 == rank) { - err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype, + err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype, op, 0, comm, comm->c_coll.coll_reduce_module); } else { - err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0, + err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0, comm, comm->c_coll.coll_reduce_module); } } else { @@ -100,21 +82,21 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count } /* - * ompi_coll_tuned_allreduce_intra_recursivedoubling + * ompi_coll_base_allreduce_intra_recursivedoubling * * Function: Recursive doubling algorithm for allreduce operation * Accepts: Same as MPI_Allreduce() * Returns: MPI_SUCCESS or error code * - * Description: Implements recursive doubling algorithm for allreduce. - * Original (non-segmented) implementation is used in MPICH-2 + * Description: Implements recursive doubling algorithm for allreduce. + * Original (non-segmented) implementation is used in MPICH-2 * for small and intermediate size messages. - * The algorithm preserves order of operations so it can + * The algorithm preserves order of operations so it can * be used both by commutative and non-commutative operations. * * Example on 7 nodes: * Initial state - * # 0 1 2 3 4 5 6 + * # 0 1 2 3 4 5 6 * [0] [1] [2] [3] [4] [5] [6] * Initial adjustment step for non-power of two nodes. * old rank 1 3 5 6 @@ -129,24 +111,24 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [0+1+] [0+1+] - * [2+3+] [2+3+] [2+3+] [2+3+] + * [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] * Final adjustment step for non-power of two nodes - * # 0 1 2 3 4 5 6 + * # 0 1 2 3 4 5 6 * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] - * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] + * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] * */ -int -ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, +int +ompi_coll_base_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; @@ -157,9 +139,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allreduce_intra_recursivedoubling rank %d", rank)); - + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -194,16 +176,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, adjsize >>= 1; /* Handle non-power-of-two case: - - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and + - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and sets new rank to -1. - - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), + - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), apply appropriate operation, and set new rank to rank/2 - Everyone else sets rank to rank - extra_ranks */ extra_ranks = size - adjsize; if (rank < (2 * extra_ranks)) { if (0 == (rank % 2)) { - ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), + ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } @@ -221,7 +203,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, newrank = rank - extra_ranks; } - /* Communication/Computation loop + /* Communication/Computation loop - Exchange message with remote node. - Perform appropriate operation taking in account order of operations: result = value (op) result @@ -230,14 +212,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, if (newrank < 0) break; /* Determine remote node */ newremote = newrank ^ distance; - remote = (newremote < extra_ranks)? + remote = (newremote < extra_ranks)? (newremote * 2 + 1):(newremote + extra_ranks); /* Exchange the data */ ret = MCA_PML_CALL(irecv(tmprecv, count, dtype, remote, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[0])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote, + ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } @@ -258,14 +240,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, } /* Handle non-power-of-two case: - - Odd ranks less than 2 * extra_ranks send result from tmpsend to + - Odd ranks less than 2 * extra_ranks send result from tmpsend to (rank - 1) - Even ranks less than 2 * extra_ranks receive result from (rank + 1) */ if (rank < (2 * extra_ranks)) { if (0 == (rank % 2)) { ret = MCA_PML_CALL(recv(rbuf, count, dtype, (rank + 1), - MCA_COLL_BASE_TAG_ALLREDUCE, comm, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } tmpsend = (char*)rbuf; @@ -287,14 +269,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); if (NULL != inplacebuf) free(inplacebuf); return ret; } /* - * ompi_coll_tuned_allreduce_intra_ring + * ompi_coll_base_allreduce_intra_ring * * Function: Ring algorithm for allreduce operation * Accepts: Same as MPI_Allreduce() @@ -304,9 +286,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, * automatically segmented to segment of size M/N. * Algorithm requires 2*N - 1 steps. * - * Limitations: The algorithm DOES NOT preserve order of operations so it + * Limitations: The algorithm DOES NOT preserve order of operations so it * can be used only for commutative operations. - * In addition, algorithm cannot work if the total count is + * In addition, algorithm cannot work if the total count is * less than size. * Example on 5 nodes: * Initial state @@ -318,7 +300,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, * [04] [14] [24] [34] [44] * * COMPUTATION PHASE - * Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1) + * Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1) * from rank (r-1) [with wraparound]. * # 0 1 2 3 4 * [00] [00+10] [20] [30] [40] @@ -327,7 +309,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, * [03] [13] [23] [33] [33+43] * [44+04] [14] [24] [34] [44] * - * Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc + * Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc * (r-2) from rank (r-1) [with wraparound]. * # 0 1 2 3 4 * [00] [00+10] [01+10+20] [30] [40] @@ -336,7 +318,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, * [33+43+03] [13] [23] [33] [33+43] * [44+04] [44+04+14] [24] [34] [44] * - * Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc + * Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc * (r-2) from rank (r-1) [with wraparound]. * # 0 1 2 3 4 * [00] [00+10] [01+10+20] [01+10+20+30] [40] @@ -345,7 +327,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, * [33+43+03] [33+43+03+13] [23] [33] [33+43] * [44+04] [44+04+14] [44+04+14+24] [34] [44] * - * Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc + * Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc * (r-3) from rank (r-1) [with wraparound]. * # 0 1 2 3 4 * [00] [00+10] [01+10+20] [01+10+20+30] [FULL] @@ -353,16 +335,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, * [22+32+42+02] [FULL] [22] [22+32] [22+32+42] * [33+43+03] [33+43+03+13] [FULL] [33] [33+43] * [44+04] [44+04+14] [44+04+14+24] [FULL] [44] - * + * * DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1. * */ -int -ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, +int +ompi_coll_base_allreduce_intra_ring(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { int ret, line, rank, size, k, recv_from, send_to, block_count, inbi; int early_segcount, late_segcount, split_rank, max_segcount; @@ -375,9 +357,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allreduce_intra_ring rank %d, count %d", rank, count)); - + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allreduce_intra_ring rank %d, count %d", rank, count)); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -389,10 +371,10 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, /* Special case for count less than size - use recursive doubling */ if (count < size) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count)); - return (ompi_coll_tuned_allreduce_intra_recursivedoubling(sbuf, rbuf, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count)); + return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, - dtype, op, + dtype, op, comm, module)); } @@ -404,14 +386,14 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, ret = ompi_datatype_type_size( dtype, &typelng); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Determine the number of elements per block and corresponding + /* Determine the number of elements per block and corresponding block sizes. The blocks are divided into "early" and "late" ones: - blocks 0 .. (split_rank - 1) are "early" and + blocks 0 .. (split_rank - 1) are "early" and blocks (split_rank) .. (size - 1) are "late". Early blocks are at most 1 element larger than the late ones. */ - COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, + COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank, early_segcount, late_segcount ); max_segcount = early_segcount; max_real_segsize = true_extent + (max_segcount - 1) * extent; @@ -432,7 +414,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, /* Computation loop */ - /* + /* For each of the remote nodes: - post irecv for block (r-1) - send block (r) @@ -456,8 +438,8 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* Send first block (my block) to the neighbor on the right */ - block_offset = ((rank < split_rank)? - ((ptrdiff_t)rank * (ptrdiff_t)early_segcount) : + block_offset = ((rank < split_rank)? + ((ptrdiff_t)rank * (ptrdiff_t)early_segcount) : ((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank)); block_count = ((rank < split_rank)? early_segcount : late_segcount); tmpsend = ((char*)rbuf) + block_offset * extent; @@ -465,21 +447,21 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + for (k = 2; k < size; k++) { const int prevblock = (rank + size - k + 1) % size; - + inbi = inbi ^ 0x1; - + /* Post irecv for the current block */ ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Wait on previous block to arrive */ ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ @@ -489,7 +471,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, block_count = ((prevblock < split_rank)? early_segcount : late_segcount); tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype); - + /* send previous block to send_to */ ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to, MCA_COLL_BASE_TAG_ALLREDUCE, @@ -501,7 +483,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Apply operation on the last block (from neighbor (rank + 1) + /* Apply operation on the last block (from neighbor (rank + 1) rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ recv_from = (rank + 1) % size; block_offset = ((recv_from < split_rank)? @@ -510,28 +492,28 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, block_count = ((recv_from < split_rank)? early_segcount : late_segcount); tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype); - + /* Distribution loop - variation of ring allgather */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; for (k = 0; k < size - 1; k++) { const int recv_data_from = (rank + size - k) % size; const int send_data_from = (rank + 1 + size - k) % size; - const int send_block_offset = + const int send_block_offset = ((send_data_from < split_rank)? ((ptrdiff_t)send_data_from * early_segcount) : ((ptrdiff_t)send_data_from * late_segcount + split_rank)); - const int recv_block_offset = + const int recv_block_offset = ((recv_data_from < split_rank)? ((ptrdiff_t)recv_data_from * early_segcount) : ((ptrdiff_t)recv_data_from * late_segcount + split_rank)); - block_count = ((send_data_from < split_rank)? + block_count = ((send_data_from < split_rank)? early_segcount : late_segcount); tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent; tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent; - ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to, + ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to, MCA_COLL_BASE_TAG_ALLREDUCE, tmprecv, max_segcount, dtype, recv_from, MCA_COLL_BASE_TAG_ALLREDUCE, @@ -546,7 +528,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); if (NULL != inbuf[0]) free(inbuf[0]); if (NULL != inbuf[1]) free(inbuf[1]); @@ -554,30 +536,30 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, } /* - * ompi_coll_tuned_allreduce_intra_ring_segmented + * ompi_coll_base_allreduce_intra_ring_segmented * * Function: Pipelined ring algorithm for allreduce operation * Accepts: Same as MPI_Allreduce(), segment size * Returns: MPI_SUCCESS or error code * - * Description: Implements pipelined ring algorithm for allreduce: + * Description: Implements pipelined ring algorithm for allreduce: * user supplies suggested segment size for the pipelining of * reduce operation. - * The segment size determines the number of phases, np, for - * the algorithm execution. - * The message is automatically divided into blocks of + * The segment size determines the number of phases, np, for + * the algorithm execution. + * The message is automatically divided into blocks of * approximately (count / (np * segcount)) elements. - * At the end of reduction phase, allgather like step is + * At the end of reduction phase, allgather like step is * executed. * Algorithm requires (np + 1)*(N - 1) steps. * - * Limitations: The algorithm DOES NOT preserve order of operations so it + * Limitations: The algorithm DOES NOT preserve order of operations so it * can be used only for commutative operations. - * In addition, algorithm cannot work if the total size is + * In addition, algorithm cannot work if the total size is * less than size * segment size. * Example on 3 nodes with 2 phases * Initial state - * # 0 1 2 + * # 0 1 2 * [00a] [10a] [20a] * [00b] [10b] [20b] * [01a] [11a] [21a] @@ -586,9 +568,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, * [02b] [12b] [22b] * * COMPUTATION PHASE 0 (a) - * Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a + * Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a * from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [20a] * [00b] [10b] [20b] * [01a] [11a] [11a+21a] @@ -596,20 +578,20 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, * [22a+02a] [12a] [22a] * [02b] [12b] [22b] * - * Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc + * Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc * (r-2)a from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [00a+10a+20a] * [00b] [10b] [20b] * [11a+21a+01a] [11a] [11a+21a] * [01b] [11b] [21b] * [22a+02a] [22a+02a+12a] [22a] - * [02b] [12b] [22b] + * [02b] [12b] [22b] * * COMPUTATION PHASE 1 (b) - * Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b + * Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b * from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [20a] * [00b] [00b+10b] [20b] * [01a] [11a] [11a+21a] @@ -617,31 +599,31 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, * [22a+02a] [12a] [22a] * [22b+02b] [12b] [22b] * - * Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc + * Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc * (r-2)b from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [00a+10a+20a] * [00b] [10b] [0bb+10b+20b] * [11a+21a+01a] [11a] [11a+21a] * [11b+21b+01b] [11b] [21b] * [22a+02a] [22a+02a+12a] [22a] - * [02b] [22b+01b+12b] [22b] + * [02b] [22b+01b+12b] [22b] + * * - * * DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1 (same as * in regular ring algorithm. * */ -int -ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count, +int +ompi_coll_base_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, - uint32_t segsize) + uint32_t segsize) { int ret, line, rank, size, k, recv_from, send_to; - int early_blockcount, late_blockcount, split_rank; + int early_blockcount, late_blockcount, split_rank; int segcount, max_segcount, num_phases, phase, block_count, inbi; size_t typelng; char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; @@ -652,9 +634,9 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count)); - + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allreduce_intra_ring_segmented rank %d, count %d", rank, count)); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -672,34 +654,34 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count ret = ompi_datatype_type_size( dtype, &typelng); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } segcount = count; - COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount) + COLL_BASE_COMPUTED_SEGCOUNT(segsize, typelng, segcount) /* Special case for count less than size * segcount - use regular ring */ if (count < (size * segcount)) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count)); - return (ompi_coll_tuned_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count)); + return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module)); } /* Determine the number of phases of the algorithm */ num_phases = count / (size * segcount); - if ((count % (size * segcount) >= size) && + if ((count % (size * segcount) >= size) && (count % (size * segcount) > ((size * segcount) / 2))) { num_phases++; } - /* Determine the number of elements per block and corresponding + /* Determine the number of elements per block and corresponding block sizes. The blocks are divided into "early" and "late" ones: - blocks 0 .. (split_rank - 1) are "early" and + blocks 0 .. (split_rank - 1) are "early" and blocks (split_rank) .. (size - 1) are "late". Early blocks are at most 1 element larger than the late ones. Note, these blocks will be split into num_phases segments, out of the largest one will have max_segcount elements. */ - COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, + COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank, early_blockcount, late_blockcount ); - COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, + COLL_BASE_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k); max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent; @@ -722,7 +704,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count ptrdiff_t phase_offset; int early_phase_segcount, late_phase_segcount, split_phase, phase_count; - /* + /* For each of the remote nodes: - post irecv for block (r-1) - send block (r) @@ -741,7 +723,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; - + inbi = 0; /* Initialize first receive from the neighbor on the left */ ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, @@ -750,81 +732,81 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count /* Send first block (my block) to the neighbor on the right: - compute my block and phase offset - send data */ - block_offset = ((rank < split_rank)? - ((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) : + block_offset = ((rank < split_rank)? + ((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) : ((ptrdiff_t)rank * (ptrdiff_t)late_blockcount + split_rank)); block_count = ((rank < split_rank)? early_blockcount : late_blockcount); - COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, + COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? - ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : + ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); tmpsend = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; ret = MCA_PML_CALL(send(tmpsend, phase_count, dtype, send_to, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + for (k = 2; k < size; k++) { const int prevblock = (rank + size - k + 1) % size; - + inbi = inbi ^ 0x1; - + /* Post irecv for the current block */ ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, comm, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Wait on previous block to arrive */ ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ block_offset = ((prevblock < split_rank)? ((ptrdiff_t)prevblock * (ptrdiff_t)early_blockcount) : ((ptrdiff_t)prevblock * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((prevblock < split_rank)? + block_count = ((prevblock < split_rank)? early_blockcount : late_blockcount); - COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, + COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? - ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : + ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype); - + /* send previous block to send_to */ ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } } - + /* Wait on the last block to arrive */ ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - - /* Apply operation on the last block (from neighbor (rank + 1) + + /* Apply operation on the last block (from neighbor (rank + 1) rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ recv_from = (rank + 1) % size; block_offset = ((recv_from < split_rank)? ((ptrdiff_t)recv_from * (ptrdiff_t)early_blockcount) : ((ptrdiff_t)recv_from * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((recv_from < split_rank)? + block_count = ((recv_from < split_rank)? early_blockcount : late_blockcount); - COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, + COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? - ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : + ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype); @@ -836,21 +818,21 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count for (k = 0; k < size - 1; k++) { const int recv_data_from = (rank + size - k) % size; const int send_data_from = (rank + 1 + size - k) % size; - const int send_block_offset = + const int send_block_offset = ((send_data_from < split_rank)? ((ptrdiff_t)send_data_from * (ptrdiff_t)early_blockcount) : ((ptrdiff_t)send_data_from * (ptrdiff_t)late_blockcount + split_rank)); - const int recv_block_offset = + const int recv_block_offset = ((recv_data_from < split_rank)? ((ptrdiff_t)recv_data_from * (ptrdiff_t)early_blockcount) : ((ptrdiff_t)recv_data_from * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((send_data_from < split_rank)? + block_count = ((send_data_from < split_rank)? early_blockcount : late_blockcount); tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent; tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent; - ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to, + ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to, MCA_COLL_BASE_TAG_ALLREDUCE, tmprecv, early_blockcount, dtype, recv_from, MCA_COLL_BASE_TAG_ALLREDUCE, @@ -865,7 +847,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); if (NULL != inbuf[0]) free(inbuf[0]); if (NULL != inbuf[1]) free(inbuf[1]); @@ -875,8 +857,8 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not @@ -895,7 +877,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count, +ompi_coll_base_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, @@ -905,158 +887,28 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count, rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_basic_linear rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_basic_linear rank %d", rank)); /* Reduce to 0 and broadcast. */ if (MPI_IN_PLACE == sbuf) { if (0 == rank) { - err = ompi_coll_tuned_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype, + err = ompi_coll_base_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype, op, 0, comm, module); } else { - err = ompi_coll_tuned_reduce_intra_basic_linear(rbuf, NULL, count, dtype, + err = ompi_coll_base_reduce_intra_basic_linear(rbuf, NULL, count, dtype, op, 0, comm, module); } } else { - err = ompi_coll_tuned_reduce_intra_basic_linear(sbuf, rbuf, count, dtype, + err = ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, 0, comm, module); } if (MPI_SUCCESS != err) { return err; } - return ompi_coll_tuned_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module); + return ompi_coll_base_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module); } /* copied function (with appropriate renaming) ends here */ - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = coll_tuned_allreduce_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allreduce_algorithm_count", - "Number of allreduce algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_allreduce_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_allreduce_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_allreduce_algorithms", allreduce_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allreduce_algorithm", - "Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allreduce_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_allreduce_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allreduce_algorithm_segmentsize", - "Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allreduce_segment_size); - - coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allreduce_algorithm_tree_fanout", - "Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allreduce_tree_fanout); - - coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "allreduce_algorithm_chain_fanout", - "Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_allreduce_chain_fanout); - - return (MPI_SUCCESS); -} - - -int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d", - data->user_forced[ALLREDUCE].algorithm, - data->user_forced[ALLREDUCE].segsize)); - - switch (data->user_forced[ALLREDUCE].algorithm) { - case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module); - case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module); - case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module); - case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module); - case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module); - case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, data->user_forced[ALLREDUCE].segsize); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[ALLREDUCE].algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLREDUCE])); - return (MPI_ERR_ARG); - } /* switch */ - -} - - -int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module); - case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module); - case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module); - case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module); - case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module); - case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, segsize); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE])); - return (MPI_ERR_ARG); - } /* switch */ - -} - diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index 3bd1ecfa04..fe71c5345f 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -30,37 +30,18 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* alltoall algorithm variables */ -static int coll_tuned_alltoall_algorithm_count = 5; -static int coll_tuned_alltoall_forced_algorithm = 0; -static int coll_tuned_alltoall_segment_size = 0; -static int coll_tuned_alltoall_max_requests; -static int coll_tuned_alltoall_tree_fanout; -static int coll_tuned_alltoall_chain_fanout; - -/* valid values for coll_tuned_alltoall_forced_algorithm */ -static mca_base_var_enum_value_t alltoall_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "pairwise"}, - {3, "modified_bruck"}, - {4, "linear_sync"}, - {5, "two_proc"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" /* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ static int -mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, +mca_coll_base_alltoall_intra_basic_inplace(void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; MPI_Request *preq; char *tmp_buffer; @@ -91,7 +72,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ - preq = tuned_module->tuned_data->mcct_reqs; + preq = base_module->base_data->mcct_reqs; if (i == rank) { /* Copy the data into the temporary buffer */ @@ -128,11 +109,8 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, } /* Wait for the requests to complete */ - err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE); + err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } - - /* Free the requests. */ - mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2); } } @@ -145,7 +123,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, return err; } -int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount, +int ompi_coll_base_alltoall_intra_pairwise(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -157,22 +135,22 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount, ptrdiff_t lb, sext, rext; if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, + return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoall_intra_pairwise rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:alltoall_intra_pairwise rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &lb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } - + /* Perform pairwise exchange - starting from 1 so the local copy is last */ for (step = 1; step < size + 1; step++) { @@ -185,25 +163,25 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount, tmprecv = (char*)rbuf + (ptrdiff_t)recvfrom * rext * (ptrdiff_t)rcount; /* send and receive */ - err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, + err = ompi_coll_base_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL, - tmprecv, rcount, rdtype, recvfrom, + tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } return MPI_SUCCESS; - + err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } -int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, +int ompi_coll_base_alltoall_intra_bruck(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -216,20 +194,20 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, ptrdiff_t rlb, slb, tlb, sext, rext, tsext; struct ompi_datatype_t *new_ddt; #ifdef blahblah - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; #endif if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, + return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoall_intra_bruck rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -242,14 +220,14 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, #ifdef blahblah - /* try and SAVE memory by using the data segment hung off + /* try and SAVE memory by using the data segment hung off the communicator if possible */ - if (data->mcct_num_reqs >= size) { + if (data->mcct_num_reqs >= size) { /* we have enought preallocated for displments and lengths */ displs = (int*) data->mcct_reqs; blen = (int *) (displs + size); weallocated = 0; - } + } else { /* allocate the buffers ourself */ #endif displs = (int *) malloc(size * sizeof(int)); @@ -267,9 +245,9 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, tmpbuf = tmpbuf_free - slb; /* Step 1 - local rotation - shift up by rank */ - err = ompi_datatype_copy_content_same_ddt (sdtype, + err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), - tmpbuf, + tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; @@ -277,7 +255,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, - tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, + tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; @@ -294,7 +272,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { - displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; + displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } @@ -307,7 +285,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ - err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto, + err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, @@ -327,7 +305,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, - ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), + ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } @@ -341,8 +319,8 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); @@ -352,10 +330,10 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, /* * alltoall_intra_linear_sync - * + * * Function: Linear implementation of alltoall with limited number * of outstanding requests. - * Accepts: Same as MPI_Alltoall(), and the maximum number of + * Accepts: Same as MPI_Alltoall(), and the maximum number of * outstanding requests (actual number is 2 * max, since * we count receive and send requests separately). * Returns: MPI_SUCCESS or error code @@ -367,7 +345,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, * - wait for any request to complete * - replace that request by the new one of the same type. */ -int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, +int ompi_coll_base_alltoall_intra_linear_sync(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -382,7 +360,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, ompi_request_t **reqs = NULL; if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, + return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } @@ -391,8 +369,8 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_alltoall_intra_linear_sync rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_alltoall_intra_linear_sync rank %d", rank)); error = ompi_datatype_get_extent(sdtype, &slb, &sext); if (OMPI_SUCCESS != error) { @@ -423,18 +401,18 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, } /* Initiate send/recv to/from others. */ - total_reqs = (((max_outstanding_reqs > (size - 1)) || + total_reqs = (((max_outstanding_reqs > (size - 1)) || (max_outstanding_reqs <= 0)) ? (size - 1) : (max_outstanding_reqs)); - reqs = (ompi_request_t**) malloc( 2 * total_reqs * + reqs = (ompi_request_t**) malloc( 2 * total_reqs * sizeof(ompi_request_t*)); if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; } - + prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post first batch or ireceive and isend requests */ - for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; + for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; ri = (ri + 1) % size, ++nreqs, ++nrreqs) { error = MCA_PML_CALL(irecv @@ -442,7 +420,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } - for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs; + for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs; si = (si + size - 1) % size, ++nreqs, ++nsreqs) { error = MCA_PML_CALL(isend @@ -457,12 +435,12 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, /* Optimization for the case when all requests have been posted */ error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } - + } else { /* As requests complete, replace them with corresponding requests: - - wait for any request to complete, mark the request as + - wait for any request to complete, mark the request as MPI_REQUEST_NULL - - If it was a receive request, replace it with new irecv request + - If it was a receive request, replace it with new irecv request (if any) - if it was a send request, replace it with new isend request (if any) */ @@ -476,10 +454,10 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, ncreqs++; if (completed < total_reqs) { if (nrreqs < (size - 1)) { - error = + error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, - MCA_COLL_BASE_TAG_ALLTOALL, comm, + MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[completed])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } ++nrreqs; @@ -493,7 +471,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[completed])); ++nsreqs; - si = (si + size - 1) % size; + si = (si + size - 1) % size; } } } @@ -506,15 +484,15 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, rank)); if (NULL != reqs) free(reqs); return error; } -int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, +int ompi_coll_base_alltoall_intra_two_procs(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -526,14 +504,14 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, ptrdiff_t sext, rext, lb; if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, + return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_alltoall_intra_two_procs rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_alltoall_intra_two_procs rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -548,17 +526,17 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, tmprecv = (char*)rbuf + (ptrdiff_t)remote * rext * (ptrdiff_t)rcount; /* send and receive */ - err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, remote, + err = ompi_coll_base_sendrecv ( tmpsend, scount, sdtype, remote, MCA_COLL_BASE_TAG_ALLTOALL, - tmprecv, rcount, rdtype, remote, + tmprecv, rcount, rdtype, remote, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* ddt sendrecv your own data */ - err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount, - (int32_t) scount, sdtype, - (char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount, + err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount, + (int32_t) scount, sdtype, + (char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount, (int32_t) rcount, rdtype); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -566,7 +544,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; @@ -577,8 +555,8 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not @@ -588,7 +566,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, /* copied function (with appropriate renaming) starts here */ -int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, +int ompi_coll_base_alltoall_intra_basic_linear(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -599,11 +577,11 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, char *psnd, *prcv; MPI_Aint lb, sndinc, rcvinc; ompi_request_t **req, **sreq, **rreq; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, + return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } @@ -612,8 +590,8 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_alltoall_intra_basic_linear rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank)); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); @@ -654,23 +632,23 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, /* Post all receives first -- a simple optimization */ - for (nreqs = 0, i = (rank + 1) % size; i != rank; + for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq, ++nreqs) { err = MCA_PML_CALL(irecv_init (prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (MPI_SUCCESS != err) { - ompi_coll_tuned_free_reqs(req, rreq - req); + ompi_coll_base_free_reqs(req, rreq - req); return err; } } - /* Now post all sends in reverse order + /* Now post all sends in reverse order - We would like to minimize the search time through message queue when messages actually arrive in the order in which they were posted. */ - for (nreqs = 0, i = (rank + size - 1) % size; i != rank; + for (nreqs = 0, i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size, ++sreq, ++nreqs) { err = MCA_PML_CALL(isend_init @@ -678,7 +656,7 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (MPI_SUCCESS != err) { - ompi_coll_tuned_free_reqs(req, sreq - req); + ompi_coll_base_free_reqs(req, sreq - req); return err; } } @@ -698,165 +676,10 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); /* Free the reqs */ - - ompi_coll_tuned_free_reqs(req, nreqs); + ompi_coll_base_free_reqs(req, nreqs); /* All done */ - return err; } /* copied function (with appropriate renaming) ends here */ - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t*new_enum; - - ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = coll_tuned_alltoall_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm_count", - "Number of alltoall algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_alltoall_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_alltoall_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_alltoall_algorithms", alltoall_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm", - "Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_alltoall_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_alltoall_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm_segmentsize", - "Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_alltoall_segment_size); - - coll_tuned_alltoall_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm_tree_fanout", - "Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_alltoall_tree_fanout); - - coll_tuned_alltoall_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm_chain_fanout", - "Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_alltoall_chain_fanout); - - coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */ - mca_param_indices->max_requests_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm_max_requests", - "Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_alltoall_max_requests); - if (mca_param_indices->max_requests_param_index < 0) { - return mca_param_indices->max_requests_param_index; - } - - if (coll_tuned_alltoall_max_requests < 0) { - if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { - opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n", - ompi_coll_tuned_init_max_requests ); - } - coll_tuned_alltoall_max_requests = 0; - } - - return (MPI_SUCCESS); -} - - - -int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d", - data->user_forced[ALLTOALL].algorithm)); - - switch (data->user_forced[ALLTOALL].algorithm) { - case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, data->user_forced[ALLTOALL].max_requests); - case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL])); - return (MPI_ERR_ARG); - } /* switch */ - -} - - -int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize, - int max_requests) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, max_requests); - case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL])); - return (MPI_ERR_ARG); - } /* switch */ - -} - diff --git a/ompi/mca/coll/base/coll_base_alltoallv.c b/ompi/mca/coll/base/coll_base_alltoallv.c index 412fb4366c..19f71674dd 100644 --- a/ompi/mca/coll/base/coll_base_alltoallv.c +++ b/ompi/mca/coll/base/coll_base_alltoallv.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -32,29 +32,17 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* alltoallv algorithm variables */ -static int coll_tuned_alltoallv_algorithm_count = 2; -static int coll_tuned_alltoallv_forced_algorithm = 0; - -/* valid values for coll_tuned_alltoallv_forced_algorithm */ -static mca_base_var_enum_value_t alltoallv_algorithms[] = { - {0, "ignore"}, - {1, "basic_linear"}, - {2, "pairwise"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" static int -mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps, +mca_coll_base_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; MPI_Request *preq; char *tmp_buffer; @@ -90,7 +78,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ - preq = tuned_module->tuned_data->mcct_reqs; + preq = base_module->base_data->mcct_reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ @@ -127,11 +115,8 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con } /* Wait for the requests to complete */ - err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE); + err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } - - /* Free the requests. */ - mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2); } } @@ -145,7 +130,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con } int -ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps, +ompi_coll_base_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void* rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, @@ -157,15 +142,15 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps, ptrdiff_t sext, rext; if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, + return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoallv_intra_pairwise rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:alltoallv_intra_pairwise rank %d", rank)); ompi_datatype_type_extent(sdtype, &sext); ompi_datatype_type_extent(rdtype, &rext); @@ -182,34 +167,33 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps, prcv = (char*)rbuf + (ptrdiff_t)rdisps[recvfrom] * rext; /* send and receive */ - err = ompi_coll_tuned_sendrecv( psnd, scounts[sendto], sdtype, sendto, + err = ompi_coll_base_sendrecv( psnd, scounts[sendto], sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALLV, - prcv, rcounts[recvfrom], rdtype, recvfrom, + prcv, rcounts[recvfrom], rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALLV, comm, MPI_STATUS_IGNORE, rank); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } return MPI_SUCCESS; - + err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line, + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line, err, rank, step)); return err; } -/* +/** * Linear functions are copied from the basic coll module. For * some small number of nodes and/or small data sizes they are just as - * fast as tuned/tree based segmenting operations and as such may be + * fast as base/tree based segmenting operations and as such may be * selected by the decision functions. These are copied into this module * due to the way we select modules in V1. i.e. in V2 we will handle this - * differently and so will not have to duplicate code. - * GEF Oct05 after asking Jeff. + * differently and so will not have to duplicate code. */ int -ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps, +ompi_coll_base_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, @@ -220,19 +204,19 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis char *psnd, *prcv; ptrdiff_t sext, rext; MPI_Request *preq; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; if (MPI_IN_PLACE == sbuf) { - return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, + return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoallv_intra_basic_linear rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:alltoallv_intra_basic_linear rank %d", rank)); ompi_datatype_type_extent(sdtype, &sext); ompi_datatype_type_extent(rdtype, &rext); @@ -269,7 +253,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis preq++)); ++nreqs; if (MPI_SUCCESS != err) { - ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); + ompi_coll_base_free_reqs(data->mcct_reqs, nreqs); return err; } } @@ -287,7 +271,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis preq++)); ++nreqs; if (MPI_SUCCESS != err) { - ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); + ompi_coll_base_free_reqs(data->mcct_reqs, nreqs); return err; } } @@ -305,128 +289,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis MPI_STATUSES_IGNORE); /* Free the requests. */ - ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); + ompi_coll_base_free_reqs(data->mcct_reqs, nreqs); return err; } - -/* - * The following are used by dynamic and forced rules. Publish - * details of each algorithm and if its forced/fixed/locked in as you add - * methods/algorithms you must update this and the query/map routines. - * This routine is called by the component only. This makes sure that - * the mca parameters are set to their initial values and perms. - * Module does not call this. They call the forced_getvalues routine - * instead. - */ -int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t - *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[ALLTOALLV] = coll_tuned_alltoallv_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoallv_algorithm_count", - "Number of alltoallv algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_alltoallv_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_alltoallv_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_alltoallv_algorithms", alltoallv_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "alltoallv_algorithm", - "Which alltoallv algorithm is used. " - "Can be locked down to choice of: 0 ignore, " - "1 basic linear, 2 pairwise.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_alltoallv_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - return (MPI_SUCCESS); -} - - - -int ompi_coll_tuned_alltoallv_intra_do_forced(void *sbuf, int *scounts, int *sdisps, - struct ompi_datatype_t *sdtype, - void* rbuf, int *rcounts, int *rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoallv_intra_do_forced selected algorithm %d", - data->user_forced[ALLTOALLV].algorithm)); - - switch (data->user_forced[ALLTOALLV].algorithm) { - case (0): - return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); - case (2): - return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoallv_intra_do_forced attempt to " - "select algorithm %d when only 0-%d is valid.", - data->user_forced[ALLTOALLV].algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLTOALLV])); - return (MPI_ERR_ARG); - } -} - -/* If the user selects dynamic rules and specifies the algorithm to - * use, then this function is called. */ -int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisps, - struct ompi_datatype_t *sdtype, - void* rbuf, int *rcounts, int *rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoallv_intra_do_this selected algorithm %d ", - algorithm)); - - switch (algorithm) { - case (0): - return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); - case (2): - return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:alltoall_intra_do_this attempt to select " - "algorithm %d when only 0-%d is valid.", - algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV])); - return (MPI_ERR_ARG); - } -} diff --git a/ompi/mca/coll/base/coll_base_barrier.c b/ompi/mca/coll/base/coll_base_barrier.c index ca9d143f62..15b3f4883d 100644 --- a/ompi/mca/coll/base/coll_base_barrier.c +++ b/ompi/mca/coll/base/coll_base_barrier.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -31,25 +31,9 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* barrier algorithm variables */ -static int coll_tuned_barrier_algorithm_count = 6; -static int coll_tuned_barrier_forced_algorithm = 0; - -/* valid values for coll_tuned_barrier_forced_algorithm */ -static mca_base_var_enum_value_t barrier_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "double_ring"}, - {3, "recursive_doubling"}, - {4, "bruck"}, - {5, "two_proc"}, - {6, "tree"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" /** * A quick version of the MPI_Sendreceive implemented for the barrier. @@ -57,7 +41,7 @@ static mca_base_var_enum_value_t barrier_algorithms[] = { * signal a two peer synchronization. */ static inline int -ompi_coll_tuned_sendrecv_zero(int dest, int stag, +ompi_coll_base_sendrecv_zero(int dest, int stag, int source, int rtag, MPI_Comm comm) @@ -87,8 +71,8 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag, err_index = 1; } err = statuses[err_index].MPI_ERROR; - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s" - " stage of ompi_coll_tuned_sendrecv_zero\n", + OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s" + " stage of ompi_coll_base_sendrecv_zero\n", __FILE__, line, err, (0 == err_index ? "receive" : "send"))); return err; } @@ -100,21 +84,21 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag, /* Error discovered during the posting of the irecv or isend, * and no status is available. */ - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n", + OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n", __FILE__, line, err)); return err; } /* - * Barrier is ment to be a synchronous operation, as some BTLs can mark - * a request done before its passed to the NIC and progress might not be made - * elsewhere we cannot allow a process to exit the barrier until its last + * Barrier is ment to be a synchronous operation, as some BTLs can mark + * a request done before its passed to the NIC and progress might not be made + * elsewhere we cannot allow a process to exit the barrier until its last * [round of] sends are completed. * - * It is last round of sends rather than 'last' individual send as each pair of - * peers can use different channels/devices/btls and the receiver of one of + * It is last round of sends rather than 'last' individual send as each pair of + * peers can use different channels/devices/btls and the receiver of one of * these sends might be forced to wait as the sender - * leaves the collective and does not make progress until the next mpi call + * leaves the collective and does not make progress until the next mpi call * */ @@ -124,7 +108,7 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag, * synchronous gurantee made by last ring of sends are synchronous * */ -int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm, +int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, err = 0, line = 0, left, right; @@ -132,50 +116,50 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm, rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_doublering rank %d", rank)); - + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); + left = ((rank-1)%size); right = ((rank+1)%size); if (rank > 0) { /* receive message from the left */ - err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, - MCA_COLL_BASE_TAG_BARRIER, comm, + err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, + MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* Send message to the right */ - err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, - MCA_COLL_BASE_TAG_BARRIER, + err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, + MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* root needs to receive from the last node */ if (rank == 0) { - err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, - MCA_COLL_BASE_TAG_BARRIER, comm, + err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, + MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* Allow nodes to exit */ if (rank > 0) { /* post Receive from left */ - err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, - MCA_COLL_BASE_TAG_BARRIER, comm, + err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, + MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* send message to the right one */ - err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, - MCA_COLL_BASE_TAG_BARRIER, + err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, + MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } - + /* rank 0 post receive from the last node */ if (rank == 0) { - err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, - MCA_COLL_BASE_TAG_BARRIER, comm, + err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, + MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } @@ -183,7 +167,7 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm, return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -193,15 +177,15 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm, * To make synchronous, uses sync sends and sync sendrecvs */ -int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm, +int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, adjsize, err, line, mask, remote; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_barrier_intra_recursivedoubling rank %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); /* do nearest power of 2 less than size calc */ @@ -213,7 +197,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * if (rank >= adjsize) { /* send message to lower ranked node */ remote = rank - adjsize; - err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, + err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} @@ -222,7 +206,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * /* receive message from high level rank */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize, - MCA_COLL_BASE_TAG_BARRIER, comm, + MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} @@ -238,7 +222,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * if (remote >= adjsize) continue; /* post receive from the remote node */ - err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, + err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} @@ -250,8 +234,8 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * if (rank < (size - adjsize)) { /* send enter message to higher ranked node */ remote = rank + adjsize; - err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, + err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote, + MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} @@ -261,7 +245,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -271,23 +255,23 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * * To make synchronous, uses sync sends and sync sendrecvs */ -int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm, +int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, distance, to, from, err, line = 0; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_barrier_intra_bruck rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_barrier_intra_bruck rank %d", rank)); /* exchange data with rank-2^k and rank+2^k */ - for (distance = 1; distance < size; distance <<= 1) { + for (distance = 1; distance < size; distance <<= 1) { from = (rank + size - distance) % size; to = (rank + distance) % size; /* send message to lower ranked node */ - err = ompi_coll_tuned_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER, + err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER, from, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} @@ -296,7 +280,7 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm, return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -306,17 +290,17 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm, * To make synchronous, uses sync sends and sync sendrecvs */ /* special case for two processes */ -int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm, +int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int remote, err; remote = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_barrier_intra_two_procs rank %d", remote)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_barrier_intra_two_procs rank %d", remote)); remote = (remote + 1) & 0x1; - err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, + err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); return (err); @@ -327,7 +311,7 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm, * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not @@ -337,7 +321,7 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm, /* copied function (with appropriate renaming) starts here */ -static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm, +static int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size; @@ -347,14 +331,14 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t /* All non-root send & receive zero-length message. */ if (rank > 0) { - err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, + err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } - err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, + err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { @@ -370,7 +354,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) ); for (i = 1; i < size; ++i) { err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, - MCA_COLL_BASE_TAG_BARRIER, comm, + MCA_COLL_BASE_TAG_BARRIER, comm, &(requests[i]))); if (MPI_SUCCESS != err) { return err; @@ -380,7 +364,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t for (i = 1; i < size; ++i) { err = MCA_PML_CALL(isend(NULL, 0, MPI_BYTE, i, - MCA_COLL_BASE_TAG_BARRIER, + MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[i]))); if (MPI_SUCCESS != err) { @@ -400,17 +384,17 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t /* * Another recursive doubling type algorithm, but in this case - * we go up the tree and back down the tree. + * we go up the tree and back down the tree. */ -int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, +int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, depth, err, jump, partner; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_barrier_intra_tree %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_barrier_intra_tree %d", rank)); /* Find the nearest power of 2 of the communicator size. */ @@ -420,21 +404,21 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { - err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, + err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, - MCA_COLL_BASE_TAG_BARRIER, + MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } } } - + depth >>= 1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; @@ -446,7 +430,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, if (MPI_SUCCESS != err) return err; } else if (partner < rank) { - err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, + err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) @@ -457,101 +441,3 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, return MPI_SUCCESS; } - - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map */ -/* routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values */ -/* and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[BARRIER] = coll_tuned_barrier_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "barrier_algorithm_count", - "Number of barrier algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_barrier_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_barrier_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_barrier_algorithms", barrier_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "barrier_algorithm", - "Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_barrier_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - return (MPI_SUCCESS); -} - - - -int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:barrier_intra_do_forced selected algorithm %d", - data->user_forced[BARRIER].algorithm)); - - switch (data->user_forced[BARRIER].algorithm) { - case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module); - case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module); - case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module); - case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module); - case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module); - case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module); - case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[BARRIER].algorithm, - ompi_coll_tuned_forced_max_algorithms[BARRIER])); - return (MPI_ERR_ARG); - } /* switch */ - -} - - -int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout)); - - switch (algorithm) { - case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module); - case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module); - case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module); - case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module); - case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module); - case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module); - case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER])); - return (MPI_ERR_ARG); - } /* switch */ -} - diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 079dbbcacc..8f7fe1b3e4 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -3,18 +3,18 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -27,33 +27,14 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* bcast algorithm variables */ -static int coll_tuned_bcast_algorithm_count = 6; -static int coll_tuned_bcast_forced_algorithm = 0; -static int coll_tuned_bcast_segment_size = 0; -static int coll_tuned_bcast_tree_fanout; -static int coll_tuned_bcast_chain_fanout; - -/* valid values for coll_tuned_bcast_forced_algorithm */ -static mca_base_var_enum_value_t bcast_algorithms[] = { - {0, "ignore"}, - {1, "basic_linear"}, - {2, "chain"}, - {3, "pipeline"}, - {4, "split_binary_tree"}, - {5, "binary_tree"}, - {6, "binomial"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" int -ompi_coll_tuned_bcast_intra_generic( void* buffer, - int original_count, - struct ompi_datatype_t* datatype, +ompi_coll_base_bcast_intra_generic( void* buffer, + int original_count, + struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -62,12 +43,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, { int err = 0, line, i, rank, size, segindex, req_index; int num_segments; /* Number of segments */ - int sendcount; /* number of elements sent in this segment */ + int sendcount; /* number of elements sent in this segment */ size_t realsegsize, type_size; char *tmpbuf; ptrdiff_t extent, lb; ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) ompi_request_t **send_reqs = NULL; #endif @@ -79,20 +60,20 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, ompi_datatype_type_size( datatype, &type_size ); num_segments = (original_count + count_by_segment - 1) / count_by_segment; realsegsize = (ptrdiff_t)count_by_segment * extent; - + /* Set the buffer pointers */ tmpbuf = (char *) buffer; -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) if( tree->tree_nextsize != 0 ) { - send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize * + send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize * sizeof(ompi_request_t*) ); } #endif /* Root code */ if( rank == root ) { - /* + /* For each segment: - send segment to all children. The last segment may have less elements than other segments. @@ -102,39 +83,39 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, if( segindex == (num_segments - 1) ) { sendcount = original_count - segindex * count_by_segment; } - for( i = 0; i < tree->tree_nextsize; i++ ) { -#if defined(COLL_TUNED_BCAST_USE_BLOCKING) + for( i = 0; i < tree->tree_nextsize; i++ ) { +#if defined(COLL_BASE_BCAST_USE_BLOCKING) err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype, - tree->tree_next[i], + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm)); #else err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, - tree->tree_next[i], + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, + MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i])); -#endif /* COLL_TUNED_BCAST_USE_BLOCKING */ +#endif /* COLL_BASE_BCAST_USE_BLOCKING */ if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } + } -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) /* complete the sends before starting the next sends */ - err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, + err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } -#endif /* not COLL_TUNED_BCAST_USE_BLOCKING */ +#endif /* not COLL_BASE_BCAST_USE_BLOCKING */ /* update tmp buffer */ tmpbuf += realsegsize; } - } - + } + /* Intermediate nodes code */ - else if( tree->tree_nextsize > 0 ) { - /* - Create the pipeline. + else if( tree->tree_nextsize > 0 ) { + /* + Create the pipeline. 1) Post the first receive 2) For segments 1 .. num_segments - post new receive @@ -149,49 +130,49 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, comm, &recv_reqs[req_index])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - + for( segindex = 1; segindex < num_segments; segindex++ ) { - + req_index = req_index ^ 0x1; - + /* post new irecv */ err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment, - datatype, tree->tree_prev, - MCA_COLL_BASE_TAG_BCAST, + datatype, tree->tree_prev, + MCA_COLL_BASE_TAG_BCAST, comm, &recv_reqs[req_index])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - + /* wait for and forward the previous segment to children */ - err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], + err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( i = 0; i < tree->tree_nextsize; i++ ) { -#if defined(COLL_TUNED_BCAST_USE_BLOCKING) + + for( i = 0; i < tree->tree_nextsize; i++ ) { +#if defined(COLL_BASE_BCAST_USE_BLOCKING) err = MCA_PML_CALL(send(tmpbuf, count_by_segment, datatype, - tree->tree_next[i], + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm)); #else err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype, - tree->tree_next[i], + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, + MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i])); -#endif /* COLL_TUNED_BCAST_USE_BLOCKING */ +#endif /* COLL_BASE_BCAST_USE_BLOCKING */ if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) + } + +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) /* complete the sends before starting the next iteration */ - err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, + err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } -#endif /* COLL_TUNED_BCAST_USE_BLOCKING */ - +#endif /* COLL_BASE_BCAST_USE_BLOCKING */ + /* Update the receive buffer */ tmpbuf += realsegsize; - + } /* Process the last segment */ @@ -199,31 +180,31 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment; for( i = 0; i < tree->tree_nextsize; i++ ) { -#if defined(COLL_TUNED_BCAST_USE_BLOCKING) +#if defined(COLL_BASE_BCAST_USE_BLOCKING) err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype, - tree->tree_next[i], + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm)); #else err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, - tree->tree_next[i], + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, + MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i])); -#endif /* COLL_TUNED_BCAST_USE_BLOCKING */ +#endif /* COLL_BASE_BCAST_USE_BLOCKING */ if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } - -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) - err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, + +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) + err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } -#endif /* COLL_TUNED_BCAST_USE_BLOCKING */ +#endif /* COLL_BASE_BCAST_USE_BLOCKING */ } - + /* Leaf nodes */ else { - /* + /* Receive all segments from parent in a loop: 1) post irecv for the first segment 2) for segments 1 .. num_segments @@ -241,12 +222,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, req_index = req_index ^ 0x1; tmpbuf += realsegsize; /* post receive for the next segment */ - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, comm, &recv_reqs[req_index])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* wait on the previous segment */ - err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], + err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } @@ -255,25 +236,25 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) if( NULL != send_reqs ) free(send_reqs); #endif return (MPI_SUCCESS); - + error_hndl: - OPAL_OUTPUT( (ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); -#if !defined(COLL_TUNED_BCAST_USE_BLOCKING) +#if !defined(COLL_BASE_BCAST_USE_BLOCKING) if( NULL != send_reqs ) free(send_reqs); #endif return (err); } int -ompi_coll_tuned_bcast_intra_bintree ( void* buffer, - int count, - struct ompi_datatype_t* datatype, +ompi_coll_base_bcast_intra_bintree ( void* buffer, + int count, + struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -281,28 +262,27 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer, { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_comm_t *data = module->base_data; - COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_BINTREE( comm, module, root ); /** * Determine number of elements sent per operation. */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); - return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bintree ); } int -ompi_coll_tuned_bcast_intra_pipeline( void* buffer, - int count, - struct ompi_datatype_t* datatype, +ompi_coll_base_bcast_intra_pipeline( void* buffer, + int count, + struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -310,28 +290,27 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer, { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_comm_t *data = module->base_data; - COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root ); + COLL_BASE_UPDATE_PIPELINE( comm, module, root ); /** * Determine number of elements sent per operation. */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); - return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_pipeline ); } int -ompi_coll_tuned_bcast_intra_chain( void* buffer, - int count, - struct ompi_datatype_t* datatype, +ompi_coll_base_bcast_intra_chain( void* buffer, + int count, + struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -339,28 +318,27 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer, { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_comm_t *data = module->base_data; - COLL_TUNED_UPDATE_CHAIN( comm, tuned_module, root, chains ); + COLL_BASE_UPDATE_CHAIN( comm, module, root, chains ); /** * Determine number of elements sent per operation. */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount)); - return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_chain ); } int -ompi_coll_tuned_bcast_intra_binomial( void* buffer, - int count, - struct ompi_datatype_t* datatype, +ompi_coll_base_bcast_intra_binomial( void* buffer, + int count, + struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -368,28 +346,27 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer, { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_comm_t *data = module->base_data; - COLL_TUNED_UPDATE_BMTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_BMTREE( comm, module, root ); /** * Determine number of elements sent per operation. */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); - return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bmtree ); } int -ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, - int count, - struct ompi_datatype_t* datatype, +ompi_coll_base_bcast_intra_split_bintree ( void* buffer, + int count, + struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -399,26 +376,25 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, uint32_t counts[2]; int segcount[2]; /* Number of elements sent with each segment */ int num_segments[2]; /* Number of segmenets */ - int sendcount[2]; /* the same like segcount, except for the last segment */ + int sendcount[2]; /* the same like segcount, except for the last segment */ size_t realsegsize[2], type_size; char *tmpbuf[2]; ptrdiff_t type_extent, lb; ompi_request_t *base_req, *new_req; ompi_coll_tree_t *tree; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_comm_t *data = module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize)); if (size == 1) { return MPI_SUCCESS; } /* setup the binary tree topology. */ - COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_BINTREE( comm, module, root ); tree = data->cached_bintree; err = ompi_datatype_type_size( datatype, &type_size ); @@ -431,10 +407,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, /* Note that ompi_datatype_type_size() will never return a negative value in typelng; it returns an int [vs. an unsigned type] because of the MPI spec. */ - if (segsize < ((uint32_t) type_size)) { + if (segsize < ((uint32_t) type_size)) { segsize = type_size; /* push segsize up to hold one type */ } - segcount[0] = segcount[1] = segsize / type_size; + segcount[0] = segcount[1] = segsize / type_size; num_segments[0] = counts[0]/segcount[0]; if ((counts[0] % segcount[0]) != 0) num_segments[0]++; num_segments[1] = counts[1]/segcount[1]; @@ -450,17 +426,17 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, (segsize > ((ptrdiff_t)counts[0] * type_size)) || (segsize > ((ptrdiff_t)counts[1] * type_size)) ) { /* call linear version here ! */ - return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype, + return (ompi_coll_base_bcast_intra_chain ( buffer, count, datatype, root, comm, module, segsize, 1 )); } err = ompi_datatype_get_extent (datatype, &lb, &type_extent); - + /* Determine real segment size */ realsegsize[0] = (ptrdiff_t)segcount[0] * type_extent; realsegsize[1] = (ptrdiff_t)segcount[1] * type_extent; - + /* set the buffer pointers */ tmpbuf[0] = (char *) buffer; tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent; @@ -473,11 +449,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, /* determine if I am left (0) or right (1), (root is right) */ lr = ((rank + size - root)%size + 1)%2; - + /* root code */ if( rank == root ) { /* determine segment count */ - sendcount[0] = segcount[0]; + sendcount[0] = segcount[0]; sendcount[1] = segcount[1]; /* for each segment */ for (segindex = 0; segindex < num_segments[0]; segindex++) { @@ -487,7 +463,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, continue; } /* determine how many elements are being sent in this round */ - if(segindex == (num_segments[i] - 1)) + if(segindex == (num_segments[i] - 1)) sendcount[i] = counts[i] - segindex*segcount[i]; /* send data */ MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype, @@ -498,19 +474,19 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, tmpbuf[i] += realsegsize[i]; } } - } - + } + /* intermediate nodes code */ - else if( tree->tree_nextsize > 0 ) { + else if( tree->tree_nextsize > 0 ) { /* Intermediate nodes: * It will receive segments only from one half of the data. - * Which one is determined by whether the node belongs to the "left" or "right" + * Which one is determined by whether the node belongs to the "left" or "right" * subtree. Topoloby building function builds binary tree such that * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree, * and even on the right subtree. * * Create the pipeline. We first post the first receive, then in the loop we - * post the next receive and after that wait for the previous receive to complete + * post the next receive and after that wait for the previous receive to complete * and we disseminating the data to all children. */ sendcount[lr] = segcount[lr]; @@ -521,11 +497,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, for( segindex = 1; segindex < num_segments[lr]; segindex++ ) { /* determine how many elements to expect in this round */ - if( segindex == (num_segments[lr] - 1)) + if( segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr]; /* post new irecv */ err = MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr], - datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, comm, &new_req)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } @@ -539,7 +515,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, } /* end of for each child */ /* upate the base request */ - base_req = new_req; + base_req = new_req; /* go to the next buffer (ie. the one corresponding to the next recv) */ tmpbuf[lr] += realsegsize[lr]; } /* end of for segindex */ @@ -552,10 +528,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* end of for each child */ - } - + } + /* leaf nodes */ - else { + else { /* Just consume segments as fast as possible */ sendcount[lr] = segcount[lr]; for (segindex = 0; segindex < num_segments[lr]; segindex++) { @@ -577,9 +553,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent; /* Step 2: - Find your immediate pair (identical node in opposite subtree) and SendRecv + Find your immediate pair (identical node in opposite subtree) and SendRecv data buffer with them. - The tree building function ensures that + The tree building function ensures that if (we are not root) if we are in the left subtree (lr == 0) our pair is (rank+1)%size. if we are in the right subtree (lr == 1) our pair is (rank-1)%size @@ -591,9 +567,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, pair = (rank+size-1)%size; } - if ( (size%2) != 0 && rank != root) { + if ( (size%2) != 0 && rank != root) { - err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype, + err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype, pair, MCA_COLL_BASE_TAG_BCAST, tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, pair, MCA_COLL_BASE_TAG_BCAST, @@ -607,28 +583,28 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } + } /* last node receives right buffer from the root */ else if (rank == (root+size-1)%size) { err = MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype, root, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } + } /* everyone else exchanges buffers */ else { - err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype, + err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype, pair, MCA_COLL_BASE_TAG_BCAST, tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, pair, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE, rank); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } } return (MPI_SUCCESS); - + error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank)); return (err); } @@ -636,8 +612,8 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not @@ -655,21 +631,20 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count, +ompi_coll_base_bcast_intra_basic_linear (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, err; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_comm_t *data = module->base_data; ompi_request_t **preq, **reqs = data->mcct_reqs; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d", rank, root)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_basic_linear rank %d root %d", rank, root)); /* Non-root receive the data. */ @@ -710,148 +685,11 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count, err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE); /* Free the reqs */ - - ompi_coll_tuned_free_reqs(reqs, i); + ompi_coll_base_free_reqs(reqs, i); /* All done */ - return err; } /* copied function (with appropriate renaming) ends here */ - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[BCAST] = coll_tuned_bcast_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "bcast_algorithm_count", - "Number of bcast algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_bcast_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_bcast_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_bcast_algorithms", bcast_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "bcast_algorithm", - "Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_bcast_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_bcast_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "bcast_algorithm_segmentsize", - "Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_bcast_segment_size); - - coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "bcast_algorithm_tree_fanout", - "Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_bcast_tree_fanout); - - coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "bcast_algorithm_chain_fanout", - "Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_bcast_chain_fanout); - - return (MPI_SUCCESS); -} - - -int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d", - data->user_forced[BCAST].algorithm)); - - switch (data->user_forced[BCAST].algorithm) { - case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module ); - case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module ); - case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module, - data->user_forced[BCAST].segsize, - data->user_forced[BCAST].chain_fanout ); - case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module, - data->user_forced[BCAST].segsize ); - case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, - data->user_forced[BCAST].segsize ); - case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module, - data->user_forced[BCAST].segsize ); - case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module, - data->user_forced[BCAST].segsize ); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST])); - } /* switch */ - return (MPI_ERR_ARG); -} - - -int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) - -{ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module ); - case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module ); - case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout ); - case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize ); - case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize ); - case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize ); - case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize ); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST])); - } /* switch */ - return (MPI_ERR_ARG); -} - diff --git a/ompi/mca/coll/base/coll_base_frame.c b/ompi/mca/coll/base/coll_base_frame.c index 311f31bdf9..6159d1a94b 100644 --- a/ompi/mca/coll/base/coll_base_frame.c +++ b/ompi/mca/coll/base/coll_base_frame.c @@ -3,10 +3,10 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -15,9 +15,9 @@ * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -33,6 +33,7 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" +#include "ompi/mca/coll/base/coll_base_functions.h" /* * The following file was created by configure. It contains extern @@ -49,10 +50,55 @@ static void coll_base_module_construct(mca_coll_base_module_t *m) /* zero out all functions */ memset ((char *) m + sizeof (m->super), 0, sizeof (*m) - sizeof (m->super)); m->coll_module_disable = NULL; + m->base_data = NULL; } -OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t, - coll_base_module_construct, NULL); +static void +coll_base_module_destruct(mca_coll_base_module_t *module) +{ + mca_coll_base_comm_t* data = module->base_data; + + if (NULL != data) { + if( NULL != data->mcct_reqs ) { + for( int i = 0; i < data->mcct_num_reqs; ++i ) { + if( MPI_REQUEST_NULL != data->mcct_reqs[i] ) + ompi_request_free(&data->mcct_reqs[i]); + } + free(data->mcct_reqs); + data->mcct_reqs = NULL; + data->mcct_num_reqs = 0; + } + assert(0 == data->mcct_num_reqs); + + /* free any cached information that has been allocated */ + if (data->cached_ntree) { /* destroy general tree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_ntree); + } + if (data->cached_bintree) { /* destroy bintree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_bintree); + } + if (data->cached_bmtree) { /* destroy bmtree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_bmtree); + } + if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree); + } + if (data->cached_chain) { /* destroy general chain if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_chain); + } + if (data->cached_pipeline) { /* destroy pipeline if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_pipeline); + } + if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree); + } + + free(data); + } +} + +OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t, + coll_base_module_construct, coll_base_module_destruct); MCA_BASE_FRAMEWORK_DECLARE(ompi, coll, "Collectives", NULL, NULL, NULL, mca_coll_base_static_components, 0); diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h new file mode 100644 index 0000000000..5291f8725e --- /dev/null +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -0,0 +1,341 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_BASE_EXPORT_H +#define MCA_COLL_BASE_EXPORT_H + +#include "ompi_config.h" + +#include "ompi/mca/coll/base/base.h" +#include "opal/mca/mca.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/request/request.h" + +/* need to include our own topo prototypes so we can malloc data on the comm correctly */ +#include "coll_base_topo.h" + +/* some fixed value index vars to simplify certain operations */ +typedef enum COLLTYPE { + ALLGATHER = 0, /* 0 */ + ALLGATHERV, /* 1 */ + ALLREDUCE, /* 2 */ + ALLTOALL, /* 3 */ + ALLTOALLV, /* 4 */ + ALLTOALLW, /* 5 */ + BARRIER, /* 6 */ + BCAST, /* 7 */ + EXSCAN, /* 8 */ + GATHER, /* 9 */ + GATHERV, /* 10 */ + REDUCE, /* 11 */ + REDUCESCATTER, /* 12 */ + SCAN, /* 13 */ + SCATTER, /* 14 */ + SCATTERV, /* 15 */ + COLLCOUNT /* 16 end counter keep it as last element */ +} COLLTYPE_T; + +/* defined arg lists to simply auto inclusion of user overriding decision functions */ +#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module +/* end defined arg lists to simply auto inclusion of user overriding decision functions */ + +BEGIN_C_DECLS + +/* All Gather */ +int ompi_coll_base_allgather_intra_bruck(ALLGATHER_ARGS); +int ompi_coll_base_allgather_intra_recursivedoubling(ALLGATHER_ARGS); +int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS); +int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS); +int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS); +int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS); + +/* All GatherV */ +int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS); +int ompi_coll_base_allgatherv_intra_ring(ALLGATHERV_ARGS); +int ompi_coll_base_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS); +int ompi_coll_base_allgatherv_intra_basic_default(ALLGATHERV_ARGS); +int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS); + +/* All Reduce */ +int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS); +int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS); +int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS); +int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize); +int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS); + +/* AlltoAll */ +int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS); +int ompi_coll_base_alltoall_intra_bruck(ALLTOALL_ARGS); +int ompi_coll_base_alltoall_intra_basic_linear(ALLTOALL_ARGS); +int ompi_coll_base_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests); +int ompi_coll_base_alltoall_intra_two_procs(ALLTOALL_ARGS); + +/* AlltoAllV */ +int ompi_coll_base_alltoallv_intra_pairwise(ALLTOALLV_ARGS); +int ompi_coll_base_alltoallv_intra_basic_linear(ALLTOALLV_ARGS); + +/* AlltoAllW */ + +/* Barrier */ +int ompi_coll_base_barrier_intra_doublering(BARRIER_ARGS); +int ompi_coll_base_barrier_intra_recursivedoubling(BARRIER_ARGS); +int ompi_coll_base_barrier_intra_bruck(BARRIER_ARGS); +int ompi_coll_base_barrier_intra_two_procs(BARRIER_ARGS); +int ompi_coll_base_barrier_intra_linear(BARRIER_ARGS); +int ompi_coll_base_barrier_intra_tree(BARRIER_ARGS); + +/* Bcast */ +int ompi_coll_base_bcast_intra_basic_linear(BCAST_ARGS); +int ompi_coll_base_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains); +int ompi_coll_base_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize); +int ompi_coll_base_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize); +int ompi_coll_base_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize); +int ompi_coll_base_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize); + +/* Exscan */ + +/* Gather */ +int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS); +int ompi_coll_base_gather_intra_binomial(GATHER_ARGS); +int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size); + +/* GatherV */ + +/* Reduce */ +int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS); +int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs ); +int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); +int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); +int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); +int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); + +/* Reduce_scatter */ +int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS); +int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS); +int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS); + +/* Scan */ + +/* Scatter */ +int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS); +int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS); + +/* ScatterV */ + +END_C_DECLS + +#define COLL_BASE_UPDATE_BINTREE( OMPI_COMM, BASE_MODULE, ROOT ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \ + if( !( (coll_comm->cached_bintree) \ + && (coll_comm->cached_bintree_root == (ROOT)) ) ) { \ + if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \ + ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bintree) ); \ + } \ + coll_comm->cached_bintree = ompi_coll_base_topo_build_tree(2,(OMPI_COMM),(ROOT)); \ + coll_comm->cached_bintree_root = (ROOT); \ + } \ +} while (0) + +#define COLL_BASE_UPDATE_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \ + if( !( (coll_comm->cached_bmtree) \ + && (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \ + if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \ + ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \ + } \ + coll_comm->cached_bmtree = ompi_coll_base_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \ + coll_comm->cached_bmtree_root = (ROOT); \ + } \ +} while (0) + +#define COLL_BASE_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \ + if( !( (coll_comm->cached_in_order_bmtree) \ + && (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \ + if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \ + ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \ + } \ + coll_comm->cached_in_order_bmtree = ompi_coll_base_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \ + coll_comm->cached_in_order_bmtree_root = (ROOT); \ + } \ +} while (0) + +#define COLL_BASE_UPDATE_PIPELINE( OMPI_COMM, BASE_MODULE, ROOT ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \ + if( !( (coll_comm->cached_pipeline) \ + && (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \ + if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \ + ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \ + } \ + coll_comm->cached_pipeline = ompi_coll_base_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \ + coll_comm->cached_pipeline_root = (ROOT); \ + } \ +} while (0) + +#define COLL_BASE_UPDATE_CHAIN( OMPI_COMM, BASE_MODULE, ROOT, FANOUT ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \ + if( !( (coll_comm->cached_chain) \ + && (coll_comm->cached_chain_root == (ROOT)) \ + && (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \ + if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \ + ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_chain) ); \ + } \ + coll_comm->cached_chain = ompi_coll_base_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \ + coll_comm->cached_chain_root = (ROOT); \ + coll_comm->cached_chain_fanout = (FANOUT); \ + } \ +} while (0) + +#define COLL_BASE_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, BASE_MODULE ) \ +do { \ + mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \ + if( !(coll_comm->cached_in_order_bintree) ) { \ + /* In-order binary tree topology is defined by communicator size */ \ + /* Thus, there is no need to destroy anything */ \ + coll_comm->cached_in_order_bintree = \ + ompi_coll_base_topo_build_in_order_bintree((OMPI_COMM)); \ + } \ +} while (0) + +/** + * This macro give a generic way to compute the best count of + * the segment (i.e. the number of complete datatypes that + * can fit in the specified SEGSIZE). Beware, when this macro + * is called, the SEGCOUNT should be initialized to the count as + * expected by the collective call. + */ +#define COLL_BASE_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \ + if( ((SEGSIZE) >= (TYPELNG)) && \ + ((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \ + size_t residual; \ + (SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \ + residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \ + if( residual > ((TYPELNG) >> 1) ) \ + (SEGCOUNT)++; \ + } \ + +/** + * This macro gives a generic wait to compute the well distributed block counts + * when the count and number of blocks are fixed. + * Macro returns "early-block" count, "late-block" count, and "split-index" + * which is the block at which we switch from "early-block" count to + * the "late-block" count. + * count = split_index * early_block_count + + * (block_count - split_index) * late_block_count + * We do not perform ANY error checks - make sure that the input values + * make sense (eg. count > num_blocks). + */ +#define COLL_BASE_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \ + EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \ + EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \ + SPLIT_INDEX = COUNT % NUM_BLOCKS; \ + if (0 != SPLIT_INDEX) { \ + EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \ + } \ + +/* + * Data structure for hanging data off the communicator + * i.e. per module instance + */ +struct mca_coll_base_comm_t { + opal_object_t super; + + /* standard data for requests and PML usage */ + + /* Precreate space for requests + * Note this does not effect basic, + * but if in wrong context can confuse a debugger + * this is controlled by an MCA param + */ + + ompi_request_t **mcct_reqs; + int mcct_num_reqs; + + /* + * base topo information caching per communicator + * + * for each communicator we cache the topo information so we can + * reuse without regenerating if we change the root, [or fanout] + * then regenerate and recache this information + */ + + /* general tree with n fan out */ + ompi_coll_tree_t *cached_ntree; + int cached_ntree_root; + int cached_ntree_fanout; + + /* binary tree */ + ompi_coll_tree_t *cached_bintree; + int cached_bintree_root; + + /* binomial tree */ + ompi_coll_tree_t *cached_bmtree; + int cached_bmtree_root; + + /* binomial tree */ + ompi_coll_tree_t *cached_in_order_bmtree; + int cached_in_order_bmtree_root; + + /* chained tree (fanout followed by pipelines) */ + ompi_coll_tree_t *cached_chain; + int cached_chain_root; + int cached_chain_fanout; + + /* pipeline */ + ompi_coll_tree_t *cached_pipeline; + int cached_pipeline_root; + + /* in-order binary tree (root of the in-order binary tree is rank 0) */ + ompi_coll_tree_t *cached_in_order_bintree; +}; +typedef struct mca_coll_base_comm_t mca_coll_base_comm_t; +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t); + +static inline void ompi_coll_base_free_reqs(ompi_request_t **reqs, int count) +{ + int i; + for (i = 0; i < count; ++i) + ompi_request_free(&reqs[i]); +} + +#endif /* MCA_COLL_BASE_EXPORT_H */ diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index d23fe78e77..fc8f9f6495 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -30,30 +30,14 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* gather algorithm variables */ -static int coll_tuned_gather_algorithm_count = 3; -static int coll_tuned_gather_forced_algorithm = 0; -static int coll_tuned_gather_segment_size = 0; -static int coll_tuned_gather_tree_fanout; -static int coll_tuned_gather_chain_fanout; - -/* valid values for coll_tuned_gather_forced_algorithm */ -static mca_base_var_enum_value_t gather_algorithms[] = { - {0, "ignore"}, - {1, "basic_linear"}, - {2, "binomial"}, - {3, "linear_sync"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" /* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain, * gather_intra_pipeline, segmentation? */ int -ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, +ompi_coll_base_gather_intra_binomial(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -65,19 +49,19 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; - MPI_Aint sextent, slb, strue_lb, strue_extent; + MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_binomial rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_gather_intra_binomial rank %d", rank)); /* create the binomial tree */ - COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_get_extent(sdtype, &slb, &sextent); @@ -112,7 +96,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ - err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, + err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } @@ -157,8 +141,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, mycount = size - vkid; mycount *= rcount; - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount)); err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, @@ -172,8 +156,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, if (rank != root) { /* all nodes except root send to parents */ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv)); err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, @@ -207,7 +191,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, if (NULL != tempbuf) free(tempbuf); - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -220,11 +204,11 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, +ompi_coll_base_gather_intra_linear_sync(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, - int root, + int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int first_segment_size) @@ -237,8 +221,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); if (rank != root) { /* Non-root processes: @@ -250,10 +234,10 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, ompi_datatype_type_size(sdtype, &typelng); ompi_datatype_get_extent(sdtype, &lb, &extent); first_segment_count = scount; - COLL_TUNED_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng, + COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng, first_segment_count ); - ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root, + ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } @@ -263,15 +247,15 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count, - (scount - first_segment_count), sdtype, + ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count, + (scount - first_segment_count), sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } else { - /* Root process, + /* Root process, - For every non-root node: - post irecv for the first segment of the message - send zero byte message to signal node to send the message @@ -284,20 +268,20 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, ompi_request_t *first_segment_req; reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*)); if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; } - + ompi_datatype_type_size(rdtype, &typelng); ompi_datatype_get_extent(rdtype, &lb, &extent); first_segment_count = rcount; - COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, + COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, first_segment_count ); ptmp = (char *) rbuf; for (i = 0; i < size; ++i) { - if (i == rank) { + if (i == rank) { /* skip myself */ - reqs[i] = MPI_REQUEST_NULL; - continue; - } + reqs[i] = MPI_REQUEST_NULL; + continue; + } /* irecv for the first segment from i */ ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent; @@ -305,7 +289,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, MCA_COLL_BASE_TAG_GATHER, comm, &first_segment_req)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - + /* send sync message */ ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_GATHER, @@ -314,7 +298,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, /* irecv for the second segment */ ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent; - ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count), + ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count), rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &reqs[i])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } @@ -327,11 +311,11 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_sndrcv(sbuf, scount, sdtype, - (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, + (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } - + /* wait all second segments to complete */ ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } @@ -346,8 +330,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, if (NULL != reqs) { free(reqs); } - OPAL_OUTPUT (( ompi_coll_tuned_stream, - "ERROR_HNDL: node %d file %s line %d error %d\n", + OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, + "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); return ret; } @@ -355,13 +339,13 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not * have to duplicate code. - * JPG following the examples from other coll_tuned implementations. Dec06. + * JPG following the examples from other coll_base implementations. Dec06. */ /* copied function (with appropriate renaming) starts here */ @@ -373,7 +357,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount, +ompi_coll_base_gather_intra_basic_linear(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -389,8 +373,8 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount, rank = ompi_comm_rank(comm); /* Everyone but root sends data and returns. */ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_basic_linear rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_gather_intra_basic_linear rank %d", rank)); if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, @@ -427,164 +411,3 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount, /* copied function (with appropriate renaming) ends here */ - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map - routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values - and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int -ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[GATHER] = coll_tuned_gather_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "gather_algorithm_count", - "Number of gather algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_gather_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_gather_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_gather_algorithms", gather_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "gather_algorithm", - "Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_gather_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_gather_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "gather_algorithm_segmentsize", - "Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_gather_segment_size); - - coll_tuned_gather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "gather_algorithm_tree_fanout", - "Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_gather_tree_fanout); - - coll_tuned_gather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "gather_algorithm_chain_fanout", - "Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_gather_chain_fanout); - - return (MPI_SUCCESS); -} - -int -ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_forced selected algorithm %d", - data->user_forced[GATHER].algorithm)); - - switch (data->user_forced[GATHER].algorithm) { - case (0): - return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (1): - return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (2): - return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (3): - return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module, - data->user_forced[GATHER].segsize); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[GATHER].algorithm, - ompi_coll_tuned_forced_max_algorithms[GATHER])); - return (MPI_ERR_ARG); - } /* switch */ -} - -int -ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): - return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (1): - return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (2): - return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (3): - return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module, - segsize); - - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[GATHER])); - return (MPI_ERR_ARG); - } /* switch */ -} diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index 4b7c2acf7d..f6752579c9 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -31,28 +31,8 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" - -/* reduce algorithm variables */ -static int coll_tuned_reduce_algorithm_count = 6; -static int coll_tuned_reduce_forced_algorithm = 0; -static int coll_tuned_reduce_segment_size = 0; -static int coll_tuned_reduce_max_requests; -static int coll_tuned_reduce_tree_fanout; -static int coll_tuned_reduce_chain_fanout; - -/* valid values for coll_tuned_reduce_forced_algorithm */ -static mca_base_var_enum_value_t reduce_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "chain"}, - {3, "pipeline"}, - {4, "binary"}, - {5, "binomial"}, - {6, "in-order_binary"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" /** * This is a generic implementation of the reduce protocol. It used the tree @@ -62,10 +42,10 @@ static mca_base_var_enum_value_t reduce_algorithms[] = { * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy - * for the first block: thus we must copy sendbuf to accumbuf on intermediate + * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ -int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count, +int ompi_coll_base_reduce_generic( void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, @@ -90,60 +70,60 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = (ptrdiff_t)count_by_segment * extent; - sendtmpbuf = (char*) sendbuf; - if( sendbuf == MPI_IN_PLACE ) { - sendtmpbuf = (char *)recvbuf; + sendtmpbuf = (char*) sendbuf; + if( sendbuf == MPI_IN_PLACE ) { + sendtmpbuf = (char *)recvbuf; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); - /* non-leaf nodes - wait for children to send me data & forward up + /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t true_lower_bound, true_extent, real_segment_size; - ompi_datatype_get_true_extent( datatype, &true_lower_bound, + ompi_datatype_get_true_extent( datatype, &true_lower_bound, &true_extent ); - /* handle non existant recv buffer (i.e. its NULL) and + /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ - accumbuf_free = (char*)malloc(true_extent + + accumbuf_free = (char*)malloc(true_extent + (ptrdiff_t)(original_count - 1) * extent); - if (accumbuf_free == NULL) { - line = __LINE__; ret = -1; goto error_hndl; + if (accumbuf_free == NULL) { + line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - lower_bound; - } + } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op)) { - ompi_datatype_copy_content_same_ddt(datatype, original_count, + ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent; inbuf_free[0] = (char*) malloc(real_segment_size); - if( inbuf_free[0] == NULL ) { - line = __LINE__; ret = -1; goto error_hndl; + if( inbuf_free[0] == NULL ) { + line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - lower_bound; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); - if( inbuf_free[1] == NULL ) { + if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - lower_bound; - } + } /* reset input buffer index and receive count */ inbi = 0; @@ -166,14 +146,14 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { - /* for the first step (1st child per segment) and - * commutative operations we might be able to irecv - * directly into the accumulate buffer so that we can - * reduce(op) this with our sendbuf in one step as - * ompi_op_reduce only has two buffer pointers, + /* for the first step (1st child per segment) and + * commutative operations we might be able to irecv + * directly into the accumulate buffer so that we can + * reduce(op) this with our sendbuf in one step as + * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * - * BUT if the operation is non-commutative or + * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && @@ -183,34 +163,34 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, - tree->tree_next[i], - MCA_COLL_BASE_TAG_REDUCE, comm, + tree->tree_next[i], + MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. - if there are no requests reqs[inbi ^1] will be + if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ - ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], + ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { - /* our first operation is to combine our own [sendbuf] data - * with the data we recvd from down stream (but only - * the operation is commutative and if we are not root and + /* our first operation is to combine our own [sendbuf] data + * with the data we recvd from down stream (but only + * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { - if( (ompi_op_is_commute(op)) && + if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ - ompi_op_reduce(op, local_op_buffer, - accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, + ompi_op_reduce(op, local_op_buffer, + accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; @@ -220,25 +200,25 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } - ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, + ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); - /* all reduced on available data this step (i) complete, + /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ - ret = MCA_PML_CALL( send( accumulator, prevcount, - datatype, tree->tree_prev, + ret = MCA_PML_CALL( send( accumulator, prevcount, + datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, - MCA_PML_BASE_SEND_STANDARD, + MCA_PML_BASE_SEND_STANDARD, comm) ); - if (ret != MPI_SUCCESS) { - line = __LINE__; goto error_hndl; + if (ret != MPI_SUCCESS) { + line = __LINE__; goto error_hndl; } } - /* we stop when segindex = number of segments + /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } @@ -254,33 +234,33 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c if( accumbuf_free != NULL ) free(accumbuf_free); } - /* leaf nodes - Depending on the value of max_outstanding_reqs and + /* leaf nodes + Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - - avoid overflooding the parent nodes by limiting the number of + - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. - TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size - for the current communication, synchronization should be used only + TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size + for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding - requests or there is no limit on the maximum number of outstanding + requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ - if ((0 == max_outstanding_reqs) || + if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { - + segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } - ret = MCA_PML_CALL( send((char*)sendbuf + + ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, - tree->tree_prev, + tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); @@ -310,7 +290,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, - tree->tree_prev, + tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); @@ -328,12 +308,12 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c if( original_count < count_by_segment ) { count_by_segment = original_count; } - ret = MCA_PML_CALL( isend((char*)sendbuf + - (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, - count_by_segment, datatype, - tree->tree_prev, - MCA_COLL_BASE_TAG_REDUCE, - MCA_PML_BASE_SEND_SYNCHRONOUS, comm, + ret = MCA_PML_CALL( isend((char*)sendbuf + + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, + count_by_segment, datatype, + tree->tree_prev, + MCA_COLL_BASE_TAG_REDUCE, + MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; @@ -342,7 +322,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c } /* Wait on the remaining request to complete */ - ret = ompi_request_wait_all( max_outstanding_reqs, sreq, + ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } @@ -353,8 +333,8 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c return OMPI_SUCCESS; error_hndl: /* error handler */ - OPAL_OUTPUT (( ompi_coll_tuned_stream, - "ERROR_HNDL: node %d file %s line %d error %d\n", + OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, + "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); @@ -369,9 +349,9 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c meaning that at least one datatype must fit in the segment ! */ -int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count, - ompi_datatype_t* datatype, - ompi_op_t* op, int root, +int ompi_coll_base_reduce_intra_chain( void *sendbuf, void *recvbuf, int count, + ompi_datatype_t* datatype, + ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int fanout, @@ -379,27 +359,27 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count, { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize)); - COLL_TUNED_UPDATE_CHAIN( comm, tuned_module, root, fanout ); + COLL_BASE_UPDATE_CHAIN( comm, base_module, root, fanout ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, - data->cached_chain, + data->cached_chain, segcount, max_outstanding_reqs ); } -int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf, +int ompi_coll_base_reduce_intra_pipeline( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, @@ -409,101 +389,101 @@ int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf, { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_pipeline rank %d ss %5d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_pipeline rank %d ss %5d", ompi_comm_rank(comm), segsize)); - COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root ); + COLL_BASE_UPDATE_PIPELINE( comm, base_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, - data->cached_pipeline, + data->cached_pipeline, segcount, max_outstanding_reqs ); } -int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf, +int ompi_coll_base_reduce_intra_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, - ompi_communicator_t* comm, + ompi_communicator_t* comm, mca_coll_base_module_t *module, - uint32_t segsize, + uint32_t segsize, int max_outstanding_reqs ) { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binary rank %d ss %5d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_binary rank %d ss %5d", ompi_comm_rank(comm), segsize)); - COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_BINTREE( comm, base_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, - data->cached_bintree, + data->cached_bintree, segcount, max_outstanding_reqs ); } -int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf, +int ompi_coll_base_reduce_intra_binomial( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, - ompi_communicator_t* comm, + ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int segcount = count; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binomial rank %d ss %5d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_binomial rank %d ss %5d", ompi_comm_rank(comm), segsize)); - COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, - data->cached_in_order_bmtree, + data->cached_in_order_bmtree, segcount, max_outstanding_reqs ); } /* - * reduce_intra_in_order_binary - * + * reduce_intra_in_order_binary + * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ -int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, - int count, +int ompi_coll_base_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, + int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, - ompi_communicator_t* comm, + ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) @@ -511,28 +491,28 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); - COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module ); + COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); - COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle - 1. MPI_IN_PLACE option on real root, and + 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). - Note that generic function must be careful not to switch order of + Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; @@ -541,7 +521,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, if (io_root != root) { ptrdiff_t tlb, text, lb, ext; char *tmpbuf = NULL; - + ompi_datatype_get_extent(datatype, &lb, &ext); ompi_datatype_get_true_extent(datatype, &tlb, &text); @@ -550,7 +530,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, if (NULL == tmpbuf) { return MPI_ERR_INTERN; } - ompi_datatype_copy_content_same_ddt(datatype, count, + ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; @@ -564,9 +544,9 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, } /* Use generic reduce with in-order binary tree topology and io_root */ - ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, - op, io_root, comm, module, - data->cached_in_order_bintree, + ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, + op, io_root, comm, module, + data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } @@ -581,11 +561,11 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } - + } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, - MCA_COLL_BASE_TAG_REDUCE, + MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } free(use_this_recvbuf); @@ -598,8 +578,8 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not @@ -617,12 +597,12 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) +ompi_coll_base_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; @@ -634,7 +614,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_basic_linear rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_basic_linear rank %d", rank)); /* If not root, send data to the root. */ @@ -645,7 +625,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, return err; } - /* see discussion in ompi_coll_basic_reduce_lin_intra about + /* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extent */ /* for reducing buffer allocation lengths.... */ @@ -673,7 +653,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, /* Initialize the receive buffer. */ if (rank == (size - 1)) { - err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, + err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, @@ -705,7 +685,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, } if (NULL != inplace_temp) { - err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, + err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); } else { err = MPI_SUCCESS; @@ -724,185 +704,3 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, } /* copied function (with appropriate renaming) ends here */ - - -/** - * The following are used by dynamic and forced rules - * - * publish details of each algorithm and if its forced/fixed/locked in - * as you add methods/algorithms you must update this and the query/map routines - * - * this routine is called by the component only - * this makes sure that the mca parameters are set to their initial values and - * perms module does not call this they call the forced_getvalues routine - * instead. - */ - -int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t*new_enum; - - ompi_coll_tuned_forced_max_algorithms[REDUCE] = coll_tuned_reduce_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_algorithm_count", - "Number of reduce algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_reduce_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_reduce_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_reduce_algorithms", reduce_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_algorithm", - "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_reduce_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_algorithm_segmentsize", - "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_segment_size); - - coll_tuned_reduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_algorithm_tree_fanout", - "Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_tree_fanout); - - coll_tuned_reduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_algorithm_chain_fanout", - "Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_chain_fanout); - - coll_tuned_reduce_max_requests = 0; /* no limit for reduce by default */ - mca_param_indices->max_requests_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_algorithm_max_requests", - "Maximum number of outstanding send requests on leaf nodes. 0 means no limit.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_max_requests); - if (mca_param_indices->max_requests_param_index < 0) { - return mca_param_indices->max_requests_param_index; - } - - if (coll_tuned_reduce_max_requests < 0) { - if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { - opal_output( 0, "Maximum outstanding requests must be positive number or 0. Initializing to 0 (no limit).\n" ); - } - coll_tuned_reduce_max_requests = 0; - } - - return (MPI_SUCCESS); -} - - -int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - const int segsize = data->user_forced[REDUCE].segsize; - const int chain_fanout = data->user_forced[REDUCE].chain_fanout; - const int max_requests = data->user_forced[REDUCE].max_requests; - - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d", - data->user_forced[REDUCE].algorithm)); - - - switch (data->user_forced[REDUCE].algorithm) { - case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, - op, root, comm, module); - case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, - op, root, comm, module); - case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, chain_fanout, max_requests); - case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE])); - return (MPI_ERR_ARG); - } /* switch */ -} - - -int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, - int segsize, int max_requests ) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, - op, root, comm, module); - case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, - op, root, comm, module); - case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, faninout, max_requests); - case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, - op, root, comm, module, - segsize, max_requests); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE])); - return (MPI_ERR_ARG); - } /* switch */ -} - diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 3fc85daa87..0c23206c0e 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -32,37 +32,21 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" - -/* reduce_scatter algorithm variables */ -static int coll_tuned_reduce_scatter_algorithm_count = 2; -static int coll_tuned_reduce_scatter_forced_algorithm = 0; -static int coll_tuned_reduce_scatter_segment_size = 0; -static int coll_tuned_reduce_scatter_tree_fanout; -static int coll_tuned_reduce_scatter_chain_fanout; - -/* valid values for coll_tuned_reduce_scatter_forced_algorithm */ -static mca_base_var_enum_value_t reduce_scatter_algorithms[] = { - {0, "ignore"}, - {1, "non-overlapping"}, - {2, "recursive_halfing"}, - {3, "ring"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" /******************************************************************************* - * ompi_coll_tuned_reduce_scatter_intra_nonoverlapping + * ompi_coll_base_reduce_scatter_intra_nonoverlapping * - * This function just calls a reduce to rank 0, followed by an + * This function just calls a reduce to rank 0, followed by an * appropriate scatterv call. */ -int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, +int ompi_coll_base_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { int err, i, rank, size, total_count, *displs = NULL; const int root = 0; @@ -71,7 +55,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_nonoverlapping, rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_nonoverlapping, rank %d", rank)); for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; } @@ -80,7 +64,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, if (MPI_IN_PLACE == sbuf) { /* rbuf on root (0) is big enough to hold whole data */ if (root == rank) { - err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, + err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } else { err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count, @@ -91,13 +75,13 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, /* We must allocate temporary receive buffer on root to ensure that rbuf is big enough */ ptrdiff_t lb, extent, tlb, textent; - + ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &tlb, &textent); tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); tmprbuf = tmprbuf_free - lb; - } + } err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } @@ -105,7 +89,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, if (NULL != tmprbuf_free) free(tmprbuf_free); return err; } - + displs = (int*) malloc(size * sizeof(int)); displs[0] = 0; for (i = 1; i < size; i++) { @@ -122,7 +106,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, /* * Recursive-halving function is (*mostly*) copied from the BASIC coll module. - * I have removed the part which handles "large" message sizes + * I have removed the part which handles "large" message sizes * (non-overlapping version of reduce_Scatter). */ @@ -131,15 +115,15 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, /* * reduce_scatter_intra_basic_recursivehalving * - * Function: - reduce scatter implementation using recursive-halving + * Function: - reduce scatter implementation using recursive-halving * algorithm * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * Limitation: - Works only for commutative operations. */ int -ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, - void *rbuf, +ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(void *sbuf, + void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, @@ -151,12 +135,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, ptrdiff_t true_lb, true_extent, lb, extent, buf_size; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; - + /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_basic_recursivehalving, rank %d", rank)); + + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_basic_recursivehalving, rank %d", rank)); /* Find displacements and the like */ disps = (int*) malloc(sizeof(int) * size); @@ -191,43 +175,43 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } - + /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); result_buf = result_buf_free - true_lb; - + /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); if (OMPI_SUCCESS != err) goto cleanup; - + /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ - tmp_size = opal_next_poweroftwo (size); + tmp_size = opal_next_poweroftwo (size); tmp_size >>= 1; remain = size - tmp_size; - + /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { - err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, + err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) goto cleanup; - + /* we don't participate from here on out */ tmp_rank = -1; } else { err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); - + /* integrate their results into our temp results */ ompi_op_reduce(op, recv_buf, result_buf, count, dtype); - + /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } @@ -236,13 +220,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, remain" ranks dropped out */ tmp_rank = rank - remain; } - + /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; - + /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ @@ -317,11 +301,11 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, free(tmp_rcounts); free(tmp_disps); goto cleanup; - } + } } if (send_count > 0) { err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent, - send_count, dtype, peer, + send_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); @@ -329,7 +313,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, free(tmp_rcounts); free(tmp_disps); goto cleanup; - } + } } /* if we received something on this step, push it into @@ -340,10 +324,10 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, free(tmp_rcounts); free(tmp_disps); goto cleanup; - } + } - ompi_op_reduce(op, - recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, + ompi_op_reduce(op, + recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, recv_count, dtype); } @@ -357,13 +341,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, - rcounts[rank], dtype, + rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; - } + } } free(tmp_rcounts); @@ -389,7 +373,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, comm)); if (OMPI_SUCCESS != err) goto cleanup; } - } + } } cleanup: @@ -404,18 +388,18 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, /* - * ompi_coll_tuned_reduce_scatter_intra_ring + * ompi_coll_base_reduce_scatter_intra_ring * * Function: Ring algorithm for reduce_scatter operation * Accepts: Same as MPI_Reduce_scatter() * Returns: MPI_SUCCESS or error code * - * Description: Implements ring algorithm for reduce_scatter: - * the block sizes defined in rcounts are exchanged and + * Description: Implements ring algorithm for reduce_scatter: + * the block sizes defined in rcounts are exchanged and 8 updated until they reach proper destination. * Algorithm requires 2 * max(rcounts) extra buffering * - * Limitations: The algorithm DOES NOT preserve order of operations so it + * Limitations: The algorithm DOES NOT preserve order of operations so it * can be used only for commutative operations. * Example on 5 nodes: * Initial state @@ -427,7 +411,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, * [04] -> [14] [24] [34] [44] * * COMPUTATION PHASE - * Step 0: rank r sends block (r-1) to rank (r+1) and + * Step 0: rank r sends block (r-1) to rank (r+1) and * receives block (r+1) from rank (r-1) [with wraparound]. * # 0 1 2 3 4 * [00] [10] [10+20] -> [30] [40] @@ -435,12 +419,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, * -> [02] [12] [22] [32] [32+42] -->.. * [43+03] -> [13] [23] [33] [43] * [04] [04+14] -> [24] [34] [44] - * + * * Step 1: * # 0 1 2 3 4 * [00] [10] [10+20] [10+20+30] -> [40] * -> [01] [11] [21] [21+31] [21+31+41] -> - * [32+42+02] -> [12] [22] [32] [32+42] + * [32+42+02] -> [12] [22] [32] [32+42] * [03] [43+03+13] -> [23] [33] [43] * [04] [04+14] [04+14+24] -> [34] [44] * @@ -448,7 +432,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, * # 0 1 2 3 4 * -> [00] [10] [10+20] [10+20+30] [10+20+30+40] -> * [21+31+41+01]-> [11] [21] [21+31] [21+31+41] - * [32+42+02] [32+42+02+12]-> [22] [32] [32+42] + * [32+42+02] [32+42+02+12]-> [22] [32] [32+42] * [03] [43+03+13] [43+03+13+23]-> [33] [43] * [04] [04+14] [04+14+24] [04+14+24+34] -> [44] * @@ -456,14 +440,14 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, * # 0 1 2 3 4 * [10+20+30+40+00] [10] [10+20] [10+20+30] [10+20+30+40] * [21+31+41+01] [21+31+41+01+11] [21] [21+31] [21+31+41] - * [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42] + * [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42] * [03] [43+03+13] [43+03+13+23] [43+03+13+23+33] [43] * [04] [04+14] [04+14+24] [04+14+24+34] [04+14+24+34+44] * DONE :) * */ -int -ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, +int +ompi_coll_base_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, @@ -480,11 +464,11 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:reduce_scatter_intra_ring rank %d, size %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:reduce_scatter_intra_ring rank %d, size %d", rank, size)); - /* Determine the maximum number of elements per node, + /* Determine the maximum number of elements per node, corresponding block size, and displacements array. */ displs = (int*) malloc(size * sizeof(int)); @@ -492,16 +476,16 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, displs[0] = 0; total_count = rcounts[0]; max_block_count = rcounts[0]; - for (i = 1; i < size; i++) { + for (i = 1; i < size; i++) { displs[i] = total_count; total_count += rcounts[i]; if (max_block_count < rcounts[i]) max_block_count = rcounts[i]; } - + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, + ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, (char*)rbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } @@ -541,13 +525,13 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, sbuf = rbuf; } - ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, + ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, accumbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } /* Computation loop */ - /* + /* For each of the remote nodes: - post irecv for block (r-2) from (r-1) with wrap around - send block (r-1) to (r+1) @@ -568,7 +552,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, inbi = 0; /* Initialize first receive from the neighbor on the left */ ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &reqs[inbi])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent; @@ -579,25 +563,25 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, for (k = 2; k < size; k++) { const int prevblock = (rank + size - k) % size; - + inbi = inbi ^ 0x1; /* Post irecv for the current block */ ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &reqs[inbi])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Wait on previous block to arrive */ ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent; ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype); - + /* send previous block to send_to */ ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to, MCA_COLL_BASE_TAG_REDUCE_SCATTER, @@ -613,7 +597,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */ tmprecv = accumbuf + (ptrdiff_t)displs[rank] * extent; ompi_op_reduce(op, inbuf[inbi], tmprecv, rcounts[rank], dtype); - + /* Copy result from tmprecv to rbuf */ ret = ompi_datatype_copy_content_same_ddt(dtype, rcounts[rank], (char *)rbuf, tmprecv); if (ret < 0) { line = __LINE__; goto error_hndl; } @@ -626,7 +610,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); if (NULL != displs) free(displs); if (NULL != accumbuf_free) free(accumbuf_free); @@ -634,139 +618,3 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, if (NULL != inbuf_free[1]) free(inbuf_free[1]); return ret; } - - -/** - * The following are used by dynamic and forced rules - * - * publish details of each algorithm and if its forced/fixed/locked in - * as you add methods/algorithms you must update this and the query/map routines - * - * this routine is called by the component only - * this makes sure that the mca parameters are set to their initial values and - * perms module does not call this they call the forced_getvalues routine - * instead - */ - -int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER] = coll_tuned_reduce_scatter_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_scatter_algorithm_count", - "Number of reduce_scatter algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_reduce_scatter_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_reduce_scatter_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_reduce_scatter_algorithms", reduce_scatter_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_scatter_algorithm", - "Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_scatter_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_reduce_scatter_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_scatter_algorithm_segmentsize", - "Segment size in bytes used by default for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_scatter_segment_size); - - coll_tuned_reduce_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_scatter_algorithm_tree_fanout", - "Fanout for n-tree used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_scatter_tree_fanout); - - coll_tuned_reduce_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "reduce_scatter_algorithm_chain_fanout", - "Fanout for chains used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_reduce_scatter_chain_fanout); - - return (MPI_SUCCESS); -} - - -int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf, - int *rcounts, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d", - data->user_forced[REDUCESCATTER].algorithm)); - - switch (data->user_forced[REDUCESCATTER].algorithm) { - case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts, - dtype, op, comm, module); - case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts, - dtype, op, comm, module); - case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, - dtype, op, comm, module); - case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts, - dtype, op, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); - return (MPI_ERR_ARG); - } /* switch */ -} - - -int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf, - int *rcounts, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts, - dtype, op, comm, module); - case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts, - dtype, op, comm, module); - case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, - dtype, op, comm, module); - case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts, - dtype, op, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); - return (MPI_ERR_ARG); - } /* switch */ -} - diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index b9381e18b7..e832f4064e 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -28,27 +28,12 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" -#include "coll_tuned_util.h" - -/* scatter algorithm variables */ -static int coll_tuned_scatter_algorithm_count = 2; -static int coll_tuned_scatter_forced_algorithm = 0; -static int coll_tuned_scatter_segment_size = 0; -static int coll_tuned_scatter_tree_fanout; -static int coll_tuned_scatter_chain_fanout; - -/* valid values for coll_tuned_scatter_forced_algorithm */ -static mca_base_var_enum_value_t scatter_algorithms[] = { - {0, "ignore"}, - {1, "basic_linear"}, - {2, "binomial"}, - {0, NULL} -}; +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" +#include "coll_base_util.h" int -ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, +ompi_coll_base_scatter_intra_binomial(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -60,19 +45,19 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, char *ptmp, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; - MPI_Aint sextent, slb, strue_lb, strue_extent; + MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_scatter_intra_binomial rank %d", rank)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "ompi_coll_base_scatter_intra_binomial rank %d", rank)); /* create the binomial tree */ - COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); + COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_get_extent(sdtype, &slb, &sextent); @@ -167,7 +152,7 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, total_send += mycount; } - if (NULL != tempbuf) + if (NULL != tempbuf) free(tempbuf); } else { /* recv from parent on leaf nodes */ @@ -182,7 +167,7 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, if (NULL != tempbuf) free(tempbuf); - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; } @@ -190,13 +175,13 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations - * but for some small number of nodes and/or small data sizes they - * are just as fast as tuned/tree based segmenting operations + * but for some small number of nodes and/or small data sizes they + * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not * have to duplicate code. - * JPG following the examples from other coll_tuned implementations. Dec06. + * JPG following the examples from other coll_base implementations. Dec06. */ /* copied function (with appropriate renaming) starts here */ @@ -208,7 +193,7 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, * Returns: - MPI_SUCCESS or error code */ int -ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount, +ompi_coll_base_scatter_intra_basic_linear(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, @@ -269,153 +254,3 @@ ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount, /* copied function (with appropriate renaming) ends here */ - -/* The following are used by dynamic and forced rules */ - -/* publish details of each algorithm and if its forced/fixed/locked in */ -/* as you add methods/algorithms you must update this and the query/map - routines */ - -/* this routine is called by the component only */ -/* this makes sure that the mca parameters are set to their initial values - and perms */ -/* module does not call this they call the forced_getvalues routine instead */ - -int -ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) -{ - mca_base_var_enum_t *new_enum; - - ompi_coll_tuned_forced_max_algorithms[SCATTER] = coll_tuned_scatter_algorithm_count; - - (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_count", - "Number of scatter algorithms available", - MCA_BASE_VAR_TYPE_INT, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &coll_tuned_scatter_algorithm_count); - - /* MPI_T: This variable should eventually be bound to a communicator */ - coll_tuned_scatter_forced_algorithm = 0; - (void) mca_base_var_enum_create("coll_tuned_scatter_algorithms", scatter_algorithms, &new_enum); - mca_param_indices->algorithm_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm", - "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_scatter_forced_algorithm); - OBJ_RELEASE(new_enum); - if (mca_param_indices->algorithm_param_index < 0) { - return mca_param_indices->algorithm_param_index; - } - - coll_tuned_scatter_segment_size = 0; - mca_param_indices->segsize_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_segmentsize", - "Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_scatter_segment_size); - - coll_tuned_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */ - mca_param_indices->tree_fanout_param_index = - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_tree_fanout", - "Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_scatter_tree_fanout); - - coll_tuned_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */ - mca_param_indices->chain_fanout_param_index= - mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_chain_fanout", - "Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &coll_tuned_scatter_chain_fanout); - - return (MPI_SUCCESS); -} - -int -ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; - mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_forced selected algorithm %d", - data->user_forced[SCATTER].algorithm)); - - switch (data->user_forced[SCATTER].algorithm) { - case (0): - return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (1): - return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (2): - return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[SCATTER].algorithm, - ompi_coll_tuned_forced_max_algorithms[SCATTER])); - return (MPI_ERR_ARG); - } /* switch */ -} - -int -ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) -{ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); - - switch (algorithm) { - case (0): - return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (1): - return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (2): - return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[SCATTER])); - return (MPI_ERR_ARG); - } /* switch */ -} diff --git a/ompi/mca/coll/base/coll_base_topo.c b/ompi/mca/coll/base/coll_base_topo.c index 0df6599dd6..0736faaf91 100644 --- a/ompi/mca/coll/base/coll_base_topo.c +++ b/ompi/mca/coll/base/coll_base_topo.c @@ -5,16 +5,16 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -25,8 +25,8 @@ #include "ompi/constants.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/base/coll_tags.h" -#include "coll_tuned.h" -#include "coll_tuned_topo.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_base_topo.h" /* * Some static helpers. @@ -75,36 +75,36 @@ static int calculate_num_nodes_up_to_level( int fanout, int level ) */ ompi_coll_tree_t* -ompi_coll_tuned_topo_build_tree( int fanout, +ompi_coll_base_topo_build_tree( int fanout, struct ompi_communicator_t* comm, int root ) { int rank, size, schild, sparent, shiftedrank, i; int level; /* location of my rank in the tree structure of size */ int delta; /* number of nodes on my level */ - int slimit; /* total number of nodes on levels above me */ + int slimit; /* total number of nodes on levels above me */ ompi_coll_tree_t* tree; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root)); if (fanout<1) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree invalid fanout %d", fanout)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout)); return NULL; } if (fanout>MAXTREEFANOUT) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT)); return NULL; } - /* - * Get size and rank of the process in this communicator + /* + * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!tree) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree PANIC::out of memory")); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory")); return NULL; } @@ -115,8 +115,8 @@ ompi_coll_tuned_topo_build_tree( int fanout, * Set root */ tree->tree_root = root; - - /* + + /* * Initialize tree */ tree->tree_fanout = fanout; @@ -132,11 +132,11 @@ ompi_coll_tuned_topo_build_tree( int fanout, if( size < 2 ) { return tree; } - + /* - * Shift all ranks by root, so that the algorithm can be + * Shift all ranks by root, so that the algorithm can be * designed as if root would be always 0 - * shiftedrank should be used in calculating distances + * shiftedrank should be used in calculating distances * and position in tree */ shiftedrank = rank - root; @@ -158,7 +158,7 @@ ompi_coll_tuned_topo_build_tree( int fanout, break; } } - + /* find my parent */ slimit = calculate_num_nodes_up_to_level( fanout, level ); sparent = shiftedrank; @@ -170,12 +170,12 @@ ompi_coll_tuned_topo_build_tree( int fanout, } } tree->tree_prev = (sparent+root)%size; - + return tree; } /* - * Constructs in-order binary tree which can be used for non-commutative reduce + * Constructs in-order binary tree which can be used for non-commutative reduce * operations. * Root of this tree is always rank (size-1) and fanout is 2. * Here are some of the examples of this tree: @@ -189,28 +189,28 @@ ompi_coll_tuned_topo_build_tree( int fanout, * 4 0 */ ompi_coll_tree_t* -ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) +ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) { int rank, size, myrank, rightsize, delta, parent, lchild, rchild; ompi_coll_tree_t* tree; - /* - * Get size and rank of the process in this communicator + /* + * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!tree) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:topo_build_tree PANIC::out of memory")); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:topo_build_tree PANIC::out of memory")); return NULL; } tree->tree_root = MPI_UNDEFINED; tree->tree_nextsize = MPI_UNDEFINED; - /* + /* * Initialize tree */ tree->tree_fanout = 2; @@ -220,11 +220,11 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) tree->tree_nextsize = 0; tree->tree_next[0] = -1; tree->tree_next[1] = -1; - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:topo_build_in_order_tree Building fo %d rt %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:topo_build_in_order_tree Building fo %d rt %d", tree->tree_fanout, tree->tree_root)); - /* + /* * Build the tree */ myrank = rank; @@ -240,18 +240,18 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) rchild = -1; if (size - 1 > 0) { lchild = parent - 1; - if (lchild > 0) { + if (lchild > 0) { rchild = rightsize - 1; } } - - /* The following cases are possible: myrank can be + + /* The following cases are possible: myrank can be - a parent, - belong to the left subtree, or - belong to the right subtee Each of the cases need to be handled differently. */ - + if (myrank == parent) { /* I am the parent: - compute real ranks of my children, and exit the loop. */ @@ -262,7 +262,7 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) if (myrank > rchild) { /* I belong to the left subtree: - If I am the left child, compute real rank of my parent - - Iterate down through tree: + - Iterate down through tree: compute new size, shift ranks down, and update delta. */ if (myrank == lchild) { @@ -276,8 +276,8 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) } else { /* I belong to the right subtree: - If I am the right child, compute real rank of my parent - - Iterate down through tree: - compute new size and parent, + - Iterate down through tree: + compute new size and parent, but the delta and rank do not need to change. */ if (myrank == rchild) { @@ -287,14 +287,14 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) parent = rchild; } } - + if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; } if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; } return tree; } -int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ) +int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree ) { ompi_coll_tree_t *ptr; @@ -311,7 +311,7 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ) } /* - * + * * Here are some of the examples of this tree: * size == 2 size = 4 size = 8 * 0 0 0 @@ -323,16 +323,16 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ) * 7 */ ompi_coll_tree_t* -ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, +ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm, int root ) { int childs = 0, rank, size, mask = 1, index, remote, i; ompi_coll_tree_t *bmtree; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root)); - /* - * Get size and rank of the process in this communicator + /* + * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -341,7 +341,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!bmtree) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory")); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory")); return NULL; } @@ -372,7 +372,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, remote += root; if( remote >= size ) remote -= size; if (childs==MAXTREEFANOUT) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs)); free(bmtree); return NULL; } @@ -388,7 +388,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, /* * Constructs in-order binomial tree which can be used for gather/scatter * operations. - * + * * Here are some of the examples of this tree: * size == 2 size = 4 size = 8 * 0 0 0 @@ -400,16 +400,16 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, * 7 */ ompi_coll_tree_t* -ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, +ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, int root ) { int childs = 0, rank, vrank, size, mask = 1, remote, i; ompi_coll_tree_t *bmtree; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root)); - /* - * Get size and rank of the process in this communicator + /* + * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -418,7 +418,7 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!bmtree) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory")); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory")); return NULL; } @@ -442,8 +442,8 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, bmtree->tree_next[childs] = (remote + root) % size; childs++; if (childs==MAXTREEFANOUT) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs)); free (bmtree); return NULL; @@ -459,36 +459,36 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, ompi_coll_tree_t* -ompi_coll_tuned_topo_build_chain( int fanout, +ompi_coll_base_topo_build_chain( int fanout, struct ompi_communicator_t* comm, int root ) { int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */; ompi_coll_tree_t *chain; - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root)); - /* - * Get size and rank of the process in this communicator + /* + * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); if( fanout < 1 ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!")); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!")); fanout = 1; } if (fanout>MAXTREEFANOUT) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT)); fanout = MAXTREEFANOUT; } /* - * Allocate space for topology arrays if needed + * Allocate space for topology arrays if needed */ chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) ); if (!chain) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain PANIC out of memory")); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory")); fflush(stdout); return NULL; } @@ -496,17 +496,17 @@ ompi_coll_tuned_topo_build_chain( int fanout, chain->tree_nextsize = -1; for(i=0;itree_next[i] = -1; - /* + /* * Set root & numchain */ chain->tree_root = root; - if( (size - 1) < fanout ) { + if( (size - 1) < fanout ) { chain->tree_nextsize = size-1; fanout = size-1; } else { chain->tree_nextsize = fanout; } - + /* * Shift ranks */ @@ -577,7 +577,7 @@ ompi_coll_tuned_topo_build_chain( int fanout, chain->tree_nextsize = 1; } else { chain->tree_next[0] = -1; - chain->tree_nextsize = 0; + chain->tree_nextsize = 0; } } chain->tree_prev = (chain->tree_prev+root)%size; @@ -586,7 +586,7 @@ ompi_coll_tuned_topo_build_chain( int fanout, } } else { /* - * Unshift values + * Unshift values */ chain->tree_prev = -1; chain->tree_next[0] = (root+1)%size; @@ -603,17 +603,62 @@ ompi_coll_tuned_topo_build_chain( int fanout, return chain; } -int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank) +int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank) { int i; - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo:topo_dump_tree %1d tree root %d" + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d" " fanout %d BM %1d nextsize %d prev %d", rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout, tree->tree_nextsize, tree->tree_prev)); if( tree->tree_nextsize ) { for( i = 0; i < tree->tree_nextsize; i++ ) - OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i])); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i])); } return (0); } + +mca_coll_base_comm_t* ompi_coll_base_topo_construct( mca_coll_base_comm_t* data ) +{ + if( NULL == data ) { + data = (mca_coll_base_comm_t*)calloc(1, sizeof(mca_coll_base_comm_t)); + } + return data; +} + +void ompi_coll_base_topo_destruct( mca_coll_base_comm_t* data ) +{ + if(NULL == data) return; + +#if OPAL_ENABLE_DEBUG + /* Reset the reqs to NULL/0 -- they'll be freed as part of freeing + the generel c_coll_selected_data */ + data->mcct_reqs = NULL; + data->mcct_num_reqs = 0; +#endif + + /* free any cached information that has been allocated */ + if (data->cached_ntree) { /* destroy general tree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_ntree); + } + if (data->cached_bintree) { /* destroy bintree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_bintree); + } + if (data->cached_bmtree) { /* destroy bmtree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_bmtree); + } + if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree); + } + if (data->cached_chain) { /* destroy general chain if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_chain); + } + if (data->cached_pipeline) { /* destroy pipeline if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_pipeline); + } + if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */ + ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree); + } + + free(data); +} diff --git a/ompi/mca/coll/base/coll_base_topo.h b/ompi/mca/coll/base/coll_base_topo.h index 717b67aed9..35159dadfb 100644 --- a/ompi/mca/coll/base/coll_base_topo.h +++ b/ompi/mca/coll/base/coll_base_topo.h @@ -5,19 +5,19 @@ * Copyright (c) 2004-2012 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ -#ifndef MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED -#define MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED +#ifndef MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED +#define MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED #include "ompi_config.h" @@ -35,29 +35,28 @@ typedef struct ompi_coll_tree_t { } ompi_coll_tree_t; ompi_coll_tree_t* -ompi_coll_tuned_topo_build_tree( int fanout, +ompi_coll_base_topo_build_tree( int fanout, struct ompi_communicator_t* com, int root ); ompi_coll_tree_t* -ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ); +ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm ); ompi_coll_tree_t* -ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, +ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm, int root ); ompi_coll_tree_t* -ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, +ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, int root ); ompi_coll_tree_t* -ompi_coll_tuned_topo_build_chain( int fanout, +ompi_coll_base_topo_build_chain( int fanout, struct ompi_communicator_t* com, int root ); -int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ); +int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree ); /* debugging stuff, will be removed later */ -int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank); +int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank); END_C_DECLS -#endif /* MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED */ - +#endif /* MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 8fe57ce01b..34607067e8 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -19,17 +19,17 @@ */ #include "ompi_config.h" -#include "coll_tuned.h" #include "mpi.h" #include "ompi/constants.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/coll/base/coll_base_functions.h" #include "ompi/mca/pml/pml.h" -#include "coll_tuned_util.h" +#include "coll_base_util.h" -int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, +int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, void* recvbuf, size_t rcount, @@ -91,14 +91,14 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, *status = statuses[err_index]; } err = statuses[err_index].MPI_ERROR; - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s" - " stage of ompi_coll_tuned_sendrecv_zero\n", + OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s" + " stage of ompi_coll_base_sendrecv_zero\n", __FILE__, line, err, (0 == err_index ? "receive" : "send"))); } else { /* Error discovered during the posting of the irecv or isend, * and no status is available. */ - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n", + OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n", __FILE__, line, err)); if (MPI_STATUS_IGNORE != status) { status->MPI_ERROR = err; diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index e46e7f4020..c49d6e37bd 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -18,8 +18,8 @@ * $HEADER$ */ -#ifndef MCA_COLL_TUNED_UTIL_EXPORT_H -#define MCA_COLL_TUNED_UTIL_EXPORT_H +#ifndef MCA_COLL_BASE_UTIL_EXPORT_H +#define MCA_COLL_BASE_UTIL_EXPORT_H #include "ompi_config.h" @@ -36,7 +36,7 @@ BEGIN_C_DECLS * If one of the communications results in a zero-byte message the * communication is ignored, and no message will cross to the peer. */ -int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, +int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, void* recvbuf, size_t rcount, @@ -53,7 +53,7 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, * communications. */ static inline int -ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, +ompi_coll_base_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, void* recvbuf, size_t rcount, ompi_datatype_t* rdatatype, int source, int rtag, @@ -64,13 +64,13 @@ ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdataty return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype, recvbuf, (int32_t) rcount, rdatatype); } - return ompi_coll_tuned_sendrecv_nonzero_actual (sendbuf, scount, sdatatype, + return ompi_coll_base_sendrecv_nonzero_actual (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype, source, rtag, comm, status); } END_C_DECLS -#endif /* MCA_COLL_TUNED_UTIL_EXPORT_H */ +#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/coll.h b/ompi/mca/coll/coll.h index 34bed4a14e..82f62ff69c 100644 --- a/ompi/mca/coll/coll.h +++ b/ompi/mca/coll/coll.h @@ -470,6 +470,9 @@ struct mca_coll_base_module_2_1_0_t { be used for the given communicator */ mca_coll_base_module_disable_1_1_0_fn_t coll_module_disable; + /** Data storage for all the algorithms defined in the base. Should + not be used by other modules */ + struct mca_coll_base_comm_t* base_data; }; typedef struct mca_coll_base_module_2_1_0_t mca_coll_base_module_2_1_0_t;