Rework the selection logic for the tuned collectives. All supported collectives
now are able to use the dynamic rules. Moreover, these rules are loaded only once, and stored at the component level. All communicators are able to use these rules (not only MPI_COMM_WORLD as until now). A lot of minor corrections, memory management issues and reduction in the amount of memory used by the tuned collectives. This commit was SVN r21825.
Этот коммит содержится в:
родитель
c3afac1d50
Коммит
23e8ce91ba
@ -2,7 +2,7 @@
|
|||||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
# University Research and Technology
|
# University Research and Technology
|
||||||
# Corporation. All rights reserved.
|
# Corporation. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
# Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
# of Tennessee Research Foundation. All rights
|
# of Tennessee Research Foundation. All rights
|
||||||
# reserved.
|
# reserved.
|
||||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -22,14 +22,12 @@ sources = \
|
|||||||
coll_tuned_util.h \
|
coll_tuned_util.h \
|
||||||
coll_tuned_dynamic_file.h \
|
coll_tuned_dynamic_file.h \
|
||||||
coll_tuned_dynamic_rules.h \
|
coll_tuned_dynamic_rules.h \
|
||||||
coll_tuned_forced.h \
|
|
||||||
coll_tuned_topo.c \
|
coll_tuned_topo.c \
|
||||||
coll_tuned_util.c \
|
coll_tuned_util.c \
|
||||||
coll_tuned_decision_fixed.c \
|
coll_tuned_decision_fixed.c \
|
||||||
coll_tuned_decision_dynamic.c \
|
coll_tuned_decision_dynamic.c \
|
||||||
coll_tuned_dynamic_file.c \
|
coll_tuned_dynamic_file.c \
|
||||||
coll_tuned_dynamic_rules.c \
|
coll_tuned_dynamic_rules.c \
|
||||||
coll_tuned_forced.c \
|
|
||||||
coll_tuned_allreduce.c \
|
coll_tuned_allreduce.c \
|
||||||
coll_tuned_alltoall.c \
|
coll_tuned_alltoall.c \
|
||||||
coll_tuned_alltoallv.c \
|
coll_tuned_alltoallv.c \
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -34,12 +34,26 @@
|
|||||||
/* also need the dynamic rule structures */
|
/* also need the dynamic rule structures */
|
||||||
#include "coll_tuned_dynamic_rules.h"
|
#include "coll_tuned_dynamic_rules.h"
|
||||||
|
|
||||||
/* need the forced user choice structures */
|
|
||||||
#include "coll_tuned_forced.h"
|
|
||||||
|
|
||||||
/* some fixed value index vars to simplify certain operations */
|
/* some fixed value index vars to simplify certain operations */
|
||||||
typedef enum COLLTYPE {ALLGATHER, ALLGATHERV, ALLREDUCE, ALLTOALL, ALLTOALLV, ALLTOALLW, BARRIER, BCAST,
|
typedef enum COLLTYPE {
|
||||||
EXSCAN, GATHER, GATHERV, REDUCE, REDUCESCATTER, SCAN, SCATTER, SCATTERV, COLLCOUNT} COLLTYPE_T;
|
ALLGATHER = 0, /* 0 */
|
||||||
|
ALLGATHERV, /* 1 */
|
||||||
|
ALLREDUCE, /* 2 */
|
||||||
|
ALLTOALL, /* 3 */
|
||||||
|
ALLTOALLV, /* 4 */
|
||||||
|
ALLTOALLW, /* 5 */
|
||||||
|
BARRIER, /* 6 */
|
||||||
|
BCAST, /* 7 */
|
||||||
|
EXSCAN, /* 8 */
|
||||||
|
GATHER, /* 9 */
|
||||||
|
GATHERV, /* 10 */
|
||||||
|
REDUCE, /* 11 */
|
||||||
|
REDUCESCATTER, /* 12 */
|
||||||
|
SCAN, /* 13 */
|
||||||
|
SCATTER, /* 14 */
|
||||||
|
SCATTERV, /* 15 */
|
||||||
|
COLLCOUNT /* 16 end counter keep it as last element */
|
||||||
|
} COLLTYPE_T;
|
||||||
|
|
||||||
/* defined arg lists to simply auto inclusion of user overriding decision functions */
|
/* defined arg lists to simply auto inclusion of user overriding decision functions */
|
||||||
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||||
@ -60,343 +74,357 @@ EXSCAN, GATHER, GATHERV, REDUCE, REDUCESCATTER, SCAN, SCATTER, SCATTERV, COLLCOU
|
|||||||
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||||
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
|
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
BEGIN_C_DECLS
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* these are the same across all modules and are loaded at component query time */
|
/* these are the same across all modules and are loaded at component query time */
|
||||||
extern int ompi_coll_tuned_stream;
|
extern int ompi_coll_tuned_stream;
|
||||||
extern int ompi_coll_tuned_priority;
|
extern int ompi_coll_tuned_priority;
|
||||||
extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
|
extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
|
||||||
extern int ompi_coll_tuned_use_dynamic_rules;
|
extern int ompi_coll_tuned_use_dynamic_rules;
|
||||||
extern char* ompi_coll_tuned_dynamic_rules_filename;
|
extern char* ompi_coll_tuned_dynamic_rules_filename;
|
||||||
extern int ompi_coll_tuned_init_tree_fanout;
|
extern int ompi_coll_tuned_init_tree_fanout;
|
||||||
extern int ompi_coll_tuned_init_chain_fanout;
|
extern int ompi_coll_tuned_init_chain_fanout;
|
||||||
extern int ompi_coll_tuned_init_max_requests;
|
extern int ompi_coll_tuned_init_max_requests;
|
||||||
|
|
||||||
/* forced algorithm choices */
|
/* forced algorithm choices */
|
||||||
/* the indices to the MCA params so that modules can look them up at open / comm create time */
|
/* this structure is for storing the indexes to the forced algorithm mca params... */
|
||||||
extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
|
/* we get these at component query (so that registered values appear in ompi_infoi) */
|
||||||
/* the actual max algorithm values (readonly), loaded at component open */
|
struct coll_tuned_force_algorithm_mca_param_indices_t {
|
||||||
extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
|
int algorithm_param_index; /* which algorithm you want to force */
|
||||||
|
int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */
|
||||||
/*
|
int tree_fanout_param_index; /* tree fanout/in to use */
|
||||||
* coll API functions
|
int chain_fanout_param_index; /* K-chain fanout/in to use */
|
||||||
*/
|
int max_requests_param_index; /* Maximum number of outstanding send or recv requests */
|
||||||
|
};
|
||||||
/* API functions */
|
typedef struct coll_tuned_force_algorithm_mca_param_indices_t coll_tuned_force_algorithm_mca_param_indices_t;
|
||||||
|
|
||||||
int ompi_coll_tuned_init_query(bool enable_progress_threads,
|
|
||||||
bool enable_mpi_threads);
|
|
||||||
|
|
||||||
mca_coll_base_module_t *
|
|
||||||
ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);
|
|
||||||
|
|
||||||
/* API functions of decision functions and any implementations */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Note this gets long as we have to have a prototype for each
|
|
||||||
* MPI collective 4 times.. 2 for the comm type and 2 for each decision
|
|
||||||
* type.
|
|
||||||
* we might cut down the decision prototypes by conditional compiling
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* All Gather */
|
|
||||||
int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_allgather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);
|
|
||||||
|
|
||||||
/* All GatherV */
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_dec_fixed(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);
|
|
||||||
|
|
||||||
/* All Reduce */
|
|
||||||
int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
|
|
||||||
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);
|
|
||||||
|
|
||||||
/* AlltoAll */
|
|
||||||
int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
|
|
||||||
int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);
|
|
||||||
|
|
||||||
/* AlltoAllV */
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_dec_fixed(ALLTOALLV_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm);
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);
|
|
||||||
|
|
||||||
/* AlltoAllW */
|
|
||||||
int ompi_coll_tuned_alltoallw_intra_dec_fixed(ALLTOALLW_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallw_intra_dec_dynamic(ALLTOALLW_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallw_inter_dec_fixed(ALLTOALLW_ARGS);
|
|
||||||
int ompi_coll_tuned_alltoallw_inter_dec_dynamic(ALLTOALLW_ARGS);
|
|
||||||
|
|
||||||
/* Barrier */
|
|
||||||
int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);
|
|
||||||
int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS);
|
|
||||||
|
|
||||||
/* Bcast */
|
|
||||||
int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
|
|
||||||
int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
|
|
||||||
int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
|
|
||||||
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
|
|
||||||
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
|
|
||||||
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
|
|
||||||
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
|
|
||||||
int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
|
|
||||||
int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
|
|
||||||
int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
|
|
||||||
int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
|
|
||||||
int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);
|
|
||||||
|
|
||||||
/* Exscan */
|
|
||||||
int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
|
|
||||||
int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
|
|
||||||
int ompi_coll_tuned_exscan_inter_dec_fixed(EXSCAN_ARGS);
|
|
||||||
int ompi_coll_tuned_exscan_inter_dec_dynamic(EXSCAN_ARGS);
|
|
||||||
|
|
||||||
/* Gather */
|
|
||||||
int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
|
|
||||||
int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);
|
|
||||||
|
|
||||||
/* GatherV */
|
|
||||||
int ompi_coll_tuned_gatherv_intra_dec_fixed(GATHERV_ARGS);
|
|
||||||
int ompi_coll_tuned_gatherv_intra_dec_dynamic(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gatherv_inter_dec_fixed(GATHER_ARGS);
|
|
||||||
int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);
|
|
||||||
|
|
||||||
/* Reduce */
|
|
||||||
int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs );
|
|
||||||
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
|
|
||||||
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
|
|
||||||
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
|
||||||
int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
|
||||||
int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
|
||||||
int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
|
||||||
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
|
|
||||||
|
|
||||||
/* Reduce_scatter */
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
|
|
||||||
|
|
||||||
int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
|
|
||||||
|
|
||||||
/* Scan */
|
|
||||||
int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
|
|
||||||
int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
|
|
||||||
int ompi_coll_tuned_scan_inter_dec_fixed(SCAN_ARGS);
|
|
||||||
int ompi_coll_tuned_scan_inter_dec_dynamic(SCAN_ARGS);
|
|
||||||
|
|
||||||
/* Scatter */
|
|
||||||
int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
|
|
||||||
int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
|
||||||
int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
|
|
||||||
int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);
|
|
||||||
|
|
||||||
/* ScatterV */
|
|
||||||
int ompi_coll_tuned_scatterv_intra_dec_fixed(SCATTERV_ARGS);
|
|
||||||
int ompi_coll_tuned_scatterv_intra_dec_dynamic(SCATTERV_ARGS);
|
|
||||||
int ompi_coll_tuned_scatterv_inter_dec_fixed(SCATTERV_ARGS);
|
|
||||||
int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);
|
|
||||||
|
|
||||||
int mca_coll_tuned_ft_event(int state);
|
|
||||||
|
|
||||||
|
|
||||||
/* Utility functions */
|
/* the following type is for storing actual value obtained from the MCA on each tuned module */
|
||||||
|
/* via their mca param indices lookup in the component */
|
||||||
|
/* this structure is stored once per collective type per communicator... */
|
||||||
|
struct coll_tuned_force_algorithm_params_t {
|
||||||
|
int algorithm; /* which algorithm you want to force */
|
||||||
|
int segsize; /* segsize to use (if supported), 0 = no segmentation */
|
||||||
|
int tree_fanout; /* tree fanout/in to use */
|
||||||
|
int chain_fanout; /* K-chain fanout/in to use */
|
||||||
|
int max_requests; /* Maximum number of outstanding send or recv requests */
|
||||||
|
};
|
||||||
|
typedef struct coll_tuned_force_algorithm_params_t coll_tuned_force_algorithm_params_t;
|
||||||
|
|
||||||
static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
|
/* the indices to the MCA params so that modules can look them up at open / comm create time */
|
||||||
{
|
extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
|
||||||
|
/* the actual max algorithm values (readonly), loaded at component open */
|
||||||
|
extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* coll API functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* API functions */
|
||||||
|
|
||||||
|
int ompi_coll_tuned_init_query(bool enable_progress_threads,
|
||||||
|
bool enable_mpi_threads);
|
||||||
|
|
||||||
|
mca_coll_base_module_t *
|
||||||
|
ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);
|
||||||
|
|
||||||
|
/* API functions of decision functions and any implementations */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note this gets long as we have to have a prototype for each
|
||||||
|
* MPI collective 4 times.. 2 for the comm type and 2 for each decision
|
||||||
|
* type.
|
||||||
|
* we might cut down the decision prototypes by conditional compiling
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* All Gather */
|
||||||
|
int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);
|
||||||
|
|
||||||
|
/* All GatherV */
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_dec_fixed(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);
|
||||||
|
|
||||||
|
/* All Reduce */
|
||||||
|
int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);
|
||||||
|
|
||||||
|
/* AlltoAll */
|
||||||
|
int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
|
||||||
|
int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);
|
||||||
|
|
||||||
|
/* AlltoAllV */
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_dec_fixed(ALLTOALLV_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm);
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);
|
||||||
|
|
||||||
|
/* AlltoAllW */
|
||||||
|
int ompi_coll_tuned_alltoallw_intra_dec_fixed(ALLTOALLW_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallw_intra_dec_dynamic(ALLTOALLW_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallw_inter_dec_fixed(ALLTOALLW_ARGS);
|
||||||
|
int ompi_coll_tuned_alltoallw_inter_dec_dynamic(ALLTOALLW_ARGS);
|
||||||
|
|
||||||
|
/* Barrier */
|
||||||
|
int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);
|
||||||
|
int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS);
|
||||||
|
|
||||||
|
/* Bcast */
|
||||||
|
int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
|
||||||
|
int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
|
||||||
|
int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
|
||||||
|
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
|
||||||
|
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
|
||||||
|
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
|
||||||
|
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
|
||||||
|
int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);
|
||||||
|
|
||||||
|
/* Exscan */
|
||||||
|
int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
|
||||||
|
int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
|
||||||
|
int ompi_coll_tuned_exscan_inter_dec_fixed(EXSCAN_ARGS);
|
||||||
|
int ompi_coll_tuned_exscan_inter_dec_dynamic(EXSCAN_ARGS);
|
||||||
|
|
||||||
|
/* Gather */
|
||||||
|
int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
|
||||||
|
int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);
|
||||||
|
|
||||||
|
/* GatherV */
|
||||||
|
int ompi_coll_tuned_gatherv_intra_dec_fixed(GATHERV_ARGS);
|
||||||
|
int ompi_coll_tuned_gatherv_intra_dec_dynamic(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gatherv_inter_dec_fixed(GATHER_ARGS);
|
||||||
|
int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);
|
||||||
|
|
||||||
|
/* Reduce */
|
||||||
|
int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs );
|
||||||
|
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
|
||||||
|
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
|
||||||
|
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||||
|
int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||||
|
int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||||
|
int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||||
|
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
|
||||||
|
|
||||||
|
/* Reduce_scatter */
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
|
||||||
|
|
||||||
|
int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
|
||||||
|
|
||||||
|
/* Scan */
|
||||||
|
int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
|
||||||
|
int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
|
||||||
|
int ompi_coll_tuned_scan_inter_dec_fixed(SCAN_ARGS);
|
||||||
|
int ompi_coll_tuned_scan_inter_dec_dynamic(SCAN_ARGS);
|
||||||
|
|
||||||
|
/* Scatter */
|
||||||
|
int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
|
||||||
|
int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||||
|
int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
|
||||||
|
int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);
|
||||||
|
|
||||||
|
/* ScatterV */
|
||||||
|
int ompi_coll_tuned_scatterv_intra_dec_fixed(SCATTERV_ARGS);
|
||||||
|
int ompi_coll_tuned_scatterv_intra_dec_dynamic(SCATTERV_ARGS);
|
||||||
|
int ompi_coll_tuned_scatterv_inter_dec_fixed(SCATTERV_ARGS);
|
||||||
|
int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);
|
||||||
|
|
||||||
|
int mca_coll_tuned_ft_event(int state);
|
||||||
|
|
||||||
|
|
||||||
|
/* Utility functions */
|
||||||
|
|
||||||
|
static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
|
||||||
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < count; ++i)
|
for (i = 0; i < count; ++i)
|
||||||
ompi_request_free(&reqs[i]);
|
ompi_request_free(&reqs[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct mca_coll_tuned_component_t {
|
struct mca_coll_tuned_component_t {
|
||||||
/** Base coll component */
|
/** Base coll component */
|
||||||
mca_coll_base_component_2_0_0_t super;
|
mca_coll_base_component_2_0_0_t super;
|
||||||
|
|
||||||
/** MCA parameter: Priority of this component */
|
/** MCA parameter: Priority of this component */
|
||||||
int tuned_priority;
|
int tuned_priority;
|
||||||
|
|
||||||
/** global stuff that I need the component to store */
|
/** global stuff that I need the component to store */
|
||||||
|
|
||||||
/* MCA parameters first */
|
/* MCA parameters first */
|
||||||
|
|
||||||
/* cached decision table stuff (moved from MCW module) */
|
/* cached decision table stuff (moved from MCW module) */
|
||||||
ompi_coll_alg_rule_t *all_base_rules;
|
ompi_coll_alg_rule_t *all_base_rules;
|
||||||
};
|
};
|
||||||
/**
|
/**
|
||||||
* Convenience typedef
|
* Convenience typedef
|
||||||
*/
|
*/
|
||||||
typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t;
|
typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Global component instance
|
* Global component instance
|
||||||
*/
|
*/
|
||||||
OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;
|
OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Data structure for hanging data off the communicator
|
* Data structure for hanging data off the communicator
|
||||||
* i.e. per module instance
|
* i.e. per module instance
|
||||||
*/
|
*/
|
||||||
struct mca_coll_tuned_comm_t {
|
struct mca_coll_tuned_comm_t {
|
||||||
/* standard data for requests and PML usage */
|
/* standard data for requests and PML usage */
|
||||||
|
|
||||||
/* Precreate space for requests
|
/* Precreate space for requests
|
||||||
* Note this does not effect basic,
|
* Note this does not effect basic,
|
||||||
* but if in wrong context can confuse a debugger
|
* but if in wrong context can confuse a debugger
|
||||||
* this is controlled by an MCA param
|
* this is controlled by an MCA param
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ompi_request_t **mcct_reqs;
|
ompi_request_t **mcct_reqs;
|
||||||
int mcct_num_reqs;
|
int mcct_num_reqs;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* tuned topo information caching per communicator
|
* tuned topo information caching per communicator
|
||||||
*
|
*
|
||||||
* for each communicator we cache the topo information so we can
|
* for each communicator we cache the topo information so we can
|
||||||
* reuse without regenerating if we change the root, [or fanout]
|
* reuse without regenerating if we change the root, [or fanout]
|
||||||
* then regenerate and recache this information
|
* then regenerate and recache this information
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* general tree with n fan out */
|
/* general tree with n fan out */
|
||||||
ompi_coll_tree_t *cached_ntree;
|
ompi_coll_tree_t *cached_ntree;
|
||||||
int cached_ntree_root;
|
int cached_ntree_root;
|
||||||
int cached_ntree_fanout;
|
int cached_ntree_fanout;
|
||||||
|
|
||||||
/* binary tree */
|
/* binary tree */
|
||||||
ompi_coll_tree_t *cached_bintree;
|
ompi_coll_tree_t *cached_bintree;
|
||||||
int cached_bintree_root;
|
int cached_bintree_root;
|
||||||
|
|
||||||
/* binomial tree */
|
/* binomial tree */
|
||||||
ompi_coll_tree_t *cached_bmtree;
|
ompi_coll_tree_t *cached_bmtree;
|
||||||
int cached_bmtree_root;
|
int cached_bmtree_root;
|
||||||
|
|
||||||
/* binomial tree */
|
/* binomial tree */
|
||||||
ompi_coll_tree_t *cached_in_order_bmtree;
|
ompi_coll_tree_t *cached_in_order_bmtree;
|
||||||
int cached_in_order_bmtree_root;
|
int cached_in_order_bmtree_root;
|
||||||
|
|
||||||
/* chained tree (fanout followed by pipelines) */
|
/* chained tree (fanout followed by pipelines) */
|
||||||
ompi_coll_tree_t *cached_chain;
|
ompi_coll_tree_t *cached_chain;
|
||||||
int cached_chain_root;
|
int cached_chain_root;
|
||||||
int cached_chain_fanout;
|
int cached_chain_fanout;
|
||||||
|
|
||||||
/* pipeline */
|
/* pipeline */
|
||||||
ompi_coll_tree_t *cached_pipeline;
|
ompi_coll_tree_t *cached_pipeline;
|
||||||
int cached_pipeline_root;
|
int cached_pipeline_root;
|
||||||
|
|
||||||
/* in-order binary tree (root of the in-order binary tree is rank 0) */
|
/* in-order binary tree (root of the in-order binary tree is rank 0) */
|
||||||
ompi_coll_tree_t *cached_in_order_bintree;
|
ompi_coll_tree_t *cached_in_order_bintree;
|
||||||
|
|
||||||
/* extra data required by the decision functions */
|
|
||||||
ompi_coll_alg_rule_t *all_base_rules; /* stored only on MCW, all other coms ref it */
|
|
||||||
/* moving to the component */
|
/* moving to the component */
|
||||||
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
|
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
|
||||||
|
|
||||||
/* for forced algorithms we store the information on the module */
|
/* for forced algorithms we store the information on the module */
|
||||||
/* previously we only had one shared copy, ops, it really is per comm/module */
|
/* previously we only had one shared copy, ops, it really is per comm/module */
|
||||||
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
|
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
|
||||||
};
|
};
|
||||||
/**
|
typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t;
|
||||||
* Convenience typedef
|
|
||||||
*/
|
|
||||||
typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t;
|
|
||||||
|
|
||||||
struct mca_coll_tuned_module_t {
|
struct mca_coll_tuned_module_t {
|
||||||
mca_coll_base_module_t super;
|
mca_coll_base_module_t super;
|
||||||
|
|
||||||
mca_coll_tuned_comm_t *tuned_data;
|
mca_coll_tuned_comm_t *tuned_data;
|
||||||
};
|
};
|
||||||
typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t;
|
typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t;
|
||||||
OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t);
|
OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t);
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
END_C_DECLS
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
|
#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -31,8 +31,7 @@
|
|||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
#include "ompi/mca/coll/coll.h"
|
#include "ompi/mca/coll/coll.h"
|
||||||
#include "coll_tuned.h"
|
#include "coll_tuned.h"
|
||||||
|
#include "coll_tuned_dynamic_file.h"
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Public string showing the coll ompi_tuned component version number
|
* Public string showing the coll ompi_tuned component version number
|
||||||
@ -58,7 +57,6 @@ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COL
|
|||||||
/* max algorithm values */
|
/* max algorithm values */
|
||||||
int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
|
int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local function
|
* Local function
|
||||||
*/
|
*/
|
||||||
@ -71,14 +69,10 @@ static int tuned_close(void);
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
mca_coll_tuned_component_t mca_coll_tuned_component = {
|
mca_coll_tuned_component_t mca_coll_tuned_component = {
|
||||||
|
|
||||||
/* First, fill in the super */
|
/* First, fill in the super */
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
||||||
/* First, the mca_component_t struct containing meta information
|
/* First, the mca_component_t struct containing meta information
|
||||||
about the component itself */
|
about the component itself */
|
||||||
|
|
||||||
{
|
{
|
||||||
MCA_COLL_BASE_VERSION_2_0_0,
|
MCA_COLL_BASE_VERSION_2_0_0,
|
||||||
|
|
||||||
@ -107,13 +101,14 @@ mca_coll_tuned_component_t mca_coll_tuned_component = {
|
|||||||
0,
|
0,
|
||||||
|
|
||||||
/* Tuned component specific information */
|
/* Tuned component specific information */
|
||||||
/* Note some of this WAS in the module */
|
|
||||||
NULL /* ompi_coll_alg_rule_t ptr */
|
NULL /* ompi_coll_alg_rule_t ptr */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static int tuned_open(void)
|
static int tuned_open(void)
|
||||||
{
|
{
|
||||||
|
int rc;
|
||||||
|
|
||||||
#if OPAL_ENABLE_DEBUG
|
#if OPAL_ENABLE_DEBUG
|
||||||
{
|
{
|
||||||
int param;
|
int param;
|
||||||
@ -177,6 +172,18 @@ static int tuned_open(void)
|
|||||||
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
||||||
false, false, ompi_coll_tuned_dynamic_rules_filename,
|
false, false, ompi_coll_tuned_dynamic_rules_filename,
|
||||||
&ompi_coll_tuned_dynamic_rules_filename);
|
&ompi_coll_tuned_dynamic_rules_filename);
|
||||||
|
if( ompi_coll_tuned_dynamic_rules_filename ) {
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:component_open Reading collective rules file [%s]",
|
||||||
|
ompi_coll_tuned_dynamic_rules_filename));
|
||||||
|
rc = ompi_coll_tuned_read_rules_config_file( ompi_coll_tuned_dynamic_rules_filename,
|
||||||
|
&(mca_coll_tuned_component.all_base_rules), COLLCOUNT);
|
||||||
|
if( rc >= 0 ) {
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Read %d valid rules\n", rc));
|
||||||
|
} else {
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Reading collective rules file failed\n"));
|
||||||
|
mca_coll_tuned_component.all_base_rules = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
|
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
|
||||||
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
|
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
|
||||||
ompi_coll_tuned_allgather_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLGATHER]);
|
ompi_coll_tuned_allgather_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLGATHER]);
|
||||||
@ -206,6 +213,11 @@ static int tuned_close(void)
|
|||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: done!"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: done!"));
|
||||||
|
|
||||||
|
if( NULL != mca_coll_tuned_component.all_base_rules ) {
|
||||||
|
ompi_coll_tuned_free_all_rules(mca_coll_tuned_component.all_base_rules, COLLCOUNT);
|
||||||
|
mca_coll_tuned_component.all_base_rules = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -227,45 +239,36 @@ mca_coll_tuned_module_destruct(mca_coll_tuned_module_t *module)
|
|||||||
data = module->tuned_data;
|
data = module->tuned_data;
|
||||||
if (NULL != data) {
|
if (NULL != data) {
|
||||||
#if OPAL_ENABLE_DEBUG
|
#if OPAL_ENABLE_DEBUG
|
||||||
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
|
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
|
||||||
the generel c_coll_selected_data */
|
the generel c_coll_selected_data */
|
||||||
data->mcct_reqs = NULL;
|
data->mcct_reqs = NULL;
|
||||||
data->mcct_num_reqs = 0;
|
data->mcct_num_reqs = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* free any cached information that has been allocated */
|
/* free any cached information that has been allocated */
|
||||||
if (data->cached_ntree) { /* destroy general tree if defined */
|
if (data->cached_ntree) { /* destroy general tree if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_ntree);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_ntree);
|
||||||
}
|
}
|
||||||
if (data->cached_bintree) { /* destroy bintree if defined */
|
if (data->cached_bintree) { /* destroy bintree if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_bintree);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_bintree);
|
||||||
}
|
}
|
||||||
if (data->cached_bmtree) { /* destroy bmtree if defined */
|
if (data->cached_bmtree) { /* destroy bmtree if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_bmtree);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_bmtree);
|
||||||
}
|
}
|
||||||
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
|
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bmtree);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bmtree);
|
||||||
}
|
}
|
||||||
if (data->cached_chain) { /* destroy general chain if defined */
|
if (data->cached_chain) { /* destroy general chain if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_chain);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_chain);
|
||||||
}
|
}
|
||||||
if (data->cached_pipeline) { /* destroy pipeline if defined */
|
if (data->cached_pipeline) { /* destroy pipeline if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_pipeline);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_pipeline);
|
||||||
}
|
}
|
||||||
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
|
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
|
||||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bintree);
|
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bintree);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* FIXME: */
|
free(data);
|
||||||
/* if any algorithm rules are cached on the communicator, only free them if its MCW */
|
|
||||||
/* as this is the only place they are allocated by reading the decision configure file */
|
|
||||||
if ((ompi_coll_tuned_use_dynamic_rules)&&(&ompi_mpi_comm_world==comm)) {
|
|
||||||
if (comm->data->all_base_rules) {
|
|
||||||
ompi_coll_tuned_free_all_rules (comm->data->all_base_rules, COLLCOUNT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
free(data);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -56,7 +56,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -76,19 +76,19 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
|||||||
dsize, &faninout, &segsize, &ignoreme);
|
dsize, &faninout, &segsize, &ignoreme);
|
||||||
|
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for this message size */
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
|
return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout, segsize);
|
alg, faninout, segsize);
|
||||||
} /* found a method */
|
} /* found a method */
|
||||||
} /*end if any com rules to check */
|
} /*end if any com rules to check */
|
||||||
|
|
||||||
if (data->user_forced[ALLREDUCE].algorithm) {
|
if (data->user_forced[ALLREDUCE].algorithm) {
|
||||||
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op,
|
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op,
|
return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -104,7 +104,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -126,23 +126,22 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
|||||||
dsize, &faninout, &segsize, &max_requests);
|
dsize, &faninout, &segsize, &max_requests);
|
||||||
|
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for this message size */
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
return ompi_coll_tuned_alltoall_intra_do_this (sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_do_this (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout, segsize, max_requests);
|
alg, faninout, segsize, max_requests);
|
||||||
} /* found a method */
|
} /* found a method */
|
||||||
} /*end if any com rules to check */
|
} /*end if any com rules to check */
|
||||||
|
|
||||||
|
|
||||||
if (data->user_forced[ALLTOALL].algorithm) {
|
if (data->user_forced[ALLTOALL].algorithm) {
|
||||||
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -163,16 +162,27 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
|||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_dynamic"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_dynamic"));
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* BEGIN - File Based Rules
|
* check to see if we have some filebased rules. As we don't have global
|
||||||
*
|
* knowledge about the total amount of data, use the first available rule.
|
||||||
* Here is where we would check to see if we have some file based
|
* This allow the users to specify the alltoallv algorithm to be used only
|
||||||
* rules. Currently, we do not, so move on to seeing if the user
|
* based on the communicator size.
|
||||||
* specified a specific algorithm. If not, then use the fixed
|
|
||||||
* decision code to decide.
|
|
||||||
*
|
|
||||||
* END - File Based Rules
|
|
||||||
*/
|
*/
|
||||||
|
if (data->com_rules[ALLTOALLV]) {
|
||||||
|
int alg, faninout, segsize, max_requests;
|
||||||
|
|
||||||
|
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALLV],
|
||||||
|
0, &faninout, &segsize, &max_requests);
|
||||||
|
|
||||||
|
if (alg) {
|
||||||
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
|
return ompi_coll_tuned_alltoallv_intra_do_this (sbuf, scounts, sdisps, sdtype,
|
||||||
|
rbuf, rcounts, rdisps, rdtype,
|
||||||
|
comm, module,
|
||||||
|
alg);
|
||||||
|
} /* found a method */
|
||||||
|
} /*end if any com rules to check */
|
||||||
|
|
||||||
if (data->user_forced[ALLTOALLV].algorithm) {
|
if (data->user_forced[ALLTOALLV].algorithm) {
|
||||||
return ompi_coll_tuned_alltoallv_intra_do_forced(sbuf, scounts, sdisps, sdtype,
|
return ompi_coll_tuned_alltoallv_intra_do_forced(sbuf, scounts, sdisps, sdtype,
|
||||||
rbuf, rcounts, rdisps, rdtype,
|
rbuf, rcounts, rdisps, rdtype,
|
||||||
@ -191,7 +201,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
|||||||
* Returns: - MPI_SUCCESS or error code (passed from the barrier implementation)
|
* Returns: - MPI_SUCCESS or error code (passed from the barrier implementation)
|
||||||
*/
|
*/
|
||||||
int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -207,7 +217,7 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
|||||||
0, &faninout, &segsize, &ignoreme);
|
0, &faninout, &segsize, &ignoreme);
|
||||||
|
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for this message size */
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
return ompi_coll_tuned_barrier_intra_do_this (comm, module,
|
return ompi_coll_tuned_barrier_intra_do_this (comm, module,
|
||||||
alg, faninout, segsize);
|
alg, faninout, segsize);
|
||||||
} /* found a method */
|
} /* found a method */
|
||||||
@ -229,7 +239,7 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
|||||||
int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||||
struct ompi_datatype_t *datatype, int root,
|
struct ompi_datatype_t *datatype, int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -249,9 +259,9 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
|||||||
dsize, &faninout, &segsize, &ignoreme);
|
dsize, &faninout, &segsize, &ignoreme);
|
||||||
|
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for this message size */
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
return ompi_coll_tuned_bcast_intra_do_this (buff, count, datatype, root,
|
return ompi_coll_tuned_bcast_intra_do_this (buff, count, datatype, root,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout, segsize);
|
alg, faninout, segsize);
|
||||||
} /* found a method */
|
} /* found a method */
|
||||||
} /*end if any com rules to check */
|
} /*end if any com rules to check */
|
||||||
@ -259,10 +269,10 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
|||||||
|
|
||||||
if (data->user_forced[BCAST].algorithm) {
|
if (data->user_forced[BCAST].algorithm) {
|
||||||
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root,
|
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_bcast_intra_dec_fixed (buff, count, datatype, root,
|
return ompi_coll_tuned_bcast_intra_dec_fixed (buff, count, datatype, root,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -277,7 +287,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
|||||||
int count, struct ompi_datatype_t* datatype,
|
int count, struct ompi_datatype_t* datatype,
|
||||||
struct ompi_op_t* op, int root,
|
struct ompi_op_t* op, int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -298,10 +308,10 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
|||||||
dsize, &faninout, &segsize, &max_requests);
|
dsize, &faninout, &segsize, &max_requests);
|
||||||
|
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for this message size */
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
|
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
|
||||||
op, root,
|
op, root,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout,
|
alg, faninout,
|
||||||
segsize,
|
segsize,
|
||||||
max_requests);
|
max_requests);
|
||||||
@ -310,12 +320,12 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
|||||||
|
|
||||||
if (data->user_forced[REDUCE].algorithm) {
|
if (data->user_forced[REDUCE].algorithm) {
|
||||||
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype,
|
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype,
|
||||||
op, root,
|
op, root,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_reduce_intra_dec_fixed (sendbuf, recvbuf, count, datatype,
|
return ompi_coll_tuned_reduce_intra_dec_fixed (sendbuf, recvbuf, count, datatype,
|
||||||
op, root,
|
op, root,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -332,7 +342,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -354,10 +364,10 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
|||||||
dsize, &faninout,
|
dsize, &faninout,
|
||||||
&segsize, &ignoreme);
|
&segsize, &ignoreme);
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for this message size */
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts,
|
return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts,
|
||||||
dtype, op,
|
dtype, op,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout,
|
alg, faninout,
|
||||||
segsize);
|
segsize);
|
||||||
} /* found a method */
|
} /* found a method */
|
||||||
@ -369,8 +379,8 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
|||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
||||||
dtype, op,
|
dtype, op,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -387,49 +397,49 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_allgather_intra_dec_dynamic"));
|
"ompi_coll_tuned_allgather_intra_dec_dynamic"));
|
||||||
|
|
||||||
if (data->com_rules[ALLGATHER]) {
|
if (data->com_rules[ALLGATHER]) {
|
||||||
/* We have file based rules:
|
/* We have file based rules:
|
||||||
- calculate message size and other necessary information */
|
- calculate message size and other necessary information */
|
||||||
int comsize;
|
int comsize;
|
||||||
int alg, faninout, segsize, ignoreme;
|
int alg, faninout, segsize, ignoreme;
|
||||||
size_t dsize;
|
size_t dsize;
|
||||||
|
|
||||||
ompi_datatype_type_size (sdtype, &dsize);
|
ompi_datatype_type_size (sdtype, &dsize);
|
||||||
comsize = ompi_comm_size(comm);
|
comsize = ompi_comm_size(comm);
|
||||||
dsize *= comsize * scount;
|
dsize *= comsize * scount;
|
||||||
|
|
||||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHER],
|
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHER],
|
||||||
dsize, &faninout, &segsize, &ignoreme);
|
dsize, &faninout, &segsize, &ignoreme);
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for
|
/* we have found a valid choice from the file based rules for
|
||||||
this message size */
|
this message size */
|
||||||
return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout, segsize);
|
alg, faninout, segsize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We do not have file based rules */
|
/* We do not have file based rules */
|
||||||
if (data->user_forced[ALLGATHER].algorithm) {
|
if (data->user_forced[ALLGATHER].algorithm) {
|
||||||
/* User-forced algorithm */
|
/* User-forced algorithm */
|
||||||
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Use default decision */
|
/* Use default decision */
|
||||||
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -447,100 +457,146 @@ int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
|
|||||||
int *rdispls,
|
int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_allgatherv_intra_dec_dynamic"));
|
"ompi_coll_tuned_allgatherv_intra_dec_dynamic"));
|
||||||
|
|
||||||
if (data->com_rules[ALLGATHERV]) {
|
if (data->com_rules[ALLGATHERV]) {
|
||||||
/* We have file based rules:
|
/* We have file based rules:
|
||||||
- calculate message size and other necessary information */
|
- calculate message size and other necessary information */
|
||||||
int comsize, i;
|
int comsize, i;
|
||||||
int alg, faninout, segsize, ignoreme;
|
int alg, faninout, segsize, ignoreme;
|
||||||
size_t dsize, total_size;
|
size_t dsize, total_size;
|
||||||
|
|
||||||
comsize = ompi_comm_size(comm);
|
comsize = ompi_comm_size(comm);
|
||||||
ompi_datatype_type_size (sdtype, &dsize);
|
ompi_datatype_type_size (sdtype, &dsize);
|
||||||
total_size = 0;
|
total_size = 0;
|
||||||
for (i = 0; i < comsize; i++) { total_size += dsize * rcounts[i]; }
|
for (i = 0; i < comsize; i++) { total_size += dsize * rcounts[i]; }
|
||||||
|
|
||||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHERV],
|
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHERV],
|
||||||
total_size, &faninout, &segsize, &ignoreme);
|
total_size, &faninout, &segsize, &ignoreme);
|
||||||
if (alg) {
|
if (alg) {
|
||||||
/* we have found a valid choice from the file based rules for
|
/* we have found a valid choice from the file based rules for
|
||||||
this message size */
|
this message size */
|
||||||
return ompi_coll_tuned_allgatherv_intra_do_this (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_do_this (sbuf, scount, sdtype,
|
||||||
rbuf, rcounts,
|
rbuf, rcounts,
|
||||||
rdispls, rdtype,
|
rdispls, rdtype,
|
||||||
comm, module,
|
comm, module,
|
||||||
alg, faninout, segsize);
|
alg, faninout, segsize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We do not have file based rules */
|
/* We do not have file based rules */
|
||||||
if (data->user_forced[ALLGATHERV].algorithm) {
|
if (data->user_forced[ALLGATHERV].algorithm) {
|
||||||
/* User-forced algorithm */
|
/* User-forced algorithm */
|
||||||
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
|
||||||
rbuf, rcounts,
|
rbuf, rcounts,
|
||||||
rdispls, rdtype,
|
rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Use default decision */
|
/* Use default decision */
|
||||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcounts,
|
rbuf, rcounts,
|
||||||
rdispls, rdtype,
|
rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_dec_dynamic"));
|
"ompi_coll_tuned_gather_intra_dec_dynamic"));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check to see if we have some filebased rules.
|
||||||
|
*/
|
||||||
|
if (data->com_rules[GATHER]) {
|
||||||
|
int comsize, alg, faninout, segsize, max_requests;
|
||||||
|
size_t dsize;
|
||||||
|
|
||||||
|
comsize = ompi_comm_size(comm);
|
||||||
|
ompi_datatype_type_size (sdtype, &dsize);
|
||||||
|
dsize *= comsize;
|
||||||
|
|
||||||
|
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[GATHER],
|
||||||
|
dsize, &faninout, &segsize, &max_requests);
|
||||||
|
|
||||||
|
if (alg) {
|
||||||
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
|
return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
|
||||||
|
rbuf, rcount, rdtype,
|
||||||
|
root, comm, module,
|
||||||
|
alg, faninout, segsize);
|
||||||
|
} /* found a method */
|
||||||
|
} /*end if any com rules to check */
|
||||||
|
|
||||||
if (data->user_forced[GATHER].algorithm) {
|
if (data->user_forced[GATHER].algorithm) {
|
||||||
return ompi_coll_tuned_gather_intra_do_forced (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_do_forced (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root, struct ompi_communicator_t *comm,
|
int root, struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_scatter_intra_dec_dynamic"));
|
"ompi_coll_tuned_scatter_intra_dec_dynamic"));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check to see if we have some filebased rules.
|
||||||
|
*/
|
||||||
|
if (data->com_rules[SCATTER]) {
|
||||||
|
int comsize, alg, faninout, segsize, max_requests;
|
||||||
|
size_t dsize;
|
||||||
|
|
||||||
|
comsize = ompi_comm_size(comm);
|
||||||
|
ompi_datatype_type_size (sdtype, &dsize);
|
||||||
|
dsize *= comsize;
|
||||||
|
|
||||||
|
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[SCATTER],
|
||||||
|
dsize, &faninout, &segsize, &max_requests);
|
||||||
|
|
||||||
|
if (alg) {
|
||||||
|
/* we have found a valid choice from the file based rules for this message size */
|
||||||
|
return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype,
|
||||||
|
rbuf, rcount, rdtype,
|
||||||
|
root, comm, module,
|
||||||
|
alg, faninout, segsize);
|
||||||
|
} /* found a method */
|
||||||
|
} /*end if any com rules to check */
|
||||||
|
|
||||||
if (data->user_forced[SCATTER].algorithm) {
|
if (data->user_forced[SCATTER].algorithm) {
|
||||||
return ompi_coll_tuned_scatter_intra_do_forced (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_do_forced (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -40,7 +40,7 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
size_t dsize, block_dsize;
|
size_t dsize, block_dsize;
|
||||||
int comm_size = ompi_comm_size(comm);
|
int comm_size = ompi_comm_size(comm);
|
||||||
@ -58,22 +58,22 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
|||||||
block_dsize = dsize * count;
|
block_dsize = dsize * count;
|
||||||
|
|
||||||
if (block_dsize < intermediate_message) {
|
if (block_dsize < intermediate_message) {
|
||||||
return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf,
|
return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf,
|
||||||
count, dtype,
|
count, dtype,
|
||||||
op, comm, module));
|
op, comm, module));
|
||||||
}
|
}
|
||||||
|
|
||||||
if( ompi_op_is_commute(op) && (count > comm_size) ) {
|
if( ompi_op_is_commute(op) && (count > comm_size) ) {
|
||||||
const size_t segment_size = 1 << 20; /* 1 MB */
|
const size_t segment_size = 1 << 20; /* 1 MB */
|
||||||
if ((comm_size * segment_size >= block_dsize)) {
|
if ((comm_size * segment_size >= block_dsize)) {
|
||||||
return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype,
|
return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype,
|
||||||
op, comm, module));
|
op, comm, module));
|
||||||
} else {
|
} else {
|
||||||
return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
|
return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
|
||||||
count, dtype,
|
count, dtype,
|
||||||
op, comm, module,
|
op, comm, module,
|
||||||
segment_size));
|
segment_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count,
|
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count,
|
||||||
@ -93,7 +93,7 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int communicator_size;
|
int communicator_size;
|
||||||
size_t dsize, block_dsize;
|
size_t dsize, block_dsize;
|
||||||
@ -106,8 +106,8 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
/* special case */
|
/* special case */
|
||||||
if (communicator_size==2) {
|
if (communicator_size==2) {
|
||||||
return ompi_coll_tuned_alltoall_intra_two_procs(sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_two_procs(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Decision function based on measurement on Grig cluster at
|
/* Decision function based on measurement on Grig cluster at
|
||||||
@ -118,19 +118,19 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
block_dsize = dsize * scount;
|
block_dsize = dsize * scount;
|
||||||
|
|
||||||
if ((block_dsize < 200) && (communicator_size > 12)) {
|
if ((block_dsize < 200) && (communicator_size > 12)) {
|
||||||
return ompi_coll_tuned_alltoall_intra_bruck(sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_bruck(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
|
|
||||||
} else if (block_dsize < 3000) {
|
} else if (block_dsize < 3000) {
|
||||||
return ompi_coll_tuned_alltoall_intra_basic_linear(sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_basic_linear(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype,
|
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
/* previous decision */
|
/* previous decision */
|
||||||
@ -179,7 +179,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_fixed(void *sbuf, int *scounts, int *sdi
|
|||||||
* Returns: - MPI_SUCCESS or error code (passed from the barrier implementation)
|
* Returns: - MPI_SUCCESS or error code (passed from the barrier implementation)
|
||||||
*/
|
*/
|
||||||
int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int communicator_size = ompi_comm_size(comm);
|
int communicator_size = ompi_comm_size(comm);
|
||||||
|
|
||||||
@ -219,10 +219,10 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
|||||||
int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||||
struct ompi_datatype_t *datatype, int root,
|
struct ompi_datatype_t *datatype, int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
/* Decision function based on MX results for
|
/* Decision function based on MX results for
|
||||||
messages up to 36MB and communicator sizes up to 64 nodes */
|
messages up to 36MB and communicator sizes up to 64 nodes */
|
||||||
const size_t small_message_size = 2048;
|
const size_t small_message_size = 2048;
|
||||||
const size_t intermediate_message_size = 370728;
|
const size_t intermediate_message_size = 370728;
|
||||||
const double a_p16 = 3.2118e-6; /* [1 / byte] */
|
const double a_p16 = 3.2118e-6; /* [1 / byte] */
|
||||||
@ -249,56 +249,56 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
|||||||
/* Handle messages of small and intermediate size, and
|
/* Handle messages of small and intermediate size, and
|
||||||
single-element broadcasts */
|
single-element broadcasts */
|
||||||
if ((message_size < small_message_size) || (count <= 1)) {
|
if ((message_size < small_message_size) || (count <= 1)) {
|
||||||
/* Binomial without segmentation */
|
/* Binomial without segmentation */
|
||||||
segsize = 0;
|
segsize = 0;
|
||||||
return ompi_coll_tuned_bcast_intra_binomial (buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_binomial (buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
} else if (message_size < intermediate_message_size) {
|
} else if (message_size < intermediate_message_size) {
|
||||||
/* SplittedBinary with 1KB segments */
|
/* SplittedBinary with 1KB segments */
|
||||||
segsize = 1024;
|
segsize = 1024;
|
||||||
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
}
|
}
|
||||||
/* Handle large message sizes */
|
/* Handle large message sizes */
|
||||||
else if (communicator_size < (a_p128 * message_size + b_p128)) {
|
else if (communicator_size < (a_p128 * message_size + b_p128)) {
|
||||||
/* Pipeline with 128KB segments */
|
/* Pipeline with 128KB segments */
|
||||||
segsize = 1024 << 7;
|
segsize = 1024 << 7;
|
||||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
} else if (communicator_size < 13) {
|
} else if (communicator_size < 13) {
|
||||||
/* Split Binary with 8KB segments */
|
/* Split Binary with 8KB segments */
|
||||||
segsize = 1024 << 3;
|
segsize = 1024 << 3;
|
||||||
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
} else if (communicator_size < (a_p64 * message_size + b_p64)) {
|
} else if (communicator_size < (a_p64 * message_size + b_p64)) {
|
||||||
/* Pipeline with 64KB segments */
|
/* Pipeline with 64KB segments */
|
||||||
segsize = 1024 << 6;
|
segsize = 1024 << 6;
|
||||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
} else if (communicator_size < (a_p16 * message_size + b_p16)) {
|
} else if (communicator_size < (a_p16 * message_size + b_p16)) {
|
||||||
/* Pipeline with 16KB segments */
|
/* Pipeline with 16KB segments */
|
||||||
segsize = 1024 << 4;
|
segsize = 1024 << 4;
|
||||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Pipeline with 8KB segments */
|
/* Pipeline with 8KB segments */
|
||||||
segsize = 1024 << 3;
|
segsize = 1024 << 3;
|
||||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
#if 0
|
#if 0
|
||||||
/* this is based on gige measurements */
|
/* this is based on gige measurements */
|
||||||
|
|
||||||
@ -340,7 +340,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
int count, struct ompi_datatype_t* datatype,
|
int count, struct ompi_datatype_t* datatype,
|
||||||
struct ompi_op_t* op, int root,
|
struct ompi_op_t* op, int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int communicator_size, segsize = 0;
|
int communicator_size, segsize = 0;
|
||||||
size_t message_size, dsize;
|
size_t message_size, dsize;
|
||||||
@ -370,10 +370,10 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
0, max_requests);
|
0, max_requests);
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
|
||||||
"root %d rank %d com_size %d msg_length %lu",
|
"root %d rank %d com_size %d msg_length %lu",
|
||||||
root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
|
root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
|
||||||
|
|
||||||
@ -385,17 +385,17 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
/* Binomial_0K */
|
/* Binomial_0K */
|
||||||
segsize = 0;
|
segsize = 0;
|
||||||
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
} else if (communicator_size > (a1 * message_size + b1)) {
|
} else if (communicator_size > (a1 * message_size + b1)) {
|
||||||
/* Binomial_1K */
|
/* Binomial_1K */
|
||||||
segsize = 1024;
|
segsize = 1024;
|
||||||
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
} else if (communicator_size > (a2 * message_size + b2)) {
|
} else if (communicator_size > (a2 * message_size + b2)) {
|
||||||
/* Pipeline_1K */
|
/* Pipeline_1K */
|
||||||
segsize = 1024;
|
segsize = 1024;
|
||||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
} else if (communicator_size > (a3 * message_size + b3)) {
|
} else if (communicator_size > (a3 * message_size + b3)) {
|
||||||
/* Binary_32K */
|
/* Binary_32K */
|
||||||
segsize = 32*1024;
|
segsize = 32*1024;
|
||||||
@ -410,7 +410,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
segsize = 64*1024;
|
segsize = 64*1024;
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
/* for small messages use linear algorithm */
|
/* for small messages use linear algorithm */
|
||||||
@ -433,11 +433,11 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
/* later swap this for a binary tree */
|
/* later swap this for a binary tree */
|
||||||
/* fanout = 2; */
|
/* fanout = 2; */
|
||||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
segsize, fanout, max_requests);
|
segsize, fanout, max_requests);
|
||||||
}
|
}
|
||||||
segsize = 1024;
|
segsize = 1024;
|
||||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
#endif /* 0 */
|
#endif /* 0 */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -457,51 +457,51 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int comm_size, i, pow2;
|
int comm_size, i, pow2;
|
||||||
size_t total_message_size, dsize;
|
size_t total_message_size, dsize;
|
||||||
const double a = 0.0012;
|
const double a = 0.0012;
|
||||||
const double b = 8.0;
|
const double b = 8.0;
|
||||||
const size_t small_message_size = 12 * 1024;
|
const size_t small_message_size = 12 * 1024;
|
||||||
const size_t large_message_size = 256 * 1024;
|
const size_t large_message_size = 256 * 1024;
|
||||||
bool zerocounts = false;
|
bool zerocounts = false;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed"));
|
||||||
|
|
||||||
comm_size = ompi_comm_size(comm);
|
comm_size = ompi_comm_size(comm);
|
||||||
/* We need data size for decision function */
|
/* We need data size for decision function */
|
||||||
ompi_datatype_type_size(dtype, &dsize);
|
ompi_datatype_type_size(dtype, &dsize);
|
||||||
total_message_size = 0;
|
total_message_size = 0;
|
||||||
for (i = 0; i < comm_size; i++) {
|
for (i = 0; i < comm_size; i++) {
|
||||||
total_message_size += rcounts[i];
|
total_message_size += rcounts[i];
|
||||||
if (0 == rcounts[i]) {
|
if (0 == rcounts[i]) {
|
||||||
zerocounts = true;
|
zerocounts = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if( !ompi_op_is_commute(op) || (zerocounts)) {
|
if( !ompi_op_is_commute(op) || (zerocounts)) {
|
||||||
return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
|
return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
|
||||||
dtype, op,
|
dtype, op,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
total_message_size *= dsize;
|
total_message_size *= dsize;
|
||||||
|
|
||||||
/* compute the nearest power of 2 */
|
/* compute the nearest power of 2 */
|
||||||
for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
|
for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
|
||||||
|
|
||||||
if ((total_message_size <= small_message_size) ||
|
if ((total_message_size <= small_message_size) ||
|
||||||
((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
|
((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
|
||||||
(comm_size >= a * total_message_size + b)) {
|
(comm_size >= a * total_message_size + b)) {
|
||||||
return
|
return
|
||||||
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||||
dtype, op,
|
dtype, op,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||||
dtype, op,
|
dtype, op,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -520,80 +520,80 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int communicator_size, pow2_size;
|
int communicator_size, pow2_size;
|
||||||
size_t dsize, total_dsize;
|
size_t dsize, total_dsize;
|
||||||
|
|
||||||
communicator_size = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
|
|
||||||
/* Special case for 2 processes */
|
/* Special case for 2 processes */
|
||||||
if (communicator_size == 2) {
|
if (communicator_size == 2) {
|
||||||
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Determine complete data size */
|
/* Determine complete data size */
|
||||||
ompi_datatype_type_size(sdtype, &dsize);
|
ompi_datatype_type_size(sdtype, &dsize);
|
||||||
total_dsize = dsize * scount * communicator_size;
|
total_dsize = dsize * scount * communicator_size;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
|
||||||
" rank %d com_size %d msg_length %lu",
|
" rank %d com_size %d msg_length %lu",
|
||||||
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
||||||
|
|
||||||
for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
|
for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
|
||||||
|
|
||||||
/* Decision based on MX 2Gb results from Grig cluster at
|
/* Decision based on MX 2Gb results from Grig cluster at
|
||||||
The University of Tennesse, Knoxville
|
The University of Tennesse, Knoxville
|
||||||
- if total message size is less than 50KB use either bruck or
|
- if total message size is less than 50KB use either bruck or
|
||||||
recursive doubling for non-power of two and power of two nodes,
|
recursive doubling for non-power of two and power of two nodes,
|
||||||
respectively.
|
respectively.
|
||||||
- else use ring and neighbor exchange algorithms for odd and even
|
- else use ring and neighbor exchange algorithms for odd and even
|
||||||
number of nodes, respectively.
|
number of nodes, respectively.
|
||||||
*/
|
*/
|
||||||
if (total_dsize < 50000) {
|
if (total_dsize < 50000) {
|
||||||
if (pow2_size == communicator_size) {
|
if (pow2_size == communicator_size) {
|
||||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
} else {
|
} else {
|
||||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (communicator_size % 2) {
|
if (communicator_size % 2) {
|
||||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
} else {
|
} else {
|
||||||
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(USE_MPICH2_DECISION)
|
#if defined(USE_MPICH2_DECISION)
|
||||||
/* Decision as in MPICH-2
|
/* Decision as in MPICH-2
|
||||||
presented in Thakur et.al. "Optimization of Collective Communication
|
presented in Thakur et.al. "Optimization of Collective Communication
|
||||||
Operations in MPICH", International Journal of High Performance Computing
|
Operations in MPICH", International Journal of High Performance Computing
|
||||||
Applications, Vol. 19, No. 1, 49-66 (2005)
|
Applications, Vol. 19, No. 1, 49-66 (2005)
|
||||||
- for power-of-two processes and small and medium size messages
|
- for power-of-two processes and small and medium size messages
|
||||||
(up to 512KB) use recursive doubling
|
(up to 512KB) use recursive doubling
|
||||||
- for non-power-of-two processes and small messages (80KB) use bruck,
|
- for non-power-of-two processes and small messages (80KB) use bruck,
|
||||||
- for everything else use ring.
|
- for everything else use ring.
|
||||||
*/
|
*/
|
||||||
if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
|
if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
|
||||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
} else if (total_dsize <= 81920) {
|
} else if (total_dsize <= 81920) {
|
||||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
#endif /* defined(USE_MPICH2_DECISION) */
|
#endif /* defined(USE_MPICH2_DECISION) */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -612,7 +612,7 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
int *rdispls,
|
int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
int communicator_size;
|
int communicator_size;
|
||||||
@ -639,22 +639,22 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
" rank %d com_size %d msg_length %lu",
|
" rank %d com_size %d msg_length %lu",
|
||||||
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
||||||
|
|
||||||
/* Decision based on allgather decision. */
|
/* Decision based on allgather decision. */
|
||||||
if (total_dsize < 50000) {
|
if (total_dsize < 50000) {
|
||||||
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
} else {
|
} else {
|
||||||
if (communicator_size % 2) {
|
if (communicator_size % 2) {
|
||||||
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
} else {
|
} else {
|
||||||
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -667,12 +667,12 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
const int large_segment_size = 32768;
|
const int large_segment_size = 32768;
|
||||||
const int small_segment_size = 1024;
|
const int small_segment_size = 1024;
|
||||||
@ -688,7 +688,7 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
size_t dsize, block_size;
|
size_t dsize, block_size;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_dec_fixed"));
|
"ompi_coll_tuned_gather_intra_dec_fixed"));
|
||||||
|
|
||||||
communicator_size = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
@ -724,8 +724,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
}
|
}
|
||||||
/* Otherwise, use basic linear */
|
/* Otherwise, use basic linear */
|
||||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -738,11 +738,11 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root, struct ompi_communicator_t *comm,
|
int root, struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
const size_t small_block_size = 300;
|
const size_t small_block_size = 300;
|
||||||
const int small_comm_size = 10;
|
const int small_comm_size = 10;
|
||||||
@ -750,7 +750,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
size_t dsize, block_size;
|
size_t dsize, block_size;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_scatter_intra_dec_fixed"));
|
"ompi_coll_tuned_scatter_intra_dec_fixed"));
|
||||||
|
|
||||||
communicator_size = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
@ -770,6 +770,6 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -314,18 +314,12 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
|||||||
best_com_p = com_p = alg_p->com_rules;
|
best_com_p = com_p = alg_p->com_rules;
|
||||||
i = best = 0;
|
i = best = 0;
|
||||||
|
|
||||||
while (i<alg_p->n_com_sizes) {
|
while( i < alg_p->n_com_sizes ) {
|
||||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking comsize %d against alg_id %d com_id %d index %d com_size %d", */
|
if (com_p->mpi_comsize > mpi_comsize) {
|
||||||
/* mpi_comsize, com_p->alg_rule_id, com_p->com_rule_id, i, com_p->mpi_comsize)); */
|
|
||||||
if (com_p->mpi_comsize <= mpi_comsize) {
|
|
||||||
best = i;
|
|
||||||
best_com_p = com_p;
|
|
||||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
best = i;
|
||||||
|
best_com_p = com_p;
|
||||||
/* go to the next entry */
|
/* go to the next entry */
|
||||||
com_p++;
|
com_p++;
|
||||||
i++;
|
i++;
|
||||||
@ -359,26 +353,11 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
|
|||||||
ompi_coll_msg_rule_t* best_msg_p = (ompi_coll_msg_rule_t*) NULL;
|
ompi_coll_msg_rule_t* best_msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||||
int i, best;
|
int i, best;
|
||||||
|
|
||||||
if (!base_com_rule) {
|
/* No rule or zero rules */
|
||||||
|
if( (NULL == base_com_rule) || (0 == base_com_rule->n_msg_sizes)) {
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!result_topo_faninout) {
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!result_segsize) {
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!max_requests) {
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!base_com_rule->n_msg_sizes) { /* check for count of message sizes */
|
|
||||||
return (0); /* no msg sizes so no rule */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
|
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
|
||||||
|
|
||||||
/* make a copy of the first msg rule */
|
/* make a copy of the first msg rule */
|
||||||
|
@ -1,62 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
||||||
* University Research and Technology
|
|
||||||
* Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "ompi_config.h"
|
|
||||||
|
|
||||||
#include "mpi.h"
|
|
||||||
#include "opal/mca/mca.h"
|
|
||||||
#include "ompi/constants.h"
|
|
||||||
#include "coll_tuned.h"
|
|
||||||
|
|
||||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
|
||||||
#include "coll_tuned_topo.h"
|
|
||||||
|
|
||||||
/* also need the dynamic rule structures */
|
|
||||||
#include "coll_tuned_forced.h"
|
|
||||||
|
|
||||||
#include "coll_tuned_util.h"
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
|
|
||||||
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
|
|
||||||
|
|
||||||
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
|
||||||
coll_tuned_force_algorithm_params_t *forced_values)
|
|
||||||
{
|
|
||||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
|
||||||
mca_base_param_lookup_int (mca_params.segsize_param_index, &(forced_values->segsize));
|
|
||||||
mca_base_param_lookup_int (mca_params.tree_fanout_param_index, &(forced_values->tree_fanout));
|
|
||||||
mca_base_param_lookup_int (mca_params.chain_fanout_param_index, &(forced_values->chain_fanout));
|
|
||||||
mca_base_param_lookup_int (mca_params.max_requests_param_index, &(forced_values->max_requests));
|
|
||||||
|
|
||||||
return (MPI_SUCCESS);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* special version of above just for barrier which only has one option available (at the moment...) */
|
|
||||||
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
|
||||||
coll_tuned_force_algorithm_params_t *forced_values)
|
|
||||||
{
|
|
||||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
|
||||||
|
|
||||||
return (MPI_SUCCESS);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
||||||
* University Research and Technology
|
|
||||||
* Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED
|
|
||||||
#define MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED
|
|
||||||
|
|
||||||
#include "ompi_config.h"
|
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/* this structure is for storing the indexes to the forced algorithm mca params... */
|
|
||||||
/* we get these at component query (so that registered values appear in ompi_infoi) */
|
|
||||||
|
|
||||||
struct coll_tuned_force_algorithm_mca_param_indices_t {
|
|
||||||
int algorithm_param_index; /* which algorithm you want to force */
|
|
||||||
int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */
|
|
||||||
int tree_fanout_param_index; /* tree fanout/in to use */
|
|
||||||
int chain_fanout_param_index; /* K-chain fanout/in to use */
|
|
||||||
int max_requests_param_index; /* Maximum number of outstanding send or recv requests */
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef struct coll_tuned_force_algorithm_mca_param_indices_t coll_tuned_force_algorithm_mca_param_indices_t;
|
|
||||||
|
|
||||||
|
|
||||||
/* the following type is for storing actual value obtained from the MCA on each tuned module */
|
|
||||||
/* via their mca param indices lookup in the component */
|
|
||||||
/* this structure is stored once per collective type per communicator... */
|
|
||||||
struct coll_tuned_force_algorithm_params_t {
|
|
||||||
int algorithm; /* which algorithm you want to force */
|
|
||||||
int segsize; /* segsize to use (if supported), 0 = no segmentation */
|
|
||||||
int tree_fanout; /* tree fanout/in to use */
|
|
||||||
int chain_fanout; /* K-chain fanout/in to use */
|
|
||||||
int max_requests; /* Maximum number of outstanding send or recv requests */
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef struct coll_tuned_force_algorithm_params_t coll_tuned_force_algorithm_params_t;
|
|
||||||
|
|
||||||
|
|
||||||
/* prototypes */
|
|
||||||
|
|
||||||
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
|
||||||
coll_tuned_force_algorithm_params_t *forced_values);
|
|
||||||
|
|
||||||
/* barrier has less options than any other collective so it gets its own special function */
|
|
||||||
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
|
||||||
coll_tuned_force_algorithm_params_t *forced_values);
|
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#endif /* MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED */
|
|
||||||
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -440,49 +440,49 @@ ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_pa
|
|||||||
ompi_coll_tuned_forced_max_algorithms[GATHER] = max_alg;
|
ompi_coll_tuned_forced_max_algorithms[GATHER] = max_alg;
|
||||||
|
|
||||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||||
"gather_algorithm_count",
|
"gather_algorithm_count",
|
||||||
"Number of gather algorithms available",
|
"Number of gather algorithms available",
|
||||||
false, true, max_alg, NULL);
|
false, true, max_alg, NULL);
|
||||||
|
|
||||||
mca_param_indices->algorithm_param_index
|
mca_param_indices->algorithm_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"gather_algorithm",
|
"gather_algorithm",
|
||||||
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
|
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
|
||||||
false, false, 0, NULL);
|
false, false, 0, NULL);
|
||||||
if (mca_param_indices->algorithm_param_index < 0) {
|
if (mca_param_indices->algorithm_param_index < 0) {
|
||||||
return mca_param_indices->algorithm_param_index;
|
return mca_param_indices->algorithm_param_index;
|
||||||
}
|
}
|
||||||
mca_base_param_lookup_int(mca_param_indices->algorithm_param_index,
|
mca_base_param_lookup_int(mca_param_indices->algorithm_param_index,
|
||||||
&(requested_alg));
|
&(requested_alg));
|
||||||
if( 0 > requested_alg || requested_alg > max_alg ) {
|
if( 0 > requested_alg || requested_alg > max_alg ) {
|
||||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||||
opal_output( 0, "Gather algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
opal_output( 0, "Gather algorithm #%d is not available (range [0..%d]). Switching back to default(0)\n",
|
||||||
requested_alg, max_alg );
|
requested_alg, max_alg );
|
||||||
}
|
}
|
||||||
mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0);
|
mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
mca_param_indices->segsize_param_index
|
mca_param_indices->segsize_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"gather_algorithm_segmentsize",
|
"gather_algorithm_segmentsize",
|
||||||
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||||
false, false, 0, NULL);
|
false, false, 0, NULL);
|
||||||
|
|
||||||
mca_param_indices->tree_fanout_param_index
|
mca_param_indices->tree_fanout_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"gather_algorithm_tree_fanout",
|
"gather_algorithm_tree_fanout",
|
||||||
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||||
false, false,
|
false, false,
|
||||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
mca_param_indices->chain_fanout_param_index
|
mca_param_indices->chain_fanout_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"gather_algorithm_chain_fanout",
|
"gather_algorithm_chain_fanout",
|
||||||
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||||
false, false,
|
false, false,
|
||||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
return (MPI_SUCCESS);
|
return (MPI_SUCCESS);
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -31,7 +31,6 @@
|
|||||||
#include "coll_tuned_topo.h"
|
#include "coll_tuned_topo.h"
|
||||||
#include "coll_tuned_dynamic_rules.h"
|
#include "coll_tuned_dynamic_rules.h"
|
||||||
#include "coll_tuned_dynamic_file.h"
|
#include "coll_tuned_dynamic_file.h"
|
||||||
#include "coll_tuned_forced.h"
|
|
||||||
|
|
||||||
static int tuned_module_enable(mca_coll_base_module_t *module,
|
static int tuned_module_enable(mca_coll_base_module_t *module,
|
||||||
struct ompi_communicator_t *comm);
|
struct ompi_communicator_t *comm);
|
||||||
@ -61,27 +60,21 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
|
|||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:module_tuned query called"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:module_tuned query called"));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* No support for inter-communicator yet.
|
||||||
|
*/
|
||||||
|
if (OMPI_COMM_IS_INTER(comm)) {
|
||||||
|
*priority = 0;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If it is inter-communicator and size is less than 2 we have specialized modules
|
* If it is inter-communicator and size is less than 2 we have specialized modules
|
||||||
* to handle the intra collective communications.
|
* to handle the intra collective communications.
|
||||||
*/
|
*/
|
||||||
if (OMPI_COMM_IS_INTRA(comm) && ompi_comm_size(comm) < 2) {
|
if (OMPI_COMM_IS_INTRA(comm) && ompi_comm_size(comm) < 2) {
|
||||||
*priority = 0;
|
*priority = 0;
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
|
||||||
|
|
||||||
if (OMPI_COMM_IS_INTER(comm)) {
|
|
||||||
#if 0
|
|
||||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_dynamic"));
|
|
||||||
} else {
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_fixed"));
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
/* tuned does not support inter-communicator yet */
|
|
||||||
*priority = 0;
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tuned_module = OBJ_NEW(mca_coll_tuned_module_t);
|
tuned_module = OBJ_NEW(mca_coll_tuned_module_t);
|
||||||
@ -99,72 +92,86 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
|
|||||||
tuned_module->super.ft_event = mca_coll_tuned_ft_event;
|
tuned_module->super.ft_event = mca_coll_tuned_ft_event;
|
||||||
|
|
||||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_dynamic"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_dynamic"));
|
||||||
|
|
||||||
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_alltoallw = NULL;
|
|
||||||
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_exscan = NULL;
|
|
||||||
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_gatherv = NULL;
|
|
||||||
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_scan = NULL;
|
|
||||||
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_dynamic;
|
|
||||||
tuned_module->super.coll_scatterv = NULL;
|
|
||||||
|
|
||||||
|
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_alltoallw = NULL;
|
||||||
|
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_exscan = NULL;
|
||||||
|
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_gatherv = NULL;
|
||||||
|
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_scan = NULL;
|
||||||
|
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_dynamic;
|
||||||
|
tuned_module->super.coll_scatterv = NULL;
|
||||||
} else {
|
} else {
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_fixed"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_fixed"));
|
||||||
|
|
||||||
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_alltoallw = NULL;
|
|
||||||
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_exscan = NULL;
|
|
||||||
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_gatherv = NULL;
|
|
||||||
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_scan = NULL;
|
|
||||||
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_fixed;
|
|
||||||
tuned_module->super.coll_scatterv = NULL;
|
|
||||||
|
|
||||||
|
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_alltoallw = NULL;
|
||||||
|
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_exscan = NULL;
|
||||||
|
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_gatherv = NULL;
|
||||||
|
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_scan = NULL;
|
||||||
|
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_fixed;
|
||||||
|
tuned_module->super.coll_scatterv = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return &(tuned_module->super);
|
return &(tuned_module->super);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
|
||||||
|
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
|
||||||
|
|
||||||
|
static int
|
||||||
|
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||||
|
coll_tuned_force_algorithm_params_t *forced_values )
|
||||||
|
{
|
||||||
|
coll_tuned_force_algorithm_mca_param_indices_t* mca_params;
|
||||||
|
|
||||||
|
mca_params = &(ompi_coll_tuned_forced_params[type]);
|
||||||
|
|
||||||
|
mca_base_param_lookup_int (mca_params->algorithm_param_index, &(forced_values->algorithm));
|
||||||
|
if( BARRIER != type ) {
|
||||||
|
mca_base_param_lookup_int (mca_params->segsize_param_index, &(forced_values->segsize));
|
||||||
|
mca_base_param_lookup_int (mca_params->tree_fanout_param_index, &(forced_values->tree_fanout));
|
||||||
|
mca_base_param_lookup_int (mca_params->chain_fanout_param_index, &(forced_values->chain_fanout));
|
||||||
|
mca_base_param_lookup_int (mca_params->max_requests_param_index, &(forced_values->max_requests));
|
||||||
|
}
|
||||||
|
return (MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Init module on the communicator
|
* Init module on the communicator
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
tuned_module_enable(mca_coll_base_module_t *module,
|
tuned_module_enable( mca_coll_base_module_t *module,
|
||||||
struct ompi_communicator_t *comm)
|
struct ompi_communicator_t *comm )
|
||||||
{
|
{
|
||||||
int size;
|
int size, i;
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module;
|
||||||
mca_coll_tuned_comm_t *data = NULL;
|
mca_coll_tuned_comm_t *data = NULL;
|
||||||
/* fanout parameters */
|
|
||||||
int rc=0;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
|
||||||
|
|
||||||
/* This routine will become more complex and might have to be */
|
/* This routine will become more complex and might have to be
|
||||||
/* broken into more sections/function calls */
|
* broken into more sections/function calls
|
||||||
|
*
|
||||||
/* Order of operations:
|
* Order of operations:
|
||||||
* alloc memory for nb reqs (in case we fall through)
|
* alloc memory for nb reqs (in case we fall through)
|
||||||
* add decision rules if using dynamic rules
|
* add decision rules if using dynamic rules
|
||||||
* compact rules using communicator size info etc
|
* compact rules using communicator size info etc
|
||||||
@ -174,15 +181,13 @@ tuned_module_enable(mca_coll_base_module_t *module,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* Allocate the data that hangs off the communicator */
|
/* Allocate the data that hangs off the communicator */
|
||||||
|
|
||||||
if (OMPI_COMM_IS_INTER(comm)) {
|
if (OMPI_COMM_IS_INTER(comm)) {
|
||||||
size = ompi_comm_remote_size(comm);
|
size = ompi_comm_remote_size(comm);
|
||||||
} else {
|
} else {
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
/*
|
|
||||||
* we still malloc data as it is used by the TUNED modules
|
* we still malloc data as it is used by the TUNED modules
|
||||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||||
* we place any special info after the default data
|
* we place any special info after the default data
|
||||||
@ -195,12 +200,9 @@ tuned_module_enable(mca_coll_base_module_t *module,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* if we within the memory/size limit, allow preallocated data */
|
/* if we within the memory/size limit, allow preallocated data */
|
||||||
|
if( size <= ompi_coll_tuned_preallocate_memory_comm_size_limit ) {
|
||||||
|
|
||||||
if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) {
|
|
||||||
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t) +
|
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t) +
|
||||||
(sizeof(ompi_request_t *) * size * 2));
|
(sizeof(ompi_request_t *) * size * 2));
|
||||||
|
|
||||||
if (NULL == data) {
|
if (NULL == data) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
@ -208,7 +210,6 @@ tuned_module_enable(mca_coll_base_module_t *module,
|
|||||||
data->mcct_num_reqs = size * 2;
|
data->mcct_num_reqs = size * 2;
|
||||||
} else {
|
} else {
|
||||||
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t));
|
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t));
|
||||||
|
|
||||||
if (NULL == data) {
|
if (NULL == data) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
@ -216,138 +217,62 @@ tuned_module_enable(mca_coll_base_module_t *module,
|
|||||||
data->mcct_num_reqs = 0;
|
data->mcct_num_reqs = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
/*
|
|
||||||
* If using dynamic and you are MPI_COMM_WORLD and you want to use a parameter file..
|
* If using dynamic and you are MPI_COMM_WORLD and you want to use a parameter file..
|
||||||
* then this effects how much storage space you need
|
* then this effects how much storage space you need
|
||||||
* (This is a basic version of what will go into V2)
|
* (This is a basic version of what will go into V2)
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
size = ompi_comm_size(comm); /* find size so we can (A) decide if to access the file directly */
|
|
||||||
/* (B) so we can get our very own customised ompi_coll_com_rule_t ptr */
|
|
||||||
/* which only has rules in it for our com size */
|
|
||||||
|
|
||||||
/* actually if they are below a threadhold, they all open it */
|
|
||||||
/* have to build a collective in here.. but just for MCW.. */
|
|
||||||
/* but we have to make sure we have the same rules everywhere :( */
|
|
||||||
|
|
||||||
/* if using dynamic rules make sure all overrides are NULL before we start override anything accidently */
|
/* if using dynamic rules make sure all overrides are NULL before we start override anything accidently */
|
||||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||||
/* base rules */
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic"));
|
||||||
data->all_base_rules = (ompi_coll_alg_rule_t*) NULL;
|
|
||||||
|
|
||||||
/* each collective rule for my com size */
|
/**
|
||||||
for (i=0;i<COLLCOUNT;i++) {
|
* next dynamic state, recheck all forced rules as well
|
||||||
data->com_rules[i] = (ompi_coll_com_rule_t*) NULL;
|
* warning, we should check to make sure this is really an INTRA comm here...
|
||||||
|
*/
|
||||||
|
ompi_coll_tuned_forced_getvalues( ALLGATHER, &(data->user_forced[ALLGATHER]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( ALLGATHERV, &(data->user_forced[ALLGATHERV]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( ALLREDUCE, &(data->user_forced[ALLREDUCE]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( ALLTOALL, &(data->user_forced[ALLTOALL]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( ALLTOALLV, &(data->user_forced[ALLTOALLV]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( ALLTOALLW, &(data->user_forced[ALLTOALLW]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( BARRIER, &(data->user_forced[BARRIER]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( BCAST, &(data->user_forced[BCAST]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( EXSCAN, &(data->user_forced[EXSCAN]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( GATHER, &(data->user_forced[GATHER]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( GATHERV, &(data->user_forced[GATHERV]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( REDUCE, &(data->user_forced[REDUCE]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( REDUCESCATTER, &(data->user_forced[REDUCESCATTER]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( SCAN, &(data->user_forced[SCAN]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( SCATTER, &(data->user_forced[SCATTER]));
|
||||||
|
ompi_coll_tuned_forced_getvalues( SCATTERV, &(data->user_forced[SCATTERV]));
|
||||||
|
|
||||||
|
if( NULL != mca_coll_tuned_component.all_base_rules ) {
|
||||||
|
/* extract our customized communicator sized rule set, for each collective */
|
||||||
|
for( i = 0; i < COLLCOUNT; i++ ) {
|
||||||
|
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules,
|
||||||
|
i, size );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* next dynamic state, recheck all forced rules as well */
|
|
||||||
/* warning, we should check to make sure this is really an INTRA comm here... */
|
|
||||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLREDUCE], &(data->user_forced[ALLREDUCE]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALL], &(data->user_forced[ALLTOALL]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLGATHER], &(data->user_forced[ALLGATHER]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLGATHERV], &(data->user_forced[ALLGATHERV]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV]));
|
|
||||||
ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCESCATTER], &(data->user_forced[REDUCESCATTER]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[GATHER], &(data->user_forced[GATHER]));
|
|
||||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[SCATTER], &(data->user_forced[SCATTER]));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (&ompi_mpi_comm_world.comm==comm) {
|
|
||||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic"));
|
|
||||||
if (ompi_coll_tuned_dynamic_rules_filename) {
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Opening [%s]",
|
|
||||||
ompi_coll_tuned_dynamic_rules_filename));
|
|
||||||
rc = ompi_coll_tuned_read_rules_config_file (ompi_coll_tuned_dynamic_rules_filename,
|
|
||||||
&(data->all_base_rules), COLLCOUNT);
|
|
||||||
if (rc>=0) {
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Read %d valid rules\n", rc));
|
|
||||||
/* at this point we all have a base set of rules */
|
|
||||||
/* now we can get our customized communicator sized rule set, for each collective */
|
|
||||||
for (i=0;i<COLLCOUNT;i++) {
|
|
||||||
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else { /* failed to read config file, thus make sure its a NULL... */
|
|
||||||
data->all_base_rules = (ompi_coll_alg_rule_t*) NULL;
|
|
||||||
}
|
|
||||||
} /* end if a config filename exists */
|
|
||||||
} /* end if dynamic_rules */
|
|
||||||
} /* end if MCW */
|
|
||||||
|
|
||||||
/* ok, if using dynamic rules, not MCW and we are just any rank and a base set of rules exist.. ref them */
|
|
||||||
/* order of eval is important here, if we are MCW ompi_mpi_comm_world.c_coll_selected_data is NULL still.. */
|
|
||||||
|
|
||||||
#if 0 /* FIXME: don't know how to deal with this */
|
|
||||||
if ((ompi_coll_tuned_use_dynamic_rules)&&(!(&ompi_mpi_comm_world==comm))&&
|
|
||||||
((ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules)) {
|
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init NOT MCW & Dynamic"));
|
|
||||||
|
|
||||||
/* this will, erm fail if MCW doesn't exist which it should! */
|
|
||||||
data->all_base_rules = (ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules;
|
|
||||||
|
|
||||||
/* at this point we all have a base set of rules if they exist atall */
|
|
||||||
/* now we can get our customized communicator sized rule set, for each collective */
|
|
||||||
for (i=0;i<COLLCOUNT;i++) {
|
|
||||||
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
|
||||||
* now for the cached topo functions
|
|
||||||
* guess the initial topologies to use rank 0 as root
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* general n fan out tree */
|
/* general n fan out tree */
|
||||||
data->cached_ntree = ompi_coll_tuned_topo_build_tree (ompi_coll_tuned_init_tree_fanout,
|
data->cached_ntree = NULL;
|
||||||
comm, 0);
|
|
||||||
data->cached_ntree_root = 0;
|
|
||||||
data->cached_ntree_fanout = ompi_coll_tuned_init_tree_fanout;
|
|
||||||
|
|
||||||
/* binary tree */
|
/* binary tree */
|
||||||
data->cached_bintree = ompi_coll_tuned_topo_build_tree (2, comm, 0);
|
data->cached_bintree = NULL;
|
||||||
data->cached_bintree_root = 0;
|
|
||||||
|
|
||||||
/* binomial tree */
|
/* binomial tree */
|
||||||
data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0);
|
data->cached_bmtree = NULL;
|
||||||
data->cached_bmtree_root = 0;
|
|
||||||
|
|
||||||
/* binomial tree */
|
/* binomial tree */
|
||||||
data->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree (comm, 0);
|
data->cached_in_order_bmtree = NULL;
|
||||||
data->cached_in_order_bmtree_root = 0;
|
/* chains (fanout followed by pipelines) */
|
||||||
/*
|
data->cached_chain = NULL;
|
||||||
* chains (fanout followed by pipelines)
|
|
||||||
* are more difficuilt as the fan out really really depends on message size [sometimes]..
|
|
||||||
* as size gets larger fan-out gets smaller [usually]
|
|
||||||
*
|
|
||||||
* will probably change how we cache this later, for now a midsize
|
|
||||||
* GEF
|
|
||||||
*/
|
|
||||||
data->cached_chain = ompi_coll_tuned_topo_build_chain (ompi_coll_tuned_init_chain_fanout,
|
|
||||||
comm, 0);
|
|
||||||
data->cached_chain_root = 0;
|
|
||||||
data->cached_chain_fanout = ompi_coll_tuned_init_chain_fanout;
|
|
||||||
|
|
||||||
/* standard pipeline */
|
/* standard pipeline */
|
||||||
data->cached_pipeline = ompi_coll_tuned_topo_build_chain (1, comm, 0);
|
data->cached_pipeline = NULL;
|
||||||
data->cached_pipeline_root = 0;
|
|
||||||
|
|
||||||
/* in-order binary tree */
|
/* in-order binary tree */
|
||||||
data->cached_in_order_bintree = ompi_coll_tuned_topo_build_in_order_bintree(comm);
|
data->cached_in_order_bintree = NULL;
|
||||||
|
|
||||||
/* All done */
|
/* All done */
|
||||||
|
|
||||||
tuned_module->tuned_data = data;
|
tuned_module->tuned_data = data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
@ -43,7 +43,7 @@
|
|||||||
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
|
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
|
||||||
ompi_datatype_t* datatype, ompi_op_t* op,
|
ompi_datatype_t* datatype, ompi_op_t* op,
|
||||||
int root, ompi_communicator_t* comm,
|
int root, ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
ompi_coll_tree_t* tree, int count_by_segment,
|
ompi_coll_tree_t* tree, int count_by_segment,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
@ -79,7 +79,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
if( tree->tree_nextsize > 0 ) {
|
if( tree->tree_nextsize > 0 ) {
|
||||||
ptrdiff_t true_lower_bound, true_extent, real_segment_size;
|
ptrdiff_t true_lower_bound, true_extent, real_segment_size;
|
||||||
ompi_datatype_get_true_extent( datatype, &true_lower_bound,
|
ompi_datatype_get_true_extent( datatype, &true_lower_bound,
|
||||||
&true_extent );
|
&true_extent );
|
||||||
|
|
||||||
/* handle non existant recv buffer (i.e. its NULL) and
|
/* handle non existant recv buffer (i.e. its NULL) and
|
||||||
protect the recv buffer on non-root nodes */
|
protect the recv buffer on non-root nodes */
|
||||||
@ -88,18 +88,18 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
/* Allocate temporary accumulator buffer. */
|
/* Allocate temporary accumulator buffer. */
|
||||||
accumbuf_free = (char*)malloc(true_extent +
|
accumbuf_free = (char*)malloc(true_extent +
|
||||||
(original_count - 1) * extent);
|
(original_count - 1) * extent);
|
||||||
if (accumbuf_free == NULL) {
|
if (accumbuf_free == NULL) {
|
||||||
line = __LINE__; ret = -1; goto error_hndl;
|
line = __LINE__; ret = -1; goto error_hndl;
|
||||||
}
|
}
|
||||||
accumbuf = accumbuf_free - lower_bound;
|
accumbuf = accumbuf_free - lower_bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If this is a non-commutative operation we must copy
|
/* If this is a non-commutative operation we must copy
|
||||||
sendbuf to the accumbuf, in order to simplfy the loops */
|
sendbuf to the accumbuf, in order to simplfy the loops */
|
||||||
if (!ompi_op_is_commute(op)) {
|
if (!ompi_op_is_commute(op)) {
|
||||||
ompi_datatype_copy_content_same_ddt(datatype, original_count,
|
ompi_datatype_copy_content_same_ddt(datatype, original_count,
|
||||||
(char*)accumbuf,
|
(char*)accumbuf,
|
||||||
(char*)sendtmpbuf);
|
(char*)sendtmpbuf);
|
||||||
}
|
}
|
||||||
/* Allocate two buffers for incoming segments */
|
/* Allocate two buffers for incoming segments */
|
||||||
real_segment_size = true_extent + (count_by_segment - 1) * extent;
|
real_segment_size = true_extent + (count_by_segment - 1) * extent;
|
||||||
@ -232,11 +232,11 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
the number of segments we have two options:
|
the number of segments we have two options:
|
||||||
- send all segments using blocking send to the parent, or
|
- send all segments using blocking send to the parent, or
|
||||||
- avoid overflooding the parent nodes by limiting the number of
|
- avoid overflooding the parent nodes by limiting the number of
|
||||||
outstanding requests to max_oustanding_reqs.
|
outstanding requests to max_oustanding_reqs.
|
||||||
TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size
|
TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size
|
||||||
for the current communication, synchronization should be used only
|
for the current communication, synchronization should be used only
|
||||||
when the message/segment size is smaller than the eager size.
|
when the message/segment size is smaller than the eager size.
|
||||||
*/
|
*/
|
||||||
else {
|
else {
|
||||||
|
|
||||||
/* If the number of segments is less than a maximum number of oustanding
|
/* If the number of segments is less than a maximum number of oustanding
|
||||||
@ -266,9 +266,9 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
/* Otherwise, introduce flow control:
|
/* Otherwise, introduce flow control:
|
||||||
- post max_outstanding_reqs non-blocking synchronous send,
|
- post max_outstanding_reqs non-blocking synchronous send,
|
||||||
- for remaining segments
|
- for remaining segments
|
||||||
- wait for a ssend to complete, and post the next one.
|
- wait for a ssend to complete, and post the next one.
|
||||||
- wait for all outstanding sends to complete.
|
- wait for all outstanding sends to complete.
|
||||||
*/
|
*/
|
||||||
else {
|
else {
|
||||||
|
|
||||||
int creq = 0;
|
int creq = 0;
|
||||||
@ -346,7 +346,7 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
ompi_datatype_t* datatype,
|
ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm,
|
ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize, int fanout,
|
uint32_t segsize, int fanout,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
@ -376,7 +376,7 @@ int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
|||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm,
|
ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize,
|
uint32_t segsize,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
@ -407,7 +407,7 @@ int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
|
|||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm,
|
ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize,
|
uint32_t segsize,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
@ -438,7 +438,7 @@ int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
|
|||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm,
|
ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize,
|
uint32_t segsize,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
@ -477,7 +477,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
|
|||||||
ompi_datatype_t* datatype,
|
ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm,
|
ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize,
|
uint32_t segsize,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
@ -514,57 +514,57 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
|
|||||||
use_this_sendbuf = sendbuf;
|
use_this_sendbuf = sendbuf;
|
||||||
use_this_recvbuf = recvbuf;
|
use_this_recvbuf = recvbuf;
|
||||||
if (io_root != root) {
|
if (io_root != root) {
|
||||||
ptrdiff_t tlb, text, lb, ext;
|
ptrdiff_t tlb, text, lb, ext;
|
||||||
char *tmpbuf = NULL;
|
char *tmpbuf = NULL;
|
||||||
|
|
||||||
ompi_datatype_get_extent(datatype, &lb, &ext);
|
ompi_datatype_get_extent(datatype, &lb, &ext);
|
||||||
ompi_datatype_get_true_extent(datatype, &tlb, &text);
|
ompi_datatype_get_true_extent(datatype, &tlb, &text);
|
||||||
|
|
||||||
if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
|
if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
|
||||||
tmpbuf = (char *) malloc(text + (count - 1) * ext);
|
tmpbuf = (char *) malloc(text + (count - 1) * ext);
|
||||||
if (NULL == tmpbuf) {
|
if (NULL == tmpbuf) {
|
||||||
return MPI_ERR_INTERN;
|
return MPI_ERR_INTERN;
|
||||||
}
|
}
|
||||||
ompi_datatype_copy_content_same_ddt(datatype, count,
|
ompi_datatype_copy_content_same_ddt(datatype, count,
|
||||||
(char*)tmpbuf,
|
(char*)tmpbuf,
|
||||||
(char*)recvbuf);
|
(char*)recvbuf);
|
||||||
use_this_sendbuf = tmpbuf;
|
use_this_sendbuf = tmpbuf;
|
||||||
} else if (io_root == rank) {
|
} else if (io_root == rank) {
|
||||||
tmpbuf = (char *) malloc(text + (count - 1) * ext);
|
tmpbuf = (char *) malloc(text + (count - 1) * ext);
|
||||||
if (NULL == tmpbuf) {
|
if (NULL == tmpbuf) {
|
||||||
return MPI_ERR_INTERN;
|
return MPI_ERR_INTERN;
|
||||||
}
|
}
|
||||||
use_this_recvbuf = tmpbuf;
|
use_this_recvbuf = tmpbuf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Use generic reduce with in-order binary tree topology and io_root */
|
/* Use generic reduce with in-order binary tree topology and io_root */
|
||||||
ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
|
ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
|
||||||
op, io_root, comm, module,
|
op, io_root, comm, module,
|
||||||
data->cached_in_order_bintree,
|
data->cached_in_order_bintree,
|
||||||
segcount, max_outstanding_reqs );
|
segcount, max_outstanding_reqs );
|
||||||
if (MPI_SUCCESS != ret) { return ret; }
|
if (MPI_SUCCESS != ret) { return ret; }
|
||||||
|
|
||||||
/* Clean up */
|
/* Clean up */
|
||||||
if (io_root != root) {
|
if (io_root != root) {
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
/* Receive result from rank io_root to recvbuf */
|
/* Receive result from rank io_root to recvbuf */
|
||||||
ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root,
|
ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root,
|
||||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||||
MPI_STATUS_IGNORE));
|
MPI_STATUS_IGNORE));
|
||||||
if (MPI_SUCCESS != ret) { return ret; }
|
if (MPI_SUCCESS != ret) { return ret; }
|
||||||
if (MPI_IN_PLACE == sendbuf) {
|
if (MPI_IN_PLACE == sendbuf) {
|
||||||
free(use_this_sendbuf);
|
free(use_this_sendbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (io_root == rank) {
|
} else if (io_root == rank) {
|
||||||
/* Send result from use_this_recvbuf to root */
|
/* Send result from use_this_recvbuf to root */
|
||||||
ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root,
|
ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root,
|
||||||
MCA_COLL_BASE_TAG_REDUCE,
|
MCA_COLL_BASE_TAG_REDUCE,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||||
if (MPI_SUCCESS != ret) { return ret; }
|
if (MPI_SUCCESS != ret) { return ret; }
|
||||||
free(use_this_recvbuf);
|
free(use_this_recvbuf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
@ -596,8 +596,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, rank, err, size;
|
int i, rank, err, size;
|
||||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||||
@ -650,7 +650,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
|
|
||||||
if (rank == (size - 1)) {
|
if (rank == (size - 1)) {
|
||||||
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf,
|
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf,
|
||||||
(char*)sbuf);
|
(char*)sbuf);
|
||||||
} else {
|
} else {
|
||||||
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
|
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
|
||||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||||
@ -688,7 +688,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
|
|
||||||
if (NULL != inplace_temp) {
|
if (NULL != inplace_temp) {
|
||||||
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf,
|
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf,
|
||||||
inplace_temp);
|
inplace_temp);
|
||||||
free(inplace_temp);
|
free(inplace_temp);
|
||||||
}
|
}
|
||||||
if (NULL != free_buffer) {
|
if (NULL != free_buffer) {
|
||||||
@ -789,7 +789,7 @@ int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op, int root,
|
struct ompi_op_t *op, int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -808,19 +808,19 @@ int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
|
|||||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype,
|
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module);
|
op, root, comm, module);
|
||||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype,
|
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, chain_fanout, max_requests);
|
segsize, chain_fanout, max_requests);
|
||||||
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype,
|
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype,
|
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype,
|
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
|
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
@ -834,7 +834,7 @@ int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op, int root,
|
struct ompi_op_t *op, int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout,
|
int algorithm, int faninout,
|
||||||
int segsize, int max_requests )
|
int segsize, int max_requests )
|
||||||
{
|
{
|
||||||
@ -843,23 +843,23 @@ int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
|
|||||||
|
|
||||||
switch (algorithm) {
|
switch (algorithm) {
|
||||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype,
|
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module);
|
op, root, comm, module);
|
||||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype,
|
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module);
|
op, root, comm, module);
|
||||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype,
|
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, faninout, max_requests);
|
segsize, faninout, max_requests);
|
||||||
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype,
|
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype,
|
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype,
|
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
|
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
|
||||||
op, root, comm, module,
|
op, root, comm, module,
|
||||||
segsize, max_requests);
|
segsize, max_requests);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user