1
1

Rename the base header file containing the prototypes of the collective

functions.
Этот коммит содержится в:
George Bosilca 2015-02-15 14:47:27 -05:00
родитель 8fbcdf685d
Коммит aa019e239e
21 изменённых файлов: 1400 добавлений и 3323 удалений

Просмотреть файл

@ -2,7 +2,7 @@
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# Copyright (c) 2004-2015 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -20,10 +20,25 @@ dist_ompidata_DATA = base/help-mca-coll-base.txt
headers += \
base/base.h \
base/coll_tags.h
base/coll_tags.h \
base/coll_base_topo.h \
base/coll_base_util.h
libmca_coll_la_SOURCES += \
base/coll_base_comm_select.c \
base/coll_base_comm_unselect.c \
base/coll_base_find_available.c \
base/coll_base_frame.c
base/coll_base_frame.c \
base/coll_base_bcast.c \
base/coll_base_scatter.c \
base/coll_base_topo.c \
base/coll_base_allgather.c \
base/coll_base_allgatherv.c \
base/coll_base_util.c \
base/coll_base_allreduce.c \
base/coll_base_alltoall.c \
base/coll_base_gather.c \
base/coll_base_alltoallv.c \
base/coll_base_reduce.c \
base/coll_base_barrier.c \
base/coll_base_reduce_scatter.c

Просмотреть файл

@ -87,7 +87,7 @@ int mca_coll_base_find_available(bool enable_progress_threads,
* coll component needs to be selected for it. It should be invoked
* near the end of the communicator creation process such that
* almost everything else is functional on the communicator (e.g.,
* point-to-point communication).
* point-to-point communication).
*
* Note that new communicators may be created as a result of
* invoking this function. Specifically: this function is called in

Просмотреть файл

@ -1,558 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_EXPORT_H
#define MCA_COLL_TUNED_EXPORT_H
#include "ompi_config.h"
#include "mpi.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
/* some fixed value index vars to simplify certain operations */
typedef enum COLLTYPE {
ALLGATHER = 0, /* 0 */
ALLGATHERV, /* 1 */
ALLREDUCE, /* 2 */
ALLTOALL, /* 3 */
ALLTOALLV, /* 4 */
ALLTOALLW, /* 5 */
BARRIER, /* 6 */
BCAST, /* 7 */
EXSCAN, /* 8 */
GATHER, /* 9 */
GATHERV, /* 10 */
REDUCE, /* 11 */
REDUCESCATTER, /* 12 */
SCAN, /* 13 */
SCATTER, /* 14 */
SCATTERV, /* 15 */
COLLCOUNT /* 16 end counter keep it as last element */
} COLLTYPE_T;
/* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
BEGIN_C_DECLS
/* these are the same across all modules and are loaded at component query time */
extern int ompi_coll_tuned_stream;
extern int ompi_coll_tuned_priority;
extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
extern bool ompi_coll_tuned_use_dynamic_rules;
extern char* ompi_coll_tuned_dynamic_rules_filename;
extern int ompi_coll_tuned_init_tree_fanout;
extern int ompi_coll_tuned_init_chain_fanout;
extern int ompi_coll_tuned_init_max_requests;
extern int ompi_coll_tuned_alltoall_small_msg;
extern int ompi_coll_tuned_alltoall_intermediate_msg;
/* forced algorithm choices */
/* this structure is for storing the indexes to the forced algorithm mca params... */
/* we get these at component query (so that registered values appear in ompi_infoi) */
struct coll_tuned_force_algorithm_mca_param_indices_t {
int algorithm_param_index; /* which algorithm you want to force */
int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */
int tree_fanout_param_index; /* tree fanout/in to use */
int chain_fanout_param_index; /* K-chain fanout/in to use */
int max_requests_param_index; /* Maximum number of outstanding send or recv requests */
};
typedef struct coll_tuned_force_algorithm_mca_param_indices_t coll_tuned_force_algorithm_mca_param_indices_t;
/* the following type is for storing actual value obtained from the MCA on each tuned module */
/* via their mca param indices lookup in the component */
/* this structure is stored once per collective type per communicator... */
struct coll_tuned_force_algorithm_params_t {
int algorithm; /* which algorithm you want to force */
int segsize; /* segsize to use (if supported), 0 = no segmentation */
int tree_fanout; /* tree fanout/in to use */
int chain_fanout; /* K-chain fanout/in to use */
int max_requests; /* Maximum number of outstanding send or recv requests */
};
typedef struct coll_tuned_force_algorithm_params_t coll_tuned_force_algorithm_params_t;
/* the indices to the MCA params so that modules can look them up at open / comm create time */
extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
/* the actual max algorithm values (readonly), loaded at component open */
extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
/*
* coll API functions
*/
/* API functions */
int ompi_coll_tuned_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
mca_coll_base_module_t *
ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);
/* API functions of decision functions and any implementations */
/*
* Note this gets long as we have to have a prototype for each
* MPI collective 4 times.. 2 for the comm type and 2 for each decision
* type.
* we might cut down the decision prototypes by conditional compiling
*/
/* All Gather */
int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);
/* All GatherV */
int ompi_coll_tuned_allgatherv_intra_dec_fixed(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);
/* All Reduce */
int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);
/* AlltoAll */
int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);
/* AlltoAllV */
int ompi_coll_tuned_alltoallv_intra_dec_fixed(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm);
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);
/* AlltoAllW */
int ompi_coll_tuned_alltoallw_intra_dec_fixed(ALLTOALLW_ARGS);
int ompi_coll_tuned_alltoallw_intra_dec_dynamic(ALLTOALLW_ARGS);
int ompi_coll_tuned_alltoallw_inter_dec_fixed(ALLTOALLW_ARGS);
int ompi_coll_tuned_alltoallw_inter_dec_dynamic(ALLTOALLW_ARGS);
/* Barrier */
int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS);
/* Bcast */
int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);
/* Exscan */
int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
int ompi_coll_tuned_exscan_inter_dec_fixed(EXSCAN_ARGS);
int ompi_coll_tuned_exscan_inter_dec_dynamic(EXSCAN_ARGS);
/* Gather */
int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);
/* GatherV */
int ompi_coll_tuned_gatherv_intra_dec_fixed(GATHERV_ARGS);
int ompi_coll_tuned_gatherv_intra_dec_dynamic(GATHER_ARGS);
int ompi_coll_tuned_gatherv_inter_dec_fixed(GATHER_ARGS);
int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);
/* Reduce */
int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
/* Reduce_scatter */
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
/* Scan */
int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
int ompi_coll_tuned_scan_inter_dec_fixed(SCAN_ARGS);
int ompi_coll_tuned_scan_inter_dec_dynamic(SCAN_ARGS);
/* Scatter */
int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);
/* ScatterV */
int ompi_coll_tuned_scatterv_intra_dec_fixed(SCATTERV_ARGS);
int ompi_coll_tuned_scatterv_intra_dec_dynamic(SCATTERV_ARGS);
int ompi_coll_tuned_scatterv_inter_dec_fixed(SCATTERV_ARGS);
int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);
int mca_coll_tuned_ft_event(int state);
/* Utility functions */
static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(&reqs[i]);
}
struct mca_coll_tuned_component_t {
/** Base coll component */
mca_coll_base_component_2_0_0_t super;
/** MCA parameter: Priority of this component */
int tuned_priority;
/** global stuff that I need the component to store */
/* MCA parameters first */
/* cached decision table stuff (moved from MCW module) */
ompi_coll_alg_rule_t *all_base_rules;
};
/**
* Convenience typedef
*/
typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t;
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;
/*
* Data structure for hanging data off the communicator
* i.e. per module instance
*/
struct mca_coll_tuned_comm_t {
/* standard data for requests and PML usage */
/* Precreate space for requests
* Note this does not effect basic,
* but if in wrong context can confuse a debugger
* this is controlled by an MCA param
*/
ompi_request_t **mcct_reqs;
int mcct_num_reqs;
/*
* tuned topo information caching per communicator
*
* for each communicator we cache the topo information so we can
* reuse without regenerating if we change the root, [or fanout]
* then regenerate and recache this information
*/
/* general tree with n fan out */
ompi_coll_tree_t *cached_ntree;
int cached_ntree_root;
int cached_ntree_fanout;
/* binary tree */
ompi_coll_tree_t *cached_bintree;
int cached_bintree_root;
/* binomial tree */
ompi_coll_tree_t *cached_bmtree;
int cached_bmtree_root;
/* binomial tree */
ompi_coll_tree_t *cached_in_order_bmtree;
int cached_in_order_bmtree_root;
/* chained tree (fanout followed by pipelines) */
ompi_coll_tree_t *cached_chain;
int cached_chain_root;
int cached_chain_fanout;
/* pipeline */
ompi_coll_tree_t *cached_pipeline;
int cached_pipeline_root;
/* in-order binary tree (root of the in-order binary tree is rank 0) */
ompi_coll_tree_t *cached_in_order_bintree;
/* moving to the component */
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
/* for forced algorithms we store the information on the module */
/* previously we only had one shared copy, ops, it really is per comm/module */
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
};
typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t;
struct mca_coll_tuned_module_t {
mca_coll_base_module_t super;
mca_coll_tuned_comm_t *tuned_data;
};
typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t;
OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t);
static inline void mca_coll_tuned_free_reqs(ompi_request_t ** reqs,
int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(reqs + i);
}
END_C_DECLS
#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_bintree) \
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
} \
coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
coll_comm->cached_bintree_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_bmtree) \
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
} \
coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_in_order_bmtree) \
&& (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
} \
coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_in_order_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_pipeline) \
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
} \
coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
coll_comm->cached_pipeline_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, TUNED_MODULE, ROOT, FANOUT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_chain) \
&& (coll_comm->cached_chain_root == (ROOT)) \
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) ); \
} \
coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
coll_comm->cached_chain_root = (ROOT); \
coll_comm->cached_chain_fanout = (FANOUT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, TUNED_MODULE ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !(coll_comm->cached_in_order_bintree) ) { \
/* In-order binary tree topology is defined by communicator size */ \
/* Thus, there is no need to destroy anything */ \
coll_comm->cached_in_order_bintree = \
ompi_coll_tuned_topo_build_in_order_bintree((OMPI_COMM)); \
} \
} while (0)
/**
* This macro give a generic way to compute the best count of
* the segment (i.e. the number of complete datatypes that
* can fit in the specified SEGSIZE). Beware, when this macro
* is called, the SEGCOUNT should be initialized to the count as
* expected by the collective call.
*/
#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
if( ((SEGSIZE) >= (TYPELNG)) && \
((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
size_t residual; \
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
if( residual > ((TYPELNG) >> 1) ) \
(SEGCOUNT)++; \
} \
/**
* This macro gives a generic wait to compute the well distributed block counts
* when the count and number of blocks are fixed.
* Macro returns "early-block" count, "late-block" count, and "split-index"
* which is the block at which we switch from "early-block" count to
* the "late-block" count.
* count = split_index * early_block_count +
* (block_count - split_index) * late_block_count
* We do not perform ANY error checks - make sure that the input values
* make sense (eg. count > num_blocks).
*/
#define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
SPLIT_INDEX = COUNT % NUM_BLOCKS; \
if (0 != SPLIT_INDEX) { \
EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
} \
#endif /* MCA_COLL_TUNED_EXPORT_H */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,31 +30,12 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* allgather algorithm variables */
static int coll_tuned_allgather_algorithm_count = 6;
static int coll_tuned_allgather_forced_algorithm = 0;
static int coll_tuned_allgather_segment_size = 0;
static int coll_tuned_allgather_tree_fanout;
static int coll_tuned_allgather_chain_fanout;
/* valid values for coll_tuned_allgather_forced_algorithm */
static mca_base_var_enum_value_t allgather_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "bruck"},
{3, "recursive_doubling"},
{4, "ring"},
{5, "neighbor"},
{6, "two_proc"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/*
* ompi_coll_tuned_allgather_intra_bruck
* ompi_coll_base_allgather_intra_bruck
*
* Function: allgather using O(log(N)) steps.
* Accepts: Same arguments as MPI_Allgather
@ -65,7 +46,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
* in Multiport Message-Passing Systems"
* Memory requirements: non-zero ranks require shift buffer to perform final
* step in the algorithm.
*
*
* Example on 6 nodes:
* Initialization: everyone has its own buffer at location 0 in rbuf
* This means if user specified MPI_IN_PLACE for sendbuf
@ -84,7 +65,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
* [2] [3] [4] [5] [0] [1]
* [3] [4] [5] [0] [1] [2]
* Step 2: send message to (rank - 2^2), receive message from (rank + 2^2)
* message size is "all remaining blocks"
* message size is "all remaining blocks"
* # 0 1 2 3 4 5
* [0] [1] [2] [3] [4] [5]
* [1] [2] [3] [4] [5] [0]
@ -101,7 +82,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
* [4] [4] [4] [4] [4] [4]
* [5] [5] [5] [5] [5] [5]
*/
int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
int ompi_coll_base_allgather_intra_bruck(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -115,8 +96,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_bruck rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -125,7 +106,7 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
receive buffer, else
- if rank r != 0, copy r^th block from receive buffer to block 0.
*/
@ -140,15 +121,15 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend);
if (err < 0) { line = __LINE__; goto err_hndl; }
}
/* Communication step:
At every step i, rank r:
- doubles the distance
- sends message which starts at begining of rbuf and has size
- sends message which starts at begining of rbuf and has size
(blockcount * rcount) to rank (r - distance)
- receives message of size blockcount * rcount from rank (r + distance)
at location (rbuf + distance * rcount * rext)
- blockcount doubles until last step when only the remaining data is
- blockcount doubles until last step when only the remaining data is
exchanged.
*/
blockcount = 1;
@ -162,14 +143,14 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
if (distance <= (size >> 1)) {
blockcount = distance;
} else {
} else {
blockcount = size - distance;
}
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, blockcount * rcount, rdtype,
err = ompi_coll_base_sendrecv(tmpsend, blockcount * rcount, rdtype,
sendto, MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, blockcount * rcount, rdtype,
tmprecv, blockcount * rcount, rdtype,
recvfrom, MCA_COLL_BASE_TAG_ALLGATHER,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -178,8 +159,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
/* Finalization step:
On all nodes except 0, data needs to be shifted locally:
- create temporary shift buffer,
see discussion in coll_basic_reduce.c about the size and begining
- create temporary shift buffer,
see discussion in coll_basic_reduce.c about the size and begining
of temporary buffer.
- copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer
- move blocks [(size - rank) .. size] from rbuf to begining of rbuf
@ -195,8 +176,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
free_buf = (char*) calloc(((true_extent +
((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)),
sizeof(char));
if (NULL == free_buf) {
line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
if (NULL == free_buf) {
line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
}
shift_buf = free_buf - true_lb;
@ -207,13 +188,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
/* 2. move blocks [(size - rank) .. size] from rbuf to the begining of rbuf */
tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext;
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
rbuf, tmpsend);
if (err < 0) { line = __LINE__; goto err_hndl; }
/* 3. copy blocks from shift buffer back to rbuf starting at block [rank]. */
tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
tmprecv, shift_buf);
if (err < 0) { line = __LINE__; goto err_hndl; }
@ -223,13 +204,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgather_intra_recursivedoubling
* ompi_coll_base_allgather_intra_recursivedoubling
*
* Function: allgather using O(log(N)) steps.
* Accepts: Same arguments as MPI_Allgather
@ -239,29 +220,29 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
* This algorithm is used in MPICH-2 for small- and medium-sized
* messages on power-of-two processes.
*
* Limitation: Current implementation only works on power-of-two number of
* processes.
* Limitation: Current implementation only works on power-of-two number of
* processes.
* In case this algorithm is invoked on non-power-of-two
* processes, Bruck algorithm will be invoked.
*
*
* Memory requirements:
* No additional memory requirements beyond user-supplied buffers.
*
*
* Example on 4 nodes:
* Initialization: everyone has its own buffer at location rank in rbuf
* # 0 1 2 3
* # 0 1 2 3
* [0] [ ] [ ] [ ]
* [ ] [1] [ ] [ ]
* [ ] [ ] [2] [ ]
* [ ] [ ] [ ] [3]
* Step 0: exchange data with (rank ^ 2^0)
* # 0 1 2 3
* # 0 1 2 3
* [0] [0] [ ] [ ]
* [1] [1] [ ] [ ]
* [ ] [ ] [2] [2]
* [ ] [ ] [3] [3]
* Step 1: exchange data with (rank ^ 2^1) (if you can)
* # 0 1 2 3
* # 0 1 2 3
* [0] [0] [0] [0]
* [1] [1] [1] [1]
* [2] [2] [2] [2]
@ -269,12 +250,12 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
*
* TODO: Modify the algorithm to work with any number of nodes.
* We can modify code to use identical implementation like MPICH-2:
* - using recursive-halving algorithm, at the end of each step,
* - using recursive-halving algorithm, at the end of each step,
* determine if there are nodes who did not exchange their data in that
* step, and send them appropriate messages.
*/
int
ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
int
ompi_coll_base_allgather_intra_recursivedoubling(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -293,21 +274,21 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
pow2size >>=1;
/* Current implementation only handles power-of-two number of processes.
If the function was called on non-power-of-two number of processes,
If the function was called on non-power-of-two number of processes,
print warning and call bruck allgather algorithm with same parameters.
*/
if (pow2size != size) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
size));
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_recursivedoubling rank %d, size %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_recursivedoubling rank %d, size %d",
rank, size));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
@ -317,7 +298,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
receive buffer
*/
if (MPI_IN_PLACE != sbuf) {
@ -326,8 +307,8 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Communication step:
At every step i, rank r:
- exchanges message with rank remote = (r ^ 2^i).
@ -347,7 +328,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
}
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
remote, MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
remote, MCA_COLL_BASE_TAG_ALLGATHER,
@ -359,7 +340,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -367,7 +348,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
/*
* ompi_coll_tuned_allgather_intra_ring
* ompi_coll_base_allgather_intra_ring
*
* Function: allgather using O(N) steps.
* Accepts: Same arguments as MPI_Allgather
@ -379,9 +360,9 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
* (r + 1) containing data from rank (r - i), with wrap arounds.
* Memory requirements:
* No additional memory requirements.
*
*
*/
int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
int ompi_coll_base_allgather_intra_ring(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -395,8 +376,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_ring rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_ring rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -413,15 +394,15 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Communication step:
At every step i: 0 .. (P-1), rank r:
- receives message from [(r - 1 + size) % size] containing data from rank
[(r - i - 1 + size) % size]
- sends message to rank [(r + 1) % size] containing data from rank
[(r - i + size) % size]
- sends message which starts at begining of rbuf and has size
- sends message which starts at begining of rbuf and has size
*/
sendto = (rank + 1) % size;
recvfrom = (rank - 1 + size) % size;
@ -434,7 +415,7 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, sendto,
err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, sendto,
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, rcount, rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLGATHER,
@ -446,34 +427,34 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgather_intra_neighborexchange
* ompi_coll_base_allgather_intra_neighborexchange
*
* Function: allgather using N/2 steps (O(N))
* Accepts: Same arguments as MPI_Allgather
* Returns: MPI_SUCCESS or error code
*
* Description: Neighbor Exchange algorithm for allgather.
* Described by Chen et.al. in
* "Performance Evaluation of Allgather Algorithms on
* Described by Chen et.al. in
* "Performance Evaluation of Allgather Algorithms on
* Terascale Linux Cluster with Fast Ethernet",
* Proceedings of the Eighth International Conference on
* Proceedings of the Eighth International Conference on
* High-Performance Computing inn Asia-Pacific Region
* (HPCASIA'05), 2005
*
*
* Rank r exchanges message with one of its neighbors and
* forwards the data further in the next step.
*
* No additional memory requirements.
*
*
* Limitations: Algorithm works only on even number of processes.
* For odd number of processes we switch to ring algorithm.
*
*
* Example on 6 nodes:
* Initial state
* # 0 1 2 3 4 5
@ -508,8 +489,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
* [4] [4] [4] [4] [4] [4]
* [5] [5] [5] [5] [5] [5]
*/
int
ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
int
ompi_coll_base_allgather_intra_neighborexchange(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -525,16 +506,16 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
if (size % 2) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
size));
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_neighborexchange rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_neighborexchange rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -551,7 +532,7 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Determine neighbors, order in which blocks will arrive, etc. */
even_rank = !(rank % 2);
@ -573,15 +554,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
/* Communication loop:
- First step is special: exchange a single block with neighbor[0].
- Rest of the steps:
update recv_data_from according to offset, and
- Rest of the steps:
update recv_data_from according to offset, and
exchange two blocks with appropriate neighbor.
the send location becomes previous receve location.
*/
tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext;
tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, rcount, rdtype, neighbor[0],
MCA_COLL_BASE_TAG_ALLGATHER,
@ -597,15 +578,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
for (i = 1; i < (size / 2); i++) {
const int i_parity = i % 2;
recv_data_from[i_parity] =
recv_data_from[i_parity] =
(recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext;
tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
neighbor[i_parity],
err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
neighbor[i_parity],
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
neighbor[i_parity],
@ -619,13 +600,13 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
int ompi_coll_base_allgather_intra_two_procs(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -638,8 +619,8 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgather_intra_two_procs rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_allgather_intra_two_procs rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -661,7 +642,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
}
tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext;
err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote,
err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, rcount, rdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHER,
@ -670,7 +651,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
/* Place your data in correct location if necessary */
if (MPI_IN_PLACE != sbuf) {
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
@ -678,7 +659,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -687,13 +668,13 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -706,10 +687,10 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
ompi_coll_base_allgather_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
@ -727,7 +708,7 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
sdtype = rdtype;
scount = rcount;
}
}
/* Gather and broadcast. */
@ -755,183 +736,3 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = coll_tuned_allgather_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_count",
"Number of allgather algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_allgather_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allgather_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allgather_algorithms", allgather_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm",
"Which allallgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allgather_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_segmentsize",
"Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_segment_size);
coll_tuned_allgather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_tree_fanout",
"Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_tree_fanout);
coll_tuned_allgather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_chain_fanout",
"Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_forced selected algorithm %d",
data->user_forced[ALLGATHER].algorithm));
switch (data->user_forced[ALLGATHER].algorithm) {
case (0):
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgather_intra_bruck (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgather_intra_recursivedoubling (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgather_intra_ring (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgather_intra_neighborexchange (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (6):
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLGATHER].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (6):
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -30,19 +30,12 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* allgatherv algorithm variables */
static int coll_tuned_allgatherv_algorithm_count = 5;
static int coll_tuned_allgatherv_forced_algorithm = 0;
static int coll_tuned_allgatherv_segment_size = 0;
static int coll_tuned_allgatherv_tree_fanout;
static int coll_tuned_allgatherv_chain_fanout;
/* valid values for coll_tuned_allgatherv_forced_algorithm */
static mca_base_var_enum_value_t allgatherv_algorithms[] = {
/* valid values for coll_base_allgatherv_forced_algorithm */
mca_base_var_enum_value_t coll_base_allgatherv_algorithms[] = {
{0, "ignore"},
{1, "default"},
{2, "bruck"},
@ -53,7 +46,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
};
/*
* ompi_coll_tuned_allgatherv_intra_bruck
* ompi_coll_base_allgatherv_intra_bruck
*
* Function: allgather using O(log(N)) steps.
* Accepts: Same arguments as MPI_Allgather
@ -107,7 +100,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
* [5] [5] [5] [5] [5] [5] [5]
* [6] [6] [6] [6] [6] [6] [6]
*/
int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
int ompi_coll_base_allgatherv_intra_bruck(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
@ -124,8 +117,8 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_bruck rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -198,7 +191,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto,
err = ompi_coll_base_sendrecv(rbuf, 1, new_sdtype, sendto,
MCA_COLL_BASE_TAG_ALLGATHERV,
rbuf, 1, new_rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLGATHERV,
@ -217,14 +210,14 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
err_hndl:
if( NULL != new_rcounts ) free(new_rcounts);
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgatherv_intra_ring
* ompi_coll_base_allgatherv_intra_ring
*
* Function: allgatherv using O(N) steps.
* Accepts: Same arguments as MPI_Allgatherv
@ -238,7 +231,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
* No additional memory requirements.
*
*/
int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
int ompi_coll_base_allgatherv_intra_ring(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
@ -252,8 +245,8 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_ring rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgatherv_intra_ring rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -292,7 +285,7 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
tmpsend = (char*)rbuf + rdisps[senddatafrom] * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
err = ompi_coll_base_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
sendto, MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, rcounts[recvdatafrom], rdtype,
recvfrom, MCA_COLL_BASE_TAG_ALLGATHERV,
@ -304,13 +297,13 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgatherv_intra_neighborexchange
* ompi_coll_base_allgatherv_intra_neighborexchange
*
* Function: allgatherv using N/2 steps (O(N))
* Accepts: Same arguments as MPI_Allgatherv
@ -368,7 +361,7 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
* [5] [5] [5] [5] [5] [5]
*/
int
ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
ompi_coll_base_allgatherv_intra_neighborexchange(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdispls,
struct ompi_datatype_t *rdtype,
@ -386,17 +379,17 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
if (size % 2) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
size));
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts,
rdispls, rdtype,
comm, module);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_neighborexchange rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgatherv_intra_neighborexchange rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -445,7 +438,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
*/
tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[neighbor[0]] * rext;
tmpsend = (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext;
err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[rank], rdtype,
err = ompi_coll_base_sendrecv(tmpsend, rcounts[rank], rdtype,
neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, rcounts[neighbor[0]], rdtype,
neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
@ -493,7 +486,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
tmpsend = (char*)rbuf;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
err = ompi_coll_base_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, 1, new_rdtype, neighbor[i_parity],
MCA_COLL_BASE_TAG_ALLGATHERV,
@ -509,13 +502,13 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
int ompi_coll_base_allgatherv_intra_two_procs(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts,
int *rdispls,
@ -529,8 +522,8 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgatherv_intra_two_procs rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_allgatherv_intra_two_procs rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -552,7 +545,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
}
tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[remote] * rext;
err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote,
err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, rcounts[remote], rdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHERV,
@ -570,7 +563,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -580,12 +573,12 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -599,7 +592,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
ompi_coll_base_allgatherv_intra_basic_default(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *disps,
@ -619,8 +612,8 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
* to process with rank 0 (OMPI convention)
*/
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgatherv_intra_basic_default rank %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_allgatherv_intra_basic_default rank %d",
rank));
if (MPI_IN_PLACE == sbuf) {
@ -676,177 +669,3 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV] = coll_tuned_allgatherv_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_count",
"Number of allgatherv algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_allgatherv_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allgatherv_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allgatherv_algorithms", allgatherv_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm",
"Which allallgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allgatherv_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_segmentsize",
"Segment size in bytes used by default for allgatherv algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_segment_size);
coll_tuned_allgatherv_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_tree_fanout",
"Fanout for n-tree used for allgatherv algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_tree_fanout);
coll_tuned_allgatherv_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_chain_fanout",
"Fanout for chains used for allgatherv algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
data->user_forced[ALLGATHERV].algorithm));
switch (data->user_forced[ALLGATHERV].algorithm) {
case (0):
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgatherv_intra_basic_default (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgatherv_intra_neighborexchange (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLGATHERV].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout,
int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgatherv_intra_basic_default(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -31,41 +31,23 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* allreduce algorithm variables */
static int coll_tuned_allreduce_algorithm_count = 5;
static int coll_tuned_allreduce_forced_algorithm = 0;
static int coll_tuned_allreduce_segment_size = 0;
static int coll_tuned_allreduce_tree_fanout;
static int coll_tuned_allreduce_chain_fanout;
/* valid values for coll_tuned_allreduce_forced_algorithm */
static mca_base_var_enum_value_t allreduce_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "nonoverlapping"},
{3, "recursive_doubling"},
{4, "ring"},
{5, "segmented_ring"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/*
* ompi_coll_tuned_allreduce_intra_nonoverlapping
* ompi_coll_base_allreduce_intra_nonoverlapping
*
* This function just calls a reduce followed by a broadcast
* both called functions are tuned but they complete sequentially,
* both called functions are base but they complete sequentially,
* i.e. no additional overlapping
* meaning if the number of segments used is greater than the topo depth
* meaning if the number of segments used is greater than the topo depth
* then once the first segment of data is fully 'reduced' it is not broadcast
* while the reduce continues (cost = cost-reduce + cost-bcast + decision x 3)
*
*/
int
ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
ompi_coll_base_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
@ -75,16 +57,16 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_nonoverlapping rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_nonoverlapping rank %d", rank));
/* Reduce to 0 and broadcast. */
if (MPI_IN_PLACE == sbuf) {
if (0 == rank) {
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype,
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype,
op, 0, comm, comm->c_coll.coll_reduce_module);
} else {
err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0,
err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0,
comm, comm->c_coll.coll_reduce_module);
}
} else {
@ -100,21 +82,21 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
}
/*
* ompi_coll_tuned_allreduce_intra_recursivedoubling
* ompi_coll_base_allreduce_intra_recursivedoubling
*
* Function: Recursive doubling algorithm for allreduce operation
* Accepts: Same as MPI_Allreduce()
* Returns: MPI_SUCCESS or error code
*
* Description: Implements recursive doubling algorithm for allreduce.
* Original (non-segmented) implementation is used in MPICH-2
* Description: Implements recursive doubling algorithm for allreduce.
* Original (non-segmented) implementation is used in MPICH-2
* for small and intermediate size messages.
* The algorithm preserves order of operations so it can
* The algorithm preserves order of operations so it can
* be used both by commutative and non-commutative operations.
*
* Example on 7 nodes:
* Initial state
* # 0 1 2 3 4 5 6
* # 0 1 2 3 4 5 6
* [0] [1] [2] [3] [4] [5] [6]
* Initial adjustment step for non-power of two nodes.
* old rank 1 3 5 6
@ -129,24 +111,24 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
* old rank 1 3 5 6
* new rank 0 1 2 3
* [0+1+] [0+1+] [0+1+] [0+1+]
* [2+3+] [2+3+] [2+3+] [2+3+]
* [2+3+] [2+3+] [2+3+] [2+3+]
* [4+5+] [4+5+] [4+5+] [4+5+]
* [6 ] [6 ] [6 ] [6 ]
* Final adjustment step for non-power of two nodes
* # 0 1 2 3 4 5 6
* # 0 1 2 3 4 5 6
* [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+]
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
* [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+]
* [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ]
*
*/
int
ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
int
ompi_coll_base_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
mca_coll_base_module_t *module)
{
int ret, line, rank, size, adjsize, remote, distance;
int newrank, newremote, extra_ranks;
@ -157,9 +139,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allreduce_intra_recursivedoubling rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allreduce_intra_recursivedoubling rank %d", rank));
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
@ -194,16 +176,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
adjsize >>= 1;
/* Handle non-power-of-two case:
- Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
- Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
sets new rank to -1.
- Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
- Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
apply appropriate operation, and set new rank to rank/2
- Everyone else sets rank to rank - extra_ranks
*/
extra_ranks = size - adjsize;
if (rank < (2 * extra_ranks)) {
if (0 == (rank % 2)) {
ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
@ -221,7 +203,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
newrank = rank - extra_ranks;
}
/* Communication/Computation loop
/* Communication/Computation loop
- Exchange message with remote node.
- Perform appropriate operation taking in account order of operations:
result = value (op) result
@ -230,14 +212,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
if (newrank < 0) break;
/* Determine remote node */
newremote = newrank ^ distance;
remote = (newremote < extra_ranks)?
remote = (newremote < extra_ranks)?
(newremote * 2 + 1):(newremote + extra_ranks);
/* Exchange the data */
ret = MCA_PML_CALL(irecv(tmprecv, count, dtype, remote,
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[0]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote,
ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
@ -258,14 +240,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
}
/* Handle non-power-of-two case:
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
(rank - 1)
- Even ranks less than 2 * extra_ranks receive result from (rank + 1)
*/
if (rank < (2 * extra_ranks)) {
if (0 == (rank % 2)) {
ret = MCA_PML_CALL(recv(rbuf, count, dtype, (rank + 1),
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
tmpsend = (char*)rbuf;
@ -287,14 +269,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != inplacebuf) free(inplacebuf);
return ret;
}
/*
* ompi_coll_tuned_allreduce_intra_ring
* ompi_coll_base_allreduce_intra_ring
*
* Function: Ring algorithm for allreduce operation
* Accepts: Same as MPI_Allreduce()
@ -304,9 +286,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* automatically segmented to segment of size M/N.
* Algorithm requires 2*N - 1 steps.
*
* Limitations: The algorithm DOES NOT preserve order of operations so it
* Limitations: The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations.
* In addition, algorithm cannot work if the total count is
* In addition, algorithm cannot work if the total count is
* less than size.
* Example on 5 nodes:
* Initial state
@ -318,7 +300,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [04] [14] [24] [34] [44]
*
* COMPUTATION PHASE
* Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1)
* Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1)
* from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [20] [30] [40]
@ -327,7 +309,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [03] [13] [23] [33] [33+43]
* [44+04] [14] [24] [34] [44]
*
* Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc
* Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc
* (r-2) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [01+10+20] [30] [40]
@ -336,7 +318,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [33+43+03] [13] [23] [33] [33+43]
* [44+04] [44+04+14] [24] [34] [44]
*
* Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc
* Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc
* (r-2) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [01+10+20] [01+10+20+30] [40]
@ -345,7 +327,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [33+43+03] [33+43+03+13] [23] [33] [33+43]
* [44+04] [44+04+14] [44+04+14+24] [34] [44]
*
* Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc
* Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc
* (r-3) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [01+10+20] [01+10+20+30] [FULL]
@ -353,16 +335,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [22+32+42+02] [FULL] [22] [22+32] [22+32+42]
* [33+43+03] [33+43+03+13] [FULL] [33] [33+43]
* [44+04] [44+04+14] [44+04+14+24] [FULL] [44]
*
*
* DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1.
*
*/
int
ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
int
ompi_coll_base_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
mca_coll_base_module_t *module)
{
int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
int early_segcount, late_segcount, split_rank, max_segcount;
@ -375,9 +357,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allreduce_intra_ring rank %d, count %d", rank, count));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allreduce_intra_ring rank %d, count %d", rank, count));
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
@ -389,10 +371,10 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
/* Special case for count less than size - use recursive doubling */
if (count < size) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count));
return (ompi_coll_tuned_allreduce_intra_recursivedoubling(sbuf, rbuf,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count));
return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
count,
dtype, op,
dtype, op,
comm, module));
}
@ -404,14 +386,14 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
ret = ompi_datatype_type_size( dtype, &typelng);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Determine the number of elements per block and corresponding
/* Determine the number of elements per block and corresponding
block sizes.
The blocks are divided into "early" and "late" ones:
blocks 0 .. (split_rank - 1) are "early" and
blocks 0 .. (split_rank - 1) are "early" and
blocks (split_rank) .. (size - 1) are "late".
Early blocks are at most 1 element larger than the late ones.
*/
COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank,
COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank,
early_segcount, late_segcount );
max_segcount = early_segcount;
max_real_segsize = true_extent + (max_segcount - 1) * extent;
@ -432,7 +414,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
/* Computation loop */
/*
/*
For each of the remote nodes:
- post irecv for block (r-1)
- send block (r)
@ -456,8 +438,8 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Send first block (my block) to the neighbor on the right */
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank));
block_count = ((rank < split_rank)? early_segcount : late_segcount);
tmpsend = ((char*)rbuf) + block_offset * extent;
@ -465,21 +447,21 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
for (k = 2; k < size; k++) {
const int prevblock = (rank + size - k + 1) % size;
inbi = inbi ^ 0x1;
/* Post irecv for the current block */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Wait on previous block to arrive */
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on previous block: result goes to rbuf
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
*/
@ -489,7 +471,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
/* send previous block to send_to */
ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
@ -501,7 +483,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on the last block (from neighbor (rank + 1)
/* Apply operation on the last block (from neighbor (rank + 1)
rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
recv_from = (rank + 1) % size;
block_offset = ((recv_from < split_rank)?
@ -510,28 +492,28 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
/* Distribution loop - variation of ring allgather */
send_to = (rank + 1) % size;
recv_from = (rank + size - 1) % size;
for (k = 0; k < size - 1; k++) {
const int recv_data_from = (rank + size - k) % size;
const int send_data_from = (rank + 1 + size - k) % size;
const int send_block_offset =
const int send_block_offset =
((send_data_from < split_rank)?
((ptrdiff_t)send_data_from * early_segcount) :
((ptrdiff_t)send_data_from * late_segcount + split_rank));
const int recv_block_offset =
const int recv_block_offset =
((recv_data_from < split_rank)?
((ptrdiff_t)recv_data_from * early_segcount) :
((ptrdiff_t)recv_data_from * late_segcount + split_rank));
block_count = ((send_data_from < split_rank)?
block_count = ((send_data_from < split_rank)?
early_segcount : late_segcount);
tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to,
ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
tmprecv, max_segcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE,
@ -546,7 +528,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != inbuf[0]) free(inbuf[0]);
if (NULL != inbuf[1]) free(inbuf[1]);
@ -554,30 +536,30 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
}
/*
* ompi_coll_tuned_allreduce_intra_ring_segmented
* ompi_coll_base_allreduce_intra_ring_segmented
*
* Function: Pipelined ring algorithm for allreduce operation
* Accepts: Same as MPI_Allreduce(), segment size
* Returns: MPI_SUCCESS or error code
*
* Description: Implements pipelined ring algorithm for allreduce:
* Description: Implements pipelined ring algorithm for allreduce:
* user supplies suggested segment size for the pipelining of
* reduce operation.
* The segment size determines the number of phases, np, for
* the algorithm execution.
* The message is automatically divided into blocks of
* The segment size determines the number of phases, np, for
* the algorithm execution.
* The message is automatically divided into blocks of
* approximately (count / (np * segcount)) elements.
* At the end of reduction phase, allgather like step is
* At the end of reduction phase, allgather like step is
* executed.
* Algorithm requires (np + 1)*(N - 1) steps.
*
* Limitations: The algorithm DOES NOT preserve order of operations so it
* Limitations: The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations.
* In addition, algorithm cannot work if the total size is
* In addition, algorithm cannot work if the total size is
* less than size * segment size.
* Example on 3 nodes with 2 phases
* Initial state
* # 0 1 2
* # 0 1 2
* [00a] [10a] [20a]
* [00b] [10b] [20b]
* [01a] [11a] [21a]
@ -586,9 +568,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
* [02b] [12b] [22b]
*
* COMPUTATION PHASE 0 (a)
* Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a
* Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a
* from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [20a]
* [00b] [10b] [20b]
* [01a] [11a] [11a+21a]
@ -596,20 +578,20 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
* [22a+02a] [12a] [22a]
* [02b] [12b] [22b]
*
* Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc
* Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc
* (r-2)a from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [00a+10a+20a]
* [00b] [10b] [20b]
* [11a+21a+01a] [11a] [11a+21a]
* [01b] [11b] [21b]
* [22a+02a] [22a+02a+12a] [22a]
* [02b] [12b] [22b]
* [02b] [12b] [22b]
*
* COMPUTATION PHASE 1 (b)
* Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b
* Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b
* from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [20a]
* [00b] [00b+10b] [20b]
* [01a] [11a] [11a+21a]
@ -617,31 +599,31 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
* [22a+02a] [12a] [22a]
* [22b+02b] [12b] [22b]
*
* Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc
* Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc
* (r-2)b from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [00a+10a+20a]
* [00b] [10b] [0bb+10b+20b]
* [11a+21a+01a] [11a] [11a+21a]
* [11b+21b+01b] [11b] [21b]
* [22a+02a] [22a+02a+12a] [22a]
* [02b] [22b+01b+12b] [22b]
* [02b] [22b+01b+12b] [22b]
*
*
*
* DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1 (same as
* in regular ring algorithm.
*
*/
int
ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count,
int
ompi_coll_base_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
uint32_t segsize)
uint32_t segsize)
{
int ret, line, rank, size, k, recv_from, send_to;
int early_blockcount, late_blockcount, split_rank;
int early_blockcount, late_blockcount, split_rank;
int segcount, max_segcount, num_phases, phase, block_count, inbi;
size_t typelng;
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
@ -652,9 +634,9 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allreduce_intra_ring_segmented rank %d, count %d", rank, count));
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
@ -672,34 +654,34 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
ret = ompi_datatype_type_size( dtype, &typelng);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
segcount = count;
COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)
COLL_BASE_COMPUTED_SEGCOUNT(segsize, typelng, segcount)
/* Special case for count less than size * segcount - use regular ring */
if (count < (size * segcount)) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count));
return (ompi_coll_tuned_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count));
return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
comm, module));
}
/* Determine the number of phases of the algorithm */
num_phases = count / (size * segcount);
if ((count % (size * segcount) >= size) &&
if ((count % (size * segcount) >= size) &&
(count % (size * segcount) > ((size * segcount) / 2))) {
num_phases++;
}
/* Determine the number of elements per block and corresponding
/* Determine the number of elements per block and corresponding
block sizes.
The blocks are divided into "early" and "late" ones:
blocks 0 .. (split_rank - 1) are "early" and
blocks 0 .. (split_rank - 1) are "early" and
blocks (split_rank) .. (size - 1) are "late".
Early blocks are at most 1 element larger than the late ones.
Note, these blocks will be split into num_phases segments,
out of the largest one will have max_segcount elements.
*/
COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank,
COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank,
early_blockcount, late_blockcount );
COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
COLL_BASE_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
max_segcount, k);
max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent;
@ -722,7 +704,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
ptrdiff_t phase_offset;
int early_phase_segcount, late_phase_segcount, split_phase, phase_count;
/*
/*
For each of the remote nodes:
- post irecv for block (r-1)
- send block (r)
@ -741,7 +723,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
*/
send_to = (rank + 1) % size;
recv_from = (rank + size - 1) % size;
inbi = 0;
/* Initialize first receive from the neighbor on the left */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
@ -750,81 +732,81 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
/* Send first block (my block) to the neighbor on the right:
- compute my block and phase offset
- send data */
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) :
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)rank * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
early_phase_segcount, late_phase_segcount)
phase_count = ((phase < split_phase)?
(early_phase_segcount) : (late_phase_segcount));
phase_offset = ((phase < split_phase)?
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
tmpsend = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
ret = MCA_PML_CALL(send(tmpsend, phase_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
for (k = 2; k < size; k++) {
const int prevblock = (rank + size - k + 1) % size;
inbi = inbi ^ 0x1;
/* Post irecv for the current block */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
&reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Wait on previous block to arrive */
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on previous block: result goes to rbuf
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
*/
block_offset = ((prevblock < split_rank)?
((ptrdiff_t)prevblock * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)prevblock * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((prevblock < split_rank)?
block_count = ((prevblock < split_rank)?
early_blockcount : late_blockcount);
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
early_phase_segcount, late_phase_segcount)
phase_count = ((phase < split_phase)?
(early_phase_segcount) : (late_phase_segcount));
phase_offset = ((phase < split_phase)?
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
/* send previous block to send_to */
ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
}
/* Wait on the last block to arrive */
ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on the last block (from neighbor (rank + 1)
/* Apply operation on the last block (from neighbor (rank + 1)
rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
recv_from = (rank + 1) % size;
block_offset = ((recv_from < split_rank)?
((ptrdiff_t)recv_from * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)recv_from * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((recv_from < split_rank)?
block_count = ((recv_from < split_rank)?
early_blockcount : late_blockcount);
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
early_phase_segcount, late_phase_segcount)
phase_count = ((phase < split_phase)?
(early_phase_segcount) : (late_phase_segcount));
phase_offset = ((phase < split_phase)?
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype);
@ -836,21 +818,21 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
for (k = 0; k < size - 1; k++) {
const int recv_data_from = (rank + size - k) % size;
const int send_data_from = (rank + 1 + size - k) % size;
const int send_block_offset =
const int send_block_offset =
((send_data_from < split_rank)?
((ptrdiff_t)send_data_from * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)send_data_from * (ptrdiff_t)late_blockcount + split_rank));
const int recv_block_offset =
const int recv_block_offset =
((recv_data_from < split_rank)?
((ptrdiff_t)recv_data_from * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)recv_data_from * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((send_data_from < split_rank)?
block_count = ((send_data_from < split_rank)?
early_blockcount : late_blockcount);
tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to,
ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
tmprecv, early_blockcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE,
@ -865,7 +847,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != inbuf[0]) free(inbuf[0]);
if (NULL != inbuf[1]) free(inbuf[1]);
@ -875,8 +857,8 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -895,7 +877,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
ompi_coll_base_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
@ -905,158 +887,28 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_basic_linear rank %d", rank));
/* Reduce to 0 and broadcast. */
if (MPI_IN_PLACE == sbuf) {
if (0 == rank) {
err = ompi_coll_tuned_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype,
err = ompi_coll_base_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype,
op, 0, comm, module);
} else {
err = ompi_coll_tuned_reduce_intra_basic_linear(rbuf, NULL, count, dtype,
err = ompi_coll_base_reduce_intra_basic_linear(rbuf, NULL, count, dtype,
op, 0, comm, module);
}
} else {
err = ompi_coll_tuned_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
err = ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
op, 0, comm, module);
}
if (MPI_SUCCESS != err) {
return err;
}
return ompi_coll_tuned_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module);
return ompi_coll_base_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module);
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = coll_tuned_allreduce_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_count",
"Number of allreduce algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_allreduce_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allreduce_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allreduce_algorithms", allreduce_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allreduce_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_segment_size);
coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_tree_fanout);
coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d",
data->user_forced[ALLREDUCE].algorithm,
data->user_forced[ALLREDUCE].segsize));
switch (data->user_forced[ALLREDUCE].algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module);
case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module);
case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module);
case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, data->user_forced[ALLREDUCE].segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLREDUCE].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module);
case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module);
case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module);
case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,37 +30,18 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* alltoall algorithm variables */
static int coll_tuned_alltoall_algorithm_count = 5;
static int coll_tuned_alltoall_forced_algorithm = 0;
static int coll_tuned_alltoall_segment_size = 0;
static int coll_tuned_alltoall_max_requests;
static int coll_tuned_alltoall_tree_fanout;
static int coll_tuned_alltoall_chain_fanout;
/* valid values for coll_tuned_alltoall_forced_algorithm */
static mca_base_var_enum_value_t alltoall_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "pairwise"},
{3, "modified_bruck"},
{4, "linear_sync"},
{5, "two_proc"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */
static int
mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
mca_coll_base_alltoall_intra_basic_inplace(void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
int i, j, size, rank, err=MPI_SUCCESS;
MPI_Request *preq;
char *tmp_buffer;
@ -91,7 +72,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
for (i = 0 ; i < size ; ++i) {
for (j = i+1 ; j < size ; ++j) {
/* Initiate all send/recv to/from others. */
preq = tuned_module->tuned_data->mcct_reqs;
preq = base_module->base_data->mcct_reqs;
if (i == rank) {
/* Copy the data into the temporary buffer */
@ -128,11 +109,8 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
}
/* Wait for the requests to complete */
err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Free the requests. */
mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
}
}
@ -145,7 +123,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
return err;
}
int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_pairwise(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -157,22 +135,22 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
ptrdiff_t lb, sext, rext;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_pairwise rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoall_intra_pairwise rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_get_extent (rdtype, &lb, &rext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* Perform pairwise exchange - starting from 1 so the local copy is last */
for (step = 1; step < size + 1; step++) {
@ -185,25 +163,25 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
tmprecv = (char*)rbuf + (ptrdiff_t)recvfrom * rext * (ptrdiff_t)rcount;
/* send and receive */
err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto,
err = ompi_coll_base_sendrecv( tmpsend, scount, sdtype, sendto,
MCA_COLL_BASE_TAG_ALLTOALL,
tmprecv, rcount, rdtype, recvfrom,
tmprecv, rcount, rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLTOALL,
comm, MPI_STATUS_IGNORE, rank);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
err, rank));
return err;
}
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_bruck(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -216,20 +194,20 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
struct ompi_datatype_t *new_ddt;
#ifdef blahblah
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
#endif
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoall_intra_bruck rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
@ -242,14 +220,14 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
#ifdef blahblah
/* try and SAVE memory by using the data segment hung off
/* try and SAVE memory by using the data segment hung off
the communicator if possible */
if (data->mcct_num_reqs >= size) {
if (data->mcct_num_reqs >= size) {
/* we have enought preallocated for displments and lengths */
displs = (int*) data->mcct_reqs;
blen = (int *) (displs + size);
weallocated = 0;
}
}
else { /* allocate the buffers ourself */
#endif
displs = (int *) malloc(size * sizeof(int));
@ -267,9 +245,9 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
tmpbuf = tmpbuf_free - slb;
/* Step 1 - local rotation - shift up by rank */
err = ompi_datatype_copy_content_same_ddt (sdtype,
err = ompi_datatype_copy_content_same_ddt (sdtype,
(int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
tmpbuf,
tmpbuf,
((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
if (err<0) {
line = __LINE__; err = -1; goto err_hndl;
@ -277,7 +255,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
if (rank != 0) {
err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
(char*) sbuf);
if (err<0) {
line = __LINE__; err = -1; goto err_hndl;
@ -294,7 +272,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
/* create indexed datatype */
for (i = 1; i < size; i++) {
if (( i & distance) == distance) {
displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
blen[k] = scount;
k++;
}
@ -307,7 +285,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* Sendreceive */
err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto,
err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto,
MCA_COLL_BASE_TAG_ALLTOALL,
rbuf, 1, new_ddt, recvfrom,
MCA_COLL_BASE_TAG_ALLTOALL,
@ -327,7 +305,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
for (i = 0; i < size; i++) {
err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
if (err < 0) { line = __LINE__; err = -1; goto err_hndl; }
}
@ -341,8 +319,8 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
rank));
if (tmpbuf != NULL) free(tmpbuf_free);
if (displs != NULL) free(displs);
@ -352,10 +330,10 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
/*
* alltoall_intra_linear_sync
*
*
* Function: Linear implementation of alltoall with limited number
* of outstanding requests.
* Accepts: Same as MPI_Alltoall(), and the maximum number of
* Accepts: Same as MPI_Alltoall(), and the maximum number of
* outstanding requests (actual number is 2 * max, since
* we count receive and send requests separately).
* Returns: MPI_SUCCESS or error code
@ -367,7 +345,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
* - wait for any request to complete
* - replace that request by the new one of the same type.
*/
int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_linear_sync(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -382,7 +360,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
ompi_request_t **reqs = NULL;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
@ -391,8 +369,8 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_alltoall_intra_linear_sync rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_alltoall_intra_linear_sync rank %d", rank));
error = ompi_datatype_get_extent(sdtype, &slb, &sext);
if (OMPI_SUCCESS != error) {
@ -423,18 +401,18 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
}
/* Initiate send/recv to/from others. */
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
(max_outstanding_reqs <= 0)) ?
(size - 1) : (max_outstanding_reqs));
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
sizeof(ompi_request_t*));
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
prcv = (char *) rbuf;
psnd = (char *) sbuf;
/* Post first batch or ireceive and isend requests */
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
error =
MCA_PML_CALL(irecv
@ -442,7 +420,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
}
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
error =
MCA_PML_CALL(isend
@ -457,12 +435,12 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
/* Optimization for the case when all requests have been posted */
error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
} else {
/* As requests complete, replace them with corresponding requests:
- wait for any request to complete, mark the request as
- wait for any request to complete, mark the request as
MPI_REQUEST_NULL
- If it was a receive request, replace it with new irecv request
- If it was a receive request, replace it with new irecv request
(if any)
- if it was a send request, replace it with new isend request (if any)
*/
@ -476,10 +454,10 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
ncreqs++;
if (completed < total_reqs) {
if (nrreqs < (size - 1)) {
error =
error =
MCA_PML_CALL(irecv
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
MCA_COLL_BASE_TAG_ALLTOALL, comm,
MCA_COLL_BASE_TAG_ALLTOALL, comm,
&reqs[completed]));
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
++nrreqs;
@ -493,7 +471,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
MCA_PML_BASE_SEND_STANDARD, comm,
&reqs[completed]));
++nsreqs;
si = (si + size - 1) % size;
si = (si + size - 1) % size;
}
}
}
@ -506,15 +484,15 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
rank));
if (NULL != reqs) free(reqs);
return error;
}
int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_two_procs(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -526,14 +504,14 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
ptrdiff_t sext, rext, lb;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_alltoall_intra_two_procs rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_alltoall_intra_two_procs rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
@ -548,17 +526,17 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
tmprecv = (char*)rbuf + (ptrdiff_t)remote * rext * (ptrdiff_t)rcount;
/* send and receive */
err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, remote,
err = ompi_coll_base_sendrecv ( tmpsend, scount, sdtype, remote,
MCA_COLL_BASE_TAG_ALLTOALL,
tmprecv, rcount, rdtype, remote,
tmprecv, rcount, rdtype, remote,
MCA_COLL_BASE_TAG_ALLTOALL,
comm, MPI_STATUS_IGNORE, rank );
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* ddt sendrecv your own data */
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
(int32_t) scount, sdtype,
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
(int32_t) scount, sdtype,
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
(int32_t) rcount, rdtype);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
@ -566,7 +544,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
rank));
return err;
@ -577,8 +555,8 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -588,7 +566,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
/* copied function (with appropriate renaming) starts here */
int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -599,11 +577,11 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
char *psnd, *prcv;
MPI_Aint lb, sndinc, rcvinc;
ompi_request_t **req, **sreq, **rreq;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
@ -612,8 +590,8 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_alltoall_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_alltoall_intra_basic_linear rank %d", rank));
err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
@ -654,23 +632,23 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
/* Post all receives first -- a simple optimization */
for (nreqs = 0, i = (rank + 1) % size; i != rank;
for (nreqs = 0, i = (rank + 1) % size; i != rank;
i = (i + 1) % size, ++rreq, ++nreqs) {
err =
MCA_PML_CALL(irecv_init
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(req, rreq - req);
ompi_coll_base_free_reqs(req, rreq - req);
return err;
}
}
/* Now post all sends in reverse order
/* Now post all sends in reverse order
- We would like to minimize the search time through message queue
when messages actually arrive in the order in which they were posted.
*/
for (nreqs = 0, i = (rank + size - 1) % size; i != rank;
for (nreqs = 0, i = (rank + size - 1) % size; i != rank;
i = (i + size - 1) % size, ++sreq, ++nreqs) {
err =
MCA_PML_CALL(isend_init
@ -678,7 +656,7 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
MCA_COLL_BASE_TAG_ALLTOALL,
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(req, sreq - req);
ompi_coll_base_free_reqs(req, sreq - req);
return err;
}
}
@ -698,165 +676,10 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
/* Free the reqs */
ompi_coll_tuned_free_reqs(req, nreqs);
ompi_coll_base_free_reqs(req, nreqs);
/* All done */
return err;
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t*new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = coll_tuned_alltoall_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_count",
"Number of alltoall algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_alltoall_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_alltoall_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_alltoall_algorithms", alltoall_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_alltoall_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_segment_size);
coll_tuned_alltoall_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_tree_fanout);
coll_tuned_alltoall_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_chain_fanout);
coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_max_requests",
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_max_requests);
if (mca_param_indices->max_requests_param_index < 0) {
return mca_param_indices->max_requests_param_index;
}
if (coll_tuned_alltoall_max_requests < 0) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n",
ompi_coll_tuned_init_max_requests );
}
coll_tuned_alltoall_max_requests = 0;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
data->user_forced[ALLTOALL].algorithm));
switch (data->user_forced[ALLTOALL].algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, data->user_forced[ALLTOALL].max_requests);
case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize,
int max_requests)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, max_requests);
case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -32,29 +32,17 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* alltoallv algorithm variables */
static int coll_tuned_alltoallv_algorithm_count = 2;
static int coll_tuned_alltoallv_forced_algorithm = 0;
/* valid values for coll_tuned_alltoallv_forced_algorithm */
static mca_base_var_enum_value_t alltoallv_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "pairwise"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
static int
mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
mca_coll_base_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
int i, j, size, rank, err=MPI_SUCCESS;
MPI_Request *preq;
char *tmp_buffer;
@ -90,7 +78,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
for (i = 0 ; i < size ; ++i) {
for (j = i+1 ; j < size ; ++j) {
/* Initiate all send/recv to/from others. */
preq = tuned_module->tuned_data->mcct_reqs;
preq = base_module->base_data->mcct_reqs;
if (i == rank && rcounts[j]) {
/* Copy the data into the temporary buffer */
@ -127,11 +115,8 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
}
/* Wait for the requests to complete */
err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Free the requests. */
mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
}
}
@ -145,7 +130,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
}
int
ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
ompi_coll_base_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
@ -157,15 +142,15 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
ptrdiff_t sext, rext;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
rdtype, comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_pairwise rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoallv_intra_pairwise rank %d", rank));
ompi_datatype_type_extent(sdtype, &sext);
ompi_datatype_type_extent(rdtype, &rext);
@ -182,34 +167,33 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
prcv = (char*)rbuf + (ptrdiff_t)rdisps[recvfrom] * rext;
/* send and receive */
err = ompi_coll_tuned_sendrecv( psnd, scounts[sendto], sdtype, sendto,
err = ompi_coll_base_sendrecv( psnd, scounts[sendto], sdtype, sendto,
MCA_COLL_BASE_TAG_ALLTOALLV,
prcv, rcounts[recvfrom], rdtype, recvfrom,
prcv, rcounts[recvfrom], rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLTOALLV,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line,
err, rank, step));
return err;
}
/*
/**
* Linear functions are copied from the basic coll module. For
* some small number of nodes and/or small data sizes they are just as
* fast as tuned/tree based segmenting operations and as such may be
* fast as base/tree based segmenting operations and as such may be
* selected by the decision functions. These are copied into this module
* due to the way we select modules in V1. i.e. in V2 we will handle this
* differently and so will not have to duplicate code.
* GEF Oct05 after asking Jeff.
* differently and so will not have to duplicate code.
*/
int
ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
ompi_coll_base_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
@ -220,19 +204,19 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
char *psnd, *prcv;
ptrdiff_t sext, rext;
MPI_Request *preq;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
rdtype, comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoallv_intra_basic_linear rank %d", rank));
ompi_datatype_type_extent(sdtype, &sext);
ompi_datatype_type_extent(rdtype, &rext);
@ -269,7 +253,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
preq++));
++nreqs;
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
return err;
}
}
@ -287,7 +271,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
preq++));
++nreqs;
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
return err;
}
}
@ -305,128 +289,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
MPI_STATUSES_IGNORE);
/* Free the requests. */
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
return err;
}
/*
* The following are used by dynamic and forced rules. Publish
* details of each algorithm and if its forced/fixed/locked in as you add
* methods/algorithms you must update this and the query/map routines.
* This routine is called by the component only. This makes sure that
* the mca parameters are set to their initial values and perms.
* Module does not call this. They call the forced_getvalues routine
* instead.
*/
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t
*mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV] = coll_tuned_alltoallv_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoallv_algorithm_count",
"Number of alltoallv algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_alltoallv_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_alltoallv_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_alltoallv_algorithms", alltoallv_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoallv_algorithm",
"Which alltoallv algorithm is used. "
"Can be locked down to choice of: 0 ignore, "
"1 basic linear, 2 pairwise.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoallv_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoallv_intra_do_forced(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_forced selected algorithm %d",
data->user_forced[ALLTOALLV].algorithm));
switch (data->user_forced[ALLTOALLV].algorithm) {
case (0):
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_forced attempt to "
"select algorithm %d when only 0-%d is valid.",
data->user_forced[ALLTOALLV].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
return (MPI_ERR_ARG);
}
}
/* If the user selects dynamic rules and specifies the algorithm to
* use, then this function is called. */
int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_this selected algorithm %d ",
algorithm));
switch (algorithm) {
case (0):
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_do_this attempt to select "
"algorithm %d when only 0-%d is valid.",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
return (MPI_ERR_ARG);
}
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -31,25 +31,9 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* barrier algorithm variables */
static int coll_tuned_barrier_algorithm_count = 6;
static int coll_tuned_barrier_forced_algorithm = 0;
/* valid values for coll_tuned_barrier_forced_algorithm */
static mca_base_var_enum_value_t barrier_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "double_ring"},
{3, "recursive_doubling"},
{4, "bruck"},
{5, "two_proc"},
{6, "tree"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/**
* A quick version of the MPI_Sendreceive implemented for the barrier.
@ -57,7 +41,7 @@ static mca_base_var_enum_value_t barrier_algorithms[] = {
* signal a two peer synchronization.
*/
static inline int
ompi_coll_tuned_sendrecv_zero(int dest, int stag,
ompi_coll_base_sendrecv_zero(int dest, int stag,
int source, int rtag,
MPI_Comm comm)
@ -87,8 +71,8 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
err_index = 1;
}
err = statuses[err_index].MPI_ERROR;
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_tuned_sendrecv_zero\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_base_sendrecv_zero\n",
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
return err;
}
@ -100,21 +84,21 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
/* Error discovered during the posting of the irecv or isend,
* and no status is available.
*/
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
__FILE__, line, err));
return err;
}
/*
* Barrier is ment to be a synchronous operation, as some BTLs can mark
* a request done before its passed to the NIC and progress might not be made
* elsewhere we cannot allow a process to exit the barrier until its last
* Barrier is ment to be a synchronous operation, as some BTLs can mark
* a request done before its passed to the NIC and progress might not be made
* elsewhere we cannot allow a process to exit the barrier until its last
* [round of] sends are completed.
*
* It is last round of sends rather than 'last' individual send as each pair of
* peers can use different channels/devices/btls and the receiver of one of
* It is last round of sends rather than 'last' individual send as each pair of
* peers can use different channels/devices/btls and the receiver of one of
* these sends might be forced to wait as the sender
* leaves the collective and does not make progress until the next mpi call
* leaves the collective and does not make progress until the next mpi call
*
*/
@ -124,7 +108,7 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
* synchronous gurantee made by last ring of sends are synchronous
*
*/
int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, err = 0, line = 0, left, right;
@ -132,50 +116,50 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_doublering rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
left = ((rank-1)%size);
right = ((rank+1)%size);
if (rank > 0) { /* receive message from the left */
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
/* Send message to the right */
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* root needs to receive from the last node */
if (rank == 0) {
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
/* Allow nodes to exit */
if (rank > 0) { /* post Receive from left */
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
/* send message to the right one */
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* rank 0 post receive from the last node */
if (rank == 0) {
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
@ -183,7 +167,7 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -193,15 +177,15 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
* To make synchronous, uses sync sends and sync sendrecvs
*/
int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, adjsize, err, line, mask, remote;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_recursivedoubling rank %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_recursivedoubling rank %d",
rank));
/* do nearest power of 2 less than size calc */
@ -213,7 +197,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
if (rank >= adjsize) {
/* send message to lower ranked node */
remote = rank - adjsize;
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
remote, MCA_COLL_BASE_TAG_BARRIER,
comm);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -222,7 +206,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
/* receive message from high level rank */
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
MCA_COLL_BASE_TAG_BARRIER, comm,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -238,7 +222,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
if (remote >= adjsize) continue;
/* post receive from the remote node */
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
remote, MCA_COLL_BASE_TAG_BARRIER,
comm);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -250,8 +234,8 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
if (rank < (size - adjsize)) {
/* send enter message to higher ranked node */
remote = rank + adjsize;
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
MCA_COLL_BASE_TAG_BARRIER,
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -261,7 +245,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -271,23 +255,23 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
* To make synchronous, uses sync sends and sync sendrecvs
*/
int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, distance, to, from, err, line = 0;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_bruck rank %d", rank));
/* exchange data with rank-2^k and rank+2^k */
for (distance = 1; distance < size; distance <<= 1) {
for (distance = 1; distance < size; distance <<= 1) {
from = (rank + size - distance) % size;
to = (rank + distance) % size;
/* send message to lower ranked node */
err = ompi_coll_tuned_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
from, MCA_COLL_BASE_TAG_BARRIER,
comm);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -296,7 +280,7 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -306,17 +290,17 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
* To make synchronous, uses sync sends and sync sendrecvs
*/
/* special case for two processes */
int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int remote, err;
remote = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_two_procs rank %d", remote));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_two_procs rank %d", remote));
remote = (remote + 1) & 0x1;
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
remote, MCA_COLL_BASE_TAG_BARRIER,
comm);
return (err);
@ -327,7 +311,7 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -337,7 +321,7 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
/* copied function (with appropriate renaming) starts here */
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
static int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, err, rank, size;
@ -347,14 +331,14 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
/* All non-root send & receive zero-length message. */
if (rank > 0) {
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) {
return err;
}
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
MCA_COLL_BASE_TAG_BARRIER,
comm, MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) {
@ -370,7 +354,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) );
for (i = 1; i < size; ++i) {
err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
MCA_COLL_BASE_TAG_BARRIER, comm,
MCA_COLL_BASE_TAG_BARRIER, comm,
&(requests[i])));
if (MPI_SUCCESS != err) {
return err;
@ -380,7 +364,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
for (i = 1; i < size; ++i) {
err = MCA_PML_CALL(isend(NULL, 0, MPI_BYTE, i,
MCA_COLL_BASE_TAG_BARRIER,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[i])));
if (MPI_SUCCESS != err) {
@ -400,17 +384,17 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
/*
* Another recursive doubling type algorithm, but in this case
* we go up the tree and back down the tree.
* we go up the tree and back down the tree.
*/
int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, depth, err, jump, partner;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_tree %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_tree %d",
rank));
/* Find the nearest power of 2 of the communicator size. */
@ -420,21 +404,21 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
partner = rank ^ jump;
if (!(partner & (jump-1)) && partner < size) {
if (partner > rank) {
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err)
return err;
} else if (partner < rank) {
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
MCA_COLL_BASE_TAG_BARRIER,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err)
return err;
}
}
}
depth >>= 1;
for (jump = depth; jump>0; jump>>=1) {
partner = rank ^ jump;
@ -446,7 +430,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
if (MPI_SUCCESS != err)
return err;
} else if (partner < rank) {
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err)
@ -457,101 +441,3 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
return MPI_SUCCESS;
}
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map */
/* routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values */
/* and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[BARRIER] = coll_tuned_barrier_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm_count",
"Number of barrier algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_barrier_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_barrier_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_barrier_algorithms", barrier_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm",
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_barrier_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:barrier_intra_do_forced selected algorithm %d",
data->user_forced[BARRIER].algorithm));
switch (data->user_forced[BARRIER].algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module);
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module);
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module);
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module);
case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[BARRIER].algorithm,
ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
switch (algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module);
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module);
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module);
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module);
case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,18 +3,18 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -27,33 +27,14 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* bcast algorithm variables */
static int coll_tuned_bcast_algorithm_count = 6;
static int coll_tuned_bcast_forced_algorithm = 0;
static int coll_tuned_bcast_segment_size = 0;
static int coll_tuned_bcast_tree_fanout;
static int coll_tuned_bcast_chain_fanout;
/* valid values for coll_tuned_bcast_forced_algorithm */
static mca_base_var_enum_value_t bcast_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "chain"},
{3, "pipeline"},
{4, "split_binary_tree"},
{5, "binary_tree"},
{6, "binomial"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
int
ompi_coll_tuned_bcast_intra_generic( void* buffer,
int original_count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_generic( void* buffer,
int original_count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -62,12 +43,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
{
int err = 0, line, i, rank, size, segindex, req_index;
int num_segments; /* Number of segments */
int sendcount; /* number of elements sent in this segment */
int sendcount; /* number of elements sent in this segment */
size_t realsegsize, type_size;
char *tmpbuf;
ptrdiff_t extent, lb;
ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
ompi_request_t **send_reqs = NULL;
#endif
@ -79,20 +60,20 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
ompi_datatype_type_size( datatype, &type_size );
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
realsegsize = (ptrdiff_t)count_by_segment * extent;
/* Set the buffer pointers */
tmpbuf = (char *) buffer;
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
if( tree->tree_nextsize != 0 ) {
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
sizeof(ompi_request_t*) );
}
#endif
/* Root code */
if( rank == root ) {
/*
/*
For each segment:
- send segment to all children.
The last segment may have less elements than other segments.
@ -102,39 +83,39 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
if( segindex == (num_segments - 1) ) {
sendcount = original_count - segindex * count_by_segment;
}
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
#else
err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm,
MCA_PML_BASE_SEND_STANDARD, comm,
&send_reqs[i]));
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
/* complete the sends before starting the next sends */
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
#endif /* not COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* not COLL_BASE_BCAST_USE_BLOCKING */
/* update tmp buffer */
tmpbuf += realsegsize;
}
}
}
/* Intermediate nodes code */
else if( tree->tree_nextsize > 0 ) {
/*
Create the pipeline.
else if( tree->tree_nextsize > 0 ) {
/*
Create the pipeline.
1) Post the first receive
2) For segments 1 .. num_segments
- post new receive
@ -149,49 +130,49 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &recv_reqs[req_index]));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
for( segindex = 1; segindex < num_segments; segindex++ ) {
req_index = req_index ^ 0x1;
/* post new irecv */
err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment,
datatype, tree->tree_prev,
MCA_COLL_BASE_TAG_BCAST,
datatype, tree->tree_prev,
MCA_COLL_BASE_TAG_BCAST,
comm, &recv_reqs[req_index]));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* wait for and forward the previous segment to children */
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
err = MCA_PML_CALL(send(tmpbuf, count_by_segment, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
#else
err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm,
MCA_PML_BASE_SEND_STANDARD, comm,
&send_reqs[i]));
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
}
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
/* complete the sends before starting the next iteration */
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
/* Update the receive buffer */
tmpbuf += realsegsize;
}
/* Process the last segment */
@ -199,31 +180,31 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment;
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
#else
err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm,
MCA_PML_BASE_SEND_STANDARD, comm,
&send_reqs[i]));
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
}
/* Leaf nodes */
else {
/*
/*
Receive all segments from parent in a loop:
1) post irecv for the first segment
2) for segments 1 .. num_segments
@ -241,12 +222,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
req_index = req_index ^ 0x1;
tmpbuf += realsegsize;
/* post receive for the next segment */
err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &recv_reqs[req_index]));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* wait on the previous segment */
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
MPI_STATUS_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
@ -255,25 +236,25 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
if( NULL != send_reqs ) free(send_reqs);
#endif
return (MPI_SUCCESS);
error_hndl:
OPAL_OUTPUT( (ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank) );
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
if( NULL != send_reqs ) free(send_reqs);
#endif
return (err);
}
int
ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -281,28 +262,27 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BINTREE( comm, module, root );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_bintree );
}
int
ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_pipeline( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -310,28 +290,27 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );
COLL_BASE_UPDATE_PIPELINE( comm, module, root );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_pipeline );
}
int
ompi_coll_tuned_bcast_intra_chain( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_chain( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -339,28 +318,27 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_CHAIN( comm, tuned_module, root, chains );
COLL_BASE_UPDATE_CHAIN( comm, module, root, chains );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_chain );
}
int
ompi_coll_tuned_bcast_intra_binomial( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_binomial( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -368,28 +346,27 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_BMTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BMTREE( comm, module, root );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_bmtree );
}
int
ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_split_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -399,26 +376,25 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
uint32_t counts[2];
int segcount[2]; /* Number of elements sent with each segment */
int num_segments[2]; /* Number of segmenets */
int sendcount[2]; /* the same like segcount, except for the last segment */
int sendcount[2]; /* the same like segcount, except for the last segment */
size_t realsegsize[2], type_size;
char *tmpbuf[2];
ptrdiff_t type_extent, lb;
ompi_request_t *base_req, *new_req;
ompi_coll_tree_t *tree;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));
if (size == 1) {
return MPI_SUCCESS;
}
/* setup the binary tree topology. */
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BINTREE( comm, module, root );
tree = data->cached_bintree;
err = ompi_datatype_type_size( datatype, &type_size );
@ -431,10 +407,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
/* Note that ompi_datatype_type_size() will never return a negative
value in typelng; it returns an int [vs. an unsigned type]
because of the MPI spec. */
if (segsize < ((uint32_t) type_size)) {
if (segsize < ((uint32_t) type_size)) {
segsize = type_size; /* push segsize up to hold one type */
}
segcount[0] = segcount[1] = segsize / type_size;
segcount[0] = segcount[1] = segsize / type_size;
num_segments[0] = counts[0]/segcount[0];
if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
num_segments[1] = counts[1]/segcount[1];
@ -450,17 +426,17 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
(segsize > ((ptrdiff_t)counts[0] * type_size)) ||
(segsize > ((ptrdiff_t)counts[1] * type_size)) ) {
/* call linear version here ! */
return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
return (ompi_coll_base_bcast_intra_chain ( buffer, count, datatype,
root, comm, module,
segsize, 1 ));
}
err = ompi_datatype_get_extent (datatype, &lb, &type_extent);
/* Determine real segment size */
realsegsize[0] = (ptrdiff_t)segcount[0] * type_extent;
realsegsize[1] = (ptrdiff_t)segcount[1] * type_extent;
/* set the buffer pointers */
tmpbuf[0] = (char *) buffer;
tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;
@ -473,11 +449,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
/* determine if I am left (0) or right (1), (root is right) */
lr = ((rank + size - root)%size + 1)%2;
/* root code */
if( rank == root ) {
/* determine segment count */
sendcount[0] = segcount[0];
sendcount[0] = segcount[0];
sendcount[1] = segcount[1];
/* for each segment */
for (segindex = 0; segindex < num_segments[0]; segindex++) {
@ -487,7 +463,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
continue;
}
/* determine how many elements are being sent in this round */
if(segindex == (num_segments[i] - 1))
if(segindex == (num_segments[i] - 1))
sendcount[i] = counts[i] - segindex*segcount[i];
/* send data */
MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype,
@ -498,19 +474,19 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
tmpbuf[i] += realsegsize[i];
}
}
}
}
/* intermediate nodes code */
else if( tree->tree_nextsize > 0 ) {
else if( tree->tree_nextsize > 0 ) {
/* Intermediate nodes:
* It will receive segments only from one half of the data.
* Which one is determined by whether the node belongs to the "left" or "right"
* Which one is determined by whether the node belongs to the "left" or "right"
* subtree. Topoloby building function builds binary tree such that
* odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
* and even on the right subtree.
*
* Create the pipeline. We first post the first receive, then in the loop we
* post the next receive and after that wait for the previous receive to complete
* post the next receive and after that wait for the previous receive to complete
* and we disseminating the data to all children.
*/
sendcount[lr] = segcount[lr];
@ -521,11 +497,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
/* determine how many elements to expect in this round */
if( segindex == (num_segments[lr] - 1))
if( segindex == (num_segments[lr] - 1))
sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr];
/* post new irecv */
err = MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &new_req));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -539,7 +515,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
} /* end of for each child */
/* upate the base request */
base_req = new_req;
base_req = new_req;
/* go to the next buffer (ie. the one corresponding to the next recv) */
tmpbuf[lr] += realsegsize[lr];
} /* end of for segindex */
@ -552,10 +528,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} /* end of for each child */
}
}
/* leaf nodes */
else {
else {
/* Just consume segments as fast as possible */
sendcount[lr] = segcount[lr];
for (segindex = 0; segindex < num_segments[lr]; segindex++) {
@ -577,9 +553,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;
/* Step 2:
Find your immediate pair (identical node in opposite subtree) and SendRecv
Find your immediate pair (identical node in opposite subtree) and SendRecv
data buffer with them.
The tree building function ensures that
The tree building function ensures that
if (we are not root)
if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
if we are in the right subtree (lr == 1) our pair is (rank-1)%size
@ -591,9 +567,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
pair = (rank+size-1)%size;
}
if ( (size%2) != 0 && rank != root) {
if ( (size%2) != 0 && rank != root) {
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
@ -607,28 +583,28 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
/* last node receives right buffer from the root */
else if (rank == (root+size-1)%size) {
err = MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype,
root, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
/* everyone else exchanges buffers */
else {
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE, rank);
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
return (MPI_SUCCESS);
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
return (err);
}
@ -636,8 +612,8 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -655,21 +631,20 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
ompi_coll_base_bcast_intra_basic_linear (void *buff, int count,
struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, size, rank, err;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
ompi_request_t **preq, **reqs = data->mcct_reqs;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d", rank, root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_basic_linear rank %d root %d", rank, root));
/* Non-root receive the data. */
@ -710,148 +685,11 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
/* Free the reqs */
ompi_coll_tuned_free_reqs(reqs, i);
ompi_coll_base_free_reqs(reqs, i);
/* All done */
return err;
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[BCAST] = coll_tuned_bcast_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_count",
"Number of bcast algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_bcast_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_bcast_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_bcast_algorithms", bcast_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_bcast_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_segmentsize",
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_segment_size);
coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_tree_fanout",
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_tree_fanout);
coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_chain_fanout",
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
data->user_forced[BCAST].algorithm));
switch (data->user_forced[BCAST].algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize,
data->user_forced[BCAST].chain_fanout );
case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
} /* switch */
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout );
case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize );
case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize );
case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize );
case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize );
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
} /* switch */
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -3,10 +3,10 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
@ -15,9 +15,9 @@
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -33,6 +33,7 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
/*
* The following file was created by configure. It contains extern
@ -49,10 +50,55 @@ static void coll_base_module_construct(mca_coll_base_module_t *m)
/* zero out all functions */
memset ((char *) m + sizeof (m->super), 0, sizeof (*m) - sizeof (m->super));
m->coll_module_disable = NULL;
m->base_data = NULL;
}
OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t,
coll_base_module_construct, NULL);
static void
coll_base_module_destruct(mca_coll_base_module_t *module)
{
mca_coll_base_comm_t* data = module->base_data;
if (NULL != data) {
if( NULL != data->mcct_reqs ) {
for( int i = 0; i < data->mcct_num_reqs; ++i ) {
if( MPI_REQUEST_NULL != data->mcct_reqs[i] )
ompi_request_free(&data->mcct_reqs[i]);
}
free(data->mcct_reqs);
data->mcct_reqs = NULL;
data->mcct_num_reqs = 0;
}
assert(0 == data->mcct_num_reqs);
/* free any cached information that has been allocated */
if (data->cached_ntree) { /* destroy general tree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_ntree);
}
if (data->cached_bintree) { /* destroy bintree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_bintree);
}
if (data->cached_bmtree) { /* destroy bmtree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_bmtree);
}
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree);
}
if (data->cached_chain) { /* destroy general chain if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_chain);
}
if (data->cached_pipeline) { /* destroy pipeline if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_pipeline);
}
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree);
}
free(data);
}
}
OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t,
coll_base_module_construct, coll_base_module_destruct);
MCA_BASE_FRAMEWORK_DECLARE(ompi, coll, "Collectives", NULL, NULL, NULL,
mca_coll_base_static_components, 0);

341
ompi/mca/coll/base/coll_base_functions.h Обычный файл
Просмотреть файл

@ -0,0 +1,341 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_BASE_EXPORT_H
#define MCA_COLL_BASE_EXPORT_H
#include "ompi_config.h"
#include "ompi/mca/coll/base/base.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_base_topo.h"
/* some fixed value index vars to simplify certain operations */
typedef enum COLLTYPE {
ALLGATHER = 0, /* 0 */
ALLGATHERV, /* 1 */
ALLREDUCE, /* 2 */
ALLTOALL, /* 3 */
ALLTOALLV, /* 4 */
ALLTOALLW, /* 5 */
BARRIER, /* 6 */
BCAST, /* 7 */
EXSCAN, /* 8 */
GATHER, /* 9 */
GATHERV, /* 10 */
REDUCE, /* 11 */
REDUCESCATTER, /* 12 */
SCAN, /* 13 */
SCATTER, /* 14 */
SCATTERV, /* 15 */
COLLCOUNT /* 16 end counter keep it as last element */
} COLLTYPE_T;
/* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
BEGIN_C_DECLS
/* All Gather */
int ompi_coll_base_allgather_intra_bruck(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
/* All GatherV */
int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_ring(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
/* All Reduce */
int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
/* AlltoAll */
int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
int ompi_coll_base_alltoall_intra_bruck(ALLTOALL_ARGS);
int ompi_coll_base_alltoall_intra_basic_linear(ALLTOALL_ARGS);
int ompi_coll_base_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
int ompi_coll_base_alltoall_intra_two_procs(ALLTOALL_ARGS);
/* AlltoAllV */
int ompi_coll_base_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
int ompi_coll_base_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
/* AlltoAllW */
/* Barrier */
int ompi_coll_base_barrier_intra_doublering(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_recursivedoubling(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_bruck(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_two_procs(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_linear(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_tree(BARRIER_ARGS);
/* Bcast */
int ompi_coll_base_bcast_intra_basic_linear(BCAST_ARGS);
int ompi_coll_base_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
int ompi_coll_base_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
int ompi_coll_base_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
int ompi_coll_base_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
int ompi_coll_base_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
/* Exscan */
/* Gather */
int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS);
int ompi_coll_base_gather_intra_binomial(GATHER_ARGS);
int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
/* GatherV */
/* Reduce */
int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS);
int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
/* Reduce_scatter */
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
/* Scan */
/* Scatter */
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
/* ScatterV */
END_C_DECLS
#define COLL_BASE_UPDATE_BINTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_bintree) \
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
} \
coll_comm->cached_bintree = ompi_coll_base_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
coll_comm->cached_bintree_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_bmtree) \
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
} \
coll_comm->cached_bmtree = ompi_coll_base_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_in_order_bmtree) \
&& (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
} \
coll_comm->cached_in_order_bmtree = ompi_coll_base_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_in_order_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_PIPELINE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_pipeline) \
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
} \
coll_comm->cached_pipeline = ompi_coll_base_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
coll_comm->cached_pipeline_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_CHAIN( OMPI_COMM, BASE_MODULE, ROOT, FANOUT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_chain) \
&& (coll_comm->cached_chain_root == (ROOT)) \
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_chain) ); \
} \
coll_comm->cached_chain = ompi_coll_base_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
coll_comm->cached_chain_root = (ROOT); \
coll_comm->cached_chain_fanout = (FANOUT); \
} \
} while (0)
#define COLL_BASE_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, BASE_MODULE ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !(coll_comm->cached_in_order_bintree) ) { \
/* In-order binary tree topology is defined by communicator size */ \
/* Thus, there is no need to destroy anything */ \
coll_comm->cached_in_order_bintree = \
ompi_coll_base_topo_build_in_order_bintree((OMPI_COMM)); \
} \
} while (0)
/**
* This macro give a generic way to compute the best count of
* the segment (i.e. the number of complete datatypes that
* can fit in the specified SEGSIZE). Beware, when this macro
* is called, the SEGCOUNT should be initialized to the count as
* expected by the collective call.
*/
#define COLL_BASE_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
if( ((SEGSIZE) >= (TYPELNG)) && \
((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
size_t residual; \
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
if( residual > ((TYPELNG) >> 1) ) \
(SEGCOUNT)++; \
} \
/**
* This macro gives a generic wait to compute the well distributed block counts
* when the count and number of blocks are fixed.
* Macro returns "early-block" count, "late-block" count, and "split-index"
* which is the block at which we switch from "early-block" count to
* the "late-block" count.
* count = split_index * early_block_count +
* (block_count - split_index) * late_block_count
* We do not perform ANY error checks - make sure that the input values
* make sense (eg. count > num_blocks).
*/
#define COLL_BASE_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
SPLIT_INDEX = COUNT % NUM_BLOCKS; \
if (0 != SPLIT_INDEX) { \
EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
} \
/*
* Data structure for hanging data off the communicator
* i.e. per module instance
*/
struct mca_coll_base_comm_t {
opal_object_t super;
/* standard data for requests and PML usage */
/* Precreate space for requests
* Note this does not effect basic,
* but if in wrong context can confuse a debugger
* this is controlled by an MCA param
*/
ompi_request_t **mcct_reqs;
int mcct_num_reqs;
/*
* base topo information caching per communicator
*
* for each communicator we cache the topo information so we can
* reuse without regenerating if we change the root, [or fanout]
* then regenerate and recache this information
*/
/* general tree with n fan out */
ompi_coll_tree_t *cached_ntree;
int cached_ntree_root;
int cached_ntree_fanout;
/* binary tree */
ompi_coll_tree_t *cached_bintree;
int cached_bintree_root;
/* binomial tree */
ompi_coll_tree_t *cached_bmtree;
int cached_bmtree_root;
/* binomial tree */
ompi_coll_tree_t *cached_in_order_bmtree;
int cached_in_order_bmtree_root;
/* chained tree (fanout followed by pipelines) */
ompi_coll_tree_t *cached_chain;
int cached_chain_root;
int cached_chain_fanout;
/* pipeline */
ompi_coll_tree_t *cached_pipeline;
int cached_pipeline_root;
/* in-order binary tree (root of the in-order binary tree is rank 0) */
ompi_coll_tree_t *cached_in_order_bintree;
};
typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
static inline void ompi_coll_base_free_reqs(ompi_request_t **reqs, int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(&reqs[i]);
}
#endif /* MCA_COLL_BASE_EXPORT_H */

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,30 +30,14 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* gather algorithm variables */
static int coll_tuned_gather_algorithm_count = 3;
static int coll_tuned_gather_forced_algorithm = 0;
static int coll_tuned_gather_segment_size = 0;
static int coll_tuned_gather_tree_fanout;
static int coll_tuned_gather_chain_fanout;
/* valid values for coll_tuned_gather_forced_algorithm */
static mca_base_var_enum_value_t gather_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{3, "linear_sync"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
* gather_intra_pipeline, segmentation? */
int
ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
ompi_coll_base_gather_intra_binomial(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -65,19 +49,19 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
char *ptmp = NULL, *tempbuf = NULL;
ompi_coll_tree_t* bmtree;
MPI_Status status;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_binomial rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_binomial rank %d", rank));
/* create the binomial tree */
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
bmtree = data->cached_in_order_bmtree;
ompi_datatype_get_extent(sdtype, &slb, &sextent);
@ -112,7 +96,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
} else {
/* copy from rbuf to temp buffer */
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
(char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
@ -157,8 +141,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
mycount = size - vkid;
mycount *= rcount;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d",
rank, bmtree->tree_next[i], mycount));
err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
@ -172,8 +156,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
if (rank != root) {
/* all nodes except root send to parents */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n",
rank, bmtree->tree_prev, total_recv));
err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
@ -207,7 +191,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
if (NULL != tempbuf)
free(tempbuf);
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -220,11 +204,11 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
ompi_coll_base_gather_intra_linear_sync(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int first_segment_size)
@ -237,8 +221,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
if (rank != root) {
/* Non-root processes:
@ -250,10 +234,10 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
ompi_datatype_type_size(sdtype, &typelng);
ompi_datatype_get_extent(sdtype, &lb, &extent);
first_segment_count = scount;
COLL_TUNED_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
first_segment_count );
ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
MCA_COLL_BASE_TAG_GATHER,
comm, MPI_STATUS_IGNORE));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -263,15 +247,15 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
MCA_PML_BASE_SEND_STANDARD, comm));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
(scount - first_segment_count), sdtype,
ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
(scount - first_segment_count), sdtype,
root, MCA_COLL_BASE_TAG_GATHER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} else {
/* Root process,
/* Root process,
- For every non-root node:
- post irecv for the first segment of the message
- send zero byte message to signal node to send the message
@ -284,20 +268,20 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
ompi_request_t *first_segment_req;
reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*));
if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }
ompi_datatype_type_size(rdtype, &typelng);
ompi_datatype_get_extent(rdtype, &lb, &extent);
first_segment_count = rcount;
COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
first_segment_count );
ptmp = (char *) rbuf;
for (i = 0; i < size; ++i) {
if (i == rank) {
if (i == rank) {
/* skip myself */
reqs[i] = MPI_REQUEST_NULL;
continue;
}
reqs[i] = MPI_REQUEST_NULL;
continue;
}
/* irecv for the first segment from i */
ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent;
@ -305,7 +289,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
MCA_COLL_BASE_TAG_GATHER, comm,
&first_segment_req));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* send sync message */
ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i,
MCA_COLL_BASE_TAG_GATHER,
@ -314,7 +298,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
/* irecv for the second segment */
ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent;
ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm,
&reqs[i]));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -327,11 +311,11 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
/* copy local data if necessary */
if (MPI_IN_PLACE != sbuf) {
ret = ompi_datatype_sndrcv(sbuf, scount, sdtype,
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
rcount, rdtype);
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
/* wait all second segments to complete */
ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -346,8 +330,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
if (NULL != reqs) {
free(reqs);
}
OPAL_OUTPUT (( ompi_coll_tuned_stream,
"ERROR_HNDL: node %d file %s line %d error %d\n",
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
"ERROR_HNDL: node %d file %s line %d error %d\n",
rank, __FILE__, line, ret ));
return ret;
}
@ -355,13 +339,13 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -373,7 +357,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
ompi_coll_base_gather_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -389,8 +373,8 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
/* Everyone but root sends data and returns. */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_basic_linear rank %d", rank));
if (rank != root) {
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
@ -427,164 +411,3 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[GATHER] = coll_tuned_gather_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_count",
"Number of gather algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_gather_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_gather_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_gather_algorithms", gather_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm",
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_gather_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_segmentsize",
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_segment_size);
coll_tuned_gather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_tree_fanout",
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_tree_fanout);
coll_tuned_gather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_chain_fanout",
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_chain_fanout);
return (MPI_SUCCESS);
}
int
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_forced selected algorithm %d",
data->user_forced[GATHER].algorithm));
switch (data->user_forced[GATHER].algorithm) {
case (0):
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
data->user_forced[GATHER].segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[GATHER].algorithm,
ompi_coll_tuned_forced_max_algorithms[GATHER]));
return (MPI_ERR_ARG);
} /* switch */
}
int
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[GATHER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -31,28 +31,8 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
/* reduce algorithm variables */
static int coll_tuned_reduce_algorithm_count = 6;
static int coll_tuned_reduce_forced_algorithm = 0;
static int coll_tuned_reduce_segment_size = 0;
static int coll_tuned_reduce_max_requests;
static int coll_tuned_reduce_tree_fanout;
static int coll_tuned_reduce_chain_fanout;
/* valid values for coll_tuned_reduce_forced_algorithm */
static mca_base_var_enum_value_t reduce_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "chain"},
{3, "pipeline"},
{4, "binary"},
{5, "binomial"},
{6, "in-order_binary"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
/**
* This is a generic implementation of the reduce protocol. It used the tree
@ -62,10 +42,10 @@ static mca_base_var_enum_value_t reduce_algorithms[] = {
* the number of datatype to the original count (original_count)
*
* Note that for non-commutative operations we cannot save memory copy
* for the first block: thus we must copy sendbuf to accumbuf on intermediate
* for the first block: thus we must copy sendbuf to accumbuf on intermediate
* to keep the optimized loop happy.
*/
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
int ompi_coll_base_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
ompi_datatype_t* datatype, ompi_op_t* op,
int root, ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -90,60 +70,60 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
segment_increment = (ptrdiff_t)count_by_segment * extent;
sendtmpbuf = (char*) sendbuf;
if( sendbuf == MPI_IN_PLACE ) {
sendtmpbuf = (char *)recvbuf;
sendtmpbuf = (char*) sendbuf;
if( sendbuf == MPI_IN_PLACE ) {
sendtmpbuf = (char *)recvbuf;
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d",
original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment),
(unsigned long)segment_increment, max_outstanding_reqs));
rank = ompi_comm_rank(comm);
/* non-leaf nodes - wait for children to send me data & forward up
/* non-leaf nodes - wait for children to send me data & forward up
(if needed) */
if( tree->tree_nextsize > 0 ) {
ptrdiff_t true_lower_bound, true_extent, real_segment_size;
ompi_datatype_get_true_extent( datatype, &true_lower_bound,
ompi_datatype_get_true_extent( datatype, &true_lower_bound,
&true_extent );
/* handle non existant recv buffer (i.e. its NULL) and
/* handle non existant recv buffer (i.e. its NULL) and
protect the recv buffer on non-root nodes */
accumbuf = (char*)recvbuf;
if( (NULL == accumbuf) || (root != rank) ) {
/* Allocate temporary accumulator buffer. */
accumbuf_free = (char*)malloc(true_extent +
accumbuf_free = (char*)malloc(true_extent +
(ptrdiff_t)(original_count - 1) * extent);
if (accumbuf_free == NULL) {
line = __LINE__; ret = -1; goto error_hndl;
if (accumbuf_free == NULL) {
line = __LINE__; ret = -1; goto error_hndl;
}
accumbuf = accumbuf_free - lower_bound;
}
}
/* If this is a non-commutative operation we must copy
sendbuf to the accumbuf, in order to simplfy the loops */
if (!ompi_op_is_commute(op)) {
ompi_datatype_copy_content_same_ddt(datatype, original_count,
ompi_datatype_copy_content_same_ddt(datatype, original_count,
(char*)accumbuf,
(char*)sendtmpbuf);
}
/* Allocate two buffers for incoming segments */
real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent;
inbuf_free[0] = (char*) malloc(real_segment_size);
if( inbuf_free[0] == NULL ) {
line = __LINE__; ret = -1; goto error_hndl;
if( inbuf_free[0] == NULL ) {
line = __LINE__; ret = -1; goto error_hndl;
}
inbuf[0] = inbuf_free[0] - lower_bound;
/* if there is chance to overlap communication -
allocate second buffer */
if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
inbuf_free[1] = (char*) malloc(real_segment_size);
if( inbuf_free[1] == NULL ) {
if( inbuf_free[1] == NULL ) {
line = __LINE__; ret = -1; goto error_hndl;
}
inbuf[1] = inbuf_free[1] - lower_bound;
}
}
/* reset input buffer index and receive count */
inbi = 0;
@ -166,14 +146,14 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
if( segindex < num_segments ) {
void* local_recvbuf = inbuf[inbi];
if( 0 == i ) {
/* for the first step (1st child per segment) and
* commutative operations we might be able to irecv
* directly into the accumulate buffer so that we can
* reduce(op) this with our sendbuf in one step as
* ompi_op_reduce only has two buffer pointers,
/* for the first step (1st child per segment) and
* commutative operations we might be able to irecv
* directly into the accumulate buffer so that we can
* reduce(op) this with our sendbuf in one step as
* ompi_op_reduce only has two buffer pointers,
* this avoids an extra memory copy.
*
* BUT if the operation is non-commutative or
* BUT if the operation is non-commutative or
* we are root and are USING MPI_IN_PLACE this is wrong!
*/
if( (ompi_op_is_commute(op)) &&
@ -183,34 +163,34 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
}
ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype,
tree->tree_next[i],
MCA_COLL_BASE_TAG_REDUCE, comm,
tree->tree_next[i],
MCA_COLL_BASE_TAG_REDUCE, comm,
&reqs[inbi]));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;}
}
/* wait for previous req to complete, if any.
if there are no requests reqs[inbi ^1] will be
if there are no requests reqs[inbi ^1] will be
MPI_REQUEST_NULL. */
/* wait on data from last child for previous segment */
ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1],
ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1],
MPI_STATUSES_IGNORE );
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
local_op_buffer = inbuf[inbi ^ 1];
if( i > 0 ) {
/* our first operation is to combine our own [sendbuf] data
* with the data we recvd from down stream (but only
* the operation is commutative and if we are not root and
/* our first operation is to combine our own [sendbuf] data
* with the data we recvd from down stream (but only
* the operation is commutative and if we are not root and
* not using MPI_IN_PLACE)
*/
if( 1 == i ) {
if( (ompi_op_is_commute(op)) &&
if( (ompi_op_is_commute(op)) &&
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
}
}
/* apply operation */
ompi_op_reduce(op, local_op_buffer,
accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
ompi_op_reduce(op, local_op_buffer,
accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
recvcount, datatype );
} else if ( segindex > 0 ) {
void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
@ -220,25 +200,25 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
}
}
ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
datatype );
/* all reduced on available data this step (i) complete,
/* all reduced on available data this step (i) complete,
* pass to the next process unless you are the root.
*/
if (rank != tree->tree_root) {
/* send combined/accumulated data to parent */
ret = MCA_PML_CALL( send( accumulator, prevcount,
datatype, tree->tree_prev,
ret = MCA_PML_CALL( send( accumulator, prevcount,
datatype, tree->tree_prev,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD,
MCA_PML_BASE_SEND_STANDARD,
comm) );
if (ret != MPI_SUCCESS) {
line = __LINE__; goto error_hndl;
if (ret != MPI_SUCCESS) {
line = __LINE__; goto error_hndl;
}
}
/* we stop when segindex = number of segments
/* we stop when segindex = number of segments
(i.e. we do num_segment+1 steps for pipelining */
if (segindex == num_segments) break;
}
@ -254,33 +234,33 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
if( accumbuf_free != NULL ) free(accumbuf_free);
}
/* leaf nodes
Depending on the value of max_outstanding_reqs and
/* leaf nodes
Depending on the value of max_outstanding_reqs and
the number of segments we have two options:
- send all segments using blocking send to the parent, or
- avoid overflooding the parent nodes by limiting the number of
- avoid overflooding the parent nodes by limiting the number of
outstanding requests to max_oustanding_reqs.
TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size
for the current communication, synchronization should be used only
TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size
for the current communication, synchronization should be used only
when the message/segment size is smaller than the eager size.
*/
else {
/* If the number of segments is less than a maximum number of oustanding
requests or there is no limit on the maximum number of outstanding
requests or there is no limit on the maximum number of outstanding
requests, we send data to the parent using blocking send */
if ((0 == max_outstanding_reqs) ||
if ((0 == max_outstanding_reqs) ||
(num_segments <= max_outstanding_reqs)) {
segindex = 0;
while ( original_count > 0) {
if (original_count < count_by_segment) {
count_by_segment = original_count;
}
ret = MCA_PML_CALL( send((char*)sendbuf +
ret = MCA_PML_CALL( send((char*)sendbuf +
(ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
count_by_segment, datatype,
tree->tree_prev,
tree->tree_prev,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD,
comm) );
@ -310,7 +290,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
ret = MCA_PML_CALL( isend((char*)sendbuf +
(ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
count_by_segment, datatype,
tree->tree_prev,
tree->tree_prev,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
&sreq[segindex]) );
@ -328,12 +308,12 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
if( original_count < count_by_segment ) {
count_by_segment = original_count;
}
ret = MCA_PML_CALL( isend((char*)sendbuf +
(ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
count_by_segment, datatype,
tree->tree_prev,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
ret = MCA_PML_CALL( isend((char*)sendbuf +
(ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
count_by_segment, datatype,
tree->tree_prev,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
&sreq[creq]) );
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
creq = (creq + 1) % max_outstanding_reqs;
@ -342,7 +322,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
}
/* Wait on the remaining request to complete */
ret = ompi_request_wait_all( max_outstanding_reqs, sreq,
ret = ompi_request_wait_all( max_outstanding_reqs, sreq,
MPI_STATUSES_IGNORE );
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -353,8 +333,8 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
return OMPI_SUCCESS;
error_hndl: /* error handler */
OPAL_OUTPUT (( ompi_coll_tuned_stream,
"ERROR_HNDL: node %d file %s line %d error %d\n",
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
"ERROR_HNDL: node %d file %s line %d error %d\n",
rank, __FILE__, line, ret ));
if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
@ -369,9 +349,9 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
meaning that at least one datatype must fit in the segment !
*/
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
ompi_datatype_t* datatype,
ompi_op_t* op, int root,
int ompi_coll_base_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
ompi_datatype_t* datatype,
ompi_op_t* op, int root,
ompi_communicator_t* comm,
mca_coll_base_module_t *module,
uint32_t segsize, int fanout,
@ -379,27 +359,27 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize));
COLL_TUNED_UPDATE_CHAIN( comm, tuned_module, root, fanout );
COLL_BASE_UPDATE_CHAIN( comm, base_module, root, fanout );
/**
* Determine number of segments and number of elements
* sent per operation
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
op, root, comm, module,
data->cached_chain,
data->cached_chain,
segcount, max_outstanding_reqs );
}
int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
int ompi_coll_base_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
int count, ompi_datatype_t* datatype,
ompi_op_t* op, int root,
ompi_communicator_t* comm,
@ -409,101 +389,101 @@ int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_pipeline rank %d ss %5d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_pipeline rank %d ss %5d",
ompi_comm_rank(comm), segsize));
COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );
COLL_BASE_UPDATE_PIPELINE( comm, base_module, root );
/**
* Determine number of segments and number of elements
* sent per operation
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
op, root, comm, module,
data->cached_pipeline,
data->cached_pipeline,
segcount, max_outstanding_reqs );
}
int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
int ompi_coll_base_reduce_intra_binary( void *sendbuf, void *recvbuf,
int count, ompi_datatype_t* datatype,
ompi_op_t* op, int root,
ompi_communicator_t* comm,
ompi_communicator_t* comm,
mca_coll_base_module_t *module,
uint32_t segsize,
uint32_t segsize,
int max_outstanding_reqs )
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binary rank %d ss %5d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_binary rank %d ss %5d",
ompi_comm_rank(comm), segsize));
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BINTREE( comm, base_module, root );
/**
* Determine number of segments and number of elements
* sent per operation
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
op, root, comm, module,
data->cached_bintree,
data->cached_bintree,
segcount, max_outstanding_reqs );
}
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
int ompi_coll_base_reduce_intra_binomial( void *sendbuf, void *recvbuf,
int count, ompi_datatype_t* datatype,
ompi_op_t* op, int root,
ompi_communicator_t* comm,
ompi_communicator_t* comm,
mca_coll_base_module_t *module,
uint32_t segsize,
int max_outstanding_reqs )
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binomial rank %d ss %5d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_binomial rank %d ss %5d",
ompi_comm_rank(comm), segsize));
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
/**
* Determine number of segments and number of elements
* sent per operation
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
op, root, comm, module,
data->cached_in_order_bmtree,
data->cached_in_order_bmtree,
segcount, max_outstanding_reqs );
}
/*
* reduce_intra_in_order_binary
*
* reduce_intra_in_order_binary
*
* Function: Logarithmic reduce operation for non-commutative operations.
* Acecpts: same as MPI_Reduce()
* Returns: MPI_SUCCESS or error code
*/
int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
int count,
int ompi_coll_base_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
int count,
ompi_datatype_t* datatype,
ompi_op_t* op, int root,
ompi_communicator_t* comm,
ompi_communicator_t* comm,
mca_coll_base_module_t *module,
uint32_t segsize,
int max_outstanding_reqs )
@ -511,28 +491,28 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
int ret, rank, size, io_root, segcount = count;
void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_in_order_binary rank %d ss %5d",
rank, segsize));
COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module );
COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module );
/**
* Determine number of segments and number of elements
* sent per operation
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
/* An in-order binary tree must use root (size-1) to preserve the order of
operations. Thus, if root is not rank (size - 1), then we must handle
1. MPI_IN_PLACE option on real root, and
1. MPI_IN_PLACE option on real root, and
2. we must allocate temporary recvbuf on rank (size - 1).
Note that generic function must be careful not to switch order of
Note that generic function must be careful not to switch order of
operations for non-commutative ops.
*/
io_root = size - 1;
@ -541,7 +521,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
if (io_root != root) {
ptrdiff_t tlb, text, lb, ext;
char *tmpbuf = NULL;
ompi_datatype_get_extent(datatype, &lb, &ext);
ompi_datatype_get_true_extent(datatype, &tlb, &text);
@ -550,7 +530,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
if (NULL == tmpbuf) {
return MPI_ERR_INTERN;
}
ompi_datatype_copy_content_same_ddt(datatype, count,
ompi_datatype_copy_content_same_ddt(datatype, count,
(char*)tmpbuf,
(char*)recvbuf);
use_this_sendbuf = tmpbuf;
@ -564,9 +544,9 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
}
/* Use generic reduce with in-order binary tree topology and io_root */
ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
op, io_root, comm, module,
data->cached_in_order_bintree,
ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
op, io_root, comm, module,
data->cached_in_order_bintree,
segcount, max_outstanding_reqs );
if (MPI_SUCCESS != ret) { return ret; }
@ -581,11 +561,11 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
if (MPI_IN_PLACE == sendbuf) {
free(use_this_sendbuf);
}
} else if (io_root == rank) {
/* Send result from use_this_recvbuf to root */
ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root,
MCA_COLL_BASE_TAG_REDUCE,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { return ret; }
free(use_this_recvbuf);
@ -598,8 +578,8 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -617,12 +597,12 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
ompi_coll_base_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, err, size;
ptrdiff_t true_lb, true_extent, lb, extent;
@ -634,7 +614,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_basic_linear rank %d", rank));
/* If not root, send data to the root. */
@ -645,7 +625,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
return err;
}
/* see discussion in ompi_coll_basic_reduce_lin_intra about
/* see discussion in ompi_coll_basic_reduce_lin_intra about
extent and true extent */
/* for reducing buffer allocation lengths.... */
@ -673,7 +653,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
/* Initialize the receive buffer. */
if (rank == (size - 1)) {
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf,
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf,
(char*)sbuf);
} else {
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
@ -705,7 +685,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
}
if (NULL != inplace_temp) {
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf,
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf,
inplace_temp);
} else {
err = MPI_SUCCESS;
@ -724,185 +704,3 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
}
/* copied function (with appropriate renaming) ends here */
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced/fixed/locked in
* as you add methods/algorithms you must update this and the query/map routines
*
* this routine is called by the component only
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead.
*/
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t*new_enum;
ompi_coll_tuned_forced_max_algorithms[REDUCE] = coll_tuned_reduce_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_count",
"Number of reduce algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_reduce_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_reduce_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_reduce_algorithms", reduce_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_reduce_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_segmentsize",
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_segment_size);
coll_tuned_reduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_tree_fanout",
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_tree_fanout);
coll_tuned_reduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_chain_fanout",
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_chain_fanout);
coll_tuned_reduce_max_requests = 0; /* no limit for reduce by default */
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_max_requests",
"Maximum number of outstanding send requests on leaf nodes. 0 means no limit.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_max_requests);
if (mca_param_indices->max_requests_param_index < 0) {
return mca_param_indices->max_requests_param_index;
}
if (coll_tuned_reduce_max_requests < 0) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Maximum outstanding requests must be positive number or 0. Initializing to 0 (no limit).\n" );
}
coll_tuned_reduce_max_requests = 0;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
const int segsize = data->user_forced[REDUCE].segsize;
const int chain_fanout = data->user_forced[REDUCE].chain_fanout;
const int max_requests = data->user_forced[REDUCE].max_requests;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
data->user_forced[REDUCE].algorithm));
switch (data->user_forced[REDUCE].algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype,
op, root, comm, module);
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype,
op, root, comm, module);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, chain_fanout, max_requests);
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout,
int segsize, int max_requests )
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype,
op, root, comm, module);
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype,
op, root, comm, module);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, faninout, max_requests);
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -32,37 +32,21 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
/* reduce_scatter algorithm variables */
static int coll_tuned_reduce_scatter_algorithm_count = 2;
static int coll_tuned_reduce_scatter_forced_algorithm = 0;
static int coll_tuned_reduce_scatter_segment_size = 0;
static int coll_tuned_reduce_scatter_tree_fanout;
static int coll_tuned_reduce_scatter_chain_fanout;
/* valid values for coll_tuned_reduce_scatter_forced_algorithm */
static mca_base_var_enum_value_t reduce_scatter_algorithms[] = {
{0, "ignore"},
{1, "non-overlapping"},
{2, "recursive_halfing"},
{3, "ring"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
/*******************************************************************************
* ompi_coll_tuned_reduce_scatter_intra_nonoverlapping
* ompi_coll_base_reduce_scatter_intra_nonoverlapping
*
* This function just calls a reduce to rank 0, followed by an
* This function just calls a reduce to rank 0, followed by an
* appropriate scatterv call.
*/
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
mca_coll_base_module_t *module)
{
int err, i, rank, size, total_count, *displs = NULL;
const int root = 0;
@ -71,7 +55,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_nonoverlapping, rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_nonoverlapping, rank %d", rank));
for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; }
@ -80,7 +64,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
if (MPI_IN_PLACE == sbuf) {
/* rbuf on root (0) is big enough to hold whole data */
if (root == rank) {
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
} else {
err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
@ -91,13 +75,13 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
/* We must allocate temporary receive buffer on root to ensure that
rbuf is big enough */
ptrdiff_t lb, extent, tlb, textent;
ompi_datatype_get_extent(dtype, &lb, &extent);
ompi_datatype_get_true_extent(dtype, &tlb, &textent);
tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
tmprbuf = tmprbuf_free - lb;
}
}
err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
}
@ -105,7 +89,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
if (NULL != tmprbuf_free) free(tmprbuf_free);
return err;
}
displs = (int*) malloc(size * sizeof(int));
displs[0] = 0;
for (i = 1; i < size; i++) {
@ -122,7 +106,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
/*
* Recursive-halving function is (*mostly*) copied from the BASIC coll module.
* I have removed the part which handles "large" message sizes
* I have removed the part which handles "large" message sizes
* (non-overlapping version of reduce_Scatter).
*/
@ -131,15 +115,15 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
/*
* reduce_scatter_intra_basic_recursivehalving
*
* Function: - reduce scatter implementation using recursive-halving
* Function: - reduce scatter implementation using recursive-halving
* algorithm
* Accepts: - same as MPI_Reduce_scatter()
* Returns: - MPI_SUCCESS or error code
* Limitation: - Works only for commutative operations.
*/
int
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
void *rbuf,
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
@ -151,12 +135,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
char *recv_buf = NULL, *recv_buf_free = NULL;
char *result_buf = NULL, *result_buf_free = NULL;
/* Initialize */
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));
/* Find displacements and the like */
disps = (int*) malloc(sizeof(int) * size);
@ -191,43 +175,43 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
err = OMPI_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* allocate temporary buffer for results */
result_buf_free = (char*) malloc(buf_size);
result_buf = result_buf_free - true_lb;
/* copy local buffer into the temporary results */
err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
if (OMPI_SUCCESS != err) goto cleanup;
/* figure out power of two mapping: grow until larger than
comm size, then go back one, to get the largest power of
two less than comm size */
tmp_size = opal_next_poweroftwo (size);
tmp_size = opal_next_poweroftwo (size);
tmp_size >>= 1;
remain = size - tmp_size;
/* If comm size is not a power of two, have the first "remain"
procs with an even rank send to rank + 1, leaving a power of
two procs to do the rest of the algorithm */
if (rank < 2 * remain) {
if ((rank & 1) == 0) {
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
MCA_PML_BASE_SEND_STANDARD,
comm));
if (OMPI_SUCCESS != err) goto cleanup;
/* we don't participate from here on out */
tmp_rank = -1;
} else {
err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
comm, MPI_STATUS_IGNORE));
/* integrate their results into our temp results */
ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
/* adjust rank to be the bottom "remain" ranks */
tmp_rank = rank / 2;
}
@ -236,13 +220,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
remain" ranks dropped out */
tmp_rank = rank - remain;
}
/* For ranks not kicked out by the above code, perform the
recursive halving */
if (tmp_rank >= 0) {
int *tmp_disps = NULL, *tmp_rcounts = NULL;
int mask, send_index, recv_index, last_index;
/* recalculate disps and rcounts to account for the
special "remainder" processes that are no longer doing
anything */
@ -317,11 +301,11 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
}
if (send_count > 0) {
err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
send_count, dtype, peer,
send_count, dtype, peer,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
MCA_PML_BASE_SEND_STANDARD,
comm));
@ -329,7 +313,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
}
/* if we received something on this step, push it into
@ -340,10 +324,10 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
ompi_op_reduce(op,
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
ompi_op_reduce(op,
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
recv_count, dtype);
}
@ -357,13 +341,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
/* copy local results from results buffer into real receive buffer */
if (0 != rcounts[rank]) {
err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
rcounts[rank], dtype,
rcounts[rank], dtype,
rbuf, rcounts[rank], dtype);
if (OMPI_SUCCESS != err) {
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
}
free(tmp_rcounts);
@ -389,7 +373,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
comm));
if (OMPI_SUCCESS != err) goto cleanup;
}
}
}
}
cleanup:
@ -404,18 +388,18 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
/*
* ompi_coll_tuned_reduce_scatter_intra_ring
* ompi_coll_base_reduce_scatter_intra_ring
*
* Function: Ring algorithm for reduce_scatter operation
* Accepts: Same as MPI_Reduce_scatter()
* Returns: MPI_SUCCESS or error code
*
* Description: Implements ring algorithm for reduce_scatter:
* the block sizes defined in rcounts are exchanged and
* Description: Implements ring algorithm for reduce_scatter:
* the block sizes defined in rcounts are exchanged and
8 updated until they reach proper destination.
* Algorithm requires 2 * max(rcounts) extra buffering
*
* Limitations: The algorithm DOES NOT preserve order of operations so it
* Limitations: The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations.
* Example on 5 nodes:
* Initial state
@ -427,7 +411,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* [04] -> [14] [24] [34] [44]
*
* COMPUTATION PHASE
* Step 0: rank r sends block (r-1) to rank (r+1) and
* Step 0: rank r sends block (r-1) to rank (r+1) and
* receives block (r+1) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [10] [10+20] -> [30] [40]
@ -435,12 +419,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* -> [02] [12] [22] [32] [32+42] -->..
* [43+03] -> [13] [23] [33] [43]
* [04] [04+14] -> [24] [34] [44]
*
*
* Step 1:
* # 0 1 2 3 4
* [00] [10] [10+20] [10+20+30] -> [40]
* -> [01] [11] [21] [21+31] [21+31+41] ->
* [32+42+02] -> [12] [22] [32] [32+42]
* [32+42+02] -> [12] [22] [32] [32+42]
* [03] [43+03+13] -> [23] [33] [43]
* [04] [04+14] [04+14+24] -> [34] [44]
*
@ -448,7 +432,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* # 0 1 2 3 4
* -> [00] [10] [10+20] [10+20+30] [10+20+30+40] ->
* [21+31+41+01]-> [11] [21] [21+31] [21+31+41]
* [32+42+02] [32+42+02+12]-> [22] [32] [32+42]
* [32+42+02] [32+42+02+12]-> [22] [32] [32+42]
* [03] [43+03+13] [43+03+13+23]-> [33] [43]
* [04] [04+14] [04+14+24] [04+14+24+34] -> [44]
*
@ -456,14 +440,14 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* # 0 1 2 3 4
* [10+20+30+40+00] [10] [10+20] [10+20+30] [10+20+30+40]
* [21+31+41+01] [21+31+41+01+11] [21] [21+31] [21+31+41]
* [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42]
* [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42]
* [03] [43+03+13] [43+03+13+23] [43+03+13+23+33] [43]
* [04] [04+14] [04+14+24] [04+14+24+34] [04+14+24+34+44]
* DONE :)
*
*/
int
ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
int
ompi_coll_base_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
@ -480,11 +464,11 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:reduce_scatter_intra_ring rank %d, size %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:reduce_scatter_intra_ring rank %d, size %d",
rank, size));
/* Determine the maximum number of elements per node,
/* Determine the maximum number of elements per node,
corresponding block size, and displacements array.
*/
displs = (int*) malloc(size * sizeof(int));
@ -492,16 +476,16 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
displs[0] = 0;
total_count = rcounts[0];
max_block_count = rcounts[0];
for (i = 1; i < size; i++) {
for (i = 1; i < size; i++) {
displs[i] = total_count;
total_count += rcounts[i];
if (max_block_count < rcounts[i]) max_block_count = rcounts[i];
}
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
(char*)rbuf, (char*)sbuf);
if (ret < 0) { line = __LINE__; goto error_hndl; }
}
@ -541,13 +525,13 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
sbuf = rbuf;
}
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
accumbuf, (char*)sbuf);
if (ret < 0) { line = __LINE__; goto error_hndl; }
/* Computation loop */
/*
/*
For each of the remote nodes:
- post irecv for block (r-2) from (r-1) with wrap around
- send block (r-1) to (r+1)
@ -568,7 +552,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
inbi = 0;
/* Initialize first receive from the neighbor on the left */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
&reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent;
@ -579,25 +563,25 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
for (k = 2; k < size; k++) {
const int prevblock = (rank + size - k) % size;
inbi = inbi ^ 0x1;
/* Post irecv for the current block */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
&reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Wait on previous block to arrive */
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on previous block: result goes to rbuf
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
*/
tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent;
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype);
/* send previous block to send_to */
ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
@ -613,7 +597,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */
tmprecv = accumbuf + (ptrdiff_t)displs[rank] * extent;
ompi_op_reduce(op, inbuf[inbi], tmprecv, rcounts[rank], dtype);
/* Copy result from tmprecv to rbuf */
ret = ompi_datatype_copy_content_same_ddt(dtype, rcounts[rank], (char *)rbuf, tmprecv);
if (ret < 0) { line = __LINE__; goto error_hndl; }
@ -626,7 +610,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != displs) free(displs);
if (NULL != accumbuf_free) free(accumbuf_free);
@ -634,139 +618,3 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
if (NULL != inbuf_free[1]) free(inbuf_free[1]);
return ret;
}
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced/fixed/locked in
* as you add methods/algorithms you must update this and the query/map routines
*
* this routine is called by the component only
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead
*/
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER] = coll_tuned_reduce_scatter_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_count",
"Number of reduce_scatter algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_reduce_scatter_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_reduce_scatter_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_reduce_scatter_algorithms", reduce_scatter_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm",
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_reduce_scatter_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_segmentsize",
"Segment size in bytes used by default for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_segment_size);
coll_tuned_reduce_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_tree_fanout",
"Fanout for n-tree used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_tree_fanout);
coll_tuned_reduce_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_chain_fanout",
"Fanout for chains used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
data->user_forced[REDUCESCATTER].algorithm));
switch (data->user_forced[REDUCESCATTER].algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
dtype, op, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
dtype, op, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -28,27 +28,12 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* scatter algorithm variables */
static int coll_tuned_scatter_algorithm_count = 2;
static int coll_tuned_scatter_forced_algorithm = 0;
static int coll_tuned_scatter_segment_size = 0;
static int coll_tuned_scatter_tree_fanout;
static int coll_tuned_scatter_chain_fanout;
/* valid values for coll_tuned_scatter_forced_algorithm */
static mca_base_var_enum_value_t scatter_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
int
ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
ompi_coll_base_scatter_intra_binomial(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -60,19 +45,19 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
char *ptmp, *tempbuf = NULL;
ompi_coll_tree_t* bmtree;
MPI_Status status;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_scatter_intra_binomial rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_scatter_intra_binomial rank %d", rank));
/* create the binomial tree */
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
bmtree = data->cached_in_order_bmtree;
ompi_datatype_get_extent(sdtype, &slb, &sextent);
@ -167,7 +152,7 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
total_send += mycount;
}
if (NULL != tempbuf)
if (NULL != tempbuf)
free(tempbuf);
} else {
/* recv from parent on leaf nodes */
@ -182,7 +167,7 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
if (NULL != tempbuf)
free(tempbuf);
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -190,13 +175,13 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -208,7 +193,7 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
ompi_coll_base_scatter_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -269,153 +254,3 @@ ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[SCATTER] = coll_tuned_scatter_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_count",
"Number of scatter algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_scatter_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_scatter_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_scatter_algorithms", scatter_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm",
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_scatter_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_segmentsize",
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_segment_size);
coll_tuned_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_tree_fanout",
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_tree_fanout);
coll_tuned_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index=
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_chain_fanout",
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_chain_fanout);
return (MPI_SUCCESS);
}
int
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
data->user_forced[SCATTER].algorithm));
switch (data->user_forced[SCATTER].algorithm) {
case (0):
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[SCATTER].algorithm,
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}
int
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -5,16 +5,16 @@
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -25,8 +25,8 @@
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
/*
* Some static helpers.
@ -75,36 +75,36 @@ static int calculate_num_nodes_up_to_level( int fanout, int level )
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_tree( int fanout,
ompi_coll_base_topo_build_tree( int fanout,
struct ompi_communicator_t* comm,
int root )
{
int rank, size, schild, sparent, shiftedrank, i;
int level; /* location of my rank in the tree structure of size */
int delta; /* number of nodes on my level */
int slimit; /* total number of nodes on levels above me */
int slimit; /* total number of nodes on levels above me */
ompi_coll_tree_t* tree;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root));
if (fanout<1) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree invalid fanout %d", fanout));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout));
return NULL;
}
if (fanout>MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
return NULL;
}
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!tree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree PANIC::out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory"));
return NULL;
}
@ -115,8 +115,8 @@ ompi_coll_tuned_topo_build_tree( int fanout,
* Set root
*/
tree->tree_root = root;
/*
/*
* Initialize tree
*/
tree->tree_fanout = fanout;
@ -132,11 +132,11 @@ ompi_coll_tuned_topo_build_tree( int fanout,
if( size < 2 ) {
return tree;
}
/*
* Shift all ranks by root, so that the algorithm can be
* Shift all ranks by root, so that the algorithm can be
* designed as if root would be always 0
* shiftedrank should be used in calculating distances
* shiftedrank should be used in calculating distances
* and position in tree
*/
shiftedrank = rank - root;
@ -158,7 +158,7 @@ ompi_coll_tuned_topo_build_tree( int fanout,
break;
}
}
/* find my parent */
slimit = calculate_num_nodes_up_to_level( fanout, level );
sparent = shiftedrank;
@ -170,12 +170,12 @@ ompi_coll_tuned_topo_build_tree( int fanout,
}
}
tree->tree_prev = (sparent+root)%size;
return tree;
}
/*
* Constructs in-order binary tree which can be used for non-commutative reduce
* Constructs in-order binary tree which can be used for non-commutative reduce
* operations.
* Root of this tree is always rank (size-1) and fanout is 2.
* Here are some of the examples of this tree:
@ -189,28 +189,28 @@ ompi_coll_tuned_topo_build_tree( int fanout,
* 4 0
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
{
int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
ompi_coll_tree_t* tree;
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!tree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:topo_build_tree PANIC::out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:topo_build_tree PANIC::out of memory"));
return NULL;
}
tree->tree_root = MPI_UNDEFINED;
tree->tree_nextsize = MPI_UNDEFINED;
/*
/*
* Initialize tree
*/
tree->tree_fanout = 2;
@ -220,11 +220,11 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
tree->tree_nextsize = 0;
tree->tree_next[0] = -1;
tree->tree_next[1] = -1;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:topo_build_in_order_tree Building fo %d rt %d",
tree->tree_fanout, tree->tree_root));
/*
/*
* Build the tree
*/
myrank = rank;
@ -240,18 +240,18 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
rchild = -1;
if (size - 1 > 0) {
lchild = parent - 1;
if (lchild > 0) {
if (lchild > 0) {
rchild = rightsize - 1;
}
}
/* The following cases are possible: myrank can be
/* The following cases are possible: myrank can be
- a parent,
- belong to the left subtree, or
- belong to the right subtee
Each of the cases need to be handled differently.
*/
if (myrank == parent) {
/* I am the parent:
- compute real ranks of my children, and exit the loop. */
@ -262,7 +262,7 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
if (myrank > rchild) {
/* I belong to the left subtree:
- If I am the left child, compute real rank of my parent
- Iterate down through tree:
- Iterate down through tree:
compute new size, shift ranks down, and update delta.
*/
if (myrank == lchild) {
@ -276,8 +276,8 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
} else {
/* I belong to the right subtree:
- If I am the right child, compute real rank of my parent
- Iterate down through tree:
compute new size and parent,
- Iterate down through tree:
compute new size and parent,
but the delta and rank do not need to change.
*/
if (myrank == rchild) {
@ -287,14 +287,14 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
parent = rchild;
}
}
if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
return tree;
}
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree )
{
ompi_coll_tree_t *ptr;
@ -311,7 +311,7 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
}
/*
*
*
* Here are some of the examples of this tree:
* size == 2 size = 4 size = 8
* 0 0 0
@ -323,16 +323,16 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
* 7
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
int root )
{
int childs = 0, rank, size, mask = 1, index, remote, i;
ompi_coll_tree_t *bmtree;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root));
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@ -341,7 +341,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!bmtree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
return NULL;
}
@ -372,7 +372,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
remote += root;
if( remote >= size ) remote -= size;
if (childs==MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
free(bmtree);
return NULL;
}
@ -388,7 +388,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
/*
* Constructs in-order binomial tree which can be used for gather/scatter
* operations.
*
*
* Here are some of the examples of this tree:
* size == 2 size = 4 size = 8
* 0 0 0
@ -400,16 +400,16 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
* 7
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
int root )
{
int childs = 0, rank, vrank, size, mask = 1, remote, i;
ompi_coll_tree_t *bmtree;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root));
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@ -418,7 +418,7 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!bmtree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
return NULL;
}
@ -442,8 +442,8 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
bmtree->tree_next[childs] = (remote + root) % size;
childs++;
if (childs==MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:topo:build_bmtree max fanout incorrect %d needed %d",
MAXTREEFANOUT, childs));
free (bmtree);
return NULL;
@ -459,36 +459,36 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_chain( int fanout,
ompi_coll_base_topo_build_chain( int fanout,
struct ompi_communicator_t* comm,
int root )
{
int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
ompi_coll_tree_t *chain;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root));
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
if( fanout < 1 ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
fanout = 1;
}
if (fanout>MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
fanout = MAXTREEFANOUT;
}
/*
* Allocate space for topology arrays if needed
* Allocate space for topology arrays if needed
*/
chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) );
if (!chain) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain PANIC out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory"));
fflush(stdout);
return NULL;
}
@ -496,17 +496,17 @@ ompi_coll_tuned_topo_build_chain( int fanout,
chain->tree_nextsize = -1;
for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
/*
/*
* Set root & numchain
*/
chain->tree_root = root;
if( (size - 1) < fanout ) {
if( (size - 1) < fanout ) {
chain->tree_nextsize = size-1;
fanout = size-1;
} else {
chain->tree_nextsize = fanout;
}
/*
* Shift ranks
*/
@ -577,7 +577,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
chain->tree_nextsize = 1;
} else {
chain->tree_next[0] = -1;
chain->tree_nextsize = 0;
chain->tree_nextsize = 0;
}
}
chain->tree_prev = (chain->tree_prev+root)%size;
@ -586,7 +586,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
}
} else {
/*
* Unshift values
* Unshift values
*/
chain->tree_prev = -1;
chain->tree_next[0] = (root+1)%size;
@ -603,17 +603,62 @@ ompi_coll_tuned_topo_build_chain( int fanout,
return chain;
}
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
{
int i;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo:topo_dump_tree %1d tree root %d"
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d"
" fanout %d BM %1d nextsize %d prev %d",
rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
tree->tree_nextsize, tree->tree_prev));
if( tree->tree_nextsize ) {
for( i = 0; i < tree->tree_nextsize; i++ )
OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i]));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i]));
}
return (0);
}
mca_coll_base_comm_t* ompi_coll_base_topo_construct( mca_coll_base_comm_t* data )
{
if( NULL == data ) {
data = (mca_coll_base_comm_t*)calloc(1, sizeof(mca_coll_base_comm_t));
}
return data;
}
void ompi_coll_base_topo_destruct( mca_coll_base_comm_t* data )
{
if(NULL == data) return;
#if OPAL_ENABLE_DEBUG
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
the generel c_coll_selected_data */
data->mcct_reqs = NULL;
data->mcct_num_reqs = 0;
#endif
/* free any cached information that has been allocated */
if (data->cached_ntree) { /* destroy general tree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_ntree);
}
if (data->cached_bintree) { /* destroy bintree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_bintree);
}
if (data->cached_bmtree) { /* destroy bmtree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_bmtree);
}
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree);
}
if (data->cached_chain) { /* destroy general chain if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_chain);
}
if (data->cached_pipeline) { /* destroy pipeline if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_pipeline);
}
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree);
}
free(data);
}

Просмотреть файл

@ -5,19 +5,19 @@
* Copyright (c) 2004-2012 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
#define MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
#ifndef MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED
#define MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED
#include "ompi_config.h"
@ -35,29 +35,28 @@ typedef struct ompi_coll_tree_t {
} ompi_coll_tree_t;
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_tree( int fanout,
ompi_coll_base_topo_build_tree( int fanout,
struct ompi_communicator_t* com,
int root );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
int root );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
int root );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_chain( int fanout,
ompi_coll_base_topo_build_chain( int fanout,
struct ompi_communicator_t* com,
int root );
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree );
int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree );
/* debugging stuff, will be removed later */
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
END_C_DECLS
#endif /* MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED */
#endif /* MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -19,17 +19,17 @@
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned_util.h"
#include "coll_base_util.h"
int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, size_t rcount,
@ -91,14 +91,14 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
*status = statuses[err_index];
}
err = statuses[err_index].MPI_ERROR;
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_tuned_sendrecv_zero\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_base_sendrecv_zero\n",
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
} else {
/* Error discovered during the posting of the irecv or isend,
* and no status is available.
*/
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
__FILE__, line, err));
if (MPI_STATUS_IGNORE != status) {
status->MPI_ERROR = err;

Просмотреть файл

@ -18,8 +18,8 @@
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_UTIL_EXPORT_H
#define MCA_COLL_TUNED_UTIL_EXPORT_H
#ifndef MCA_COLL_BASE_UTIL_EXPORT_H
#define MCA_COLL_BASE_UTIL_EXPORT_H
#include "ompi_config.h"
@ -36,7 +36,7 @@ BEGIN_C_DECLS
* If one of the communications results in a zero-byte message the
* communication is ignored, and no message will cross to the peer.
*/
int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, size_t rcount,
@ -53,7 +53,7 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
* communications.
*/
static inline int
ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype,
ompi_coll_base_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, size_t rcount, ompi_datatype_t* rdatatype,
int source, int rtag,
@ -64,13 +64,13 @@ ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdataty
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
recvbuf, (int32_t) rcount, rdatatype);
}
return ompi_coll_tuned_sendrecv_nonzero_actual (sendbuf, scount, sdatatype,
return ompi_coll_base_sendrecv_nonzero_actual (sendbuf, scount, sdatatype,
dest, stag,
recvbuf, rcount, rdatatype,
source, rtag, comm, status);
}
END_C_DECLS
#endif /* MCA_COLL_TUNED_UTIL_EXPORT_H */
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

Просмотреть файл

@ -470,6 +470,9 @@ struct mca_coll_base_module_2_1_0_t {
be used for the given communicator */
mca_coll_base_module_disable_1_1_0_fn_t coll_module_disable;
/** Data storage for all the algorithms defined in the base. Should
not be used by other modules */
struct mca_coll_base_comm_t* base_data;
};
typedef struct mca_coll_base_module_2_1_0_t mca_coll_base_module_2_1_0_t;