1
1

A few small changes that just expanded in the name of neatness...

(1) As pointed out by Torsten after Jeff comment that there are 15 collectives yesterday.. nope.. I have 16 but
    miss counted them in my ifdefs (I had two #11s). Replaces with enum...
(2) Added a readonly MCA param for how many backend algorithms are available per collective (used by benchmarker/STS)
    This allowed me to remove the tuned query internal functions and replace them with ompi_coll_tuned_forced_max_algorithms[COLL].
(3) I was reading the user forced MCA params for the collectives on each comm create (module init) but I then put the 
    values into a global set of variables (like ompi_coll_tuned_reduce_forced_algorithm).

    To fix this and make the code neater:
    (a) The component looks up the MCA param indices on Open if dynamic_rules is set via the
                        ompi_coll_tuned_COLLECTIVE_intra_check_forced_init () call.
    (b) Got rid of the ompi_coll_ompi_coll_tuned_COLLECTIVE_forced_algorithm/segmentsize/etc globals with a struct that
            is now cached on the module data hung off the communicator. i.e. done right.
    (c) On module init if dynamic rules enabled we call a general getvalues routine (in coll_tuned_forced.c) to get the
            CURRENT values using the MCA param indices and then put them on the modules data segment.
        A shorter version of getvalues exists for barrier which only needs the algorithm choice

This commit was SVN r9663.
Этот коммит содержится в:
Graham Fagg 2006-04-19 23:42:06 +00:00
родитель 345551cb36
Коммит c31a5ad4b3
12 изменённых файлов: 382 добавлений и 226 удалений

Просмотреть файл

@ -22,12 +22,14 @@ sources = \
coll_tuned_util.h \ coll_tuned_util.h \
coll_tuned_dynamic_file.h \ coll_tuned_dynamic_file.h \
coll_tuned_dynamic_rules.h \ coll_tuned_dynamic_rules.h \
coll_tuned_forced.h \
coll_tuned_topo.c \ coll_tuned_topo.c \
coll_tuned_util.c \ coll_tuned_util.c \
coll_tuned_decision_fixed.c \ coll_tuned_decision_fixed.c \
coll_tuned_decision_dynamic.c \ coll_tuned_decision_dynamic.c \
coll_tuned_dynamic_file.c \ coll_tuned_dynamic_file.c \
coll_tuned_dynamic_rules.c \ coll_tuned_dynamic_rules.c \
coll_tuned_forced.c \
coll_tuned_allreduce.c \ coll_tuned_allreduce.c \
coll_tuned_alltoall.c \ coll_tuned_alltoall.c \
coll_tuned_barrier.c \ coll_tuned_barrier.c \

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -33,24 +33,12 @@
/* also need the dynamic rule structures */ /* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h" #include "coll_tuned_dynamic_rules.h"
/* need the forced user choice structures */
#include "coll_tuned_forced.h"
/* some fixed value index vars to simplify certain operations */ /* some fixed value index vars to simplify certain operations */
#define ALLGATHER 0 typedef enum COLLTYPE {ALLGATHER, ALLGATHERV, ALLREDUCE, ALLTOALL, ALLTOALLV, ALLTOALLW, BARRIER, BCAST,
#define ALLGATHERV 1 EXSCAN, GATHER, GATHERV, REDUCE, REDUCESCATTER, SCAN, SCATTER, SCATTERV, COLLCOUNT} COLLTYPE_T;
#define ALLREDUCE 2
#define ALLTOALL 3
#define ALLTOALLV 4
#define ALLTOALLW 5
#define BARRIER 6
#define BCAST 7
#define EXSCAN 8
#define GATHER 9
#define GATHERV 10
#define REDUCE 11
#define REDUCESCATTER 11
#define SCAN 12
#define SCATTER 13
#define SCATTERV 14
#define COLLCOUNT 15
/* defined arg lists to simply auto inclusion of user overriding decision functions */ /* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm #define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm
@ -81,6 +69,7 @@ extern "C" {
/* OMPI_COMP_EXPORT extern const mca_coll_base_component_1_0_0_t mca_coll_tuned_component; */ /* OMPI_COMP_EXPORT extern const mca_coll_base_component_1_0_0_t mca_coll_tuned_component; */
/* these are the same across all modules and are loaded at component query time */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_stream; OMPI_COMP_EXPORT extern int ompi_coll_tuned_stream;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_priority; OMPI_COMP_EXPORT extern int ompi_coll_tuned_priority;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_preallocate_memory_comm_size_limit; OMPI_COMP_EXPORT extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
@ -90,27 +79,12 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_init_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_init_chain_fanout; OMPI_COMP_EXPORT extern int ompi_coll_tuned_init_chain_fanout;
/* forced algorithm choices */ /* forced algorithm choices */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_choice; /* the indices to the MCA params so that modules can look them up at open / comm create time */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_segsize; OMPI_COMP_EXPORT extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_tree_fanout; /* the actual max algorithm values (readonly), loaded at component open */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_chain_fanout; OMPI_COMP_EXPORT extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_chain_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_barrier_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_chain_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
/* /*
* coll API functions * coll API functions
@ -156,9 +130,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int choice, int faninout, int segsize); int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allreduce_intra_check_forced(void); int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allreduce_intra_query(void);
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS); int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
@ -168,9 +141,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS); int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS); int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS); int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int choice, int faninout, int segsize); int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_alltoall_intra_check_forced(void); int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_alltoall_intra_query (void);
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS); int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS); int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS); int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
@ -194,14 +166,10 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS); int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS); int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS); int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int choice, int faninout, int segsize); int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_barrier_intra_check_forced(void);
int ompi_coll_tuned_barrier_intra_query (void);
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS); int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS); int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS); int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS); int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS); int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
@ -212,9 +180,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int choice, int faninout, int segsize); int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_bcast_intra_check_forced(void); int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_bcast_intra_query (void);
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains); int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize); int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
@ -246,9 +213,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS); int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS); int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS); int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int choice, int faninout, int segsize); int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_reduce_intra_check_forced(void); int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_reduce_intra_query (void);
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS); int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout); int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout);
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize); int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize);
@ -372,6 +338,10 @@ struct mca_coll_base_comm_t {
ompi_coll_alg_rule_t *all_base_rules; /* stored only on MCW, all other coms ref it */ ompi_coll_alg_rule_t *all_base_rules; /* stored only on MCW, all other coms ref it */
/* moving to the component */ /* moving to the component */
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */ ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
/* for forced algorithms we store the information on the module */
/* previously we only had one shared copy, ops, it really is per comm/module */
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
}; };
/** /**

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -135,43 +135,55 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
/* publish details of each algorithm and if its forced/fixed/locked in */ /* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */ /* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_allreduce_intra_check_forced ( ) /* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{ {
int rc;
int max_alg = 2;
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_count",
"Number of allreduce algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm", "allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)", "Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
false, false, ompi_coll_tuned_allreduce_forced_choice, false, false, 0, NULL);
&ompi_coll_tuned_allreduce_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize", "allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", "Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_allreduce_forced_segsize, false, false, 0, NULL);
&ompi_coll_tuned_allreduce_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout", "allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", "Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false, false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
ompi_coll_tuned_init_tree_fanout, /* get system wide default */ NULL);
&ompi_coll_tuned_allreduce_forced_tree_fanout);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout", "allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", "Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */ ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_allreduce_forced_chain_fanout); NULL);
return (MPI_SUCCESS); return (MPI_SUCCESS);
} }
int ompi_coll_tuned_allreduce_intra_query ( )
{
return (2); /* 2 algorithms available */
}
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count, int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
@ -179,15 +191,16 @@ int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_communicator_t *comm) struct ompi_communicator_t *comm)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d",
ompi_coll_tuned_allreduce_forced_choice)); comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm));
switch (ompi_coll_tuned_allreduce_forced_choice) { switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm); case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm); case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm); case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_allreduce_forced_choice, ompi_coll_tuned_allreduce_intra_query())); comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */
@ -198,18 +211,18 @@ int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
int choice, int faninout, int segsize) int algorithm, int faninout, int segsize)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
choice, faninout, segsize)); algorithm, faninout, segsize));
switch (choice) { switch (algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm); case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm); case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm); case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_allreduce_intra_query())); algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -440,44 +440,51 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
/* publish details of each algorithm and if its forced/fixed/locked in */ /* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */ /* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_alltoall_intra_check_forced ( ) /* this routine is called by the component only */
{ /* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 4;
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_count",
"Number of alltoall algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm", "alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.", "Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
false, false, ompi_coll_tuned_alltoall_forced_choice, false, false, 0, NULL);
&ompi_coll_tuned_alltoall_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize", "alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", "Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_alltoall_forced_segsize, false, false, 0, NULL);
&ompi_coll_tuned_alltoall_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout", "alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", "Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */ ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_alltoall_forced_tree_fanout); NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout", "alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", "Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */ ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_alltoall_forced_chain_fanout); NULL);
return (MPI_SUCCESS); return (MPI_SUCCESS);
} }
int ompi_coll_tuned_alltoall_intra_query ( )
{
return (4); /* 4 algorithms available */
}
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount, int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
@ -485,9 +492,10 @@ int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm) struct ompi_communicator_t *comm)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d", ompi_coll_tuned_alltoall_forced_choice)); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm));
switch (ompi_coll_tuned_alltoall_forced_choice) { switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
@ -495,7 +503,7 @@ switch (ompi_coll_tuned_alltoall_forced_choice) {
case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_alltoall_forced_choice, ompi_coll_tuned_alltoall_intra_query())); comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */
@ -507,12 +515,12 @@ int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
void* rbuf, int rcount, void* rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
int choice, int faninout, int segsize) int algorithm, int faninout, int segsize)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
choice, faninout, segsize)); algorithm, faninout, segsize));
switch (choice) { switch (algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
@ -520,7 +528,7 @@ switch (choice) {
case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_alltoall_intra_query())); algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -325,32 +325,38 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
/* publish details of each algorithm and if its forced/fixed/locked in */ /* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */ /* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_barrier_intra_check_forced ( ) /* this routine is called by the component only */
{ /* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 5;
ompi_coll_tuned_forced_max_algorithms[BARRIER] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm_count",
"Number of barrier algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm", "barrier_algorithm",
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: step based bmtree", "Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only",
false, false, ompi_coll_tuned_barrier_forced_choice, false, false, 0, NULL);
&ompi_coll_tuned_barrier_forced_choice);
return (MPI_SUCCESS); return (MPI_SUCCESS);
} }
int ompi_coll_tuned_barrier_intra_query ( )
{
return (5); /* 4 algorithms available */
/* 2 to do */
}
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm) int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d", ompi_coll_tuned_barrier_forced_choice)); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d",
comm->c_coll_selected_data->user_forced[BARRIER].algorithm));
switch (ompi_coll_tuned_barrier_forced_choice) { switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm); case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm); case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm); case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
@ -360,18 +366,18 @@ switch (ompi_coll_tuned_barrier_forced_choice) {
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */ /* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_barrier_forced_choice, ompi_coll_tuned_barrier_intra_query())); comm->c_coll_selected_data->user_forced[BARRIER].algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */
} }
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, int choice, int faninout, int segsize) int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, int algorithm, int faninout, int segsize)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", choice, faninout)); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
switch (choice) { switch (algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm); case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm); case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm); case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
@ -381,7 +387,7 @@ switch (choice) {
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */ /* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_barrier_intra_query())); algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -773,69 +773,81 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
/* copied function (with appropriate renaming) ends here */ /* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
int ompi_coll_tuned_bcast_intra_check_forced ( )
{ {
int rc;
int max_alg = 6;
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, ompi_coll_tuned_forced_max_algorithms[BCAST] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_count",
"Number of bcast algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm", "bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: BM tree.", "Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: BM tree.",
false, false, ompi_coll_tuned_bcast_forced_choice, false, false, 0, NULL);
&ompi_coll_tuned_bcast_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_segmentsize", "bcast_algorithm_segmentsize",
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", "Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_bcast_forced_segsize, false, false, 0, NULL);
&ompi_coll_tuned_bcast_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_tree_fanout", "bcast_algorithm_tree_fanout",
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", "Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */ ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_bcast_forced_tree_fanout); NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_chain_fanout", "bcast_algorithm_chain_fanout",
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", "Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */ ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_bcast_forced_chain_fanout); NULL);
return (MPI_SUCCESS); return (MPI_SUCCESS);
} }
int ompi_coll_tuned_bcast_intra_query ( )
{
return (5); /* 5 algorithms available */
/* 1 left to implement + NEC version */
}
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count, int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
int root, int root,
struct ompi_communicator_t *comm) struct ompi_communicator_t *comm)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d", ompi_coll_tuned_bcast_forced_choice)); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
comm->c_coll_selected_data->user_forced[BCAST].algorithm));
switch (ompi_coll_tuned_bcast_forced_choice) { switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm); case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm); case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize, ompi_coll_tuned_bcast_forced_chain_fanout ); case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm,
case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize); comm->c_coll_selected_data->user_forced[BCAST].segsize,
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize); comm->c_coll_selected_data->user_forced[BCAST].chain_fanout );
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize); case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize);
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize);
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize);
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm, /* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
* ompi_coll_tuned_bcast_forced_segsize); */ * ompi_coll_tuned_bcast_forced_segsize); */
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_bcast_forced_choice, ompi_coll_tuned_bcast_intra_query())); comm->c_coll_selected_data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */
@ -846,13 +858,13 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
int choice, int faninout, int segsize) int algorithm, int faninout, int segsize)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
choice, faninout, segsize)); algorithm, faninout, segsize));
switch (choice) { switch (algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm); case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm); case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, segsize, faninout ); case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, segsize, faninout );
@ -863,7 +875,7 @@ switch (choice) {
* segsize); */ * segsize); */
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_bcast_intra_query())); algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -49,27 +49,10 @@ int ompi_coll_tuned_init_tree_fanout = 4;
int ompi_coll_tuned_init_chain_fanout = 4; int ompi_coll_tuned_init_chain_fanout = 4;
/* forced alogrithm variables */ /* forced alogrithm variables */
int ompi_coll_tuned_allreduce_forced_choice = 0; /* indices for the MCA parameters */
int ompi_coll_tuned_allreduce_forced_segsize = 0; coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
int ompi_coll_tuned_allreduce_forced_chain_fanout = 0; /* max algorithm values */
int ompi_coll_tuned_allreduce_forced_tree_fanout = 0; int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
int ompi_coll_tuned_alltoall_forced_choice = 0;
int ompi_coll_tuned_alltoall_forced_segsize = 0;
int ompi_coll_tuned_alltoall_forced_chain_fanout = 0;
int ompi_coll_tuned_alltoall_forced_tree_fanout = 0;
int ompi_coll_tuned_barrier_forced_choice = 0;
int ompi_coll_tuned_bcast_forced_choice = 0;
int ompi_coll_tuned_bcast_forced_segsize = 0;
int ompi_coll_tuned_bcast_forced_chain_fanout = 0;
int ompi_coll_tuned_bcast_forced_tree_fanout = 0;
int ompi_coll_tuned_reduce_forced_choice = 0;
int ompi_coll_tuned_reduce_forced_segsize = 0;
int ompi_coll_tuned_reduce_forced_chain_fanout = 0;
int ompi_coll_tuned_reduce_forced_tree_fanout = 0;
/* /*
@ -159,7 +142,7 @@ static int tuned_open(void)
/* by default DISABLE dynamic rules and instead use fixed [if based] rules */ /* by default DISABLE dynamic rules and instead use fixed [if based] rules */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"use_dynamic_rules", "use_dynamic_rules",
"Switch used to decide if we use static (if statements) or dynamic (built at runtime) decision function rules", "Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
false, false, ompi_coll_tuned_use_dynamic_rules, false, false, ompi_coll_tuned_use_dynamic_rules,
&ompi_coll_tuned_use_dynamic_rules); &ompi_coll_tuned_use_dynamic_rules);
@ -197,16 +180,21 @@ static int tuned_open(void)
} }
} }
/* now check that the user hasn't overrode any of the decision functions */ /* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */
/* the user can do this before every comm dup/create if they like */ /* the user can redo this before every comm dup/create if they like */
/* this is useful for benchmarking and user knows best tuning */ /* this is useful for benchmarking and user knows best tuning */
/* as this is the component we only lookup the indicies of the mca params */
/* the actual values are looked up during comm create via module init */
/* intra functions first */ /* intra functions first */
ompi_coll_tuned_allreduce_intra_check_forced(); if (ompi_coll_tuned_use_dynamic_rules) {
ompi_coll_tuned_alltoall_intra_check_forced(); ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
ompi_coll_tuned_barrier_intra_check_forced(); ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
ompi_coll_tuned_bcast_intra_check_forced(); /* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
ompi_coll_tuned_reduce_intra_check_forced(); ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!")); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!"));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -80,7 +80,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
} /* found a method */ } /* found a method */
} /*end if any com rules to check */ } /*end if any com rules to check */
if (ompi_coll_tuned_allreduce_forced_choice) { if (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op, comm); return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op, comm);
} }
else { else {
@ -127,7 +127,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
} /*end if any com rules to check */ } /*end if any com rules to check */
if (ompi_coll_tuned_alltoall_forced_choice) { if (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
} }
else { else {
@ -162,7 +162,7 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm)
} /* found a method */ } /* found a method */
} /*end if any com rules to check */ } /*end if any com rules to check */
if (ompi_coll_tuned_barrier_forced_choice) { if (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
return ompi_coll_tuned_barrier_intra_do_forced (comm); return ompi_coll_tuned_barrier_intra_do_forced (comm);
} }
else { else {
@ -205,7 +205,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
} /*end if any com rules to check */ } /*end if any com rules to check */
if (ompi_coll_tuned_bcast_forced_choice) { if (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root, comm); return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root, comm);
} }
else { else {
@ -249,7 +249,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
} /* found a method */ } /* found a method */
} /*end if any com rules to check */ } /*end if any com rules to check */
if (ompi_coll_tuned_reduce_forced_choice) { if (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype, op, root, comm); return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype, op, root, comm);
} }
else { else {

65
ompi/mca/coll/tuned/coll_tuned_forced.c Обычный файл
Просмотреть файл

@ -0,0 +1,65 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/constants.h"
#include "ompi/datatype/datatype.h"
#include "ompi/communicator/communicator.h"
#include "coll_tuned.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
/* also need the dynamic rule structures */
#include "coll_tuned_forced.h"
#include "coll_tuned_util.h"
#include <stdlib.h>
#include <stdio.h>
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values)
{
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
mca_base_param_lookup_int (mca_params.segsize_param_index, &(forced_values->segsize));
mca_base_param_lookup_int (mca_params.tree_fanout_param_index, &(forced_values->tree_fanout));
mca_base_param_lookup_int (mca_params.chain_fanout_param_index, &(forced_values->chain_fanout));
return (MPI_SUCCESS);
}
/* special version of above just for barrier which only has one option available (at the moment...) */
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values)
{
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
return (MPI_SUCCESS);
}

71
ompi/mca/coll/tuned/coll_tuned_forced.h Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED
#define MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED
#include "ompi_config.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* this structure is for storing the indexes to the forced algorithm mca params... */
/* we get these at component query (so that registered values appear in ompi_infoi) */
struct coll_tuned_force_algorithm_mca_param_indices_t {
int algorithm_param_index; /* which algorithm you want to force */
int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */
int tree_fanout_param_index; /* tree fanout/in to use */
int chain_fanout_param_index; /* K-chain fanout/in to use */
};
typedef struct coll_tuned_force_algorithm_mca_param_indices_t coll_tuned_force_algorithm_mca_param_indices_t;
/* the following type is for storing actual value obtained from the MCA on each tuned module */
/* via their mca param indices lookup in the component */
/* this structure is stored once per collective type per communicator... */
struct coll_tuned_force_algorithm_params_t {
int algorithm; /* which algorithm you want to force */
int segsize; /* segsize to use (if supported), 0 = no segmentation */
int tree_fanout; /* tree fanout/in to use */
int chain_fanout; /* K-chain fanout/in to use */
};
typedef struct coll_tuned_force_algorithm_params_t coll_tuned_force_algorithm_params_t;
/* prototypes */
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values);
/* barrier has less options than any other collective so it gets its own special function */
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,7 +30,7 @@
#include "coll_tuned_topo.h" #include "coll_tuned_topo.h"
#include "coll_tuned_dynamic_rules.h" #include "coll_tuned_dynamic_rules.h"
#include "coll_tuned_dynamic_file.h" #include "coll_tuned_dynamic_file.h"
#include "coll_tuned_forced.h"
/* /*
* Which set are we using? * Which set are we using?
@ -332,6 +332,8 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
*/ */
/* if we within the memory/size limit, allow preallocated data */ /* if we within the memory/size limit, allow preallocated data */
if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) { if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) {
data = malloc(sizeof(struct mca_coll_base_comm_t) + data = malloc(sizeof(struct mca_coll_base_comm_t) +
(sizeof(ompi_request_t *) * size * 2)); (sizeof(ompi_request_t *) * size * 2));
@ -381,6 +383,17 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
} }
} }
/* next dynamic state, recheck all forced rules as well */
/* warning, we should check to make sure this is really an INTRA comm here... */
if (ompi_coll_tuned_use_dynamic_rules) {
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLREDUCE], &(data->user_forced[ALLREDUCE]));
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALL], &(data->user_forced[ALLTOALL]));
/* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */
ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER]));
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST]));
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE]));
}
if (&ompi_mpi_comm_world==comm) { if (&ompi_mpi_comm_world==comm) {

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -444,62 +444,70 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
/* publish details of each algorithm and if its forced/fixed/locked in */ /* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */ /* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_reduce_intra_check_forced ( ) /* this routine is called by the component only */
{ /* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 3;
ompi_coll_tuned_forced_max_algorithms[REDUCE] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_count",
"Number of reduce algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm", "reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline", "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline",
false, false, ompi_coll_tuned_reduce_forced_choice, false, false, 0, NULL);
&ompi_coll_tuned_reduce_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_segmentsize", "reduce_algorithm_segmentsize",
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_reduce_forced_segsize, false, false, 0, NULL);
&ompi_coll_tuned_reduce_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_tree_fanout", "reduce_algorithm_tree_fanout",
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.", "Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */ ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_reduce_forced_tree_fanout); NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_chain_fanout", "reduce_algorithm_chain_fanout",
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.", "Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false, false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */ ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_reduce_forced_chain_fanout); NULL);
return (MPI_SUCCESS); return (MPI_SUCCESS);
} }
int ompi_coll_tuned_reduce_intra_query ( )
{
return (3); /* 3 algorithms available */
}
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count, int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root, struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm) struct ompi_communicator_t *comm)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d", ompi_coll_tuned_reduce_forced_choice)); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
switch (ompi_coll_tuned_reduce_forced_choice) { switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm); case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm); case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm, case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
ompi_coll_tuned_reduce_forced_segsize, ompi_coll_tuned_reduce_forced_chain_fanout); comm->c_coll_selected_data->user_forced[REDUCE].segsize,
comm->c_coll_selected_data->user_forced[REDUCE].chain_fanout);
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm, case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
ompi_coll_tuned_reduce_forced_segsize); comm->c_coll_selected_data->user_forced[REDUCE].segsize);
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_reduce_forced_choice, ompi_coll_tuned_reduce_intra_query())); comm->c_coll_selected_data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */
@ -510,12 +518,12 @@ int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root, struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
int choice, int faninout, int segsize) int algorithm, int faninout, int segsize)
{ {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
choice, faninout, segsize)); algorithm, faninout, segsize));
switch (choice) { switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm); case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm); case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm, case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
@ -524,7 +532,7 @@ switch (choice) {
segsize); segsize);
default: default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_reduce_intra_query())); algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG); return (MPI_ERR_ARG);
} /* switch */ } /* switch */