1
1

A few small changes that just expanded in the name of neatness...

(1) As pointed out by Torsten after Jeff comment that there are 15 collectives yesterday.. nope.. I have 16 but
    miss counted them in my ifdefs (I had two #11s). Replaces with enum...
(2) Added a readonly MCA param for how many backend algorithms are available per collective (used by benchmarker/STS)
    This allowed me to remove the tuned query internal functions and replace them with ompi_coll_tuned_forced_max_algorithms[COLL].
(3) I was reading the user forced MCA params for the collectives on each comm create (module init) but I then put the 
    values into a global set of variables (like ompi_coll_tuned_reduce_forced_algorithm).

    To fix this and make the code neater:
    (a) The component looks up the MCA param indices on Open if dynamic_rules is set via the
                        ompi_coll_tuned_COLLECTIVE_intra_check_forced_init () call.
    (b) Got rid of the ompi_coll_ompi_coll_tuned_COLLECTIVE_forced_algorithm/segmentsize/etc globals with a struct that
            is now cached on the module data hung off the communicator. i.e. done right.
    (c) On module init if dynamic rules enabled we call a general getvalues routine (in coll_tuned_forced.c) to get the
            CURRENT values using the MCA param indices and then put them on the modules data segment.
        A shorter version of getvalues exists for barrier which only needs the algorithm choice

This commit was SVN r9663.
Этот коммит содержится в:
Graham Fagg 2006-04-19 23:42:06 +00:00
родитель 345551cb36
Коммит c31a5ad4b3
12 изменённых файлов: 382 добавлений и 226 удалений

Просмотреть файл

@ -22,12 +22,14 @@ sources = \
coll_tuned_util.h \
coll_tuned_dynamic_file.h \
coll_tuned_dynamic_rules.h \
coll_tuned_forced.h \
coll_tuned_topo.c \
coll_tuned_util.c \
coll_tuned_decision_fixed.c \
coll_tuned_decision_dynamic.c \
coll_tuned_dynamic_file.c \
coll_tuned_dynamic_rules.c \
coll_tuned_forced.c \
coll_tuned_allreduce.c \
coll_tuned_alltoall.c \
coll_tuned_barrier.c \

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -33,24 +33,12 @@
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
/* need the forced user choice structures */
#include "coll_tuned_forced.h"
/* some fixed value index vars to simplify certain operations */
#define ALLGATHER 0
#define ALLGATHERV 1
#define ALLREDUCE 2
#define ALLTOALL 3
#define ALLTOALLV 4
#define ALLTOALLW 5
#define BARRIER 6
#define BCAST 7
#define EXSCAN 8
#define GATHER 9
#define GATHERV 10
#define REDUCE 11
#define REDUCESCATTER 11
#define SCAN 12
#define SCATTER 13
#define SCATTERV 14
#define COLLCOUNT 15
typedef enum COLLTYPE {ALLGATHER, ALLGATHERV, ALLREDUCE, ALLTOALL, ALLTOALLV, ALLTOALLW, BARRIER, BCAST,
EXSCAN, GATHER, GATHERV, REDUCE, REDUCESCATTER, SCAN, SCATTER, SCATTERV, COLLCOUNT} COLLTYPE_T;
/* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm
@ -81,6 +69,7 @@ extern "C" {
/* OMPI_COMP_EXPORT extern const mca_coll_base_component_1_0_0_t mca_coll_tuned_component; */
/* these are the same across all modules and are loaded at component query time */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_stream;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_priority;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
@ -90,27 +79,12 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_init_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_init_chain_fanout;
/* forced algorithm choices */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_allreduce_forced_chain_fanout;
/* the indices to the MCA params so that modules can look them up at open / comm create time */
OMPI_COMP_EXPORT extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
/* the actual max algorithm values (readonly), loaded at component open */
OMPI_COMP_EXPORT extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_alltoall_forced_chain_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_barrier_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_bcast_forced_chain_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_choice;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_segsize;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_tree_fanout;
OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
/*
* coll API functions
@ -156,9 +130,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int choice, int faninout, int segsize);
int ompi_coll_tuned_allreduce_intra_check_forced(void);
int ompi_coll_tuned_allreduce_intra_query(void);
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
@ -168,9 +141,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int choice, int faninout, int segsize);
int ompi_coll_tuned_alltoall_intra_check_forced(void);
int ompi_coll_tuned_alltoall_intra_query (void);
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
@ -194,14 +166,10 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int choice, int faninout, int segsize);
int ompi_coll_tuned_barrier_intra_check_forced(void);
int ompi_coll_tuned_barrier_intra_query (void);
int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
@ -212,9 +180,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int choice, int faninout, int segsize);
int ompi_coll_tuned_bcast_intra_check_forced(void);
int ompi_coll_tuned_bcast_intra_query (void);
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
@ -246,9 +213,8 @@ OMPI_COMP_EXPORT extern int ompi_coll_tuned_reduce_forced_chain_fanout;
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int choice, int faninout, int segsize);
int ompi_coll_tuned_reduce_intra_check_forced(void);
int ompi_coll_tuned_reduce_intra_query (void);
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout);
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize);
@ -372,6 +338,10 @@ struct mca_coll_base_comm_t {
ompi_coll_alg_rule_t *all_base_rules; /* stored only on MCW, all other coms ref it */
/* moving to the component */
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
/* for forced algorithms we store the information on the module */
/* previously we only had one shared copy, ops, it really is per comm/module */
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
};
/**

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -135,43 +135,55 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_allreduce_intra_check_forced ( )
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 2;
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_count",
"Number of allreduce algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
false, false, ompi_coll_tuned_allreduce_forced_choice,
&ompi_coll_tuned_allreduce_forced_choice);
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_allreduce_forced_segsize,
&ompi_coll_tuned_allreduce_forced_segsize);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_allreduce_forced_tree_fanout);
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_allreduce_forced_chain_fanout);
NULL);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allreduce_intra_query ( )
{
return (2); /* 2 algorithms available */
}
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
@ -179,15 +191,16 @@ int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_communicator_t *comm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d",
ompi_coll_tuned_allreduce_forced_choice));
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm));
switch (ompi_coll_tuned_allreduce_forced_choice) {
switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_allreduce_forced_choice, ompi_coll_tuned_allreduce_intra_query()));
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
} /* switch */
@ -198,18 +211,18 @@ int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
int choice, int faninout, int segsize)
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
choice, faninout, segsize));
algorithm, faninout, segsize));
switch (choice) {
switch (algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_allreduce_intra_query()));
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
} /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -440,44 +440,51 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_alltoall_intra_check_forced ( )
{
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 4;
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_count",
"Number of alltoall algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
false, false, ompi_coll_tuned_alltoall_forced_choice,
&ompi_coll_tuned_alltoall_forced_choice);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_alltoall_forced_segsize,
&ompi_coll_tuned_alltoall_forced_segsize);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_alltoall_forced_tree_fanout);
NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_alltoall_forced_chain_fanout);
NULL);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoall_intra_query ( )
{
return (4); /* 4 algorithms available */
}
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
@ -485,9 +492,10 @@ int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d", ompi_coll_tuned_alltoall_forced_choice));
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm));
switch (ompi_coll_tuned_alltoall_forced_choice) {
switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
@ -495,7 +503,7 @@ switch (ompi_coll_tuned_alltoall_forced_choice) {
case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_alltoall_forced_choice, ompi_coll_tuned_alltoall_intra_query()));
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
} /* switch */
@ -507,12 +515,12 @@ int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
int choice, int faninout, int segsize)
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
choice, faninout, segsize));
algorithm, faninout, segsize));
switch (choice) {
switch (algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
@ -520,7 +528,7 @@ switch (choice) {
case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_alltoall_intra_query()));
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
} /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -325,32 +325,38 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_barrier_intra_check_forced ( )
{
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 5;
ompi_coll_tuned_forced_max_algorithms[BARRIER] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm_count",
"Number of barrier algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm",
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: step based bmtree",
false, false, ompi_coll_tuned_barrier_forced_choice,
&ompi_coll_tuned_barrier_forced_choice);
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only",
false, false, 0, NULL);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_barrier_intra_query ( )
{
return (5); /* 4 algorithms available */
/* 2 to do */
}
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d", ompi_coll_tuned_barrier_forced_choice));
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d",
comm->c_coll_selected_data->user_forced[BARRIER].algorithm));
switch (ompi_coll_tuned_barrier_forced_choice) {
switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
@ -360,18 +366,18 @@ switch (ompi_coll_tuned_barrier_forced_choice) {
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_barrier_forced_choice, ompi_coll_tuned_barrier_intra_query()));
comm->c_coll_selected_data->user_forced[BARRIER].algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, int choice, int faninout, int segsize)
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", choice, faninout));
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
switch (choice) {
switch (algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
@ -381,7 +387,7 @@ switch (choice) {
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_barrier_intra_query()));
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
} /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -773,69 +773,81 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_bcast_intra_check_forced ( )
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 6;
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
ompi_coll_tuned_forced_max_algorithms[BCAST] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_count",
"Number of bcast algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: BM tree.",
false, false, ompi_coll_tuned_bcast_forced_choice,
&ompi_coll_tuned_bcast_forced_choice);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_segmentsize",
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_bcast_forced_segsize,
&ompi_coll_tuned_bcast_forced_segsize);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_tree_fanout",
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_bcast_forced_tree_fanout);
NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_chain_fanout",
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_bcast_forced_chain_fanout);
NULL);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_bcast_intra_query ( )
{
return (5); /* 5 algorithms available */
/* 1 left to implement + NEC version */
}
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d", ompi_coll_tuned_bcast_forced_choice));
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
comm->c_coll_selected_data->user_forced[BCAST].algorithm));
switch (ompi_coll_tuned_bcast_forced_choice) {
switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize, ompi_coll_tuned_bcast_forced_chain_fanout );
case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize);
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize);
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm, ompi_coll_tuned_bcast_forced_segsize);
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize,
comm->c_coll_selected_data->user_forced[BCAST].chain_fanout );
case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize);
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize);
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm,
comm->c_coll_selected_data->user_forced[BCAST].segsize);
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
* ompi_coll_tuned_bcast_forced_segsize); */
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_bcast_forced_choice, ompi_coll_tuned_bcast_intra_query()));
comm->c_coll_selected_data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
return (MPI_ERR_ARG);
} /* switch */
@ -846,13 +858,13 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
int choice, int faninout, int segsize)
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
choice, faninout, segsize));
algorithm, faninout, segsize));
switch (choice) {
switch (algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, segsize, faninout );
@ -863,7 +875,7 @@ switch (choice) {
* segsize); */
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_bcast_intra_query()));
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
return (MPI_ERR_ARG);
} /* switch */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -49,27 +49,10 @@ int ompi_coll_tuned_init_tree_fanout = 4;
int ompi_coll_tuned_init_chain_fanout = 4;
/* forced alogrithm variables */
int ompi_coll_tuned_allreduce_forced_choice = 0;
int ompi_coll_tuned_allreduce_forced_segsize = 0;
int ompi_coll_tuned_allreduce_forced_chain_fanout = 0;
int ompi_coll_tuned_allreduce_forced_tree_fanout = 0;
int ompi_coll_tuned_alltoall_forced_choice = 0;
int ompi_coll_tuned_alltoall_forced_segsize = 0;
int ompi_coll_tuned_alltoall_forced_chain_fanout = 0;
int ompi_coll_tuned_alltoall_forced_tree_fanout = 0;
int ompi_coll_tuned_barrier_forced_choice = 0;
int ompi_coll_tuned_bcast_forced_choice = 0;
int ompi_coll_tuned_bcast_forced_segsize = 0;
int ompi_coll_tuned_bcast_forced_chain_fanout = 0;
int ompi_coll_tuned_bcast_forced_tree_fanout = 0;
int ompi_coll_tuned_reduce_forced_choice = 0;
int ompi_coll_tuned_reduce_forced_segsize = 0;
int ompi_coll_tuned_reduce_forced_chain_fanout = 0;
int ompi_coll_tuned_reduce_forced_tree_fanout = 0;
/* indices for the MCA parameters */
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
/* max algorithm values */
int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
/*
@ -159,7 +142,7 @@ static int tuned_open(void)
/* by default DISABLE dynamic rules and instead use fixed [if based] rules */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"use_dynamic_rules",
"Switch used to decide if we use static (if statements) or dynamic (built at runtime) decision function rules",
"Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
false, false, ompi_coll_tuned_use_dynamic_rules,
&ompi_coll_tuned_use_dynamic_rules);
@ -197,16 +180,21 @@ static int tuned_open(void)
}
}
/* now check that the user hasn't overrode any of the decision functions */
/* the user can do this before every comm dup/create if they like */
/* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */
/* the user can redo this before every comm dup/create if they like */
/* this is useful for benchmarking and user knows best tuning */
/* as this is the component we only lookup the indicies of the mca params */
/* the actual values are looked up during comm create via module init */
/* intra functions first */
ompi_coll_tuned_allreduce_intra_check_forced();
ompi_coll_tuned_alltoall_intra_check_forced();
ompi_coll_tuned_barrier_intra_check_forced();
ompi_coll_tuned_bcast_intra_check_forced();
ompi_coll_tuned_reduce_intra_check_forced();
if (ompi_coll_tuned_use_dynamic_rules) {
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!"));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -80,7 +80,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
} /* found a method */
} /*end if any com rules to check */
if (ompi_coll_tuned_allreduce_forced_choice) {
if (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op, comm);
}
else {
@ -127,7 +127,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
} /*end if any com rules to check */
if (ompi_coll_tuned_alltoall_forced_choice) {
if (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
}
else {
@ -162,7 +162,7 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm)
} /* found a method */
} /*end if any com rules to check */
if (ompi_coll_tuned_barrier_forced_choice) {
if (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
return ompi_coll_tuned_barrier_intra_do_forced (comm);
}
else {
@ -205,7 +205,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
} /*end if any com rules to check */
if (ompi_coll_tuned_bcast_forced_choice) {
if (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root, comm);
}
else {
@ -249,7 +249,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
} /* found a method */
} /*end if any com rules to check */
if (ompi_coll_tuned_reduce_forced_choice) {
if (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype, op, root, comm);
}
else {

65
ompi/mca/coll/tuned/coll_tuned_forced.c Обычный файл
Просмотреть файл

@ -0,0 +1,65 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/constants.h"
#include "ompi/datatype/datatype.h"
#include "ompi/communicator/communicator.h"
#include "coll_tuned.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
/* also need the dynamic rule structures */
#include "coll_tuned_forced.h"
#include "coll_tuned_util.h"
#include <stdlib.h>
#include <stdio.h>
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values)
{
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
mca_base_param_lookup_int (mca_params.segsize_param_index, &(forced_values->segsize));
mca_base_param_lookup_int (mca_params.tree_fanout_param_index, &(forced_values->tree_fanout));
mca_base_param_lookup_int (mca_params.chain_fanout_param_index, &(forced_values->chain_fanout));
return (MPI_SUCCESS);
}
/* special version of above just for barrier which only has one option available (at the moment...) */
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values)
{
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
return (MPI_SUCCESS);
}

71
ompi/mca/coll/tuned/coll_tuned_forced.h Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED
#define MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED
#include "ompi_config.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* this structure is for storing the indexes to the forced algorithm mca params... */
/* we get these at component query (so that registered values appear in ompi_infoi) */
struct coll_tuned_force_algorithm_mca_param_indices_t {
int algorithm_param_index; /* which algorithm you want to force */
int segsize_param_index; /* segsize to use (if supported), 0 = no segmentation */
int tree_fanout_param_index; /* tree fanout/in to use */
int chain_fanout_param_index; /* K-chain fanout/in to use */
};
typedef struct coll_tuned_force_algorithm_mca_param_indices_t coll_tuned_force_algorithm_mca_param_indices_t;
/* the following type is for storing actual value obtained from the MCA on each tuned module */
/* via their mca param indices lookup in the component */
/* this structure is stored once per collective type per communicator... */
struct coll_tuned_force_algorithm_params_t {
int algorithm; /* which algorithm you want to force */
int segsize; /* segsize to use (if supported), 0 = no segmentation */
int tree_fanout; /* tree fanout/in to use */
int chain_fanout; /* K-chain fanout/in to use */
};
typedef struct coll_tuned_force_algorithm_params_t coll_tuned_force_algorithm_params_t;
/* prototypes */
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values);
/* barrier has less options than any other collective so it gets its own special function */
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
coll_tuned_force_algorithm_params_t *forced_values);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_COLL_TUNED_FORCED_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,7 +30,7 @@
#include "coll_tuned_topo.h"
#include "coll_tuned_dynamic_rules.h"
#include "coll_tuned_dynamic_file.h"
#include "coll_tuned_forced.h"
/*
* Which set are we using?
@ -332,6 +332,8 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
*/
/* if we within the memory/size limit, allow preallocated data */
if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) {
data = malloc(sizeof(struct mca_coll_base_comm_t) +
(sizeof(ompi_request_t *) * size * 2));
@ -381,6 +383,17 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
}
}
/* next dynamic state, recheck all forced rules as well */
/* warning, we should check to make sure this is really an INTRA comm here... */
if (ompi_coll_tuned_use_dynamic_rules) {
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLREDUCE], &(data->user_forced[ALLREDUCE]));
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALL], &(data->user_forced[ALLTOALL]));
/* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */
ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER]));
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST]));
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE]));
}
if (&ompi_mpi_comm_world==comm) {

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -444,62 +444,70 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int ompi_coll_tuned_reduce_intra_check_forced ( )
{
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
int rc;
int max_alg = 3;
ompi_coll_tuned_forced_max_algorithms[REDUCE] = max_alg;
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_count",
"Number of reduce algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline",
false, false, ompi_coll_tuned_reduce_forced_choice,
&ompi_coll_tuned_reduce_forced_choice);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_segmentsize",
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, ompi_coll_tuned_reduce_forced_segsize,
&ompi_coll_tuned_reduce_forced_segsize);
false, false, 0, NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_tree_fanout",
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
&ompi_coll_tuned_reduce_forced_tree_fanout);
NULL);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_chain_fanout",
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
&ompi_coll_tuned_reduce_forced_chain_fanout);
NULL);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_intra_query ( )
{
return (3); /* 3 algorithms available */
}
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d", ompi_coll_tuned_reduce_forced_choice));
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
switch (ompi_coll_tuned_reduce_forced_choice) {
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
ompi_coll_tuned_reduce_forced_segsize, ompi_coll_tuned_reduce_forced_chain_fanout);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
comm->c_coll_selected_data->user_forced[REDUCE].segsize,
comm->c_coll_selected_data->user_forced[REDUCE].chain_fanout);
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
ompi_coll_tuned_reduce_forced_segsize);
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
ompi_coll_tuned_reduce_forced_choice, ompi_coll_tuned_reduce_intra_query()));
comm->c_coll_selected_data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG);
} /* switch */
@ -510,12 +518,12 @@ int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm,
int choice, int faninout, int segsize)
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
choice, faninout, segsize));
algorithm, faninout, segsize));
switch (choice) {
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
@ -524,7 +532,7 @@ switch (choice) {
segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
choice, ompi_coll_tuned_reduce_intra_query()));
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG);
} /* switch */