1
1
- consistent arguments checking (not allowing to select an algorithm which
     is not available)
 - consistent way of computing the segcount (number of datatypes by segment).
 - small cleanups.
 - more informative debugging messages.

This commit was SVN r12545.
Этот коммит содержится в:
George Bosilca 2006-11-10 19:54:09 +00:00
родитель 938e7cd8d9
Коммит 476b922074
7 изменённых файлов: 178 добавлений и 172 удалений

Просмотреть файл

@ -397,4 +397,20 @@ do {
} \
} while (0)
/**
* This macro give a generic way to compute the best count of
* the segment (i.e. the number of complete datatypes that
* can fit in the specified SEGSIZE). Beware, when this macro
* is called, the SEGCOUNT should be initialized to the count as
* expected by the collective call.
*/
#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
if( (SEGSIZE) >= (TYPELNG) ) { \
size_t residual; \
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
if( residual > ((TYPELNG) >> 1) ) \
(SEGCOUNT)++; \
} \
#endif /* MCA_COLL_TUNED_EXPORT_H */

Просмотреть файл

@ -150,35 +150,39 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith
"Number of allreduce algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
false, false, 0, NULL);
if( mca_param_indices->algorithm_param_index > max_alg ) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Allreduce algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
mca_param_indices->algorithm_param_index, max_alg );
}
mca_param_indices->algorithm_param_index = 0;
}
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
false, false, 0, NULL);
mca_param_indices->segsize_param_index
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, 0, NULL);
mca_param_indices->tree_fanout_param_index
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
NULL);
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, 0, NULL);
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
NULL);
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
NULL);
mca_param_indices->chain_fanout_param_index
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
NULL);
return (MPI_SUCCESS);
}

Просмотреть файл

@ -448,30 +448,40 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
"Number of alltoall algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
false, false, 0, NULL);
if( mca_param_indices->algorithm_param_index > max_alg ) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Alltoall algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
mca_param_indices->algorithm_param_index, max_alg );
}
mca_param_indices->algorithm_param_index = 0;
}
mca_param_indices->segsize_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, 0, NULL);
mca_param_indices->tree_fanout_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
NULL);
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
false, false, 0, NULL);
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, 0, NULL);
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
NULL);
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
NULL);
mca_param_indices->chain_fanout_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
NULL);
return (MPI_SUCCESS);
}

Просмотреть файл

@ -233,7 +233,6 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
return (err);
}
int
ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
int count,
@ -242,24 +241,19 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
struct ompi_communicator_t* comm,
uint32_t segsize )
{
int segcount;
int segcount = count;
size_t typelng;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d",
ompi_comm_rank(comm), segsize));
COLL_TUNED_UPDATE_BINTREE( comm, root );
/**
* Determine number of segments and number of elements
* sent per operation
* Determine number of elements sent per operation.
*/
ompi_ddt_type_size( datatype, &typelng );
if( segsize > typelng ) {
segcount = (int)(segsize / typelng);
} else {
segcount = count;
}
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d typelng %ld segcount %d",
ompi_comm_rank(comm), segsize, typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
segcount, comm->c_coll_selected_data->cached_bintree );
@ -276,21 +270,16 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
int segcount;
size_t typelng;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d",
ompi_comm_rank(comm), segsize));
COLL_TUNED_UPDATE_PIPELINE( comm, root );
/**
* Determine number of segments and number of elements
* sent per operation
* Determine number of elements sent per operation.
*/
ompi_ddt_type_size( datatype, &typelng );
if( segsize > typelng ) {
segcount = (int)(segsize / typelng);
} else {
segcount = count;
}
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %ld segcount %d",
ompi_comm_rank(comm), segsize, typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
segcount, comm->c_coll_selected_data->cached_pipeline );
@ -307,19 +296,16 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer,
int segcount;
size_t typelng;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), chains, segsize));
COLL_TUNED_UPDATE_CHAIN( comm, root, chains );
/**
* Determine number of segments and number of elements
* sent per operation
* Determine number of elements sent per operation.
*/
ompi_ddt_type_size( datatype, &typelng );
if( segsize > typelng ) {
segcount = (int)(segsize / typelng);
} else {
segcount = count;
}
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %ld segcount %d",
ompi_comm_rank(comm), chains, segsize, typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
segcount, comm->c_coll_selected_data->cached_chain );
@ -336,19 +322,16 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer,
int segcount;
size_t typelng;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d", ompi_comm_rank(comm), segsize));
COLL_TUNED_UPDATE_BMTREE( comm, root );
/**
* Determine number of segments and number of elements
* sent per operation
* Determine number of elements sent per operation.
*/
ompi_ddt_type_size( datatype, &typelng );
if( segsize > typelng ) {
segcount = (int)(segsize / typelng);
} else {
segcount = count;
}
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %ld segcount %d",
ompi_comm_rank(comm), segsize, typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
segcount, comm->c_coll_selected_data->cached_bmtree );
@ -714,6 +697,13 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc
"bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
false, false, 0, NULL);
if( mca_param_indices->algorithm_param_index > max_alg ) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Broadcast algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
mca_param_indices->algorithm_param_index, max_alg );
}
mca_param_indices->algorithm_param_index = 0;
}
mca_param_indices->segsize_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,

Просмотреть файл

@ -120,55 +120,6 @@ mca_coll_tuned_component_t mca_coll_tuned_component = {
static int tuned_open(void)
{
/* ompi_coll_tuned_component_t *ct = &ompi_coll_tuned_component; */
/* Use a low priority, but allow other components to be lower */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"priority",
"Priority of the tuned coll component",
false, false, ompi_coll_tuned_priority,
&ompi_coll_tuned_priority);
/* parameter for pre-allocated memory requests etc */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"pre_allocate_memory_comm_size_limit",
"Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
false, false, ompi_coll_tuned_preallocate_memory_comm_size_limit,
&ompi_coll_tuned_preallocate_memory_comm_size_limit);
/* by default DISABLE dynamic rules and instead use fixed [if based] rules */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"use_dynamic_rules",
"Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
false, false, ompi_coll_tuned_use_dynamic_rules,
&ompi_coll_tuned_use_dynamic_rules);
/* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
if (ompi_coll_tuned_use_dynamic_rules) {
/* char *default_name; */
/* asprintf(&default_name, "~/.openmpi/openmpi-coll-tuned-params.conf"); */
mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
"dynamic_rules_filename",
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
false, false, ompi_coll_tuned_dynamic_rules_filename,
&ompi_coll_tuned_dynamic_rules_filename);
}
/* some initial guesses at topology parameters */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"init_tree_fanout",
"Inital fanout used in the tree topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
false, false, ompi_coll_tuned_init_tree_fanout,
&ompi_coll_tuned_init_tree_fanout);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"init_chain_fanout",
"Inital fanout used in the chain (fanout followed by pipeline) topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
false, false, ompi_coll_tuned_init_chain_fanout,
&ompi_coll_tuned_init_chain_fanout);
#if OMPI_ENABLE_DEBUG
{
int param;
@ -184,6 +135,33 @@ static int tuned_open(void)
}
#endif /* OMPI_ENABLE_DEBUG */
/* Use a low priority, but allow other components to be lower */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"priority",
"Priority of the tuned coll component",
false, false, ompi_coll_tuned_priority,
&ompi_coll_tuned_priority);
/* parameter for pre-allocated memory requests etc */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"pre_allocate_memory_comm_size_limit",
"Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
false, false, ompi_coll_tuned_preallocate_memory_comm_size_limit,
&ompi_coll_tuned_preallocate_memory_comm_size_limit);
/* some initial guesses at topology parameters */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"init_tree_fanout",
"Inital fanout used in the tree topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
false, false, ompi_coll_tuned_init_tree_fanout,
&ompi_coll_tuned_init_tree_fanout);
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"init_chain_fanout",
"Inital fanout used in the chain (fanout followed by pipeline) topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
false, false, ompi_coll_tuned_init_chain_fanout,
&ompi_coll_tuned_init_chain_fanout);
/* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */
/* the user can redo this before every comm dup/create if they like */
/* this is useful for benchmarking and user knows best tuning */
@ -191,10 +169,23 @@ static int tuned_open(void)
/* the actual values are looked up during comm create via module init */
/* intra functions first */
/* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
/* by default DISABLE dynamic rules and instead use fixed [if based] rules */
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"use_dynamic_rules",
"Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
false, false, ompi_coll_tuned_use_dynamic_rules,
&ompi_coll_tuned_use_dynamic_rules);
if (ompi_coll_tuned_use_dynamic_rules) {
mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
"dynamic_rules_filename",
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
false, false, ompi_coll_tuned_dynamic_rules_filename,
&ompi_coll_tuned_dynamic_rules_filename);
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
/*ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);

Просмотреть файл

@ -61,11 +61,9 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm)
{
int communicator_size, rank, err;
int communicator_size, rank;
size_t dsize, total_dsize;
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed"));
communicator_size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@ -75,14 +73,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
}
/* else we need data size for decision function */
err = ompi_ddt_type_size(sdtype, &dsize);
if (err != MPI_SUCCESS) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
return (err);
}
ompi_ddt_type_size(sdtype, &dsize);
total_dsize = dsize * scount * communicator_size; /* needed for decision */
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed rank %d com_size %d msg_length %ld",
rank, communicator_size, total_dsize));
if (communicator_size >= 12 && total_dsize <= 768) {
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
}
@ -102,11 +98,10 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
*/
int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
{
int communicator_size;
int communicator_size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed"));
communicator_size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed com_size %d",
communicator_size));
if( 2 == communicator_size )
return ompi_coll_tuned_barrier_intra_two_procs(comm);
@ -148,24 +143,21 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
const double b1 = 11.2445;
const double a2 = 0.0023;
const double b2 = 3.8074;
int communicator_size, rank, err;
int communicator_size, rank;
int segsize = 0;
size_t message_size, dsize;
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed"));
communicator_size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* else we need data size for decision function */
err = ompi_ddt_type_size(datatype, &dsize);
if (err != MPI_SUCCESS) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
return (err);
}
ompi_ddt_type_size(datatype, &dsize);
message_size = dsize * (unsigned long)count; /* needed for decision */
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed "
"root %d rank %d com_size %d msg_length %ld",
root, rank, communicator_size, message_size));
if ((message_size <= 1024) && (communicator_size < 12)) {
/* Linear_0K */
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
@ -245,7 +237,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
struct ompi_op_t* op, int root,
struct ompi_communicator_t* comm)
{
int communicator_size, rank, err, segsize = 0;
int communicator_size, rank, segsize = 0;
size_t message_size, dsize;
const double a1 = 0.6016 / 1024.0; /* [1/B] */
const double b1 = 1.3496;
@ -256,8 +248,6 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
const double a4 = 0.0033 / 1024.0; /* [1/B] */
const double b4 = 1.6761;
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"));
/**
* If the operation is non commutative we only have one reduce algorithm right now.
*/
@ -269,14 +259,13 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
rank = ompi_comm_rank(comm);
/* need data size for decision function */
err = ompi_ddt_type_size(datatype, &dsize);
if (err != MPI_SUCCESS) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
return (err);
}
ompi_ddt_type_size(datatype, &dsize);
message_size = dsize * count; /* needed for decision */
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"
"root %d rank %d com_size %d msg_length %ld",
root, rank, communicator_size, message_size));
if (((communicator_size < 20) && (message_size < 512)) ||
((communicator_size < 10) && (message_size <= 1024))){
/* Linear_0K */

Просмотреть файл

@ -468,12 +468,18 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
"Number of reduce algorithms available",
false, true, max_alg, NULL);
mca_param_indices->algorithm_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial",
false, false, 0, NULL);
if( mca_param_indices->algorithm_param_index > max_alg ) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Reduce algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
mca_param_indices->algorithm_param_index, max_alg );
}
mca_param_indices->algorithm_param_index = 0;
}
mca_param_indices->segsize_param_index
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,