Updates & upgrades:
- consistent arguments checking (not allowing to select an algorithm which is not available) - consistent way of computing the segcount (number of datatypes by segment). - small cleanups. - more informative debugging messages. This commit was SVN r12545.
Этот коммит содержится в:
родитель
938e7cd8d9
Коммит
476b922074
@ -397,4 +397,20 @@ do {
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* This macro give a generic way to compute the best count of
|
||||
* the segment (i.e. the number of complete datatypes that
|
||||
* can fit in the specified SEGSIZE). Beware, when this macro
|
||||
* is called, the SEGCOUNT should be initialized to the count as
|
||||
* expected by the collective call.
|
||||
*/
|
||||
#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
|
||||
if( (SEGSIZE) >= (TYPELNG) ) { \
|
||||
size_t residual; \
|
||||
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
|
||||
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
|
||||
if( residual > ((TYPELNG) >> 1) ) \
|
||||
(SEGCOUNT)++; \
|
||||
} \
|
||||
|
||||
#endif /* MCA_COLL_TUNED_EXPORT_H */
|
||||
|
@ -150,35 +150,39 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith
|
||||
"Number of allreduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
mca_param_indices->algorithm_param_index
|
||||
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
|
||||
false, false, 0, NULL);
|
||||
if( mca_param_indices->algorithm_param_index > max_alg ) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Allreduce algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
||||
mca_param_indices->algorithm_param_index, max_alg );
|
||||
}
|
||||
mca_param_indices->algorithm_param_index = 0;
|
||||
}
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->segsize_param_index
|
||||
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index
|
||||
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->chain_fanout_param_index
|
||||
= mca_base_param_reg_int( &mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
@ -448,30 +448,40 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
|
||||
"Number of alltoall algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
mca_param_indices->algorithm_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
|
||||
false, false, 0, NULL);
|
||||
if( mca_param_indices->algorithm_param_index > max_alg ) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Alltoall algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
||||
mca_param_indices->algorithm_param_index, max_alg );
|
||||
}
|
||||
mca_param_indices->algorithm_param_index = 0;
|
||||
}
|
||||
|
||||
mca_param_indices->segsize_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->chain_fanout_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
@ -233,7 +233,6 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
int count,
|
||||
@ -242,24 +241,19 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
struct ompi_communicator_t* comm,
|
||||
uint32_t segsize )
|
||||
{
|
||||
int segcount;
|
||||
int segcount = count;
|
||||
size_t typelng;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d",
|
||||
ompi_comm_rank(comm), segsize));
|
||||
|
||||
COLL_TUNED_UPDATE_BINTREE( comm, root );
|
||||
|
||||
/**
|
||||
* Determine number of segments and number of elements
|
||||
* sent per operation
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_ddt_type_size( datatype, &typelng );
|
||||
if( segsize > typelng ) {
|
||||
segcount = (int)(segsize / typelng);
|
||||
} else {
|
||||
segcount = count;
|
||||
}
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d typelng %ld segcount %d",
|
||||
ompi_comm_rank(comm), segsize, typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
|
||||
segcount, comm->c_coll_selected_data->cached_bintree );
|
||||
@ -276,21 +270,16 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
|
||||
int segcount;
|
||||
size_t typelng;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d",
|
||||
ompi_comm_rank(comm), segsize));
|
||||
|
||||
COLL_TUNED_UPDATE_PIPELINE( comm, root );
|
||||
|
||||
/**
|
||||
* Determine number of segments and number of elements
|
||||
* sent per operation
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_ddt_type_size( datatype, &typelng );
|
||||
if( segsize > typelng ) {
|
||||
segcount = (int)(segsize / typelng);
|
||||
} else {
|
||||
segcount = count;
|
||||
}
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %ld segcount %d",
|
||||
ompi_comm_rank(comm), segsize, typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
|
||||
segcount, comm->c_coll_selected_data->cached_pipeline );
|
||||
@ -307,19 +296,16 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer,
|
||||
int segcount;
|
||||
size_t typelng;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), chains, segsize));
|
||||
|
||||
COLL_TUNED_UPDATE_CHAIN( comm, root, chains );
|
||||
|
||||
/**
|
||||
* Determine number of segments and number of elements
|
||||
* sent per operation
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_ddt_type_size( datatype, &typelng );
|
||||
if( segsize > typelng ) {
|
||||
segcount = (int)(segsize / typelng);
|
||||
} else {
|
||||
segcount = count;
|
||||
}
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %ld segcount %d",
|
||||
ompi_comm_rank(comm), chains, segsize, typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
|
||||
segcount, comm->c_coll_selected_data->cached_chain );
|
||||
@ -336,19 +322,16 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer,
|
||||
int segcount;
|
||||
size_t typelng;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d", ompi_comm_rank(comm), segsize));
|
||||
|
||||
COLL_TUNED_UPDATE_BMTREE( comm, root );
|
||||
|
||||
/**
|
||||
* Determine number of segments and number of elements
|
||||
* sent per operation
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_ddt_type_size( datatype, &typelng );
|
||||
if( segsize > typelng ) {
|
||||
segcount = (int)(segsize / typelng);
|
||||
} else {
|
||||
segcount = count;
|
||||
}
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %ld segcount %d",
|
||||
ompi_comm_rank(comm), segsize, typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm,
|
||||
segcount, comm->c_coll_selected_data->cached_bmtree );
|
||||
@ -714,6 +697,13 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc
|
||||
"bcast_algorithm",
|
||||
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
|
||||
false, false, 0, NULL);
|
||||
if( mca_param_indices->algorithm_param_index > max_alg ) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Broadcast algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
||||
mca_param_indices->algorithm_param_index, max_alg );
|
||||
}
|
||||
mca_param_indices->algorithm_param_index = 0;
|
||||
}
|
||||
|
||||
mca_param_indices->segsize_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
|
@ -120,55 +120,6 @@ mca_coll_tuned_component_t mca_coll_tuned_component = {
|
||||
|
||||
static int tuned_open(void)
|
||||
{
|
||||
/* ompi_coll_tuned_component_t *ct = &ompi_coll_tuned_component; */
|
||||
|
||||
/* Use a low priority, but allow other components to be lower */
|
||||
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"priority",
|
||||
"Priority of the tuned coll component",
|
||||
false, false, ompi_coll_tuned_priority,
|
||||
&ompi_coll_tuned_priority);
|
||||
|
||||
/* parameter for pre-allocated memory requests etc */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"pre_allocate_memory_comm_size_limit",
|
||||
"Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
|
||||
false, false, ompi_coll_tuned_preallocate_memory_comm_size_limit,
|
||||
&ompi_coll_tuned_preallocate_memory_comm_size_limit);
|
||||
|
||||
/* by default DISABLE dynamic rules and instead use fixed [if based] rules */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"use_dynamic_rules",
|
||||
"Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
|
||||
false, false, ompi_coll_tuned_use_dynamic_rules,
|
||||
&ompi_coll_tuned_use_dynamic_rules);
|
||||
|
||||
|
||||
/* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
/* char *default_name; */
|
||||
/* asprintf(&default_name, "~/.openmpi/openmpi-coll-tuned-params.conf"); */
|
||||
mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
|
||||
"dynamic_rules_filename",
|
||||
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
||||
false, false, ompi_coll_tuned_dynamic_rules_filename,
|
||||
&ompi_coll_tuned_dynamic_rules_filename);
|
||||
}
|
||||
|
||||
/* some initial guesses at topology parameters */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"init_tree_fanout",
|
||||
"Inital fanout used in the tree topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout,
|
||||
&ompi_coll_tuned_init_tree_fanout);
|
||||
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"init_chain_fanout",
|
||||
"Inital fanout used in the chain (fanout followed by pipeline) topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
|
||||
false, false, ompi_coll_tuned_init_chain_fanout,
|
||||
&ompi_coll_tuned_init_chain_fanout);
|
||||
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
{
|
||||
int param;
|
||||
@ -184,6 +135,33 @@ static int tuned_open(void)
|
||||
}
|
||||
#endif /* OMPI_ENABLE_DEBUG */
|
||||
|
||||
/* Use a low priority, but allow other components to be lower */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"priority",
|
||||
"Priority of the tuned coll component",
|
||||
false, false, ompi_coll_tuned_priority,
|
||||
&ompi_coll_tuned_priority);
|
||||
|
||||
/* parameter for pre-allocated memory requests etc */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"pre_allocate_memory_comm_size_limit",
|
||||
"Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
|
||||
false, false, ompi_coll_tuned_preallocate_memory_comm_size_limit,
|
||||
&ompi_coll_tuned_preallocate_memory_comm_size_limit);
|
||||
|
||||
/* some initial guesses at topology parameters */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"init_tree_fanout",
|
||||
"Inital fanout used in the tree topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout,
|
||||
&ompi_coll_tuned_init_tree_fanout);
|
||||
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"init_chain_fanout",
|
||||
"Inital fanout used in the chain (fanout followed by pipeline) topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
|
||||
false, false, ompi_coll_tuned_init_chain_fanout,
|
||||
&ompi_coll_tuned_init_chain_fanout);
|
||||
|
||||
/* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */
|
||||
/* the user can redo this before every comm dup/create if they like */
|
||||
/* this is useful for benchmarking and user knows best tuning */
|
||||
@ -191,10 +169,23 @@ static int tuned_open(void)
|
||||
/* the actual values are looked up during comm create via module init */
|
||||
|
||||
/* intra functions first */
|
||||
/* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
|
||||
/* by default DISABLE dynamic rules and instead use fixed [if based] rules */
|
||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"use_dynamic_rules",
|
||||
"Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
|
||||
false, false, ompi_coll_tuned_use_dynamic_rules,
|
||||
&ompi_coll_tuned_use_dynamic_rules);
|
||||
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
|
||||
"dynamic_rules_filename",
|
||||
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
||||
false, false, ompi_coll_tuned_dynamic_rules_filename,
|
||||
&ompi_coll_tuned_dynamic_rules_filename);
|
||||
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
|
||||
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
|
||||
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
|
||||
/*ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
|
||||
ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
|
||||
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
|
||||
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
|
||||
|
@ -61,11 +61,9 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int communicator_size, rank, err;
|
||||
int communicator_size, rank;
|
||||
size_t dsize, total_dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
@ -75,14 +73,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
/* else we need data size for decision function */
|
||||
err = ompi_ddt_type_size(sdtype, &dsize);
|
||||
if (err != MPI_SUCCESS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
ompi_ddt_type_size(sdtype, &dsize);
|
||||
total_dsize = dsize * scount * communicator_size; /* needed for decision */
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed rank %d com_size %d msg_length %ld",
|
||||
rank, communicator_size, total_dsize));
|
||||
|
||||
if (communicator_size >= 12 && total_dsize <= 768) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
@ -102,11 +98,10 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
*/
|
||||
int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
||||
{
|
||||
int communicator_size;
|
||||
int communicator_size = ompi_comm_size(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed com_size %d",
|
||||
communicator_size));
|
||||
|
||||
if( 2 == communicator_size )
|
||||
return ompi_coll_tuned_barrier_intra_two_procs(comm);
|
||||
@ -148,24 +143,21 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
const double b1 = 11.2445;
|
||||
const double a2 = 0.0023;
|
||||
const double b2 = 3.8074;
|
||||
int communicator_size, rank, err;
|
||||
int communicator_size, rank;
|
||||
int segsize = 0;
|
||||
size_t message_size, dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* else we need data size for decision function */
|
||||
err = ompi_ddt_type_size(datatype, &dsize);
|
||||
if (err != MPI_SUCCESS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
ompi_ddt_type_size(datatype, &dsize);
|
||||
message_size = dsize * (unsigned long)count; /* needed for decision */
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed "
|
||||
"root %d rank %d com_size %d msg_length %ld",
|
||||
root, rank, communicator_size, message_size));
|
||||
|
||||
if ((message_size <= 1024) && (communicator_size < 12)) {
|
||||
/* Linear_0K */
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||
@ -245,7 +237,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
struct ompi_op_t* op, int root,
|
||||
struct ompi_communicator_t* comm)
|
||||
{
|
||||
int communicator_size, rank, err, segsize = 0;
|
||||
int communicator_size, rank, segsize = 0;
|
||||
size_t message_size, dsize;
|
||||
const double a1 = 0.6016 / 1024.0; /* [1/B] */
|
||||
const double b1 = 1.3496;
|
||||
@ -256,8 +248,6 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
const double a4 = 0.0033 / 1024.0; /* [1/B] */
|
||||
const double b4 = 1.6761;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"));
|
||||
|
||||
/**
|
||||
* If the operation is non commutative we only have one reduce algorithm right now.
|
||||
*/
|
||||
@ -269,14 +259,13 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* need data size for decision function */
|
||||
err = ompi_ddt_type_size(datatype, &dsize);
|
||||
if (err != MPI_SUCCESS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
ompi_ddt_type_size(datatype, &dsize);
|
||||
message_size = dsize * count; /* needed for decision */
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"
|
||||
"root %d rank %d com_size %d msg_length %ld",
|
||||
root, rank, communicator_size, message_size));
|
||||
|
||||
if (((communicator_size < 20) && (message_size < 512)) ||
|
||||
((communicator_size < 10) && (message_size <= 1024))){
|
||||
/* Linear_0K */
|
||||
|
@ -468,12 +468,18 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
|
||||
"Number of reduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm",
|
||||
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial",
|
||||
false, false, 0, NULL);
|
||||
if( mca_param_indices->algorithm_param_index > max_alg ) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Reduce algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
||||
mca_param_indices->algorithm_param_index, max_alg );
|
||||
}
|
||||
mca_param_indices->algorithm_param_index = 0;
|
||||
}
|
||||
|
||||
mca_param_indices->segsize_param_index
|
||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user