Correct the bcast problem where we always did a bcast with segzise of 0.
Activate the reduce decision function. Others small updates (mostly TAB to spaces). This commit was SVN r12161.
Этот коммит содержится в:
родитель
50649dd6a9
Коммит
be27ee6fa0
@ -106,7 +106,6 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_basic_linear rank %d", rank));
|
||||
|
||||
/* Reduce to 0 and broadcast. */
|
||||
@ -146,13 +145,13 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_count",
|
||||
"Number of allreduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
|
||||
@ -160,20 +159,20 @@ mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
|
||||
|
||||
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
@ -181,7 +180,7 @@ mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
@ -193,7 +192,7 @@ int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
|
||||
@ -216,7 +215,7 @@ int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
|
||||
@ -228,5 +227,3 @@ switch (algorithm) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -443,37 +443,37 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_count",
|
||||
"Number of alltoall algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
@ -487,7 +487,7 @@ int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
@ -512,7 +512,7 @@ int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
|
@ -336,17 +336,17 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm_count",
|
||||
"Number of barrier algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm",
|
||||
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only",
|
||||
false, false, 0, NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
@ -356,14 +356,14 @@ int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm)
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[BARRIER].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
|
||||
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
|
||||
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm);
|
||||
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm);
|
||||
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm);
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
comm->c_coll_selected_data->user_forced[BARRIER].algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
@ -377,14 +377,14 @@ int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, int
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
|
||||
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
|
||||
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm);
|
||||
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm);
|
||||
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm);
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
|
@ -111,7 +111,7 @@ ompi_coll_tuned_bcast_intra_chain ( void *buff, int count,
|
||||
/* set the buffer pointer */
|
||||
tmpbuf = (char *)buff;
|
||||
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,("%1d chain root %d num_segments %d\n", rank, root, num_segments); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,("%1d chain root %d num_segments %d\n", rank, root, num_segments); */
|
||||
|
||||
/* root code */
|
||||
if( rank == root ) {
|
||||
@ -735,7 +735,6 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d", rank, root));
|
||||
|
||||
|
||||
/* Non-root receive the data. */
|
||||
|
||||
if (rank != root) {
|
||||
@ -802,37 +801,37 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[BCAST] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_count",
|
||||
"Number of bcast algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm",
|
||||
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_chain_fanout",
|
||||
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
@ -844,7 +843,7 @@ int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[BCAST].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
|
||||
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
|
||||
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm,
|
||||
@ -856,7 +855,7 @@ switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
* ompi_coll_tuned_bcast_forced_segsize); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
@ -877,14 +876,14 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
|
||||
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
|
||||
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, segsize, faninout );
|
||||
case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm, segsize);
|
||||
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm, segsize);
|
||||
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm, segsize);
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
* segsize); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
|
@ -122,7 +122,7 @@ static int tuned_open(void)
|
||||
{
|
||||
int param;
|
||||
|
||||
/* ompi_coll_tuned_component_t *ct = &ompi_coll_tuned_component; */
|
||||
/* ompi_coll_tuned_component_t *ct = &ompi_coll_tuned_component; */
|
||||
|
||||
/* Use a low priority, but allow other components to be lower */
|
||||
|
||||
@ -149,8 +149,8 @@ static int tuned_open(void)
|
||||
|
||||
/* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
/* char *default_name; */
|
||||
/* asprintf(&default_name, "~/.openmpi/openmpi-coll-tuned-params.conf"); */
|
||||
/* char *default_name; */
|
||||
/* asprintf(&default_name, "~/.openmpi/openmpi-coll-tuned-params.conf"); */
|
||||
mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
|
||||
"dynamic_rules_filename",
|
||||
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
||||
@ -190,7 +190,7 @@ static int tuned_open(void)
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
|
||||
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
|
||||
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
|
||||
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
|
||||
ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
|
||||
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
|
||||
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
|
||||
|
@ -83,9 +83,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
if (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -130,9 +128,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
if (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -165,10 +161,7 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm)
|
||||
if (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
return ompi_coll_tuned_barrier_intra_do_forced (comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -208,10 +201,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
if (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_bcast_intra_dec_fixed (buff, count, datatype, root, comm);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -252,9 +242,6 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
if (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_reduce_intra_dec_fixed (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -41,17 +41,9 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
/* int size; */
|
||||
/* int contig; */
|
||||
/* int dsize; */
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed"));
|
||||
|
||||
/* size = ompi_comm_size(comm); */
|
||||
|
||||
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm));
|
||||
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -68,11 +60,8 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int comsize;
|
||||
int rank;
|
||||
int err;
|
||||
unsigned long dsize;
|
||||
unsigned long total_dsize;
|
||||
int comsize, rank, err;
|
||||
size_t dsize, total_dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed"));
|
||||
|
||||
@ -91,17 +80,15 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
return (err);
|
||||
}
|
||||
|
||||
total_dsize = dsize * scount * (unsigned long)comsize; /* needed for decision */
|
||||
total_dsize = dsize * scount * comsize; /* needed for decision */
|
||||
|
||||
if (comsize >= 12 && total_dsize <= 768) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
else if (total_dsize <= 131072) {
|
||||
if (total_dsize <= 131072) {
|
||||
return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -122,11 +109,10 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
||||
|
||||
if (2==comsize)
|
||||
return ompi_coll_tuned_barrier_intra_two_procs(comm);
|
||||
else
|
||||
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
|
||||
return ompi_coll_tuned_barrier_intra_recursivedoubling(comm);
|
||||
/* return ompi_coll_tuned_barrier_intra_bruck(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_bruck(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
|
||||
|
||||
}
|
||||
|
||||
@ -142,13 +128,9 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int comsize;
|
||||
int rank;
|
||||
int err;
|
||||
unsigned long msgsize;
|
||||
unsigned long dsize;
|
||||
int comsize, rank, err;
|
||||
int segsize = 0;
|
||||
|
||||
size_t msgsize, dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed"));
|
||||
|
||||
@ -166,34 +148,29 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
|
||||
/* this is based on gige measurements */
|
||||
|
||||
if ((comsize < 4)) {
|
||||
segsize = 0;
|
||||
if (comsize < 4) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||
}
|
||||
else if (comsize == 4) {
|
||||
if (comsize == 4) {
|
||||
if (msgsize < 524288) segsize = 0;
|
||||
else msgsize = 16384;
|
||||
else segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
else if (comsize > 4 && comsize <= 8 && msgsize < 4096) {
|
||||
segsize = 0;
|
||||
if (comsize <= 8 && msgsize < 4096) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||
}
|
||||
else if (comsize > 8 && msgsize >= 32768 && msgsize < 524288) {
|
||||
if (comsize > 8 && msgsize >= 32768 && msgsize < 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
else if (comsize > 4 && msgsize >= 524288) {
|
||||
if (msgsize >= 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
else {
|
||||
segsize = 0;
|
||||
/* once tested can swap this back in */
|
||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -209,15 +186,8 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
struct ompi_op_t* op, int root,
|
||||
struct ompi_communicator_t* comm)
|
||||
{
|
||||
int comsize;
|
||||
int rank;
|
||||
int err;
|
||||
/* int contig; */
|
||||
unsigned long msgsize;
|
||||
unsigned long dsize;
|
||||
int segsize = 0;
|
||||
/* int fanout = 0; */
|
||||
|
||||
int comsize, rank, err, segsize = 0, fanout = 0;
|
||||
size_t msgsize, dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"));
|
||||
|
||||
@ -231,35 +201,29 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
return (err);
|
||||
}
|
||||
|
||||
msgsize = dsize * (unsigned long)count; /* needed for decision */
|
||||
msgsize = dsize * count; /* needed for decision */
|
||||
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
#ifdef coconuts
|
||||
/* for small messages use linear algorithm */
|
||||
if (msgsize <= 4096) {
|
||||
segsize = 0;
|
||||
fanout = size-1;
|
||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||
fanout = comsize - 1;
|
||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||
} else if (msgsize <= 65536 ) {
|
||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||
}
|
||||
if (msgsize < 524288) {
|
||||
if (msgsize <= 65536 ) {
|
||||
segsize = 32768;
|
||||
fanout = 8;
|
||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
|
||||
} else if (msgsize < 524288) {
|
||||
} else {
|
||||
segsize = 1024;
|
||||
fanout = size/2;
|
||||
/* later swap this for a binary tree */
|
||||
/* fanout = 2; */
|
||||
fanout = comsize/2;
|
||||
}
|
||||
/* later swap this for a binary tree */
|
||||
/* fanout = 2; */
|
||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
}
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -235,7 +235,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
return (total_alg_count);
|
||||
|
||||
|
||||
on_file_error:
|
||||
on_file_error:
|
||||
|
||||
/* here we close out the file and delete any memory allocated nicely */
|
||||
/* we return back a verbose message and a count of -1 algorithms read */
|
||||
@ -274,7 +274,7 @@ static int getnext (FILE *fptr)
|
||||
}
|
||||
} while (1);
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void skiptonewline (FILE *fptr)
|
||||
|
@ -221,7 +221,6 @@ int ompi_coll_tuned_free_msg_rules_in_com_rule (ompi_coll_com_rule_t* com_p)
|
||||
|
||||
} /* if we have msg rules to free as well */
|
||||
|
||||
|
||||
return (rc);
|
||||
}
|
||||
|
||||
@ -318,15 +317,15 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
i = best = 0;
|
||||
|
||||
while (i<alg_p->n_com_sizes) {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking comsize %d against alg_id %d com_id %d index %d com_size %d", */
|
||||
/* mpi_comsize, com_p->alg_rule_id, com_p->com_rule_id, i, com_p->mpi_comsize)); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking comsize %d against alg_id %d com_id %d index %d com_size %d", */
|
||||
/* mpi_comsize, com_p->alg_rule_id, com_p->com_rule_id, i, com_p->mpi_comsize)); */
|
||||
if (com_p->mpi_comsize <= mpi_comsize) {
|
||||
best = i;
|
||||
best_com_p = com_p;
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
}
|
||||
else {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
break;
|
||||
}
|
||||
/* go to the next entry */
|
||||
@ -385,15 +384,15 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
|
||||
i = best = 0;
|
||||
|
||||
while (i<base_com_rule->n_msg_sizes) {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */
|
||||
/* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */
|
||||
/* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */
|
||||
if (msg_p->msg_size <= mpi_msgsize) {
|
||||
best = i;
|
||||
best_msg_p = msg_p;
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
}
|
||||
else {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
break;
|
||||
}
|
||||
/* go to the next entry */
|
||||
|
@ -49,7 +49,7 @@ int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indic
|
||||
mca_base_param_lookup_int (mca_params.tree_fanout_param_index, &(forced_values->tree_fanout));
|
||||
mca_base_param_lookup_int (mca_params.chain_fanout_param_index, &(forced_values->chain_fanout));
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_par
|
||||
{
|
||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
@ -61,8 +61,6 @@ int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indic
|
||||
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
||||
coll_tuned_force_algorithm_params_t *forced_values);
|
||||
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -52,37 +52,37 @@ static const mca_coll_base_module_1_0_0_t intra_fixed = {
|
||||
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgather_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_fixed, */
|
||||
NULL,
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_alltoall_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_fixed, */
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_fixed, */
|
||||
NULL,
|
||||
ompi_coll_tuned_barrier_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_bcast_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_fixed, */
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_gather_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_fixed, */
|
||||
NULL,
|
||||
ompi_coll_tuned_reduce_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_fixed, */
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_scan_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_scatter_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_fixed */
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_fixed */
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -95,37 +95,37 @@ static const mca_coll_base_module_1_0_0_t intra_dynamic = {
|
||||
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgather_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_dynamic, */
|
||||
NULL,
|
||||
ompi_coll_tuned_allreduce_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_alltoall_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_dynamic, */
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_dynamic, */
|
||||
NULL,
|
||||
ompi_coll_tuned_barrier_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_bcast_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_dynamic, */
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gather_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_dynamic, */
|
||||
NULL,
|
||||
ompi_coll_tuned_reduce_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_dynamic, */
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scan_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scatter_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_dynamic */
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_dynamic */
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -144,37 +144,37 @@ static const mca_coll_base_module_1_0_0_t inter_fixed = {
|
||||
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgather_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_barrier_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_barrier_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_bcast_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_bcast_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_exscan_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_exscan_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_gather_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_reduce_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_scan_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_scatter_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_fixed */
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_fixed */
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -187,37 +187,37 @@ static const mca_coll_base_module_1_0_0_t inter_dynamic = {
|
||||
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgather_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_barrier_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_barrier_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_bcast_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_bcast_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_exscan_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_exscan_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gather_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_reduce_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scan_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scatter_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_dynamic */
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_dynamic */
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -388,7 +388,7 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLREDUCE], &(data->user_forced[ALLREDUCE]));
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALL], &(data->user_forced[ALLTOALL]));
|
||||
/* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */
|
||||
/* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */
|
||||
ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER]));
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST]));
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE]));
|
||||
|
@ -96,10 +96,10 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
}
|
||||
realsegsize = segcount * ext;
|
||||
|
||||
/* printf("rank %d root %d count %d \t\t segsize %d typesize %d typeext %d realsegsize %d segcount %d num_segments %d\n", */
|
||||
/* rank, root, count, segsize, typelng, ext, realsegsize, segcount, num_segments); */
|
||||
/* printf("rank %d root %d count %d \t\t segsize %d typesize %d typeext %d realsegsize %d segcount %d num_segments %d\n", */
|
||||
/* rank, root, count, segsize, typelng, ext, realsegsize, segcount, num_segments); */
|
||||
|
||||
/* ompi_coll_tuned_topo_dump_chain (chain, rank); */
|
||||
/* ompi_coll_tuned_topo_dump_chain (chain, rank); */
|
||||
|
||||
|
||||
if (sendbuf != MPI_IN_PLACE) {
|
||||
@ -255,11 +255,11 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
} /* end of for each segment */
|
||||
|
||||
/* clean up */
|
||||
/* if (inbuf!=NULL) { */
|
||||
/* if (inbuf!=NULL) { */
|
||||
if (inbuf[0] != NULL) free(inbuf[0]);
|
||||
if (inbuf[1] != NULL) free(inbuf[1]);
|
||||
if (allocedaccumbuf) free(accumbuf);
|
||||
/* } */
|
||||
/* } */
|
||||
}
|
||||
|
||||
/* leaf nodes */
|
||||
@ -280,11 +280,11 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
/* error handler */
|
||||
error_hndl:
|
||||
OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ));
|
||||
/* if( inbuf != NULL ) { */
|
||||
/* if( inbuf != NULL ) { */
|
||||
if( inbuf[0] != NULL ) free(inbuf[0]);
|
||||
if( inbuf[1] != NULL ) free(inbuf[1]);
|
||||
if (allocedaccumbuf) free(accumbuf);
|
||||
/* } */
|
||||
/* } */
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -356,8 +356,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
return err;
|
||||
}
|
||||
|
||||
/* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extend */
|
||||
/* for reducing buffer allocation lengths.... */
|
||||
/* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extend */
|
||||
/* for reducing buffer allocation lengths.... */
|
||||
|
||||
ompi_ddt_get_extent(dtype, &lb, &extent);
|
||||
ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
@ -451,37 +451,36 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[REDUCE] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_count",
|
||||
"Number of reduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm",
|
||||
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
@ -493,7 +492,7 @@ int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
|
||||
@ -506,7 +505,6 @@ switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -519,7 +517,7 @@ int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
|
||||
@ -531,6 +529,5 @@ switch (algorithm) {
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
@ -428,23 +428,23 @@ int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain )
|
||||
|
||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
|
||||
{
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_tree %1d tree root %d fanout %d BM %1d nextsize %d prev %d", rank,
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_tree %1d tree root %d fanout %d BM %1d nextsize %d prev %d", rank,
|
||||
tree->tree_root, tree->tree_bmtree, tree->tree_fanout, tree->tree_nextsize, tree->tree_prev));
|
||||
if (tree->tree_nextsize) {
|
||||
if (tree->tree_nextsize) {
|
||||
for (i=0;i<tree->tree_nextsize;i++) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i]));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_topo_dump_chain (ompi_coll_chain_t* chain, int rank)
|
||||
{
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_chain %1d chain root %d fanout %d nextsize %d prev %d\n", rank,
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_chain %1d chain root %d fanout %d nextsize %d prev %d\n", rank,
|
||||
chain->chain_root, chain->chain_numchain, chain->chain_nextsize, chain->chain_prev));
|
||||
if (chain->chain_nextsize) {
|
||||
if (chain->chain_nextsize) {
|
||||
for (i=0;i<chain->chain_nextsize;i++) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d ", i, chain->chain_next[i]));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -65,8 +65,6 @@ int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain );
|
||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
|
||||
int ompi_coll_tuned_topo_dump_chain (ompi_coll_chain_t* chain, int rank);
|
||||
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -36,9 +36,9 @@ int ompi_coll_tuned_sendrecv_actual( void* sendbuf, int scount, ompi_datatype_t*
|
||||
ompi_status_public_t* status )
|
||||
|
||||
{ /* post receive first, then send, then waitall... should be fast (I hope) */
|
||||
int err, line = 0;
|
||||
ompi_request_t* reqs[2];
|
||||
ompi_status_public_t statuses[2];
|
||||
int err, line = 0;
|
||||
ompi_request_t* reqs[2];
|
||||
ompi_status_public_t statuses[2];
|
||||
|
||||
/* post new irecv */
|
||||
err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, comm, &reqs[0]));
|
||||
@ -73,9 +73,9 @@ int ompi_coll_tuned_sendrecv_actual_localcompleted (
|
||||
struct ompi_communicator_t* comm, ompi_status_public_t* status )
|
||||
|
||||
{ /* post receive first, then [local] sync send, then wait... should be fast (I hope) */
|
||||
int err, line = 0;
|
||||
ompi_request_t* req;
|
||||
ompi_status_public_t tmpstatus;
|
||||
int err, line = 0;
|
||||
ompi_request_t* req;
|
||||
ompi_status_public_t tmpstatus;
|
||||
|
||||
/* post new irecv */
|
||||
err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, comm, &req));
|
||||
@ -98,3 +98,4 @@ ompi_status_public_t tmpstatus;
|
||||
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",__FILE__,line,err));
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
@ -53,10 +53,8 @@ static inline int ompi_coll_tuned_sendrecv( void* sendbuf, int scount, ompi_data
|
||||
if ((dest==myid)&&(source==myid)) {
|
||||
return (int) ompi_ddt_sndrcv(sendbuf, (int32_t) scount, sdatatype, recvbuf, (int32_t) rcount, rdatatype);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_sendrecv_actual (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_sendrecv_actual_localcompleted( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
|
||||
@ -79,10 +77,8 @@ static inline int ompi_coll_tuned_sendrecv_localcompleted( void* sendbuf, int sc
|
||||
if ((dest==myid)&&(source==myid)) {
|
||||
return (int) ompi_ddt_sndrcv(sendbuf, (int32_t) scount, sdatatype, recvbuf, (int32_t) rcount, rdatatype);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user