1
1
started to add static (fixed if) statement based decision rules based on gigE numbers
added mca params so that a user can force a certain algorithm/segment/topo on a per collective basis
(this is not in the fixed call path but only in the dynamic (at com create) call path).
(these params can be used by test suites such as OCC to choice which algorithm they are using).

This commit was SVN r7854.
Этот коммит содержится в:
Graham Fagg 2005-10-25 03:55:58 +00:00
родитель 65bcc283c0
Коммит 382f05c7ad
9 изменённых файлов: 398 добавлений и 18 удалений

Просмотреть файл

@ -45,7 +45,23 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_use_dynamic_rules;
OMPI_COMP_EXPORT extern int mca_coll_tuned_init_tree_fanout;
OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout;
/* forced algorithm choices */
OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_choice;
OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_segsize;
OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_tree_fanout;
OMPI_COMP_EXPORT extern int mca_coll_tuned_alltoall_forced_chain_fanout;
OMPI_COMP_EXPORT extern int mca_coll_tuned_barrier_forced_choice;
OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_choice;
OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_segsize;
OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_tree_fanout;
OMPI_COMP_EXPORT extern int mca_coll_tuned_bcast_forced_chain_fanout;
OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_choice;
OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_segsize;
OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_tree_fanout;
OMPI_COMP_EXPORT extern int mca_coll_tuned_reduce_forced_chain_fanout;
/*
* coll API functions
@ -146,6 +162,13 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout;
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoall_intra_check_forced(void);
int mca_coll_tuned_alltoall_intra_query (void);
int mca_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
@ -233,11 +256,14 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout;
int mca_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_dec_dynamic(
struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_check_forced(void);
int mca_coll_tuned_barrier_intra_query (void);
int mca_coll_tuned_barrier_inter_dec_fixed(struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_inter_dec_dynamic(
struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm);
@ -253,6 +279,12 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout;
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_bcast_intra_do_forced(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_bcast_intra_check_forced(void);
int mca_coll_tuned_bcast_intra_query (void);
int mca_coll_tuned_bcast_intra_linear(void *buff, int count,
struct ompi_datatype_t *datatype,
@ -366,6 +398,12 @@ OMPI_COMP_EXPORT extern int mca_coll_tuned_init_chain_fanout;
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_reduce_intra_check_forced(void);
int mca_coll_tuned_reduce_intra_query (void);
int mca_coll_tuned_reduce_intra_chain(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,

Просмотреть файл

@ -30,6 +30,7 @@
#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
int mca_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
@ -331,6 +332,67 @@ int mca_coll_tuned_alltoall_intra_linear(void *sbuf, int scount,
return err;
}
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int mca_coll_tuned_alltoall_intra_check_forced ( )
{
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
false, false, mca_coll_tuned_alltoall_forced_choice,
&mca_coll_tuned_alltoall_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, mca_coll_tuned_alltoall_forced_segsize,
&mca_coll_tuned_alltoall_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
mca_coll_tuned_init_tree_fanout, /* get system wide default */
&mca_coll_tuned_alltoall_forced_tree_fanout);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
mca_coll_tuned_init_chain_fanout, /* get system wide default */
&mca_coll_tuned_alltoall_forced_chain_fanout);
return (MPI_SUCCESS);
}
int mca_coll_tuned_alltoall_intra_query ( )
{
return (4); /* 4 algorithms available */
}
int mca_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm)
{
switch (mca_coll_tuned_alltoall_forced_choice) {
case (0): return mca_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (1): return mca_coll_tuned_alltoall_intra_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (2): return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (3): return mca_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
case (4): return mca_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
default:
OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
mca_coll_tuned_alltoall_forced_choice, mca_coll_tuned_alltoall_intra_query()));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -48,21 +48,39 @@ int mca_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
int err;
int contig;
int dsize;
MPI_Aint sext;
long lb;
OPAL_OUTPUT((mca_coll_tuned_stream, "mca_coll_tuned_alltoall_intra_dec_fixed"));
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* special case */
if (size==2) {
return mca_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
}
else {
/* return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); */
return mca_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
/* else we need data size for decision function */
err = ompi_ddt_get_extent (sdtype, &lb, &sext);
if (err != MPI_SUCCESS) {
OPAL_OUTPUT((mca_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
return (err);
}
/* return OMPI_ERR_NOT_IMPLEMENTED; */
dsize = sext * scount * size; /* needed for decision */
if (size >= 12 && dsize <= 768) {
return mca_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
}
else if (dsize <= 131072) {
/* not implemented yet.. need to find a 'nice' way to use the basic linear version without duplicating code */
/* return mca_coll_tuned_alltoall_intra_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); */
return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
}
else {
return mca_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
}
}

Просмотреть файл

@ -224,6 +224,47 @@ int mca_coll_tuned_barrier_intra_linear(struct ompi_communicator_t *comm)
return OMPI_ERR_NOT_IMPLEMENTED;
}
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int mca_coll_tuned_barrier_intra_check_forced ( )
{
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"barrier_algorithm",
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: step based bmtree",
false, false, mca_coll_tuned_barrier_forced_choice,
&mca_coll_tuned_barrier_forced_choice);
return (MPI_SUCCESS);
}
int mca_coll_tuned_barrier_intra_query ( )
{
return (4); /* 4 algorithms available */
/* 2 to do */
}
int mca_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm)
{
switch (mca_coll_tuned_barrier_forced_choice) {
case (0): return mca_coll_tuned_barrier_intra_dec_fixed (comm);
/* case (1): return mca_coll_tuned_barrier_intra_linear (comm); */
case (2): return mca_coll_tuned_barrier_intra_doublering (comm);
case (3): return mca_coll_tuned_barrier_intra_recursivedoubling (comm);
case (4): return mca_coll_tuned_barrier_intra_bruck (comm);
case (5): return mca_coll_tuned_barrier_intra_two_procs (comm);
/* case (6): return mca_coll_tuned_barrier_intra_bmtree_step (comm); */
default:
OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
mca_coll_tuned_barrier_forced_choice, mca_coll_tuned_barrier_intra_query()));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -682,3 +682,65 @@ mca_coll_tuned_bcast_intra_bintree ( void* buffer,
}
int mca_coll_tuned_bcast_intra_check_forced ( )
{
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: BM tree.",
false, false, mca_coll_tuned_bcast_forced_choice,
&mca_coll_tuned_bcast_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"bcast_algorithm_segmentsize",
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, mca_coll_tuned_bcast_forced_segsize,
&mca_coll_tuned_bcast_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"bcast_algorithm_tree_fanout",
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
mca_coll_tuned_init_tree_fanout, /* get system wide default */
&mca_coll_tuned_bcast_forced_tree_fanout);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"bcast_algorithm_chain_fanout",
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
mca_coll_tuned_init_chain_fanout, /* get system wide default */
&mca_coll_tuned_bcast_forced_chain_fanout);
return (MPI_SUCCESS);
}
int mca_coll_tuned_bcast_intra_query ( )
{
return (4); /* 4 algorithms available */
/* 2 left to implement + NEC version */
}
int mca_coll_tuned_bcast_intra_do_forced(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm)
{
switch (mca_coll_tuned_bcast_forced_choice) {
case (0): return mca_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
/* case (1): return mca_coll_tuned_bcast_intra_linear (buf, count, dtype, root, comm); */
case (2): return mca_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize, mca_coll_tuned_bcast_forced_chain_fanout );
case (3): return mca_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize);
case (4): return mca_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize);
case (5): return mca_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm, mca_coll_tuned_bcast_forced_segsize);
/* case (6): return mca_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
* mca_coll_tuned_bcast_forced_segsize); */
default:
OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
mca_coll_tuned_bcast_forced_choice, mca_coll_tuned_bcast_intra_query()));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -45,21 +45,56 @@ int mca_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
int rank;
int err;
int contig;
int dsize;
int msgsize;
MPI_Aint ext;
long lb;
int segsize = 0;
OPAL_OUTPUT((mca_coll_tuned_stream,"mca_coll_tuned_bcast_intra_dec_fixed"));
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* err = mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm); */
/* err = mca_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, (0)); */
/* err = mca_coll_tuned_bcast_intra_chain (buff, count, datatype, root, comm, (0), 1); */
/* err = mca_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, (8192)); */
err = mca_coll_tuned_bcast_intra_split_bintree (buff, count, datatype, root, comm, (100));
/* err = mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, (100)); */
/* else we need data size for decision function */
err = ompi_ddt_get_extent (datatype, &lb, &ext);
if (err != MPI_SUCCESS) {
OPAL_OUTPUT((mca_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
return (err);
}
msgsize = ext * count; /* needed for decision */
/* this is based on gige measurements */
if ((size < 4)) {
segsize = 0;
return mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm);
}
else if (size == 4) {
if (msgsize < 524288) segsize = 0;
else msgsize = 16384;
return mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
}
else if (size > 4 && size <= 8 && msgsize < 4096) {
segsize = 0;
return mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm);
}
else if (size > 8 && msgsize >= 32768 && msgsize < 524288) {
segsize = 16384;
return mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
}
else if (size > 4 && msgsize >= 524288) {
segsize = 16384;
return mca_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
}
else {
segsize = 0;
/* once tested can swap this back in */
/* return mca_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
return mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
}
return err;
}

Просмотреть файл

@ -46,6 +46,22 @@ int mca_coll_tuned_init_chain_fanout = 4;
/* forced alogrithm variables */
int mca_coll_tuned_alltoall_forced_choice = 0;
int mca_coll_tuned_alltoall_forced_segsize = 0;
int mca_coll_tuned_alltoall_forced_chain_fanout = 0;
int mca_coll_tuned_alltoall_forced_tree_fanout = 0;
int mca_coll_tuned_barrier_forced_choice = 0;
int mca_coll_tuned_bcast_forced_choice = 0;
int mca_coll_tuned_bcast_forced_segsize = 0;
int mca_coll_tuned_bcast_forced_chain_fanout = 0;
int mca_coll_tuned_bcast_forced_tree_fanout = 0;
int mca_coll_tuned_reduce_forced_choice = 0;
int mca_coll_tuned_reduce_forced_segsize = 0;
int mca_coll_tuned_reduce_forced_chain_fanout = 0;
int mca_coll_tuned_reduce_forced_tree_fanout = 0;
/*
* Local function
*/
@ -145,6 +161,20 @@ static int tuned_open(void)
mca_coll_tuned_stream = opal_output_open(NULL);
}
}
/* now check that the user hasn't overrode any of the decision functions */
/* the user can do this before every comm dup/create if they like */
/* this is useful for benchmarking and user knows best tuning */
/* intra functions first */
mca_coll_tuned_alltoall_intra_check_forced();
mca_coll_tuned_barrier_intra_check_forced();
mca_coll_tuned_bcast_intra_check_forced();
mca_coll_tuned_reduce_intra_check_forced();
OPAL_OUTPUT((mca_coll_tuned_stream, "coll:tuned:component_open: done!"));
return OMPI_SUCCESS;

Просмотреть файл

@ -327,3 +327,67 @@ int mca_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
segsize, 1 );
}
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int mca_coll_tuned_reduce_intra_check_forced ( )
{
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline",
false, false, mca_coll_tuned_reduce_forced_choice,
&mca_coll_tuned_reduce_forced_choice);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"reduce_algorithm_segmentsize",
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
false, false, mca_coll_tuned_reduce_forced_segsize,
&mca_coll_tuned_reduce_forced_segsize);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"reduce_algorithm_tree_fanout",
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
false, false,
mca_coll_tuned_init_tree_fanout, /* get system wide default */
&mca_coll_tuned_reduce_forced_tree_fanout);
mca_base_param_reg_int(&mca_coll_tuned_component.collm_version,
"reduce_algorithm_chain_fanout",
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
false, false,
mca_coll_tuned_init_chain_fanout, /* get system wide default */
&mca_coll_tuned_reduce_forced_chain_fanout);
return (MPI_SUCCESS);
}
int mca_coll_tuned_reduce_intra_query ( )
{
return (3); /* 3 algorithms available */
}
int mca_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm)
{
switch (mca_coll_tuned_reduce_forced_choice) {
case (0): return mca_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
/* case (1): return mca_coll_tuned_reduce_intra_linear (sbuf, rbuf, count, dtype, op, root, comm); */
case (2): return mca_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
mca_coll_tuned_reduce_forced_segsize, mca_coll_tuned_reduce_forced_chain_fanout);
case (3): return mca_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
mca_coll_tuned_reduce_forced_segsize);
default:
OPAL_OUTPUT((mca_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
mca_coll_tuned_reduce_forced_choice, mca_coll_tuned_reduce_intra_query()));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -46,17 +46,47 @@ int mca_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
int rank;
int err;
int contig;
int dsize;
int msgsize;
MPI_Aint ext;
long lb;
int segsize = 0;
int fanout = 0;
OPAL_OUTPUT((mca_coll_tuned_stream, "mca_coll_tuned_reduce_intra_dec_fixed"));
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* err = mca_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, (8192)); */
/* err = mca_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, (8192)); */
err = mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, (8192), 3);
/* need data size for decision function */
err = ompi_ddt_get_extent (datatype, &lb, &ext);
if (err != MPI_SUCCESS) {
OPAL_OUTPUT((mca_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
return (err);
}
msgsize = ext * count; /* needed for decision */
/* for small messages use linear algorithm */
if (msgsize <= 4096) {
segsize = 0;
fanout = size-1;
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
/* return mca_coll_tuned_reduce_intra_linear (sendbuf, recvbuf, count, datatype, op, root, comm); */
return mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
} else if (msgsize <= 65536 ) {
segsize = 32768;
fanout = 8;
return mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
} else if (msgsize < 524288) {
segsize = 1024;
fanout = size/2;
/* later swap this for a binary tree */
/* fanout = 2; */
return mca_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
} else {
segsize = 1024;
return mca_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
}
return err;
}