Big collective commit. I lightly test it, but I think it should be quite stable. Anyway,
the default decision functions (for broadcast, reduce and barrier) are based on a high performance network (not TCP). It should give good performance (really good) for any network having the following caracteristics: small latency (5 microseconds) and good bandwidth (more than 1Gb/s). + Cleanup of the reduce algorithms, plus 2 new algorithms (binary and binomial). Now most of the reduce algorithms use a generic tree based function for completing the reduce. + Added macros for computing the trees (they are used for bcast and reduce right now). + Allow the usage of all 5 topologies. + Jelena's implementation of a binary tree that can be used for non commutative operations. Right now only the tree building function is there, it will get activated soon. + Some others minor cleanups. This commit was SVN r12326.
Этот коммит содержится в:
родитель
894b220fbb
Коммит
ba3c247f2a
@ -201,6 +201,7 @@ extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
|
|||||||
int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);
|
int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);
|
||||||
|
|
||||||
/* Reduce */
|
/* Reduce */
|
||||||
|
int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment );
|
||||||
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
|
int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
|
||||||
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
|
int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
|
||||||
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
|
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
|
||||||
@ -209,6 +210,8 @@ extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];
|
|||||||
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
|
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
|
||||||
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout);
|
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout);
|
||||||
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize);
|
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize);
|
||||||
|
int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize);
|
||||||
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
|
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
|
||||||
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
|
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
|
||||||
|
|
||||||
@ -338,4 +341,59 @@ struct mca_coll_base_comm_t {
|
|||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, ROOT ) \
|
||||||
|
do { \
|
||||||
|
mca_coll_base_comm_t* coll_comm = (OMPI_COMM)->c_coll_selected_data; \
|
||||||
|
if( !( (coll_comm->cached_bintree) \
|
||||||
|
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
|
||||||
|
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
|
||||||
|
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
|
||||||
|
} \
|
||||||
|
coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
|
||||||
|
coll_comm->cached_bintree_root = (ROOT); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, ROOT ) \
|
||||||
|
do { \
|
||||||
|
mca_coll_base_comm_t* coll_comm = (OMPI_COMM)->c_coll_selected_data; \
|
||||||
|
if( !( (coll_comm->cached_bmtree) \
|
||||||
|
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
|
||||||
|
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
|
||||||
|
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
|
||||||
|
} \
|
||||||
|
coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
|
||||||
|
coll_comm->cached_bmtree_root = (ROOT); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, ROOT ) \
|
||||||
|
do { \
|
||||||
|
mca_coll_base_comm_t* coll_comm = (OMPI_COMM)->c_coll_selected_data; \
|
||||||
|
if( !( (coll_comm->cached_pipeline) \
|
||||||
|
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
|
||||||
|
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
|
||||||
|
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
|
||||||
|
} \
|
||||||
|
coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
|
||||||
|
coll_comm->cached_pipeline_root = (ROOT); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, ROOT, FANOUT ) \
|
||||||
|
do { \
|
||||||
|
mca_coll_base_comm_t* coll_comm = (OMPI_COMM)->c_coll_selected_data; \
|
||||||
|
if( !( (coll_comm->cached_chain) \
|
||||||
|
&& (coll_comm->cached_chain_root == (ROOT)) \
|
||||||
|
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
|
||||||
|
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
|
||||||
|
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) ); \
|
||||||
|
} \
|
||||||
|
coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT),(OMPI_COMM),(ROOT)); \
|
||||||
|
coll_comm->cached_chain_root = (ROOT); \
|
||||||
|
coll_comm->cached_chain_fanout = (FANOUT); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
#endif /* MCA_COLL_TUNED_EXPORT_H */
|
#endif /* MCA_COLL_TUNED_EXPORT_H */
|
||||||
|
@ -57,24 +57,8 @@ ompi_coll_tuned_bcast_intra_chain ( void *buff, int count,
|
|||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* setup the chain topology. */
|
||||||
* setup the chain topology.
|
COLL_TUNED_UPDATE_CHAIN( comm, root, chains );
|
||||||
* if the previous chain topology is the same, then use this cached copy
|
|
||||||
* other wise recreate it.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if ((comm->c_coll_selected_data->cached_chain) && (comm->c_coll_selected_data->cached_chain_root == root)
|
|
||||||
&& (comm->c_coll_selected_data->cached_chain_fanout == chains)) {
|
|
||||||
chain = comm->c_coll_selected_data->cached_chain;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (comm->c_coll_selected_data->cached_chain) { /* destroy previous chain if defined */
|
|
||||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_chain);
|
|
||||||
}
|
|
||||||
comm->c_coll_selected_data->cached_chain = chain = ompi_coll_tuned_topo_build_chain( chains, comm, root );
|
|
||||||
comm->c_coll_selected_data->cached_chain_root = root;
|
|
||||||
comm->c_coll_selected_data->cached_chain_fanout = chains;
|
|
||||||
}
|
|
||||||
|
|
||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
|
|
||||||
@ -257,23 +241,8 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
|||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* setup the binary tree topology. */
|
||||||
* setup the tree topology.
|
COLL_TUNED_UPDATE_BINTREE( comm, root );
|
||||||
* if the previous tree topology is the same, then use this cached copy
|
|
||||||
* other wise recreate it.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if ((comm->c_coll_selected_data->cached_bintree) && (comm->c_coll_selected_data->cached_bintree_root == root)) {
|
|
||||||
tree = comm->c_coll_selected_data->cached_bintree;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (comm->c_coll_selected_data->cached_bintree) { /* destroy previous tree if defined */
|
|
||||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_bintree);
|
|
||||||
}
|
|
||||||
comm->c_coll_selected_data->cached_bintree = tree = ompi_coll_tuned_topo_build_tree( 2, comm, root );
|
|
||||||
comm->c_coll_selected_data->cached_bintree_root = root;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
err = ompi_ddt_type_size( datatype, &type_size );
|
err = ompi_ddt_type_size( datatype, &type_size );
|
||||||
|
|
||||||
@ -517,22 +486,8 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
|||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* setup the tree topology. */
|
||||||
* setup the tree topology.
|
COLL_TUNED_UPDATE_BINTREE( comm, root );
|
||||||
* if the previous tree topology is the same, then use this cached copy
|
|
||||||
* other wise recreate it.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if ((comm->c_coll_selected_data->cached_bintree) && (comm->c_coll_selected_data->cached_bintree_root == root)) {
|
|
||||||
tree = comm->c_coll_selected_data->cached_bintree;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (comm->c_coll_selected_data->cached_bintree) { /* destroy previous bintree if defined */
|
|
||||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_bintree);
|
|
||||||
}
|
|
||||||
comm->c_coll_selected_data->cached_bintree = tree = ompi_coll_tuned_topo_build_tree( 2, comm, root );
|
|
||||||
comm->c_coll_selected_data->cached_bintree_root = root;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ompi_ddt_type_size( datatype, &type_size );
|
err = ompi_ddt_type_size( datatype, &type_size );
|
||||||
|
|
||||||
|
@ -61,16 +61,16 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm)
|
struct ompi_communicator_t *comm)
|
||||||
{
|
{
|
||||||
int comsize, rank, err;
|
int communicator_size, rank, err;
|
||||||
size_t dsize, total_dsize;
|
size_t dsize, total_dsize;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed"));
|
||||||
|
|
||||||
comsize = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
/* special case */
|
/* special case */
|
||||||
if (comsize==2) {
|
if (communicator_size==2) {
|
||||||
return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,9 +81,9 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
total_dsize = dsize * scount * comsize; /* needed for decision */
|
total_dsize = dsize * scount * communicator_size; /* needed for decision */
|
||||||
|
|
||||||
if (comsize >= 12 && total_dsize <= 768) {
|
if (communicator_size >= 12 && total_dsize <= 768) {
|
||||||
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||||
}
|
}
|
||||||
if (total_dsize <= 131072) {
|
if (total_dsize <= 131072) {
|
||||||
@ -102,13 +102,13 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
|||||||
*/
|
*/
|
||||||
int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
||||||
{
|
{
|
||||||
int comsize;
|
int communicator_size;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed"));
|
||||||
|
|
||||||
comsize = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
|
|
||||||
if( 2 == comsize )
|
if( 2 == communicator_size )
|
||||||
return ompi_coll_tuned_barrier_intra_two_procs(comm);
|
return ompi_coll_tuned_barrier_intra_two_procs(comm);
|
||||||
/**
|
/**
|
||||||
* Basic optimisation. If we have a power of 2 number of nodes
|
* Basic optimisation. If we have a power of 2 number of nodes
|
||||||
@ -117,8 +117,8 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
|||||||
*/
|
*/
|
||||||
{
|
{
|
||||||
bool has_one = false;
|
bool has_one = false;
|
||||||
for( ; comsize > 0; comsize >>= 1 ) {
|
for( ; communicator_size > 0; communicator_size >>= 1 ) {
|
||||||
if( comsize & 0x1 ) {
|
if( communicator_size & 0x1 ) {
|
||||||
if( has_one )
|
if( has_one )
|
||||||
return ompi_coll_tuned_barrier_intra_bruck(comm);
|
return ompi_coll_tuned_barrier_intra_bruck(comm);
|
||||||
has_one = true;
|
has_one = true;
|
||||||
@ -142,13 +142,19 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
|||||||
struct ompi_datatype_t *datatype, int root,
|
struct ompi_datatype_t *datatype, int root,
|
||||||
struct ompi_communicator_t *comm)
|
struct ompi_communicator_t *comm)
|
||||||
{
|
{
|
||||||
int comsize, rank, err;
|
const double a0 = -7.8710;
|
||||||
|
const double b0 = 41.1613;
|
||||||
|
const double a1 = 0.0150;
|
||||||
|
const double b1 = 11.2445;
|
||||||
|
const double a2 = 0.0023;
|
||||||
|
const double b2 = 3.8074;
|
||||||
|
int communicator_size, rank, err;
|
||||||
int segsize = 0;
|
int segsize = 0;
|
||||||
size_t msgsize, dsize;
|
size_t message_size, dsize;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed"));
|
||||||
|
|
||||||
comsize = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
/* else we need data size for decision function */
|
/* else we need data size for decision function */
|
||||||
@ -158,26 +164,64 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
|||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
msgsize = dsize * (unsigned long)count; /* needed for decision */
|
message_size = dsize * (unsigned long)count; /* needed for decision */
|
||||||
|
|
||||||
|
if ((message_size <= 1024) && (communicator_size < 12)) {
|
||||||
|
/* Linear_0K */
|
||||||
|
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||||
|
} else if (message_size < 8192) {
|
||||||
|
if ((communicator_size < 12) ||
|
||||||
|
(communicator_size < (a0 * (message_size / 1024.0) + b0))) {
|
||||||
|
/* Binary_0K */
|
||||||
|
segsize = 0;
|
||||||
|
} else {
|
||||||
|
/* Binary_1K */
|
||||||
|
segsize = 1024;
|
||||||
|
}
|
||||||
|
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||||
|
} else if (message_size <= 35000) {
|
||||||
|
if (communicator_size <= 12) {
|
||||||
|
/* Binary_8K */
|
||||||
|
segsize = 1024 << 3;
|
||||||
|
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||||
|
} else {
|
||||||
|
/* SplittedBinary_1K */
|
||||||
|
segsize = 1024;
|
||||||
|
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, root, comm, segsize);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (communicator_size > (a1 * (message_size / 1024.0) + b1)) {
|
||||||
|
/* SplittedBinary_8K */
|
||||||
|
segsize = 1024 << 3;
|
||||||
|
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, root, comm, segsize);
|
||||||
|
}
|
||||||
|
if (communicator_size > (a2 * (message_size / 1024.0) + b2)) {
|
||||||
|
/* Pipeline_8K */
|
||||||
|
segsize = 1024 << 3;
|
||||||
|
} else {
|
||||||
|
/* Pipeline_64K */
|
||||||
|
segsize = 1024 << 6;
|
||||||
|
}
|
||||||
|
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
|
||||||
|
#if 0
|
||||||
/* this is based on gige measurements */
|
/* this is based on gige measurements */
|
||||||
|
|
||||||
if (comsize < 4) {
|
if (communicator_size < 4) {
|
||||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||||
}
|
}
|
||||||
if (comsize == 4) {
|
if (communicator_size == 4) {
|
||||||
if (msgsize < 524288) segsize = 0;
|
if (message_size < 524288) segsize = 0;
|
||||||
else segsize = 16384;
|
else segsize = 16384;
|
||||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||||
}
|
}
|
||||||
if (comsize <= 8 && msgsize < 4096) {
|
if (communicator_size <= 8 && message_size < 4096) {
|
||||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||||
}
|
}
|
||||||
if (comsize > 8 && msgsize >= 32768 && msgsize < 524288) {
|
if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
|
||||||
segsize = 16384;
|
segsize = 16384;
|
||||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||||
}
|
}
|
||||||
if (msgsize >= 524288) {
|
if (message_size >= 524288) {
|
||||||
segsize = 16384;
|
segsize = 16384;
|
||||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
|
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
|
||||||
}
|
}
|
||||||
@ -185,6 +229,7 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
|||||||
/* once tested can swap this back in */
|
/* once tested can swap this back in */
|
||||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||||
|
#endif /* 0 */
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -200,8 +245,16 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
struct ompi_op_t* op, int root,
|
struct ompi_op_t* op, int root,
|
||||||
struct ompi_communicator_t* comm)
|
struct ompi_communicator_t* comm)
|
||||||
{
|
{
|
||||||
int comsize, rank, err, segsize = 0, fanout = 0;
|
int communicator_size, rank, err, segsize = 0;
|
||||||
size_t msgsize, dsize;
|
size_t message_size, dsize;
|
||||||
|
const double a1 = 0.6016 / 1024.0; /* [1/B] */
|
||||||
|
const double b1 = 1.3496;
|
||||||
|
const double a2 = 0.0410 / 1024.0; /* [1/B] */
|
||||||
|
const double b2 = 9.7128;
|
||||||
|
const double a3 = 0.0422 / 1024.0; /* [1/B] */
|
||||||
|
const double b3 = 1.1614;
|
||||||
|
const double a4 = 0.0033 / 1024.0; /* [1/B] */
|
||||||
|
const double b4 = 1.6761;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"));
|
||||||
|
|
||||||
@ -212,7 +265,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
comsize = ompi_comm_size(comm);
|
communicator_size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
/* need data size for decision function */
|
/* need data size for decision function */
|
||||||
@ -222,24 +275,58 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
msgsize = dsize * count; /* needed for decision */
|
message_size = dsize * count; /* needed for decision */
|
||||||
|
|
||||||
/* for small messages use linear algorithm */
|
if (((communicator_size < 20) && (message_size < 512)) ||
|
||||||
if (msgsize <= 4096) {
|
((communicator_size < 10) && (message_size <= 1024))){
|
||||||
|
/* Linear_0K */
|
||||||
|
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||||
|
} else if ((communicator_size < 8) && (message_size < 20480)) {
|
||||||
|
/* Binomial_0K */
|
||||||
segsize = 0;
|
segsize = 0;
|
||||||
fanout = comsize - 1;
|
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
} else if (message_size < 2048) {
|
||||||
|
/* Binary_0K */
|
||||||
|
segsize = 0;
|
||||||
|
return ompi_coll_tuned_reduce_intra_binary(sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
} else if (communicator_size > (a1 * message_size + b1)) {
|
||||||
|
/* Binary_1K */
|
||||||
|
segsize = 1024;
|
||||||
|
return ompi_coll_tuned_reduce_intra_binary(sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
} else if (communicator_size > (a2 * message_size + b2)) {
|
||||||
|
/* Pipeline_1K */
|
||||||
|
segsize = 1024;
|
||||||
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
} else if (communicator_size > (a3 * message_size + b3)) {
|
||||||
|
/* Binary_32K */
|
||||||
|
segsize = 32*1024;
|
||||||
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
}
|
||||||
|
if (communicator_size > (a4 * message_size + b4)) {
|
||||||
|
/* Pipeline_32K */
|
||||||
|
segsize = 32*1024;
|
||||||
|
} else {
|
||||||
|
/* Pipeline_64K */
|
||||||
|
segsize = 64*1024;
|
||||||
|
}
|
||||||
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
#if 0
|
||||||
|
/* for small messages use linear algorithm */
|
||||||
|
if (message_size <= 4096) {
|
||||||
|
segsize = 0;
|
||||||
|
fanout = communicator_size - 1;
|
||||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||||
}
|
}
|
||||||
if (msgsize < 524288) {
|
if (message_size < 524288) {
|
||||||
if (msgsize <= 65536 ) {
|
if (message_size <= 65536 ) {
|
||||||
segsize = 32768;
|
segsize = 32768;
|
||||||
fanout = 8;
|
fanout = 8;
|
||||||
} else {
|
} else {
|
||||||
segsize = 1024;
|
segsize = 1024;
|
||||||
fanout = comsize/2;
|
fanout = communicator_size/2;
|
||||||
}
|
}
|
||||||
/* later swap this for a binary tree */
|
/* later swap this for a binary tree */
|
||||||
/* fanout = 2; */
|
/* fanout = 2; */
|
||||||
@ -247,4 +334,5 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
|||||||
}
|
}
|
||||||
segsize = 1024;
|
segsize = 1024;
|
||||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||||
|
#endif /* 0 */
|
||||||
}
|
}
|
||||||
|
@ -29,73 +29,44 @@
|
|||||||
#include "coll_tuned.h"
|
#include "coll_tuned.h"
|
||||||
#include "coll_tuned_topo.h"
|
#include "coll_tuned_topo.h"
|
||||||
|
|
||||||
/* Attention: this version of the reduce operations does not
|
/**
|
||||||
work for:
|
* This is a generic implementation of the reduce protocol. It used the tree
|
||||||
- non-commutative operations
|
* provided as an argument and execute all operations using a segment of
|
||||||
- segment sizes which are not multiplies of the extent of the datatype
|
* count times a datatype.
|
||||||
meaning that at least one datatype must fit in the segment !
|
* For the last communication it will update the count in order to limit
|
||||||
*/
|
* th number of datatype to the original count (original_count)
|
||||||
|
*/
|
||||||
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
|
||||||
ompi_datatype_t* datatype, ompi_op_t* op,
|
ompi_datatype_t* datatype, ompi_op_t* op,
|
||||||
int root, ompi_communicator_t* comm, uint32_t segsize,
|
int root, ompi_communicator_t* comm,
|
||||||
int fanout)
|
ompi_coll_tree_t* tree, int count_by_segment )
|
||||||
{
|
{
|
||||||
int ret, line, rank, size, i = 0;
|
|
||||||
int recvcount, sendcount, prevcount, inbi, previnbi;
|
|
||||||
int segcount, segindex, num_segments;
|
|
||||||
char *inbuf[2] = {(char*)NULL, (char*)NULL};
|
char *inbuf[2] = {(char*)NULL, (char*)NULL};
|
||||||
char *accumbuf = (char*)NULL;
|
char *local_op_buffer, *accumbuf, *sendtmpbuf;
|
||||||
char *sendtmpbuf = (char*)NULL;
|
ptrdiff_t extent, lower_bound;
|
||||||
ptrdiff_t ext, lb;
|
|
||||||
size_t typelng, realsegsize;
|
size_t typelng, realsegsize;
|
||||||
ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||||
ompi_coll_tree_t* chain;
|
int num_segments, line, ret, segindex, i, rank;
|
||||||
|
int recvcount, prevcount, inbi, previnbi;
|
||||||
size = ompi_comm_size(comm);
|
|
||||||
rank = ompi_comm_rank(comm);
|
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", rank, fanout, segsize));
|
|
||||||
|
|
||||||
/* setup the chain topology.
|
|
||||||
* if the previous chain topology is the same, then use this cached copy
|
|
||||||
* other wise recreate it.
|
|
||||||
*/
|
|
||||||
if ((comm->c_coll_selected_data->cached_chain) && (comm->c_coll_selected_data->cached_chain_root == root)
|
|
||||||
&& (comm->c_coll_selected_data->cached_chain_fanout == fanout)) {
|
|
||||||
chain = comm->c_coll_selected_data->cached_chain;
|
|
||||||
} else {
|
|
||||||
if (comm->c_coll_selected_data->cached_chain) { /* destroy previous chain if defined */
|
|
||||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_chain);
|
|
||||||
}
|
|
||||||
comm->c_coll_selected_data->cached_chain = chain = ompi_coll_tuned_topo_build_chain(fanout,comm,root);
|
|
||||||
comm->c_coll_selected_data->cached_chain_root = root;
|
|
||||||
comm->c_coll_selected_data->cached_chain_fanout = fanout;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine number of segments and number of elements
|
* Determine number of segments and number of elements
|
||||||
* sent per operation
|
* sent per operation
|
||||||
*/
|
*/
|
||||||
ompi_ddt_get_extent( datatype, &lb, &ext );
|
ompi_ddt_get_extent( datatype, &lower_bound, &extent );
|
||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
if( segsize > typelng ) {
|
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
|
||||||
segcount = (int)(segsize / typelng);
|
realsegsize = count_by_segment * extent;
|
||||||
num_segments = count/segcount;
|
|
||||||
if( (count % segcount) != 0 ) num_segments++;
|
|
||||||
} else {
|
|
||||||
segcount = count;
|
|
||||||
num_segments = 1;
|
|
||||||
}
|
|
||||||
realsegsize = segcount * ext;
|
|
||||||
|
|
||||||
sendtmpbuf = (char*) sendbuf;
|
sendtmpbuf = (char*) sendbuf;
|
||||||
if( sendbuf == MPI_IN_PLACE ) {
|
if( sendbuf == MPI_IN_PLACE ) {
|
||||||
sendtmpbuf = (char *)recvbuf;
|
sendtmpbuf = (char *)recvbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
/* non-leaf nodes - wait for children to send me data & forward up (if needed) */
|
/* non-leaf nodes - wait for children to send me data & forward up (if needed) */
|
||||||
if( chain->tree_nextsize > 0 ) {
|
if( tree->tree_nextsize > 0 ) {
|
||||||
/* handle non existant recv buffer (i.e. its NULL.. like basic allreduce uses!) */
|
/* handle non existant recv buffer (i.e. its NULL.. like basic allreduce uses!) */
|
||||||
accumbuf = (char*)recvbuf;
|
accumbuf = (char*)recvbuf;
|
||||||
if( NULL == accumbuf ) {
|
if( NULL == accumbuf ) {
|
||||||
@ -108,7 +79,7 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
if( inbuf[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; }
|
if( inbuf[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; }
|
||||||
/* if there is chance to overlap communication -
|
/* if there is chance to overlap communication -
|
||||||
allocate second buffer */
|
allocate second buffer */
|
||||||
if( (num_segments > 1) || (chain->tree_nextsize > 1) ) {
|
if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
|
||||||
inbuf[1] = (char*) malloc(realsegsize);
|
inbuf[1] = (char*) malloc(realsegsize);
|
||||||
if( inbuf[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl;}
|
if( inbuf[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl;}
|
||||||
} else {
|
} else {
|
||||||
@ -122,12 +93,12 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
for( segindex = 0; segindex <= num_segments; segindex++ ) {
|
for( segindex = 0; segindex <= num_segments; segindex++ ) {
|
||||||
prevcount = recvcount;
|
prevcount = recvcount;
|
||||||
/* recvcount - number of elements in current segment */
|
/* recvcount - number of elements in current segment */
|
||||||
recvcount = segcount;
|
recvcount = count_by_segment;
|
||||||
if( segindex == (num_segments-1) )
|
if( segindex == (num_segments-1) )
|
||||||
recvcount = count - segcount*segindex;
|
recvcount = original_count - count_by_segment * segindex;
|
||||||
|
|
||||||
/* for each child */
|
/* for each child */
|
||||||
for( i = 0; i < chain->tree_nextsize; i++ ) {
|
for( i = 0; i < tree->tree_nextsize; i++ ) {
|
||||||
/**
|
/**
|
||||||
* We try to overlap communication:
|
* We try to overlap communication:
|
||||||
* either with next segment or with the next child
|
* either with next segment or with the next child
|
||||||
@ -144,11 +115,11 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
* BUT if we are root and are USING MPI_IN_PLACE this is wrong ek!
|
* BUT if we are root and are USING MPI_IN_PLACE this is wrong ek!
|
||||||
* check for root might not be needed as it should be checked higher up
|
* check for root might not be needed as it should be checked higher up
|
||||||
*/
|
*/
|
||||||
if( !((MPI_IN_PLACE==sendbuf) && (rank==root)) ) {
|
if( !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
||||||
local_recvbuf = accumbuf + segindex * realsegsize;
|
local_recvbuf = accumbuf + segindex * realsegsize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount,datatype, chain->tree_next[i],
|
ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount,datatype, tree->tree_next[i],
|
||||||
MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi]));
|
MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi]));
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
@ -157,25 +128,24 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
/* wait on data from last child for previous segment */
|
/* wait on data from last child for previous segment */
|
||||||
ret = ompi_request_wait_all( 1, &reqs[previnbi], MPI_STATUSES_IGNORE );
|
ret = ompi_request_wait_all( 1, &reqs[previnbi], MPI_STATUSES_IGNORE );
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
|
local_op_buffer = inbuf[previnbi];
|
||||||
if( i > 0 ) {
|
if( i > 0 ) {
|
||||||
/* our first operation is to combine our own [sendbuf] data with the data
|
/* our first operation is to combine our own [sendbuf] data with the data
|
||||||
* we recvd from down stream (but only if we are not root and not using
|
* we recvd from down stream (but only if we are not root and not using
|
||||||
* MPI_IN_PLACE)
|
* MPI_IN_PLACE)
|
||||||
*/
|
*/
|
||||||
void* local_op_buffer = inbuf[previnbi];
|
|
||||||
if( 1 == i ) {
|
if( 1 == i ) {
|
||||||
if( !((MPI_IN_PLACE == sendbuf) && (rank == root)) ) {
|
if( !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
||||||
local_op_buffer = sendtmpbuf + segindex * realsegsize;
|
local_op_buffer = sendtmpbuf + segindex * realsegsize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* apply operation */
|
/* apply operation */
|
||||||
ompi_op_reduce(op, local_op_buffer, accumbuf+segindex*realsegsize, recvcount, datatype );
|
ompi_op_reduce(op, local_op_buffer, accumbuf+segindex*realsegsize, recvcount, datatype );
|
||||||
} else if ( segindex > 0 ) {
|
} else if ( segindex > 0 ) {
|
||||||
void* local_op_buffer = inbuf[previnbi];
|
|
||||||
void* accumulator = accumbuf + (segindex-1) * realsegsize;
|
void* accumulator = accumbuf + (segindex-1) * realsegsize;
|
||||||
|
|
||||||
if( chain->tree_nextsize <= 1 ) {
|
if( tree->tree_nextsize <= 1 ) {
|
||||||
if( !((MPI_IN_PLACE == sendbuf) && (rank == root)) ) {
|
if( !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
||||||
local_op_buffer = sendtmpbuf+(segindex-1)*realsegsize;
|
local_op_buffer = sendtmpbuf+(segindex-1)*realsegsize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -184,10 +154,10 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
/* all reduced on available data this step (i) complete, pass to
|
/* all reduced on available data this step (i) complete, pass to
|
||||||
* the next process unless your the root
|
* the next process unless your the root
|
||||||
*/
|
*/
|
||||||
if (rank != root) {
|
if (rank != tree->tree_root) {
|
||||||
/* send combined/accumulated data to parent */
|
/* send combined/accumulated data to parent */
|
||||||
ret = MCA_PML_CALL( send( accumulator, prevcount, datatype,
|
ret = MCA_PML_CALL( send( accumulator, prevcount, datatype,
|
||||||
chain->tree_prev, MCA_COLL_BASE_TAG_REDUCE,
|
tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm) );
|
MCA_PML_BASE_SEND_STANDARD, comm) );
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
@ -210,20 +180,20 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
/* leaf nodes */
|
/* leaf nodes */
|
||||||
else {
|
else {
|
||||||
/* Send segmented data to parents */
|
/* Send segmented data to parents */
|
||||||
for (segindex = 0; segindex < num_segments; segindex++) {
|
segindex = 0;
|
||||||
if (segindex < num_segments-1) sendcount = segcount;
|
while( original_count > 0 ) {
|
||||||
else sendcount = count - segindex*segcount;
|
if( original_count < count_by_segment ) count_by_segment = original_count;
|
||||||
ret = MCA_PML_CALL( send((char*)sendbuf+segindex*realsegsize, sendcount,
|
ret = MCA_PML_CALL( send((char*)sendbuf + segindex * realsegsize, count_by_segment,
|
||||||
datatype, chain->tree_prev,
|
datatype, tree->tree_prev,
|
||||||
MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) );
|
MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) );
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
|
segindex++;
|
||||||
|
original_count -= count_by_segment;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
error_hndl: /* error handler */
|
||||||
|
|
||||||
/* error handler */
|
|
||||||
error_hndl:
|
|
||||||
OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ));
|
OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ));
|
||||||
if( inbuf[0] != NULL ) free(inbuf[0]);
|
if( inbuf[0] != NULL ) free(inbuf[0]);
|
||||||
if( inbuf[1] != NULL ) free(inbuf[1]);
|
if( inbuf[1] != NULL ) free(inbuf[1]);
|
||||||
@ -231,23 +201,123 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Attention: this version of the reduce operations does not
|
||||||
|
work for:
|
||||||
|
- non-commutative operations
|
||||||
|
- segment sizes which are not multiplies of the extent of the datatype
|
||||||
|
meaning that at least one datatype must fit in the segment !
|
||||||
|
*/
|
||||||
|
|
||||||
|
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||||
|
ompi_datatype_t* datatype, ompi_op_t* op,
|
||||||
|
int root, ompi_communicator_t* comm, uint32_t segsize,
|
||||||
|
int fanout)
|
||||||
|
{
|
||||||
|
int segcount;
|
||||||
|
size_t typelng;
|
||||||
|
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize));
|
||||||
|
|
||||||
|
COLL_TUNED_UPDATE_CHAIN( comm, root, fanout );
|
||||||
|
/**
|
||||||
|
* Determine number of segments and number of elements
|
||||||
|
* sent per operation
|
||||||
|
*/
|
||||||
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
|
if( segsize > typelng ) {
|
||||||
|
segcount = (int)(segsize / typelng);
|
||||||
|
} else {
|
||||||
|
segcount = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_chain, segcount );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm, uint32_t segsize )
|
ompi_communicator_t* comm, uint32_t segsize )
|
||||||
{
|
{
|
||||||
int rank;
|
int segcount;
|
||||||
|
size_t typelng;
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_pipeline rank %d ss %5d",
|
||||||
|
ompi_comm_rank(comm), segsize));
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_pipeline rank %d ss %5d", rank, segsize));
|
COLL_TUNED_UPDATE_PIPELINE( comm, root );
|
||||||
|
|
||||||
return ompi_coll_tuned_reduce_intra_chain( sendbuf,recvbuf, count,
|
/**
|
||||||
datatype, op, root, comm,
|
* Determine number of segments and number of elements
|
||||||
segsize, 1 );
|
* sent per operation
|
||||||
|
*/
|
||||||
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
|
if( segsize > typelng ) {
|
||||||
|
segcount = (int)(segsize / typelng);
|
||||||
|
} else {
|
||||||
|
segcount = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_pipeline, segcount );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
|
||||||
|
int count, ompi_datatype_t* datatype,
|
||||||
|
ompi_op_t* op, int root,
|
||||||
|
ompi_communicator_t* comm, uint32_t segsize )
|
||||||
|
{
|
||||||
|
int segcount;
|
||||||
|
size_t typelng;
|
||||||
|
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binary rank %d ss %5d",
|
||||||
|
ompi_comm_rank(comm), segsize));
|
||||||
|
|
||||||
|
COLL_TUNED_UPDATE_BINTREE( comm, root );
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine number of segments and number of elements
|
||||||
|
* sent per operation
|
||||||
|
*/
|
||||||
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
|
if( segsize > typelng ) {
|
||||||
|
segcount = (int)(segsize / typelng);
|
||||||
|
} else {
|
||||||
|
segcount = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_bintree, segcount );
|
||||||
|
}
|
||||||
|
|
||||||
|
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
|
||||||
|
int count, ompi_datatype_t* datatype,
|
||||||
|
ompi_op_t* op, int root,
|
||||||
|
ompi_communicator_t* comm, uint32_t segsize )
|
||||||
|
{
|
||||||
|
int segcount;
|
||||||
|
size_t typelng;
|
||||||
|
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binomial rank %d ss %5d",
|
||||||
|
ompi_comm_rank(comm), segsize));
|
||||||
|
|
||||||
|
COLL_TUNED_UPDATE_BMTREE( comm, root );
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine number of segments and number of elements
|
||||||
|
* sent per operation
|
||||||
|
*/
|
||||||
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
|
if( segsize > typelng ) {
|
||||||
|
segcount = (int)(segsize / typelng);
|
||||||
|
} else {
|
||||||
|
segcount = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_bmtree, segcount );
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Linear functions are copied from the BASIC coll module
|
* Linear functions are copied from the BASIC coll module
|
||||||
@ -358,7 +428,6 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Perform the reduction */
|
/* Perform the reduction */
|
||||||
|
|
||||||
ompi_op_reduce(op, inbuf, rbuf, count, dtype);
|
ompi_op_reduce(op, inbuf, rbuf, count, dtype);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -371,22 +440,22 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* All done */
|
/* All done */
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* copied function (with appropriate renaming) ends here */
|
/* copied function (with appropriate renaming) ends here */
|
||||||
|
|
||||||
|
|
||||||
/* The following are used by dynamic and forced rules */
|
/**
|
||||||
|
* The following are used by dynamic and forced rules
|
||||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
*
|
||||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
* publish details of each algorithm and if its forced/fixed/locked in
|
||||||
|
* as you add methods/algorithms you must update this and the query/map routines
|
||||||
/* this routine is called by the component only */
|
*
|
||||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
* this routine is called by the component only
|
||||||
/* module does not call this they call the forced_getvalues routine instead */
|
* this makes sure that the mca parameters are set to their initial values and perms
|
||||||
|
* module does not call this they call the forced_getvalues routine instead
|
||||||
|
*/
|
||||||
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
@ -169,6 +169,128 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
|||||||
return tree;
|
return tree;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Constructs in-order binary tree which can be used for non-commutative reduce
|
||||||
|
* operations.
|
||||||
|
* Root of this tree is always rank (size-1) and fanout is 2.
|
||||||
|
* Here are some of the examples of this tree:
|
||||||
|
* size == 2 size = 4 size = 9
|
||||||
|
* 1 3 8
|
||||||
|
* / / \ / \
|
||||||
|
* 0 2 1 7 3
|
||||||
|
* / / \ / \
|
||||||
|
* 0 6 5 2 1
|
||||||
|
* / /
|
||||||
|
* 4 0
|
||||||
|
*/
|
||||||
|
ompi_coll_tree_t*
|
||||||
|
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||||
|
{
|
||||||
|
int rank, size;
|
||||||
|
int myrank, rightsize, delta;
|
||||||
|
int parent, lchild, rchild;
|
||||||
|
ompi_coll_tree_t* tree;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get size and rank of the process in this communicator
|
||||||
|
*/
|
||||||
|
size = ompi_comm_size(comm);
|
||||||
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
|
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
|
||||||
|
if (!tree) {
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
|
"coll:tuned:topo_build_tree PANIC::out of memory"));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
tree->tree_root = MPI_UNDEFINED;
|
||||||
|
tree->tree_nextsize = MPI_UNDEFINED;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize tree
|
||||||
|
*/
|
||||||
|
tree->tree_fanout = 2;
|
||||||
|
tree->tree_bmtree = 0;
|
||||||
|
tree->tree_root = size - 1;
|
||||||
|
tree->tree_prev = -1;
|
||||||
|
tree->tree_nextsize = 0;
|
||||||
|
tree->tree_next[0] = -1;
|
||||||
|
tree->tree_next[1] = -1;
|
||||||
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
|
"coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
|
||||||
|
tree->tree_fanout, tree->tree_root));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Build the tree
|
||||||
|
*/
|
||||||
|
myrank = rank;
|
||||||
|
parent = size - 1;
|
||||||
|
delta = 0;
|
||||||
|
|
||||||
|
while ( 1 ) {
|
||||||
|
/* Compute the size of the right subtree */
|
||||||
|
rightsize = size >> 1;
|
||||||
|
|
||||||
|
/* Determine the left and right child of this parent */
|
||||||
|
lchild = -1;
|
||||||
|
rchild = -1;
|
||||||
|
if (size - 1 > 0) {
|
||||||
|
lchild = parent - 1;
|
||||||
|
if (lchild > 0) {
|
||||||
|
rchild = rightsize - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The following cases are possible: myrank can be
|
||||||
|
- a parent,
|
||||||
|
- belong to the left subtree, or
|
||||||
|
- belong to the right subtee
|
||||||
|
Each of the cases need to be handled differently.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (myrank == parent) {
|
||||||
|
/* I am the parent:
|
||||||
|
- compute real ranks of my children, and exit the loop. */
|
||||||
|
if (lchild >= 0) tree->tree_next[0] = lchild + delta;
|
||||||
|
if (rchild >= 0) tree->tree_next[1] = rchild + delta;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (myrank > rchild) {
|
||||||
|
/* I belong to the left subtree:
|
||||||
|
- If I am the left child, compute real rank of my parent
|
||||||
|
- Iterate down through tree:
|
||||||
|
compute new size, shift ranks down, and update delta.
|
||||||
|
*/
|
||||||
|
if (myrank == lchild) {
|
||||||
|
tree->tree_prev = parent + delta;
|
||||||
|
}
|
||||||
|
size = size - rightsize - 1;
|
||||||
|
delta = delta + rightsize;
|
||||||
|
myrank = myrank - rightsize;
|
||||||
|
parent = size - 1;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* I belong to the right subtree:
|
||||||
|
- If I am the right child, compute real rank of my parent
|
||||||
|
- Iterate down through tree:
|
||||||
|
compute new size and parent,
|
||||||
|
but the delta and rank do not need to change.
|
||||||
|
*/
|
||||||
|
if (myrank == rchild) {
|
||||||
|
tree->tree_prev = parent + delta;
|
||||||
|
}
|
||||||
|
size = rightsize;
|
||||||
|
parent = rchild;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
|
||||||
|
if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
|
||||||
|
|
||||||
|
return tree;
|
||||||
|
}
|
||||||
|
|
||||||
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
|
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
|
||||||
{
|
{
|
||||||
ompi_coll_tree_t *ptr;
|
ompi_coll_tree_t *ptr;
|
||||||
|
@ -40,7 +40,8 @@ extern "C" {
|
|||||||
ompi_coll_tuned_topo_build_tree( int fanout,
|
ompi_coll_tuned_topo_build_tree( int fanout,
|
||||||
struct ompi_communicator_t* com,
|
struct ompi_communicator_t* com,
|
||||||
int root );
|
int root );
|
||||||
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree );
|
ompi_coll_tree_t*
|
||||||
|
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
|
||||||
|
|
||||||
ompi_coll_tree_t*
|
ompi_coll_tree_t*
|
||||||
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||||
@ -51,6 +52,8 @@ extern "C" {
|
|||||||
struct ompi_communicator_t* com,
|
struct ompi_communicator_t* com,
|
||||||
int root );
|
int root );
|
||||||
|
|
||||||
|
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree );
|
||||||
|
|
||||||
/* debugging stuff, will be removed later */
|
/* debugging stuff, will be removed later */
|
||||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
|
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user