2013-03-28 01:09:41 +04:00
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2005-10-27 03:11:32 +04:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2012-03-06 02:23:44 +04:00
* Copyright ( c ) 2004 - 2012 The University of Tennessee and The University
2005-11-05 22:57:48 +03:00
* of Tennessee Research Foundation . All rights
* reserved .
2005-10-27 03:11:32 +04:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2009-01-14 06:22:54 +03:00
* Copyright ( c ) 2009 University of Houston . All rights reserved .
2013-03-28 01:09:41 +04:00
* Copyright ( c ) 2013 Los Alamos National Security , LLC . All Rights
* reserved .
2005-10-27 03:11:32 +04:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "ompi_config.h"
# include "mpi.h"
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
# include "opal/util/bit_ops.h"
2006-02-12 04:33:29 +03:00
# include "ompi/constants.h"
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
# include "ompi/datatype/ompi_datatype.h"
2006-02-12 04:33:29 +03:00
# include "ompi/communicator/communicator.h"
# include "ompi/mca/coll/coll.h"
# include "ompi/mca/coll/base/coll_tags.h"
# include "ompi/mca/pml/pml.h"
# include "ompi/op/op.h"
2005-10-27 03:11:32 +04:00
# include "coll_tuned.h"
# include "coll_tuned_topo.h"
2007-01-23 04:19:11 +03:00
# include "coll_tuned_util.h"
2005-10-27 03:11:32 +04:00
2013-03-28 01:09:41 +04:00
/* allreduce algorithm variables */
static int coll_tuned_allreduce_algorithm_count = 5 ;
static int coll_tuned_allreduce_forced_algorithm = 0 ;
static int coll_tuned_allreduce_segment_size = 0 ;
static int coll_tuned_allreduce_tree_fanout ;
static int coll_tuned_allreduce_chain_fanout ;
/* valid values for coll_tuned_allreduce_forced_algorithm */
static mca_base_var_enum_value_t allreduce_algorithms [ ] = {
{ 0 , " ignore " } ,
{ 1 , " basic_linear " } ,
{ 2 , " nonoverlapping " } ,
{ 3 , " recursive_doubling " } ,
{ 4 , " ring " } ,
{ 5 , " segmented_ring " } ,
{ 0 , NULL }
} ;
2005-10-27 03:11:32 +04:00
/*
2005-12-22 16:49:33 +03:00
* ompi_coll_tuned_allreduce_intra_nonoverlapping
2005-10-27 03:11:32 +04:00
*
* This function just calls a reduce followed by a broadcast
* both called functions are tuned but they complete sequentially ,
* i . e . no additional overlapping
* meaning if the number of segments used is greater than the topo depth
* then once the first segment of data is fully ' reduced ' it is not broadcast
* while the reduce continues ( cost = cost - reduce + cost - bcast + decision x 3 )
*
*/
int
2005-12-22 16:49:33 +03:00
ompi_coll_tuned_allreduce_intra_nonoverlapping ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-10-27 03:11:32 +04:00
{
2012-04-06 19:48:07 +04:00
int err , rank ;
2005-10-27 03:11:32 +04:00
rank = ompi_comm_rank ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_intra_nonoverlapping rank %d " , rank ) ) ;
2005-10-27 03:11:32 +04:00
/* Reduce to 0 and broadcast. */
if ( MPI_IN_PLACE = = sbuf ) {
2009-05-05 16:28:51 +04:00
if ( 0 = = rank ) {
2007-01-23 04:19:11 +03:00
err = comm - > c_coll . coll_reduce ( MPI_IN_PLACE , rbuf , count , dtype ,
2009-01-14 06:22:54 +03:00
op , 0 , comm , comm - > c_coll . coll_reduce_module ) ;
2005-10-27 03:11:32 +04:00
} else {
2007-01-23 04:19:11 +03:00
err = comm - > c_coll . coll_reduce ( rbuf , NULL , count , dtype , op , 0 ,
2009-01-14 06:22:54 +03:00
comm , comm - > c_coll . coll_reduce_module ) ;
2005-10-27 03:11:32 +04:00
}
} else {
2009-01-14 06:22:54 +03:00
err = comm - > c_coll . coll_reduce ( sbuf , rbuf , count , dtype , op , 0 ,
2012-04-06 19:48:07 +04:00
comm , comm - > c_coll . coll_reduce_module ) ;
2005-10-27 03:11:32 +04:00
}
if ( MPI_SUCCESS ! = err ) {
return err ;
}
2009-01-14 06:22:54 +03:00
return comm - > c_coll . coll_bcast ( rbuf , count , dtype , 0 , comm ,
2012-04-06 19:48:07 +04:00
comm - > c_coll . coll_bcast_module ) ;
2005-10-27 03:11:32 +04:00
}
2007-01-23 04:19:11 +03:00
/*
* ompi_coll_tuned_allreduce_intra_recursivedoubling
*
* Function : Recursive doubling algorithm for allreduce operation
* Accepts : Same as MPI_Allreduce ( )
* Returns : MPI_SUCCESS or error code
*
* Description : Implements recursive doubling algorithm for allreduce .
* Original ( non - segmented ) implementation is used in MPICH - 2
* for small and intermediate size messages .
* The algorithm preserves order of operations so it can
* be used both by commutative and non - commutative operations .
*
* Example on 7 nodes :
* Initial state
* # 0 1 2 3 4 5 6
* [ 0 ] [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ]
* Initial adjustment step for non - power of two nodes .
* old rank 1 3 5 6
* new rank 0 1 2 3
* [ 0 + 1 ] [ 2 + 3 ] [ 4 + 5 ] [ 6 ]
* Step 1
* old rank 1 3 5 6
* new rank 0 1 2 3
* [ 0 + 1 + ] [ 0 + 1 + ] [ 4 + 5 + ] [ 4 + 5 + ]
* [ 2 + 3 + ] [ 2 + 3 + ] [ 6 ] [ 6 ]
* Step 2
* old rank 1 3 5 6
* new rank 0 1 2 3
* [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ]
* [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ]
* [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ]
* [ 6 ] [ 6 ] [ 6 ] [ 6 ]
* Final adjustment step for non - power of two nodes
* # 0 1 2 3 4 5 6
* [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ] [ 0 + 1 + ]
* [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ] [ 2 + 3 + ]
* [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ] [ 4 + 5 + ]
* [ 6 ] [ 6 ] [ 6 ] [ 6 ] [ 6 ] [ 6 ] [ 6 ]
*
*/
int
ompi_coll_tuned_allreduce_intra_recursivedoubling ( void * sbuf , void * rbuf ,
int count ,
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2007-01-23 04:19:11 +03:00
{
2012-04-06 19:48:07 +04:00
int ret , line , rank , size , adjsize , remote , distance ;
int newrank , newremote , extra_ranks ;
char * tmpsend = NULL , * tmprecv = NULL , * tmpswap = NULL , * inplacebuf = NULL ;
ptrdiff_t true_lb , true_extent , lb , extent ;
ompi_request_t * reqs [ 2 ] = { NULL , NULL } ;
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
" coll:tuned:allreduce_intra_recursivedoubling rank %d " , rank ) ) ;
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Special case for size == 1 */
if ( 1 = = size ) {
if ( MPI_IN_PLACE ! = sbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf , ( char * ) sbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
return MPI_SUCCESS ;
}
/* Allocate and initialize temporary send buffer */
ret = ompi_datatype_get_extent ( dtype , & lb , & extent ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = ompi_datatype_get_true_extent ( dtype , & true_lb , & true_extent ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
inplacebuf = ( char * ) malloc ( true_extent + ( ptrdiff_t ) ( count - 1 ) * extent ) ;
if ( NULL = = inplacebuf ) { ret = - 1 ; line = __LINE__ ; goto error_hndl ; }
if ( MPI_IN_PLACE = = sbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , inplacebuf , ( char * ) rbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
} else {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , inplacebuf , ( char * ) sbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
tmpsend = ( char * ) inplacebuf ;
tmprecv = ( char * ) rbuf ;
/* Determine nearest power of two less than or equal to size */
adjsize = opal_next_poweroftwo ( size ) ;
adjsize > > = 1 ;
/* Handle non-power-of-two case:
- Even ranks less than 2 * extra_ranks send their data to ( rank + 1 ) , and
sets new rank to - 1.
- Odd ranks less than 2 * extra_ranks receive data from ( rank - 1 ) ,
apply appropriate operation , and set new rank to rank / 2
- Everyone else sets rank to rank - extra_ranks
2007-01-23 04:19:11 +03:00
*/
2012-04-06 19:48:07 +04:00
extra_ranks = size - adjsize ;
if ( rank < ( 2 * extra_ranks ) ) {
if ( 0 = = ( rank % 2 ) ) {
ret = MCA_PML_CALL ( send ( tmpsend , count , dtype , ( rank + 1 ) ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
newrank = - 1 ;
} else {
ret = MCA_PML_CALL ( recv ( tmprecv , count , dtype , ( rank - 1 ) ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
/* tmpsend = tmprecv (op) tmpsend */
ompi_op_reduce ( op , tmprecv , tmpsend , count , dtype ) ;
newrank = rank > > 1 ;
}
} else {
newrank = rank - extra_ranks ;
}
/* Communication/Computation loop
- Exchange message with remote node .
- Perform appropriate operation taking in account order of operations :
result = value ( op ) result
2007-01-23 04:19:11 +03:00
*/
2012-04-06 19:48:07 +04:00
for ( distance = 0x1 ; distance < adjsize ; distance < < = 1 ) {
if ( newrank < 0 ) break ;
/* Determine remote node */
newremote = newrank ^ distance ;
remote = ( newremote < extra_ranks ) ?
( newremote * 2 + 1 ) : ( newremote + extra_ranks ) ;
/* Exchange the data */
ret = MCA_PML_CALL ( irecv ( tmprecv , count , dtype , remote ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm , & reqs [ 0 ] ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = MCA_PML_CALL ( isend ( tmpsend , count , dtype , remote ,
2007-01-23 04:19:11 +03:00
MCA_COLL_BASE_TAG_ALLREDUCE ,
2012-04-06 19:48:07 +04:00
MCA_PML_BASE_SEND_STANDARD , comm , & reqs [ 1 ] ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = ompi_request_wait_all ( 2 , reqs , MPI_STATUSES_IGNORE ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
/* Apply operation */
if ( rank < remote ) {
/* tmprecv = tmpsend (op) tmprecv */
ompi_op_reduce ( op , tmpsend , tmprecv , count , dtype ) ;
tmpswap = tmprecv ;
tmprecv = tmpsend ;
tmpsend = tmpswap ;
} else {
/* tmpsend = tmprecv (op) tmpsend */
ompi_op_reduce ( op , tmprecv , tmpsend , count , dtype ) ;
}
}
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Handle non-power-of-two case:
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
( rank - 1 )
- Even ranks less than 2 * extra_ranks receive result from ( rank + 1 )
*/
if ( rank < ( 2 * extra_ranks ) ) {
if ( 0 = = ( rank % 2 ) ) {
ret = MCA_PML_CALL ( recv ( rbuf , count , dtype , ( rank + 1 ) ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
tmpsend = ( char * ) rbuf ;
} else {
ret = MCA_PML_CALL ( send ( tmpsend , count , dtype , ( rank - 1 ) ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
}
}
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Ensure that the final result is in rbuf */
if ( tmpsend ! = rbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf , tmpsend ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
if ( NULL ! = inplacebuf ) free ( inplacebuf ) ;
return MPI_SUCCESS ;
2007-01-23 04:19:11 +03:00
error_hndl :
2012-04-06 19:48:07 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Rank %d Error occurred %d \n " ,
__FILE__ , line , rank , ret ) ) ;
if ( NULL ! = inplacebuf ) free ( inplacebuf ) ;
return ret ;
2007-01-23 04:19:11 +03:00
}
2005-10-27 03:11:32 +04:00
2007-01-23 04:19:11 +03:00
/*
* ompi_coll_tuned_allreduce_intra_ring
*
* Function : Ring algorithm for allreduce operation
* Accepts : Same as MPI_Allreduce ( )
* Returns : MPI_SUCCESS or error code
*
* Description : Implements ring algorithm for allreduce : the message is
* automatically segmented to segment of size M / N .
* Algorithm requires 2 * N - 1 steps .
*
* Limitations : The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations .
* In addition , algorithm cannot work if the total count is
* less than size .
* Example on 5 nodes :
* Initial state
* # 0 1 2 3 4
* [ 00 ] [ 10 ] [ 20 ] [ 30 ] [ 40 ]
* [ 01 ] [ 11 ] [ 21 ] [ 31 ] [ 41 ]
* [ 02 ] [ 12 ] [ 22 ] [ 32 ] [ 42 ]
* [ 03 ] [ 13 ] [ 23 ] [ 33 ] [ 43 ]
* [ 04 ] [ 14 ] [ 24 ] [ 34 ] [ 44 ]
*
* COMPUTATION PHASE
* Step 0 : rank r sends block r to rank ( r + 1 ) and receives bloc ( r - 1 )
* from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2 3 4
* [ 00 ] [ 00 + 10 ] [ 20 ] [ 30 ] [ 40 ]
* [ 01 ] [ 11 ] [ 11 + 21 ] [ 31 ] [ 41 ]
* [ 02 ] [ 12 ] [ 22 ] [ 22 + 32 ] [ 42 ]
* [ 03 ] [ 13 ] [ 23 ] [ 33 ] [ 33 + 43 ]
* [ 44 + 04 ] [ 14 ] [ 24 ] [ 34 ] [ 44 ]
*
* Step 1 : rank r sends block ( r - 1 ) to rank ( r + 1 ) and receives bloc
* ( r - 2 ) from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2 3 4
* [ 00 ] [ 00 + 10 ] [ 01 + 10 + 20 ] [ 30 ] [ 40 ]
* [ 01 ] [ 11 ] [ 11 + 21 ] [ 11 + 21 + 31 ] [ 41 ]
* [ 02 ] [ 12 ] [ 22 ] [ 22 + 32 ] [ 22 + 32 + 42 ]
* [ 33 + 43 + 03 ] [ 13 ] [ 23 ] [ 33 ] [ 33 + 43 ]
* [ 44 + 04 ] [ 44 + 04 + 14 ] [ 24 ] [ 34 ] [ 44 ]
*
* Step 2 : rank r sends block ( r - 2 ) to rank ( r + 1 ) and receives bloc
* ( r - 2 ) from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2 3 4
* [ 00 ] [ 00 + 10 ] [ 01 + 10 + 20 ] [ 01 + 10 + 20 + 30 ] [ 40 ]
* [ 01 ] [ 11 ] [ 11 + 21 ] [ 11 + 21 + 31 ] [ 11 + 21 + 31 + 41 ]
* [ 22 + 32 + 42 + 02 ] [ 12 ] [ 22 ] [ 22 + 32 ] [ 22 + 32 + 42 ]
* [ 33 + 43 + 03 ] [ 33 + 43 + 03 + 13 ] [ 23 ] [ 33 ] [ 33 + 43 ]
* [ 44 + 04 ] [ 44 + 04 + 14 ] [ 44 + 04 + 14 + 24 ] [ 34 ] [ 44 ]
*
* Step 3 : rank r sends block ( r - 3 ) to rank ( r + 1 ) and receives bloc
* ( r - 3 ) from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2 3 4
* [ 00 ] [ 00 + 10 ] [ 01 + 10 + 20 ] [ 01 + 10 + 20 + 30 ] [ FULL ]
* [ FULL ] [ 11 ] [ 11 + 21 ] [ 11 + 21 + 31 ] [ 11 + 21 + 31 + 41 ]
* [ 22 + 32 + 42 + 02 ] [ FULL ] [ 22 ] [ 22 + 32 ] [ 22 + 32 + 42 ]
* [ 33 + 43 + 03 ] [ 33 + 43 + 03 + 13 ] [ FULL ] [ 33 ] [ 33 + 43 ]
* [ 44 + 04 ] [ 44 + 04 + 14 ] [ 44 + 04 + 14 + 24 ] [ FULL ] [ 44 ]
*
* DISTRIBUTION PHASE : ring ALLGATHER with ranks shifted by 1.
*
*/
int
ompi_coll_tuned_allreduce_intra_ring ( void * sbuf , void * rbuf , int count ,
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2007-01-23 04:19:11 +03:00
{
2012-04-06 19:48:07 +04:00
int ret , line , rank , size , k , recv_from , send_to , block_count , inbi ;
int early_segcount , late_segcount , split_rank , max_segcount ;
size_t typelng ;
char * tmpsend = NULL , * tmprecv = NULL , * inbuf [ 2 ] = { NULL , NULL } ;
ptrdiff_t true_lb , true_extent , lb , extent ;
ptrdiff_t block_offset , max_real_segsize ;
ompi_request_t * reqs [ 2 ] = { NULL , NULL } ;
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
" coll:tuned:allreduce_intra_ring rank %d, count %d " , rank , count ) ) ;
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Special case for size == 1 */
if ( 1 = = size ) {
if ( MPI_IN_PLACE ! = sbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf , ( char * ) sbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
return MPI_SUCCESS ;
}
/* Special case for count less than size - use recursive doubling */
if ( count < size ) {
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling " , rank , size , count ) ) ;
return ( ompi_coll_tuned_allreduce_intra_recursivedoubling ( sbuf , rbuf ,
count ,
dtype , op ,
comm , module ) ) ;
}
/* Allocate and initialize temporary buffers */
ret = ompi_datatype_get_extent ( dtype , & lb , & extent ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = ompi_datatype_get_true_extent ( dtype , & true_lb , & true_extent ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = ompi_datatype_type_size ( dtype , & typelng ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
/* Determine the number of elements per block and corresponding
block sizes .
The blocks are divided into " early " and " late " ones :
blocks 0 . . ( split_rank - 1 ) are " early " and
blocks ( split_rank ) . . ( size - 1 ) are " late " .
Early blocks are at most 1 element larger than the late ones .
2007-01-23 04:19:11 +03:00
*/
2012-04-06 19:48:07 +04:00
COLL_TUNED_COMPUTE_BLOCKCOUNT ( count , size , split_rank ,
early_segcount , late_segcount )
max_segcount = early_segcount ;
max_real_segsize = true_extent + ( max_segcount - 1 ) * extent ;
inbuf [ 0 ] = ( char * ) malloc ( max_real_segsize ) ;
if ( NULL = = inbuf [ 0 ] ) { ret = - 1 ; line = __LINE__ ; goto error_hndl ; }
if ( size > 2 ) {
inbuf [ 1 ] = ( char * ) malloc ( max_real_segsize ) ;
if ( NULL = = inbuf [ 1 ] ) { ret = - 1 ; line = __LINE__ ; goto error_hndl ; }
}
/* Handle MPI_IN_PLACE */
if ( MPI_IN_PLACE ! = sbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf , ( char * ) sbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
/* Computation loop */
/*
For each of the remote nodes :
- post irecv for block ( r - 1 )
- send block ( r )
- in loop for every step k = 2 . . n
- post irecv for block ( r + n - k ) % n
- wait on block ( r + n - k + 1 ) % n to arrive
- compute on block ( r + n - k + 1 ) % n
- send block ( r + n - k + 1 ) % n
- wait on block ( r + 1 )
- compute on block ( r + 1 )
- send block ( r + 1 ) to rank ( r + 1 )
Note that we must be careful when computing the begining of buffers and
for send operations and computation we must compute the exact block size .
2007-01-23 04:19:11 +03:00
*/
2012-04-06 19:48:07 +04:00
send_to = ( rank + 1 ) % size ;
recv_from = ( rank + size - 1 ) % size ;
inbi = 0 ;
/* Initialize first receive from the neighbor on the left */
ret = MCA_PML_CALL ( irecv ( inbuf [ inbi ] , max_segcount , dtype , recv_from ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm , & reqs [ inbi ] ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
/* Send first block (my block) to the neighbor on the right */
block_offset = ( ( rank < split_rank ) ?
( ( ptrdiff_t ) rank * ( ptrdiff_t ) early_segcount ) :
( ( ptrdiff_t ) rank * ( ptrdiff_t ) late_segcount + split_rank ) ) ;
block_count = ( ( rank < split_rank ) ? early_segcount : late_segcount ) ;
tmpsend = ( ( char * ) rbuf ) + block_offset * extent ;
ret = MCA_PML_CALL ( send ( tmpsend , block_count , dtype , send_to ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
for ( k = 2 ; k < size ; k + + ) {
const int prevblock = ( rank + size - k + 1 ) % size ;
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
inbi = inbi ^ 0x1 ;
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Post irecv for the current block */
ret = MCA_PML_CALL ( irecv ( inbuf [ inbi ] , max_segcount , dtype , recv_from ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm , & reqs [ inbi ] ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Wait on previous block to arrive */
ret = ompi_request_wait ( & reqs [ inbi ^ 0x1 ] , MPI_STATUS_IGNORE ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Apply operation on previous block: result goes to rbuf
rbuf [ prevblock ] = inbuf [ inbi ^ 0x1 ] ( op ) rbuf [ prevblock ]
*/
block_offset = ( ( prevblock < split_rank ) ?
( ( ptrdiff_t ) prevblock * early_segcount ) :
( ( ptrdiff_t ) prevblock * late_segcount + split_rank ) ) ;
block_count = ( ( prevblock < split_rank ) ? early_segcount : late_segcount ) ;
tmprecv = ( ( char * ) rbuf ) + ( ptrdiff_t ) block_offset * extent ;
ompi_op_reduce ( op , inbuf [ inbi ^ 0x1 ] , tmprecv , block_count , dtype ) ;
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* send previous block to send_to */
ret = MCA_PML_CALL ( send ( tmprecv , block_count , dtype , send_to ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
}
/* Wait on the last block to arrive */
ret = ompi_request_wait ( & reqs [ inbi ] , MPI_STATUS_IGNORE ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
/* Apply operation on the last block (from neighbor (rank + 1)
rbuf [ rank + 1 ] = inbuf [ inbi ] ( op ) rbuf [ rank + 1 ] */
recv_from = ( rank + 1 ) % size ;
block_offset = ( ( recv_from < split_rank ) ?
( ( ptrdiff_t ) recv_from * early_segcount ) :
( ( ptrdiff_t ) recv_from * late_segcount + split_rank ) ) ;
block_count = ( ( recv_from < split_rank ) ? early_segcount : late_segcount ) ;
tmprecv = ( ( char * ) rbuf ) + ( ptrdiff_t ) block_offset * extent ;
ompi_op_reduce ( op , inbuf [ inbi ] , tmprecv , block_count , dtype ) ;
2007-01-23 04:19:11 +03:00
2012-04-06 19:48:07 +04:00
/* Distribution loop - variation of ring allgather */
send_to = ( rank + 1 ) % size ;
recv_from = ( rank + size - 1 ) % size ;
for ( k = 0 ; k < size - 1 ; k + + ) {
const int recv_data_from = ( rank + size - k ) % size ;
const int send_data_from = ( rank + 1 + size - k ) % size ;
const int send_block_offset =
( ( send_data_from < split_rank ) ?
( ( ptrdiff_t ) send_data_from * early_segcount ) :
( ( ptrdiff_t ) send_data_from * late_segcount + split_rank ) ) ;
const int recv_block_offset =
( ( recv_data_from < split_rank ) ?
( ( ptrdiff_t ) recv_data_from * early_segcount ) :
( ( ptrdiff_t ) recv_data_from * late_segcount + split_rank ) ) ;
block_count = ( ( send_data_from < split_rank ) ?
early_segcount : late_segcount ) ;
tmprecv = ( char * ) rbuf + ( ptrdiff_t ) recv_block_offset * extent ;
tmpsend = ( char * ) rbuf + ( ptrdiff_t ) send_block_offset * extent ;
ret = ompi_coll_tuned_sendrecv ( tmpsend , block_count , dtype , send_to ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
tmprecv , max_segcount , dtype , recv_from ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
comm , MPI_STATUS_IGNORE , rank ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
}
if ( NULL ! = inbuf [ 0 ] ) free ( inbuf [ 0 ] ) ;
if ( NULL ! = inbuf [ 1 ] ) free ( inbuf [ 1 ] ) ;
return MPI_SUCCESS ;
2007-01-23 04:19:11 +03:00
error_hndl :
2012-04-06 19:48:07 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Rank %d Error occurred %d \n " ,
__FILE__ , line , rank , ret ) ) ;
if ( NULL ! = inbuf [ 0 ] ) free ( inbuf [ 0 ] ) ;
if ( NULL ! = inbuf [ 1 ] ) free ( inbuf [ 1 ] ) ;
return ret ;
2007-01-23 04:19:11 +03:00
}
2005-10-27 03:11:32 +04:00
2007-02-27 23:32:30 +03:00
/*
* ompi_coll_tuned_allreduce_intra_ring_segmented
*
* Function : Pipelined ring algorithm for allreduce operation
* Accepts : Same as MPI_Allreduce ( ) , segment size
* Returns : MPI_SUCCESS or error code
*
* Description : Implements pipelined ring algorithm for allreduce :
* user supplies suggested segment size for the pipelining of
* reduce operation .
* The segment size determines the number of phases , np , for
* the algorithm execution .
* The message is automatically divided into blocks of
* approximately ( count / ( np * segcount ) ) elements .
* At the end of reduction phase , allgather like step is
* executed .
* Algorithm requires ( np + 1 ) * ( N - 1 ) steps .
*
* Limitations : The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations .
* In addition , algorithm cannot work if the total size is
* less than size * segment size .
* Example on 3 nodes with 2 phases
* Initial state
* # 0 1 2
* [ 00 a ] [ 10 a ] [ 20 a ]
* [ 00 b ] [ 10 b ] [ 20 b ]
* [ 01 a ] [ 11 a ] [ 21 a ]
* [ 01 b ] [ 11 b ] [ 21 b ]
* [ 02 a ] [ 12 a ] [ 22 a ]
* [ 02 b ] [ 12 b ] [ 22 b ]
*
* COMPUTATION PHASE 0 ( a )
* Step 0 : rank r sends block ra to rank ( r + 1 ) and receives bloc ( r - 1 ) a
* from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2
* [ 00 a ] [ 00 a + 10 a ] [ 20 a ]
* [ 00 b ] [ 10 b ] [ 20 b ]
* [ 01 a ] [ 11 a ] [ 11 a + 21 a ]
* [ 01 b ] [ 11 b ] [ 21 b ]
* [ 22 a + 02 a ] [ 12 a ] [ 22 a ]
* [ 02 b ] [ 12 b ] [ 22 b ]
*
* Step 1 : rank r sends block ( r - 1 ) a to rank ( r + 1 ) and receives bloc
* ( r - 2 ) a from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2
* [ 00 a ] [ 00 a + 10 a ] [ 00 a + 10 a + 20 a ]
* [ 00 b ] [ 10 b ] [ 20 b ]
* [ 11 a + 21 a + 01 a ] [ 11 a ] [ 11 a + 21 a ]
* [ 01 b ] [ 11 b ] [ 21 b ]
* [ 22 a + 02 a ] [ 22 a + 02 a + 12 a ] [ 22 a ]
* [ 02 b ] [ 12 b ] [ 22 b ]
*
* COMPUTATION PHASE 1 ( b )
* Step 0 : rank r sends block rb to rank ( r + 1 ) and receives bloc ( r - 1 ) b
* from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2
* [ 00 a ] [ 00 a + 10 a ] [ 20 a ]
* [ 00 b ] [ 00 b + 10 b ] [ 20 b ]
* [ 01 a ] [ 11 a ] [ 11 a + 21 a ]
* [ 01 b ] [ 11 b ] [ 11 b + 21 b ]
* [ 22 a + 02 a ] [ 12 a ] [ 22 a ]
* [ 22 b + 02 b ] [ 12 b ] [ 22 b ]
*
* Step 1 : rank r sends block ( r - 1 ) b to rank ( r + 1 ) and receives bloc
* ( r - 2 ) b from rank ( r - 1 ) [ with wraparound ] .
* # 0 1 2
* [ 00 a ] [ 00 a + 10 a ] [ 00 a + 10 a + 20 a ]
* [ 00 b ] [ 10 b ] [ 0 bb + 10 b + 20 b ]
* [ 11 a + 21 a + 01 a ] [ 11 a ] [ 11 a + 21 a ]
* [ 11 b + 21 b + 01 b ] [ 11 b ] [ 21 b ]
* [ 22 a + 02 a ] [ 22 a + 02 a + 12 a ] [ 22 a ]
* [ 02 b ] [ 22 b + 01 b + 12 b ] [ 22 b ]
*
*
* DISTRIBUTION PHASE : ring ALLGATHER with ranks shifted by 1 ( same as
* in regular ring algorithm .
*
*/
int
ompi_coll_tuned_allreduce_intra_ring_segmented ( void * sbuf , void * rbuf , int count ,
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2007-02-27 23:32:30 +03:00
uint32_t segsize )
{
2012-04-06 19:48:07 +04:00
int ret , line , rank , size , k , recv_from , send_to ;
int early_blockcount , late_blockcount , split_rank ;
int segcount , max_segcount , num_phases , phase , block_count , inbi ;
size_t typelng ;
char * tmpsend = NULL , * tmprecv = NULL , * inbuf [ 2 ] = { NULL , NULL } ;
ptrdiff_t true_lb , true_extent , lb , extent ;
ptrdiff_t block_offset , max_real_segsize ;
ompi_request_t * reqs [ 2 ] = { NULL , NULL } ;
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
" coll:tuned:allreduce_intra_ring_segmented rank %d, count %d " , rank , count ) ) ;
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* Special case for size == 1 */
if ( 1 = = size ) {
if ( MPI_IN_PLACE ! = sbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf , ( char * ) sbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
return MPI_SUCCESS ;
}
/* Determine segment count based on the suggested segment size */
ret = ompi_datatype_get_extent ( dtype , & lb , & extent ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = ompi_datatype_get_true_extent ( dtype , & true_lb , & true_extent ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
ret = ompi_datatype_type_size ( dtype , & typelng ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
segcount = count ;
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount )
/* Special case for count less than size * segcount - use regular ring */
if ( count < ( size * segcount ) ) {
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring " , rank , size , count ) ) ;
return ( ompi_coll_tuned_allreduce_intra_ring ( sbuf , rbuf , count , dtype , op ,
comm , module ) ) ;
}
/* Determine the number of phases of the algorithm */
num_phases = count / ( size * segcount ) ;
if ( ( count % ( size * segcount ) > = size ) & &
( count % ( size * segcount ) > ( ( size * segcount ) / 2 ) ) ) {
num_phases + + ;
}
/* Determine the number of elements per block and corresponding
block sizes .
The blocks are divided into " early " and " late " ones :
blocks 0 . . ( split_rank - 1 ) are " early " and
blocks ( split_rank ) . . ( size - 1 ) are " late " .
Early blocks are at most 1 element larger than the late ones .
Note , these blocks will be split into num_phases segments ,
out of the largest one will have max_segcount elements .
2007-02-27 23:32:30 +03:00
*/
2012-04-06 19:48:07 +04:00
COLL_TUNED_COMPUTE_BLOCKCOUNT ( count , size , split_rank ,
early_blockcount , late_blockcount )
COLL_TUNED_COMPUTE_BLOCKCOUNT ( early_blockcount , num_phases , inbi ,
max_segcount , k )
max_real_segsize = true_extent + ( ptrdiff_t ) ( max_segcount - 1 ) * extent ;
/* Allocate and initialize temporary buffers */
inbuf [ 0 ] = ( char * ) malloc ( max_real_segsize ) ;
if ( NULL = = inbuf [ 0 ] ) { ret = - 1 ; line = __LINE__ ; goto error_hndl ; }
if ( size > 2 ) {
inbuf [ 1 ] = ( char * ) malloc ( max_real_segsize ) ;
if ( NULL = = inbuf [ 1 ] ) { ret = - 1 ; line = __LINE__ ; goto error_hndl ; }
}
/* Handle MPI_IN_PLACE */
if ( MPI_IN_PLACE ! = sbuf ) {
ret = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf , ( char * ) sbuf ) ;
if ( ret < 0 ) { line = __LINE__ ; goto error_hndl ; }
}
/* Computation loop: for each phase, repeat ring allreduce computation loop */
for ( phase = 0 ; phase < num_phases ; phase + + ) {
ptrdiff_t phase_offset ;
int early_phase_segcount , late_phase_segcount , split_phase , phase_count ;
/*
For each of the remote nodes :
- post irecv for block ( r - 1 )
- send block ( r )
2007-02-27 23:32:30 +03:00
To do this , first compute block offset and count , and use block offset
to compute phase offset .
2012-04-06 19:48:07 +04:00
- in loop for every step k = 2 . . n
2007-02-27 23:32:30 +03:00
- post irecv for block ( r + n - k ) % n
- wait on block ( r + n - k + 1 ) % n to arrive
- compute on block ( r + n - k + 1 ) % n
- send block ( r + n - k + 1 ) % n
2012-04-06 19:48:07 +04:00
- wait on block ( r + 1 )
- compute on block ( r + 1 )
- send block ( r + 1 ) to rank ( r + 1 )
Note that we must be careful when computing the begining of buffers and
for send operations and computation we must compute the exact block size .
*/
send_to = ( rank + 1 ) % size ;
recv_from = ( rank + size - 1 ) % size ;
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
inbi = 0 ;
/* Initialize first receive from the neighbor on the left */
ret = MCA_PML_CALL ( irecv ( inbuf [ inbi ] , max_segcount , dtype , recv_from ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm , & reqs [ inbi ] ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
/* Send first block (my block) to the neighbor on the right:
- compute my block and phase offset
- send data */
block_offset = ( ( rank < split_rank ) ?
( ( ptrdiff_t ) rank * ( ptrdiff_t ) early_blockcount ) :
( ( ptrdiff_t ) rank * ( ptrdiff_t ) late_blockcount + split_rank ) ) ;
block_count = ( ( rank < split_rank ) ? early_blockcount : late_blockcount ) ;
COLL_TUNED_COMPUTE_BLOCKCOUNT ( block_count , num_phases , split_phase ,
early_phase_segcount , late_phase_segcount )
phase_count = ( ( phase < split_phase ) ?
( early_phase_segcount ) : ( late_phase_segcount ) ) ;
phase_offset = ( ( phase < split_phase ) ?
( ( ptrdiff_t ) phase * ( ptrdiff_t ) early_phase_segcount ) :
( ( ptrdiff_t ) phase * ( ptrdiff_t ) late_phase_segcount + split_phase ) ) ;
tmpsend = ( ( char * ) rbuf ) + ( ptrdiff_t ) ( block_offset + phase_offset ) * extent ;
ret = MCA_PML_CALL ( send ( tmpsend , phase_count , dtype , send_to ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
for ( k = 2 ; k < size ; k + + ) {
const int prevblock = ( rank + size - k + 1 ) % size ;
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
inbi = inbi ^ 0x1 ;
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* Post irecv for the current block */
ret = MCA_PML_CALL ( irecv ( inbuf [ inbi ] , max_segcount , dtype , recv_from ,
MCA_COLL_BASE_TAG_ALLREDUCE , comm ,
& reqs [ inbi ] ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* Wait on previous block to arrive */
ret = ompi_request_wait ( & reqs [ inbi ^ 0x1 ] , MPI_STATUS_IGNORE ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* Apply operation on previous block: result goes to rbuf
rbuf [ prevblock ] = inbuf [ inbi ^ 0x1 ] ( op ) rbuf [ prevblock ]
*/
block_offset = ( ( prevblock < split_rank ) ?
( ( ptrdiff_t ) prevblock * ( ptrdiff_t ) early_blockcount ) :
( ( ptrdiff_t ) prevblock * ( ptrdiff_t ) late_blockcount + split_rank ) ) ;
block_count = ( ( prevblock < split_rank ) ?
early_blockcount : late_blockcount ) ;
COLL_TUNED_COMPUTE_BLOCKCOUNT ( block_count , num_phases , split_phase ,
early_phase_segcount , late_phase_segcount )
phase_count = ( ( phase < split_phase ) ?
( early_phase_segcount ) : ( late_phase_segcount ) ) ;
phase_offset = ( ( phase < split_phase ) ?
( ( ptrdiff_t ) phase * ( ptrdiff_t ) early_phase_segcount ) :
( ( ptrdiff_t ) phase * ( ptrdiff_t ) late_phase_segcount + split_phase ) ) ;
tmprecv = ( ( char * ) rbuf ) + ( ptrdiff_t ) ( block_offset + phase_offset ) * extent ;
ompi_op_reduce ( op , inbuf [ inbi ^ 0x1 ] , tmprecv , phase_count , dtype ) ;
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* send previous block to send_to */
ret = MCA_PML_CALL ( send ( tmprecv , phase_count , dtype , send_to ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
}
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* Wait on the last block to arrive */
ret = ompi_request_wait ( & reqs [ inbi ] , MPI_STATUS_IGNORE ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
2007-02-27 23:32:30 +03:00
2012-04-06 19:48:07 +04:00
/* Apply operation on the last block (from neighbor (rank + 1)
rbuf [ rank + 1 ] = inbuf [ inbi ] ( op ) rbuf [ rank + 1 ] */
recv_from = ( rank + 1 ) % size ;
block_offset = ( ( recv_from < split_rank ) ?
( ( ptrdiff_t ) recv_from * ( ptrdiff_t ) early_blockcount ) :
( ( ptrdiff_t ) recv_from * ( ptrdiff_t ) late_blockcount + split_rank ) ) ;
block_count = ( ( recv_from < split_rank ) ?
early_blockcount : late_blockcount ) ;
COLL_TUNED_COMPUTE_BLOCKCOUNT ( block_count , num_phases , split_phase ,
early_phase_segcount , late_phase_segcount )
phase_count = ( ( phase < split_phase ) ?
( early_phase_segcount ) : ( late_phase_segcount ) ) ;
phase_offset = ( ( phase < split_phase ) ?
( ( ptrdiff_t ) phase * ( ptrdiff_t ) early_phase_segcount ) :
( ( ptrdiff_t ) phase * ( ptrdiff_t ) late_phase_segcount + split_phase ) ) ;
tmprecv = ( ( char * ) rbuf ) + ( ptrdiff_t ) ( block_offset + phase_offset ) * extent ;
ompi_op_reduce ( op , inbuf [ inbi ] , tmprecv , phase_count , dtype ) ;
}
/* Distribution loop - variation of ring allgather */
send_to = ( rank + 1 ) % size ;
recv_from = ( rank + size - 1 ) % size ;
for ( k = 0 ; k < size - 1 ; k + + ) {
const int recv_data_from = ( rank + size - k ) % size ;
const int send_data_from = ( rank + 1 + size - k ) % size ;
const int send_block_offset =
( ( send_data_from < split_rank ) ?
( ( ptrdiff_t ) send_data_from * ( ptrdiff_t ) early_blockcount ) :
( ( ptrdiff_t ) send_data_from * ( ptrdiff_t ) late_blockcount + split_rank ) ) ;
const int recv_block_offset =
( ( recv_data_from < split_rank ) ?
( ( ptrdiff_t ) recv_data_from * ( ptrdiff_t ) early_blockcount ) :
( ( ptrdiff_t ) recv_data_from * ( ptrdiff_t ) late_blockcount + split_rank ) ) ;
block_count = ( ( send_data_from < split_rank ) ?
early_blockcount : late_blockcount ) ;
tmprecv = ( char * ) rbuf + ( ptrdiff_t ) recv_block_offset * extent ;
tmpsend = ( char * ) rbuf + ( ptrdiff_t ) send_block_offset * extent ;
ret = ompi_coll_tuned_sendrecv ( tmpsend , block_count , dtype , send_to ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
tmprecv , early_blockcount , dtype , recv_from ,
MCA_COLL_BASE_TAG_ALLREDUCE ,
comm , MPI_STATUS_IGNORE , rank ) ;
if ( MPI_SUCCESS ! = ret ) { line = __LINE__ ; goto error_hndl ; }
}
if ( NULL ! = inbuf [ 0 ] ) free ( inbuf [ 0 ] ) ;
if ( NULL ! = inbuf [ 1 ] ) free ( inbuf [ 1 ] ) ;
return MPI_SUCCESS ;
2007-02-27 23:32:30 +03:00
error_hndl :
2012-04-06 19:48:07 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Rank %d Error occurred %d \n " ,
__FILE__ , line , rank , ret ) ) ;
if ( NULL ! = inbuf [ 0 ] ) free ( inbuf [ 0 ] ) ;
if ( NULL ! = inbuf [ 1 ] ) free ( inbuf [ 1 ] ) ;
return ret ;
2007-02-27 23:32:30 +03:00
}
2005-10-27 03:11:32 +04:00
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and / or small data sizes they
* are just as fast as tuned / tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1 . i . e . in V2 we will handle this differently and so will not
* have to duplicate code .
* GEF Oct05 after asking Jeff .
*/
/* copied function (with appropriate renaming) starts here */
/*
* allreduce_intra
*
* Function : - allreduce using other MPI collectives
* Accepts : - same as MPI_Allreduce ( )
* Returns : - MPI_SUCCESS or error code
*/
int
2005-12-22 16:49:33 +03:00
ompi_coll_tuned_allreduce_intra_basic_linear ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-10-27 03:11:32 +04:00
{
2012-04-06 19:48:07 +04:00
int err , rank ;
2005-10-27 03:11:32 +04:00
rank = ompi_comm_rank ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_intra_basic_linear rank %d " , rank ) ) ;
2005-10-27 03:11:32 +04:00
/* Reduce to 0 and broadcast. */
if ( MPI_IN_PLACE = = sbuf ) {
2009-05-05 16:28:51 +04:00
if ( 0 = = rank ) {
2007-08-19 07:37:49 +04:00
err = ompi_coll_tuned_reduce_intra_basic_linear ( MPI_IN_PLACE , rbuf , count , dtype ,
2012-04-06 19:48:07 +04:00
op , 0 , comm , module ) ;
2005-10-27 03:11:32 +04:00
} else {
2007-08-19 07:37:49 +04:00
err = ompi_coll_tuned_reduce_intra_basic_linear ( rbuf , NULL , count , dtype ,
2012-04-06 19:48:07 +04:00
op , 0 , comm , module ) ;
2005-10-27 03:11:32 +04:00
}
} else {
2007-08-19 07:37:49 +04:00
err = ompi_coll_tuned_reduce_intra_basic_linear ( sbuf , rbuf , count , dtype ,
2012-04-06 19:48:07 +04:00
op , 0 , comm , module ) ;
2005-10-27 03:11:32 +04:00
}
if ( MPI_SUCCESS ! = err ) {
return err ;
}
2007-08-19 07:37:49 +04:00
return ompi_coll_tuned_bcast_intra_basic_linear ( rbuf , count , dtype , 0 , comm , module ) ;
2005-10-27 03:11:32 +04:00
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
2006-04-20 03:42:06 +04:00
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_allreduce_intra_check_forced_init ( coll_tuned_force_algorithm_mca_param_indices_t * mca_param_indices )
2005-10-27 03:11:32 +04:00
{
2013-03-28 01:09:41 +04:00
mca_base_var_enum_t * new_enum ;
ompi_coll_tuned_forced_max_algorithms [ ALLREDUCE ] = coll_tuned_allreduce_algorithm_count ;
( void ) mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" allreduce_algorithm_count " ,
" Number of allreduce algorithms available " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 ,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_CONSTANT ,
& coll_tuned_allreduce_algorithm_count ) ;
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allreduce_forced_algorithm = 0 ;
( void ) mca_base_var_enum_create ( " coll_tuned_allreduce_algorithms " , allreduce_algorithms , & new_enum ) ;
mca_param_indices - > algorithm_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" allreduce_algorithm " ,
" Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring " ,
MCA_BASE_VAR_TYPE_INT , new_enum , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_allreduce_forced_algorithm ) ;
OBJ_RELEASE ( new_enum ) ;
2009-01-03 18:56:25 +03:00
if ( mca_param_indices - > algorithm_param_index < 0 ) {
return mca_param_indices - > algorithm_param_index ;
}
2006-04-20 03:42:06 +04:00
2013-03-28 01:09:41 +04:00
coll_tuned_allreduce_segment_size = 0 ;
mca_param_indices - > segsize_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" allreduce_algorithm_segmentsize " ,
" Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_allreduce_segment_size ) ;
coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout ; /* get system wide default */
mca_param_indices - > tree_fanout_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" allreduce_algorithm_tree_fanout " ,
" Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_allreduce_tree_fanout ) ;
coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout ; /* get system wide default */
mca_param_indices - > chain_fanout_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" allreduce_algorithm_chain_fanout " ,
" Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_allreduce_chain_fanout ) ;
2005-10-27 03:11:32 +04:00
2006-10-18 06:00:46 +04:00
return ( MPI_SUCCESS ) ;
2005-10-27 03:11:32 +04:00
}
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_allreduce_intra_do_forced ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-10-27 03:11:32 +04:00
{
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ ALLREDUCE ] . algorithm ,
data - > user_forced [ ALLREDUCE ] . segsize ) ) ;
switch ( data - > user_forced [ ALLREDUCE ] . algorithm ) {
case ( 0 ) : return ompi_coll_tuned_allreduce_intra_dec_fixed ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_allreduce_intra_basic_linear ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_allreduce_intra_nonoverlapping ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 3 ) : return ompi_coll_tuned_allreduce_intra_recursivedoubling ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 4 ) : return ompi_coll_tuned_allreduce_intra_ring ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 5 ) : return ompi_coll_tuned_allreduce_intra_ring_segmented ( sbuf , rbuf , count , dtype , op , comm , module , data - > user_forced [ ALLREDUCE ] . segsize ) ;
2005-10-27 03:11:32 +04:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid? " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ ALLREDUCE ] . algorithm ,
2006-10-18 06:00:46 +04:00
ompi_coll_tuned_forced_max_algorithms [ ALLREDUCE ] ) ) ;
2005-10-27 03:11:32 +04:00
return ( MPI_ERR_ARG ) ;
} /* switch */
}
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_allreduce_intra_do_this ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-10-18 06:00:46 +04:00
int algorithm , int faninout , int segsize )
2005-11-11 07:49:29 +03:00
{
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d " ,
2006-10-18 06:00:46 +04:00
algorithm , faninout , segsize ) ) ;
2005-11-11 07:49:29 +03:00
2006-10-18 06:00:46 +04:00
switch ( algorithm ) {
2007-08-19 07:37:49 +04:00
case ( 0 ) : return ompi_coll_tuned_allreduce_intra_dec_fixed ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_allreduce_intra_basic_linear ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_allreduce_intra_nonoverlapping ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 3 ) : return ompi_coll_tuned_allreduce_intra_recursivedoubling ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 4 ) : return ompi_coll_tuned_allreduce_intra_ring ( sbuf , rbuf , count , dtype , op , comm , module ) ;
case ( 5 ) : return ompi_coll_tuned_allreduce_intra_ring_segmented ( sbuf , rbuf , count , dtype , op , comm , module , segsize ) ;
2005-11-11 07:49:29 +03:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid? " ,
2006-10-18 06:00:46 +04:00
algorithm , ompi_coll_tuned_forced_max_algorithms [ ALLREDUCE ] ) ) ;
2005-11-11 07:49:29 +03:00
return ( MPI_ERR_ARG ) ;
} /* switch */
}