2013-03-28 01:09:41 +04:00
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2005-09-14 00:58:42 +04:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2014-06-13 20:04:45 +04:00
* Copyright ( c ) 2004 - 2014 The University of Tennessee and The University
2005-11-05 22:57:48 +03:00
* of Tennessee Research Foundation . All rights
* reserved .
2005-09-14 00:58:42 +04:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2008-05-20 21:37:41 +04:00
* Copyright ( c ) 2008 Sun Microsystems , Inc . All rights reserved .
2013-03-28 01:09:41 +04:00
* Copyright ( c ) 2013 Los Alamos National Security , LLC . All Rights
* reserved .
2005-09-14 00:58:42 +04:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "ompi_config.h"
# include "mpi.h"
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
# include "opal/util/bit_ops.h"
2006-02-12 04:33:29 +03:00
# include "ompi/constants.h"
# include "ompi/communicator/communicator.h"
# include "ompi/mca/coll/coll.h"
# include "ompi/mca/coll/base/coll_tags.h"
# include "ompi/mca/pml/pml.h"
2005-09-14 00:58:42 +04:00
# include "coll_tuned.h"
# include "coll_tuned_topo.h"
# include "coll_tuned_util.h"
2013-03-28 01:09:41 +04:00
/* barrier algorithm variables */
static int coll_tuned_barrier_algorithm_count = 6 ;
static int coll_tuned_barrier_forced_algorithm = 0 ;
/* valid values for coll_tuned_barrier_forced_algorithm */
static mca_base_var_enum_value_t barrier_algorithms [ ] = {
{ 0 , " ignore " } ,
{ 1 , " linear " } ,
{ 2 , " double_ring " } ,
{ 3 , " recursive_doubling " } ,
{ 4 , " bruck " } ,
{ 5 , " two_proc " } ,
{ 6 , " tree " } ,
{ 0 , NULL }
} ;
2014-06-13 20:04:45 +04:00
/**
* A quick version of the MPI_Sendreceive implemented for the barrier .
* No actual data is moved across the wire , we use 0 - byte messages to
* signal a two peer synchronization .
*/
static inline int
ompi_coll_tuned_sendrecv_zero ( int dest , int stag ,
int source , int rtag ,
MPI_Comm comm )
{
int err , line = 0 ;
ompi_request_t * reqs [ 2 ] ;
ompi_status_public_t statuses [ 2 ] ;
/* post new irecv */
err = MCA_PML_CALL ( irecv ( NULL , 0 , MPI_BYTE , source , rtag ,
comm , & reqs [ 0 ] ) ) ;
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_handler ; }
/* send data to children */
err = MCA_PML_CALL ( isend ( NULL , 0 , MPI_BYTE , dest , stag ,
MCA_PML_BASE_SEND_STANDARD , comm , & reqs [ 1 ] ) ) ;
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_handler ; }
err = ompi_request_wait_all ( 2 , reqs , statuses ) ;
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_handler ; }
return ( MPI_SUCCESS ) ;
error_handler :
/* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error
* code that we can propagate up the stack . Instead , look for the real
* error code from the MPI_ERROR in the status .
*/
if ( MPI_ERR_IN_STATUS = = err ) {
/* At least we know the error was detected during the wait_all */
int err_index = 1 ;
if ( MPI_SUCCESS = = statuses [ 0 ] . MPI_ERROR ) {
err_index = 0 ;
}
err = statuses [ err_index ] . MPI_ERROR ;
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%d: Error %d occurred in the %s "
" stage of ompi_coll_tuned_sendrecv_zero \n " ,
__FILE__ , line , err , ( 0 = = err_index ? " receive " : " send " ) ) ) ;
} else {
/* Error discovered during the posting of the irecv or isend,
* and no status is available .
*/
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%d: Error %d occurred \n " ,
__FILE__ , line , err ) ) ;
}
return err ;
}
2006-02-01 02:21:46 +03:00
/*
2007-01-11 01:49:43 +03:00
* Barrier is ment to be a synchronous operation , as some BTLs can mark
* a request done before its passed to the NIC and progress might not be made
* elsewhere we cannot allow a process to exit the barrier until its last
* [ round of ] sends are completed .
2006-02-01 02:21:46 +03:00
*
2007-01-11 01:49:43 +03:00
* It is last round of sends rather than ' last ' individual send as each pair of
* peers can use different channels / devices / btls and the receiver of one of
* these sends might be forced to wait as the sender
2006-02-01 02:21:46 +03:00
* leaves the collective and does not make progress until the next mpi call
*
*/
/*
* Simple double ring version of barrier
*
* synchronous gurantee made by last ring of sends are synchronous
*
*/
2007-08-19 07:37:49 +04:00
int ompi_coll_tuned_barrier_intra_doublering ( struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-09-14 00:58:42 +04:00
{
2012-04-06 19:48:07 +04:00
int rank , size , err = 0 , line = 0 , left , right ;
2005-09-14 00:58:42 +04:00
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " ompi_coll_tuned_barrier_intra_doublering rank %d " , rank ) ) ;
2005-09-14 00:58:42 +04:00
left = ( ( rank - 1 ) % size ) ;
right = ( ( rank + 1 ) % size ) ;
if ( rank > 0 ) { /* receive message from the left */
err = MCA_PML_CALL ( recv ( ( void * ) NULL , 0 , MPI_BYTE , left ,
2007-01-11 01:49:43 +03:00
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
/* Send message to the right */
2007-01-11 01:49:43 +03:00
err = MCA_PML_CALL ( send ( ( void * ) NULL , 0 , MPI_BYTE , right ,
MCA_COLL_BASE_TAG_BARRIER ,
2005-09-14 00:58:42 +04:00
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
/* root needs to receive from the last node */
if ( rank = = 0 ) {
err = MCA_PML_CALL ( recv ( ( void * ) NULL , 0 , MPI_BYTE , left ,
2007-01-11 01:49:43 +03:00
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
/* Allow nodes to exit */
if ( rank > 0 ) { /* post Receive from left */
err = MCA_PML_CALL ( recv ( ( void * ) NULL , 0 , MPI_BYTE , left ,
2007-01-11 01:49:43 +03:00
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
/* send message to the right one */
2007-01-11 01:49:43 +03:00
err = MCA_PML_CALL ( send ( ( void * ) NULL , 0 , MPI_BYTE , right ,
MCA_COLL_BASE_TAG_BARRIER ,
2006-02-01 02:21:46 +03:00
MCA_PML_BASE_SEND_SYNCHRONOUS , comm ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
/* rank 0 post receive from the last node */
if ( rank = = 0 ) {
err = MCA_PML_CALL ( recv ( ( void * ) NULL , 0 , MPI_BYTE , left ,
2007-01-11 01:49:43 +03:00
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
return MPI_SUCCESS ;
err_hndl :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Error occurred %d, rank %2d " ,
2007-01-11 01:49:43 +03:00
__FILE__ , line , err , rank ) ) ;
2006-10-18 06:00:46 +04:00
return err ;
2005-09-14 00:58:42 +04:00
}
2006-02-01 02:21:46 +03:00
/*
* To make synchronous , uses sync sends and sync sendrecvs
*/
2007-08-19 07:37:49 +04:00
int ompi_coll_tuned_barrier_intra_recursivedoubling ( struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-09-14 00:58:42 +04:00
{
2012-04-06 19:48:07 +04:00
int rank , size , adjsize , err , line , mask , remote ;
2005-09-14 00:58:42 +04:00
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
2007-01-11 01:49:43 +03:00
" ompi_coll_tuned_barrier_intra_recursivedoubling rank %d " ,
rank ) ) ;
2005-09-14 00:58:42 +04:00
/* do nearest power of 2 less than size calc */
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
adjsize = opal_next_poweroftwo ( size ) ;
2006-10-19 21:14:57 +04:00
adjsize > > = 1 ;
2005-09-14 00:58:42 +04:00
/* if size is not exact power of two, perform an extra step */
if ( adjsize ! = size ) {
if ( rank > = adjsize ) {
/* send message to lower ranked node */
2007-01-11 01:49:43 +03:00
remote = rank - adjsize ;
2014-06-13 20:04:45 +04:00
err = ompi_coll_tuned_sendrecv_zero ( remote , MCA_COLL_BASE_TAG_BARRIER ,
remote , MCA_COLL_BASE_TAG_BARRIER ,
comm ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
} else if ( rank < ( size - adjsize ) ) {
/* receive message from high level rank */
err = MCA_PML_CALL ( recv ( ( void * ) NULL , 0 , MPI_BYTE , rank + adjsize ,
2007-01-11 01:49:43 +03:00
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
}
/* exchange messages */
if ( rank < adjsize ) {
mask = 0x1 ;
while ( mask < adjsize ) {
remote = rank ^ mask ;
mask < < = 1 ;
if ( remote > = adjsize ) continue ;
2007-01-11 01:49:43 +03:00
/* post receive from the remote node */
2014-06-13 20:04:45 +04:00
err = ompi_coll_tuned_sendrecv_zero ( remote , MCA_COLL_BASE_TAG_BARRIER ,
remote , MCA_COLL_BASE_TAG_BARRIER ,
comm ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
}
/* non-power of 2 case */
if ( adjsize ! = size ) {
if ( rank < ( size - adjsize ) ) {
/* send enter message to higher ranked node */
2007-01-11 01:49:43 +03:00
remote = rank + adjsize ;
err = MCA_PML_CALL ( send ( ( void * ) NULL , 0 , MPI_BYTE , remote ,
MCA_COLL_BASE_TAG_BARRIER ,
MCA_PML_BASE_SEND_SYNCHRONOUS , comm ) ) ;
2005-09-14 00:58:42 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
}
}
return MPI_SUCCESS ;
2006-10-18 06:00:46 +04:00
err_hndl :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Error occurred %d, rank %2d " ,
2007-01-11 01:49:43 +03:00
__FILE__ , line , err , rank ) ) ;
2006-10-18 06:00:46 +04:00
return err ;
2005-09-14 00:58:42 +04:00
}
2006-02-01 02:21:46 +03:00
/*
* To make synchronous , uses sync sends and sync sendrecvs
*/
2007-08-19 07:37:49 +04:00
int ompi_coll_tuned_barrier_intra_bruck ( struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-09-14 00:58:42 +04:00
{
2012-04-06 19:48:07 +04:00
int rank , size , distance , to , from , err , line = 0 ;
2005-09-14 00:58:42 +04:00
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
2007-01-11 01:49:43 +03:00
" ompi_coll_tuned_barrier_intra_bruck rank %d " , rank ) ) ;
2005-09-14 00:58:42 +04:00
/* exchange data with rank-2^k and rank+2^k */
for ( distance = 1 ; distance < size ; distance < < = 1 ) {
2007-01-11 01:49:43 +03:00
from = ( rank + size - distance ) % size ;
to = ( rank + distance ) % size ;
/* send message to lower ranked node */
2014-06-13 20:04:45 +04:00
err = ompi_coll_tuned_sendrecv_zero ( to , MCA_COLL_BASE_TAG_BARRIER ,
from , MCA_COLL_BASE_TAG_BARRIER ,
comm ) ;
2006-10-18 06:00:46 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto err_hndl ; }
2005-09-14 00:58:42 +04:00
}
return MPI_SUCCESS ;
2006-10-18 06:00:46 +04:00
err_hndl :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Error occurred %d, rank %2d " ,
2007-01-11 01:49:43 +03:00
__FILE__ , line , err , rank ) ) ;
2006-10-18 06:00:46 +04:00
return err ;
2005-09-14 00:58:42 +04:00
}
2006-02-01 02:21:46 +03:00
/*
* To make synchronous , uses sync sends and sync sendrecvs
*/
2005-09-14 00:58:42 +04:00
/* special case for two processes */
2007-08-19 07:37:49 +04:00
int ompi_coll_tuned_barrier_intra_two_procs ( struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-09-14 00:58:42 +04:00
{
2007-01-11 01:49:43 +03:00
int remote , err ;
remote = ompi_comm_rank ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
2007-01-11 01:49:43 +03:00
" ompi_coll_tuned_barrier_intra_two_procs rank %d " , remote ) ) ;
remote = ( remote + 1 ) & 0x1 ;
2014-06-13 20:04:45 +04:00
err = ompi_coll_tuned_sendrecv_zero ( remote , MCA_COLL_BASE_TAG_BARRIER ,
remote , MCA_COLL_BASE_TAG_BARRIER ,
comm ) ;
2005-09-14 00:58:42 +04:00
return ( err ) ;
}
2005-11-11 07:49:29 +03:00
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and / or small data sizes they
* are just as fast as tuned / tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1 . i . e . in V2 we will handle this differently and so will not
* have to duplicate code .
* GEF Oct05 after asking Jeff .
*/
/* copied function (with appropriate renaming) starts here */
2007-08-19 07:37:49 +04:00
static int ompi_coll_tuned_barrier_intra_basic_linear ( struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-09-14 00:58:42 +04:00
{
2012-04-06 19:48:07 +04:00
int i , err , rank , size ;
2005-11-11 07:49:29 +03:00
2012-04-06 19:48:07 +04:00
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
2005-11-11 07:49:29 +03:00
2012-04-06 19:48:07 +04:00
/* All non-root send & receive zero-length message. */
2005-11-11 07:49:29 +03:00
if ( rank > 0 ) {
2007-01-11 01:49:43 +03:00
err = MCA_PML_CALL ( send ( NULL , 0 , MPI_BYTE , 0 ,
MCA_COLL_BASE_TAG_BARRIER ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
2005-11-11 07:49:29 +03:00
if ( MPI_SUCCESS ! = err ) {
return err ;
}
2007-01-11 01:49:43 +03:00
err = MCA_PML_CALL ( recv ( NULL , 0 , MPI_BYTE , 0 ,
MCA_COLL_BASE_TAG_BARRIER ,
comm , MPI_STATUS_IGNORE ) ) ;
2005-11-11 07:49:29 +03:00
if ( MPI_SUCCESS ! = err ) {
return err ;
}
}
/* The root collects and broadcasts the messages. */
else {
2007-01-11 01:49:43 +03:00
ompi_request_t * * requests ;
requests = ( ompi_request_t * * ) malloc ( size * sizeof ( ompi_request_t * ) ) ;
2005-11-11 07:49:29 +03:00
for ( i = 1 ; i < size ; + + i ) {
2007-01-11 01:49:43 +03:00
err = MCA_PML_CALL ( irecv ( NULL , 0 , MPI_BYTE , MPI_ANY_SOURCE ,
MCA_COLL_BASE_TAG_BARRIER , comm ,
& ( requests [ i ] ) ) ) ;
2005-11-11 07:49:29 +03:00
if ( MPI_SUCCESS ! = err ) {
return err ;
}
}
2007-01-11 01:49:43 +03:00
ompi_request_wait_all ( size - 1 , requests + 1 , MPI_STATUSES_IGNORE ) ;
2005-11-11 07:49:29 +03:00
for ( i = 1 ; i < size ; + + i ) {
2007-01-11 01:49:43 +03:00
err = MCA_PML_CALL ( isend ( NULL , 0 , MPI_BYTE , i ,
MCA_COLL_BASE_TAG_BARRIER ,
MCA_PML_BASE_SEND_STANDARD , comm ,
& ( requests [ i ] ) ) ) ;
2005-11-11 07:49:29 +03:00
if ( MPI_SUCCESS ! = err ) {
return err ;
}
}
2007-01-11 01:49:43 +03:00
ompi_request_wait_all ( size - 1 , requests + 1 , MPI_STATUSES_IGNORE ) ;
free ( requests ) ;
2005-11-11 07:49:29 +03:00
}
/* All done */
return MPI_SUCCESS ;
2005-09-14 00:58:42 +04:00
}
2005-11-11 07:49:29 +03:00
/* copied function (with appropriate renaming) ends here */
2008-05-20 21:37:41 +04:00
/*
* Another recursive doubling type algorithm , but in this case
* we go up the tree and back down the tree .
*/
int ompi_coll_tuned_barrier_intra_tree ( struct ompi_communicator_t * comm ,
2008-07-29 02:40:57 +04:00
mca_coll_base_module_t * module )
2008-05-20 21:37:41 +04:00
{
2012-04-06 19:48:07 +04:00
int rank , size , depth , err , jump , partner ;
2008-05-20 21:37:41 +04:00
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
" ompi_coll_tuned_barrier_intra_tree %d " ,
rank ) ) ;
/* Find the nearest power of 2 of the communicator size. */
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
depth = opal_next_poweroftwo_inclusive ( size ) ;
2008-05-20 21:37:41 +04:00
for ( jump = 1 ; jump < depth ; jump < < = 1 ) {
partner = rank ^ jump ;
if ( ! ( partner & ( jump - 1 ) ) & & partner < size ) {
if ( partner > rank ) {
err = MCA_PML_CALL ( recv ( NULL , 0 , MPI_BYTE , partner ,
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = err )
return err ;
} else if ( partner < rank ) {
err = MCA_PML_CALL ( send ( NULL , 0 , MPI_BYTE , partner ,
MCA_COLL_BASE_TAG_BARRIER ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = err )
return err ;
}
}
}
2012-04-06 19:48:07 +04:00
depth > > = 1 ;
2008-05-20 21:37:41 +04:00
for ( jump = depth ; jump > 0 ; jump > > = 1 ) {
partner = rank ^ jump ;
if ( ! ( partner & ( jump - 1 ) ) & & partner < size ) {
if ( partner > rank ) {
err = MCA_PML_CALL ( send ( NULL , 0 , MPI_BYTE , partner ,
MCA_COLL_BASE_TAG_BARRIER ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = err )
return err ;
} else if ( partner < rank ) {
err = MCA_PML_CALL ( recv ( NULL , 0 , MPI_BYTE , partner ,
MCA_COLL_BASE_TAG_BARRIER , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = err )
return err ;
}
}
}
return MPI_SUCCESS ;
}
2005-09-14 00:58:42 +04:00
2005-10-25 07:55:58 +04:00
/* The following are used by dynamic and forced rules */
2005-09-14 00:58:42 +04:00
2005-10-25 07:55:58 +04:00
/* publish details of each algorithm and if its forced/fixed/locked in */
2007-01-11 01:49:43 +03:00
/* as you add methods/algorithms you must update this and the query/map */
/* routines */
2005-09-14 00:58:42 +04:00
2006-04-20 03:42:06 +04:00
/* this routine is called by the component only */
2007-01-11 01:49:43 +03:00
/* this makes sure that the mca parameters are set to their initial values */
/* and perms */
2006-04-20 03:42:06 +04:00
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_barrier_intra_check_forced_init ( coll_tuned_force_algorithm_mca_param_indices_t * mca_param_indices )
2005-10-25 07:55:58 +04:00
{
2013-03-28 01:09:41 +04:00
mca_base_var_enum_t * new_enum ;
ompi_coll_tuned_forced_max_algorithms [ BARRIER ] = coll_tuned_barrier_algorithm_count ;
( void ) mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" barrier_algorithm_count " ,
" Number of barrier algorithms available " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 ,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_CONSTANT ,
& coll_tuned_barrier_algorithm_count ) ;
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_barrier_forced_algorithm = 0 ;
( void ) mca_base_var_enum_create ( " coll_tuned_barrier_algorithms " , barrier_algorithms , & new_enum ) ;
mca_param_indices - > algorithm_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" barrier_algorithm " ,
" Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree " ,
MCA_BASE_VAR_TYPE_INT , new_enum , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_barrier_forced_algorithm ) ;
OBJ_RELEASE ( new_enum ) ;
2009-01-03 18:56:25 +03:00
if ( mca_param_indices - > algorithm_param_index < 0 ) {
return mca_param_indices - > algorithm_param_index ;
}
2005-10-25 07:55:58 +04:00
2006-10-18 06:00:46 +04:00
return ( MPI_SUCCESS ) ;
2005-10-25 07:55:58 +04:00
}
2007-08-19 07:37:49 +04:00
int ompi_coll_tuned_barrier_intra_do_forced ( struct ompi_communicator_t * comm ,
2009-12-03 01:03:52 +03:00
mca_coll_base_module_t * module )
2005-10-25 07:55:58 +04:00
{
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
2007-01-11 01:49:43 +03:00
" coll:tuned:barrier_intra_do_forced selected algorithm %d " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ BARRIER ] . algorithm ) ) ;
switch ( data - > user_forced [ BARRIER ] . algorithm ) {
case ( 0 ) : return ompi_coll_tuned_barrier_intra_dec_fixed ( comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_barrier_intra_basic_linear ( comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_barrier_intra_doublering ( comm , module ) ;
case ( 3 ) : return ompi_coll_tuned_barrier_intra_recursivedoubling ( comm , module ) ;
case ( 4 ) : return ompi_coll_tuned_barrier_intra_bruck ( comm , module ) ;
case ( 5 ) : return ompi_coll_tuned_barrier_intra_two_procs ( comm , module ) ;
2008-05-20 21:37:41 +04:00
case ( 6 ) : return ompi_coll_tuned_barrier_intra_tree ( comm , module ) ;
2005-10-25 07:55:58 +04:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid? " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ BARRIER ] . algorithm ,
2007-01-11 01:49:43 +03:00
ompi_coll_tuned_forced_max_algorithms [ BARRIER ] ) ) ;
2005-10-25 07:55:58 +04:00
return ( MPI_ERR_ARG ) ;
} /* switch */
}
2005-09-14 00:58:42 +04:00
2005-11-11 07:49:29 +03:00
2007-08-19 07:37:49 +04:00
int ompi_coll_tuned_barrier_intra_do_this ( struct ompi_communicator_t * comm ,
2009-12-03 01:03:52 +03:00
mca_coll_base_module_t * module ,
int algorithm , int faninout , int segsize )
2005-11-11 07:49:29 +03:00
{
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d " , algorithm , faninout ) ) ;
2005-11-11 07:49:29 +03:00
2006-10-18 06:00:46 +04:00
switch ( algorithm ) {
2007-08-19 07:37:49 +04:00
case ( 0 ) : return ompi_coll_tuned_barrier_intra_dec_fixed ( comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_barrier_intra_basic_linear ( comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_barrier_intra_doublering ( comm , module ) ;
case ( 3 ) : return ompi_coll_tuned_barrier_intra_recursivedoubling ( comm , module ) ;
case ( 4 ) : return ompi_coll_tuned_barrier_intra_bruck ( comm , module ) ;
case ( 5 ) : return ompi_coll_tuned_barrier_intra_two_procs ( comm , module ) ;
2008-05-20 21:37:41 +04:00
case ( 6 ) : return ompi_coll_tuned_barrier_intra_tree ( comm , module ) ;
2005-11-11 07:49:29 +03:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid? " ,
2006-10-18 06:00:46 +04:00
algorithm , ompi_coll_tuned_forced_max_algorithms [ BARRIER ] ) ) ;
2005-11-11 07:49:29 +03:00
return ( MPI_ERR_ARG ) ;
} /* switch */
}