2013-03-28 01:09:41 +04:00
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2005-08-31 05:43:48 +04:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2012-03-06 02:23:44 +04:00
* Copyright ( c ) 2004 - 2012 The University of Tennessee and The University
2005-11-05 22:57:48 +03:00
* of Tennessee Research Foundation . All rights
* reserved .
2005-08-31 05:43:48 +04:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2012-08-16 21:49:48 +04:00
* Copyright ( c ) 2012 Cisco Systems , Inc . All rights reserved .
2005-08-31 05:43:48 +04:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "ompi_config.h"
# include "mpi.h"
2006-02-12 04:33:29 +03:00
# include "ompi/constants.h"
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
# include "ompi/datatype/ompi_datatype.h"
2006-02-12 04:33:29 +03:00
# include "ompi/communicator/communicator.h"
# include "ompi/mca/coll/coll.h"
# include "ompi/mca/coll/base/coll_tags.h"
# include "ompi/mca/pml/pml.h"
2006-11-10 08:53:50 +03:00
# include "coll_tuned.h"
# include "coll_tuned_topo.h"
2005-09-13 08:28:18 +04:00
# include "coll_tuned_util.h"
2005-08-31 05:43:48 +04:00
2013-03-28 01:09:41 +04:00
/* bcast algorithm variables */
static int coll_tuned_bcast_algorithm_count = 6 ;
static int coll_tuned_bcast_forced_algorithm = 0 ;
static int coll_tuned_bcast_segment_size = 0 ;
static int coll_tuned_bcast_tree_fanout ;
static int coll_tuned_bcast_chain_fanout ;
/* valid values for coll_tuned_bcast_forced_algorithm */
static mca_base_var_enum_value_t bcast_algorithms [ ] = {
{ 0 , " ignore " } ,
{ 1 , " basic_linear " } ,
{ 2 , " chain " } ,
{ 3 , " pipeline " } ,
{ 4 , " split_binary_tree " } ,
{ 5 , " binary_tree " } ,
{ 6 , " binomial " } ,
{ 0 , NULL }
} ;
2005-08-31 05:43:48 +04:00
int
2006-11-10 08:53:50 +03:00
ompi_coll_tuned_bcast_intra_generic ( void * buffer ,
int original_count ,
struct ompi_datatype_t * datatype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-11-10 08:53:50 +03:00
uint32_t count_by_segment ,
ompi_coll_tree_t * tree )
2005-08-31 05:43:48 +04:00
{
2012-04-06 19:48:07 +04:00
int err = 0 , line , i , rank , size , segindex , req_index ;
2007-03-08 00:59:53 +03:00
int num_segments ; /* Number of segments */
int sendcount ; /* number of elements sent in this segment */
2012-04-06 19:48:07 +04:00
size_t realsegsize , type_size ;
2006-11-10 08:53:50 +03:00
char * tmpbuf ;
ptrdiff_t extent , lb ;
2007-03-08 00:59:53 +03:00
ompi_request_t * recv_reqs [ 2 ] = { MPI_REQUEST_NULL , MPI_REQUEST_NULL } ;
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
ompi_request_t * * send_reqs = NULL ;
# endif
2005-08-31 05:43:48 +04:00
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
2006-11-10 08:53:50 +03:00
assert ( size > 1 ) ;
2005-08-31 05:43:48 +04:00
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_get_extent ( datatype , & lb , & extent ) ;
ompi_datatype_type_size ( datatype , & type_size ) ;
2006-11-10 08:53:50 +03:00
num_segments = ( original_count + count_by_segment - 1 ) / count_by_segment ;
2012-03-06 02:23:44 +04:00
realsegsize = ( ptrdiff_t ) count_by_segment * extent ;
2005-09-02 02:56:47 +04:00
2007-03-08 00:59:53 +03:00
/* Set the buffer pointers */
2006-11-10 08:53:50 +03:00
tmpbuf = ( char * ) buffer ;
2005-08-31 05:43:48 +04:00
2007-03-08 00:59:53 +03:00
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
2006-11-10 08:53:50 +03:00
if ( tree - > tree_nextsize ! = 0 ) {
2012-03-06 02:23:44 +04:00
send_reqs = ( ompi_request_t * * ) malloc ( ( ptrdiff_t ) tree - > tree_nextsize *
2007-03-08 00:59:53 +03:00
sizeof ( ompi_request_t * ) ) ;
2006-11-10 08:53:50 +03:00
}
2007-03-08 00:59:53 +03:00
# endif
2005-09-02 04:55:47 +04:00
2007-03-08 00:59:53 +03:00
/* Root code */
2005-08-31 05:43:48 +04:00
if ( rank = = root ) {
2007-03-08 00:59:53 +03:00
/*
For each segment :
- send segment to all children .
2012-04-06 19:48:07 +04:00
The last segment may have less elements than other segments .
2007-03-08 00:59:53 +03:00
*/
2006-11-10 08:53:50 +03:00
sendcount = count_by_segment ;
for ( segindex = 0 ; segindex < num_segments ; segindex + + ) {
2007-03-08 00:59:53 +03:00
if ( segindex = = ( num_segments - 1 ) ) {
2006-11-10 08:53:50 +03:00
sendcount = original_count - segindex * count_by_segment ;
2007-03-08 00:59:53 +03:00
}
for ( i = 0 ; i < tree - > tree_nextsize ; i + + ) {
2006-11-10 08:53:50 +03:00
# if defined(COLL_TUNED_BCAST_USE_BLOCKING)
2005-08-31 05:43:48 +04:00
err = MCA_PML_CALL ( send ( tmpbuf , sendcount , datatype ,
2007-03-08 00:59:53 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_BCAST ,
2006-11-10 08:53:50 +03:00
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
# else
err = MCA_PML_CALL ( isend ( tmpbuf , sendcount , datatype ,
2007-03-08 00:59:53 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ,
& send_reqs [ i ] ) ) ;
2006-11-10 08:53:50 +03:00
# endif /* COLL_TUNED_BCAST_USE_BLOCKING */
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
2007-03-08 00:59:53 +03:00
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
2006-11-10 08:53:50 +03:00
/* complete the sends before starting the next sends */
2007-03-08 00:59:53 +03:00
err = ompi_request_wait_all ( tree - > tree_nextsize , send_reqs ,
MPI_STATUSES_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-03-08 00:59:53 +03:00
# endif /* not COLL_TUNED_BCAST_USE_BLOCKING */
2006-11-10 08:53:50 +03:00
2005-08-31 05:43:48 +04:00
/* update tmp buffer */
tmpbuf + = realsegsize ;
2007-03-08 00:59:53 +03:00
}
}
2006-11-10 08:53:50 +03:00
2007-03-08 00:59:53 +03:00
/* Intermediate nodes code */
2006-11-10 08:53:50 +03:00
else if ( tree - > tree_nextsize > 0 ) {
2007-03-08 00:59:53 +03:00
/*
Create the pipeline .
1 ) Post the first receive
2 ) For segments 1 . . num_segments
2012-04-06 19:48:07 +04:00
- post new receive
- wait on the previous receive to complete
- send this data to children
2007-03-08 00:59:53 +03:00
3 ) Wait on the last segment
4 ) Compute number of elements in last segment .
5 ) Send the last segment to children
2012-04-06 19:48:07 +04:00
*/
2007-03-08 00:59:53 +03:00
req_index = 0 ;
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( irecv ( tmpbuf , count_by_segment , datatype ,
tree - > tree_prev , MCA_COLL_BASE_TAG_BCAST ,
comm , & recv_reqs [ req_index ] ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-03-08 00:59:53 +03:00
2005-08-31 05:43:48 +04:00
for ( segindex = 1 ; segindex < num_segments ; segindex + + ) {
2007-03-08 00:59:53 +03:00
req_index = req_index ^ 0x1 ;
2005-08-31 05:43:48 +04:00
/* post new irecv */
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( irecv ( tmpbuf + realsegsize , count_by_segment ,
datatype , tree - > tree_prev ,
MCA_COLL_BASE_TAG_BCAST ,
comm , & recv_reqs [ req_index ] ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-03-08 00:59:53 +03:00
/* wait for and forward the previous segment to children */
err = ompi_request_wait ( & recv_reqs [ req_index ^ 0x1 ] ,
MPI_STATUSES_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-03-08 00:59:53 +03:00
for ( i = 0 ; i < tree - > tree_nextsize ; i + + ) {
2006-11-10 08:53:50 +03:00
# if defined(COLL_TUNED_BCAST_USE_BLOCKING)
2006-11-11 02:04:50 +03:00
err = MCA_PML_CALL ( send ( tmpbuf , count_by_segment , datatype ,
2007-03-08 00:59:53 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_BCAST ,
2006-11-10 08:53:50 +03:00
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
# else
2006-11-11 02:04:50 +03:00
err = MCA_PML_CALL ( isend ( tmpbuf , count_by_segment , datatype ,
2007-03-08 00:59:53 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ,
& send_reqs [ i ] ) ) ;
2006-11-10 08:53:50 +03:00
# endif /* COLL_TUNED_BCAST_USE_BLOCKING */
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2006-11-10 08:53:50 +03:00
}
2007-03-08 00:59:53 +03:00
2006-11-10 08:53:50 +03:00
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
2007-03-08 00:59:53 +03:00
/* complete the sends before starting the next iteration */
err = ompi_request_wait_all ( tree - > tree_nextsize , send_reqs ,
MPI_STATUSES_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
# endif /* COLL_TUNED_BCAST_USE_BLOCKING */
2007-03-08 00:59:53 +03:00
/* Update the receive buffer */
2006-11-10 08:53:50 +03:00
tmpbuf + = realsegsize ;
2007-03-08 00:59:53 +03:00
}
2006-11-10 08:53:50 +03:00
2007-03-08 00:59:53 +03:00
/* Process the last segment */
err = ompi_request_wait ( & recv_reqs [ req_index ] , MPI_STATUSES_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2012-03-06 02:23:44 +04:00
sendcount = original_count - ( ptrdiff_t ) ( num_segments - 1 ) * count_by_segment ;
2007-03-08 00:59:53 +03:00
for ( i = 0 ; i < tree - > tree_nextsize ; i + + ) {
2006-11-10 08:53:50 +03:00
# if defined(COLL_TUNED_BCAST_USE_BLOCKING)
err = MCA_PML_CALL ( send ( tmpbuf , sendcount , datatype ,
2007-03-08 00:59:53 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_BCAST ,
2006-11-10 08:53:50 +03:00
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
# else
err = MCA_PML_CALL ( isend ( tmpbuf , sendcount , datatype ,
2007-03-08 00:59:53 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ,
& send_reqs [ i ] ) ) ;
2006-11-10 08:53:50 +03:00
# endif /* COLL_TUNED_BCAST_USE_BLOCKING */
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2006-11-10 08:53:50 +03:00
}
2007-03-08 00:59:53 +03:00
2006-11-10 08:53:50 +03:00
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
2007-03-08 00:59:53 +03:00
err = ompi_request_wait_all ( tree - > tree_nextsize , send_reqs ,
MPI_STATUSES_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
# endif /* COLL_TUNED_BCAST_USE_BLOCKING */
2007-03-08 00:59:53 +03:00
}
2006-11-10 08:53:50 +03:00
2007-03-08 00:59:53 +03:00
/* Leaf nodes */
2006-11-10 08:53:50 +03:00
else {
2007-03-08 00:59:53 +03:00
/*
Receive all segments from parent in a loop :
1 ) post irecv for the first segment
2 ) for segments 1 . . num_segments
2012-04-06 19:48:07 +04:00
- post irecv for the next segment
- wait on the previous segment to arrive
2007-03-08 00:59:53 +03:00
3 ) wait for the last segment
*/
req_index = 0 ;
err = MCA_PML_CALL ( irecv ( tmpbuf , count_by_segment , datatype ,
2006-11-10 08:53:50 +03:00
tree - > tree_prev , MCA_COLL_BASE_TAG_BCAST ,
comm , & recv_reqs [ req_index ] ) ) ;
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
for ( segindex = 1 ; segindex < num_segments ; segindex + + ) {
2007-03-08 00:59:53 +03:00
req_index = req_index ^ 0x1 ;
tmpbuf + = realsegsize ;
/* post receive for the next segment */
err = MCA_PML_CALL ( irecv ( tmpbuf , count_by_segment , datatype ,
tree - > tree_prev , MCA_COLL_BASE_TAG_BCAST ,
2006-11-10 08:53:50 +03:00
comm , & recv_reqs [ req_index ] ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-03-08 00:59:53 +03:00
/* wait on the previous segment */
err = ompi_request_wait ( & recv_reqs [ req_index ^ 0x1 ] ,
MPI_STATUS_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2005-08-31 05:43:48 +04:00
}
2007-03-08 00:59:53 +03:00
err = ompi_request_wait ( & recv_reqs [ req_index ] , MPI_STATUS_IGNORE ) ;
2006-11-10 08:53:50 +03:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2005-08-31 05:43:48 +04:00
}
2007-03-08 00:59:53 +03:00
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
2006-11-10 08:53:50 +03:00
if ( NULL ! = send_reqs ) free ( send_reqs ) ;
2007-03-08 00:59:53 +03:00
# endif
2005-08-31 05:43:48 +04:00
return ( MPI_SUCCESS ) ;
2006-11-10 08:53:50 +03:00
2005-08-31 05:43:48 +04:00
error_hndl :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Error occurred %d, rank %2d " ,
2006-11-10 08:53:50 +03:00
__FILE__ , line , err , rank ) ) ;
2007-03-08 00:59:53 +03:00
# if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
2006-11-10 08:53:50 +03:00
if ( NULL ! = send_reqs ) free ( send_reqs ) ;
2007-03-08 00:59:53 +03:00
# endif
2005-08-31 05:43:48 +04:00
return ( err ) ;
}
2006-11-10 08:53:50 +03:00
int
ompi_coll_tuned_bcast_intra_bintree ( void * buffer ,
int count ,
struct ompi_datatype_t * datatype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-11-10 08:53:50 +03:00
uint32_t segsize )
{
2006-11-10 22:54:09 +03:00
int segcount = count ;
2006-11-10 08:53:50 +03:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_BINTREE ( comm , tuned_module , root ) ;
2006-11-10 08:53:50 +03:00
/**
2006-11-10 22:54:09 +03:00
* Determine number of elements sent per operation .
2006-11-10 08:53:50 +03:00
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 22:54:09 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d " ,
2007-02-27 01:20:35 +03:00
ompi_comm_rank ( comm ) , segsize , ( unsigned long ) typelng , segcount ) ) ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
return ompi_coll_tuned_bcast_intra_generic ( buffer , count , datatype , root , comm , module ,
segcount , data - > cached_bintree ) ;
2006-11-10 08:53:50 +03:00
}
2005-09-02 06:19:59 +04:00
2005-08-31 05:43:48 +04:00
int
2006-11-10 08:53:50 +03:00
ompi_coll_tuned_bcast_intra_pipeline ( void * buffer ,
int count ,
struct ompi_datatype_t * datatype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-11-10 08:53:50 +03:00
uint32_t segsize )
2005-08-31 05:43:48 +04:00
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-11-10 08:53:50 +03:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_PIPELINE ( comm , tuned_module , root ) ;
2006-11-10 08:53:50 +03:00
/**
2006-11-10 22:54:09 +03:00
* Determine number of elements sent per operation .
2006-11-10 08:53:50 +03:00
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 22:54:09 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d " ,
2007-02-27 01:20:35 +03:00
ompi_comm_rank ( comm ) , segsize , ( unsigned long ) typelng , segcount ) ) ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
return ompi_coll_tuned_bcast_intra_generic ( buffer , count , datatype , root , comm , module ,
segcount , data - > cached_pipeline ) ;
2005-08-31 05:43:48 +04:00
}
2006-11-10 08:53:50 +03:00
int
ompi_coll_tuned_bcast_intra_chain ( void * buffer ,
int count ,
struct ompi_datatype_t * datatype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-11-10 08:53:50 +03:00
uint32_t segsize , int32_t chains )
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-11-10 08:53:50 +03:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2005-08-31 05:43:48 +04:00
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_CHAIN ( comm , tuned_module , root , chains ) ;
2006-11-10 22:54:09 +03:00
2006-11-10 08:53:50 +03:00
/**
2006-11-10 22:54:09 +03:00
* Determine number of elements sent per operation .
2006-11-10 08:53:50 +03:00
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 22:54:09 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d " ,
2007-02-27 01:20:35 +03:00
ompi_comm_rank ( comm ) , chains , segsize , ( unsigned long ) typelng , segcount ) ) ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
return ompi_coll_tuned_bcast_intra_generic ( buffer , count , datatype , root , comm , module ,
segcount , data - > cached_chain ) ;
2006-11-10 08:53:50 +03:00
}
int
ompi_coll_tuned_bcast_intra_binomial ( void * buffer ,
int count ,
struct ompi_datatype_t * datatype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-11-10 08:53:50 +03:00
uint32_t segsize )
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-11-10 08:53:50 +03:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_BMTREE ( comm , tuned_module , root ) ;
2006-11-10 22:54:09 +03:00
2006-11-10 08:53:50 +03:00
/**
2006-11-10 22:54:09 +03:00
* Determine number of elements sent per operation .
2006-11-10 08:53:50 +03:00
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 22:54:09 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d " ,
2007-02-27 01:20:35 +03:00
ompi_comm_rank ( comm ) , segsize , ( unsigned long ) typelng , segcount ) ) ;
2006-11-10 08:53:50 +03:00
2007-08-19 07:37:49 +04:00
return ompi_coll_tuned_bcast_intra_generic ( buffer , count , datatype , root , comm , module ,
segcount , data - > cached_bmtree ) ;
2006-11-10 08:53:50 +03:00
}
2005-09-02 06:19:59 +04:00
2005-08-31 05:43:48 +04:00
int
2005-12-22 16:49:33 +03:00
ompi_coll_tuned_bcast_intra_split_bintree ( void * buffer ,
2006-10-18 06:00:46 +04:00
int count ,
struct ompi_datatype_t * datatype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-10-18 06:00:46 +04:00
uint32_t segsize )
2005-08-31 05:43:48 +04:00
{
2012-04-06 19:48:07 +04:00
int err = 0 , line , rank , size , segindex , i , lr , pair ;
2005-08-31 05:43:48 +04:00
uint32_t counts [ 2 ] ;
2012-04-06 19:48:07 +04:00
int segcount [ 2 ] ; /* Number of elements sent with each segment */
2005-08-31 05:43:48 +04:00
int num_segments [ 2 ] ; /* Number of segmenets */
int sendcount [ 2 ] ; /* the same like segcount, except for the last segment */
2012-04-06 19:48:07 +04:00
size_t realsegsize [ 2 ] , type_size ;
2005-08-31 05:43:48 +04:00
char * tmpbuf [ 2 ] ;
2006-10-18 00:20:58 +04:00
ptrdiff_t type_extent , lb ;
2005-09-01 03:38:09 +04:00
ompi_request_t * base_req , * new_req ;
2005-08-31 05:43:48 +04:00
ompi_coll_tree_t * tree ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2005-08-31 05:43:48 +04:00
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d " , rank , root , segsize ) ) ;
2005-08-31 05:43:48 +04:00
if ( size = = 1 ) {
return MPI_SUCCESS ;
}
2006-10-27 02:53:05 +04:00
/* setup the binary tree topology. */
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_BINTREE ( comm , tuned_module , root ) ;
tree = data - > cached_bintree ;
2005-08-31 05:43:48 +04:00
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
err = ompi_datatype_type_size ( datatype , & type_size ) ;
2005-08-31 05:43:48 +04:00
/* Determine number of segments and number of elements per segment */
counts [ 0 ] = count / 2 ;
if ( count % 2 ! = 0 ) counts [ 0 ] + + ;
counts [ 1 ] = count - counts [ 0 ] ;
if ( segsize > 0 ) {
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
/* Note that ompi_datatype_type_size() will never return a negative
2006-09-13 20:42:31 +04:00
value in typelng ; it returns an int [ vs . an unsigned type ]
because of the MPI spec . */
if ( segsize < ( ( uint32_t ) type_size ) ) {
segsize = type_size ; /* push segsize up to hold one type */
}
2005-08-31 05:43:48 +04:00
segcount [ 0 ] = segcount [ 1 ] = segsize / type_size ;
num_segments [ 0 ] = counts [ 0 ] / segcount [ 0 ] ;
if ( ( counts [ 0 ] % segcount [ 0 ] ) ! = 0 ) num_segments [ 0 ] + + ;
num_segments [ 1 ] = counts [ 1 ] / segcount [ 1 ] ;
if ( ( counts [ 1 ] % segcount [ 1 ] ) ! = 0 ) num_segments [ 1 ] + + ;
} else {
segcount [ 0 ] = counts [ 0 ] ;
segcount [ 1 ] = counts [ 1 ] ;
num_segments [ 0 ] = num_segments [ 1 ] = 1 ;
}
/* if the message is too small to be split into segments */
if ( ( counts [ 0 ] = = 0 | | counts [ 1 ] = = 0 ) | |
2012-03-06 02:23:44 +04:00
( segsize > ( ( ptrdiff_t ) counts [ 0 ] * type_size ) ) | |
( segsize > ( ( ptrdiff_t ) counts [ 1 ] * type_size ) ) ) {
2005-08-31 05:43:48 +04:00
/* call linear version here ! */
2005-12-22 16:49:33 +03:00
return ( ompi_coll_tuned_bcast_intra_chain ( buffer , count , datatype ,
2007-08-19 07:37:49 +04:00
root , comm , module ,
2012-04-06 19:48:07 +04:00
segsize , 1 ) ) ;
2005-08-31 05:43:48 +04:00
}
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
err = ompi_datatype_get_extent ( datatype , & lb , & type_extent ) ;
2005-09-01 03:38:09 +04:00
2005-08-31 05:43:48 +04:00
/* Determine real segment size */
2012-03-06 02:23:44 +04:00
realsegsize [ 0 ] = ( ptrdiff_t ) segcount [ 0 ] * type_extent ;
realsegsize [ 1 ] = ( ptrdiff_t ) segcount [ 1 ] * type_extent ;
2005-08-31 05:43:48 +04:00
/* set the buffer pointers */
tmpbuf [ 0 ] = ( char * ) buffer ;
2012-03-06 02:23:44 +04:00
tmpbuf [ 1 ] = ( char * ) buffer + ( ptrdiff_t ) counts [ 0 ] * type_extent ;
2005-08-31 05:43:48 +04:00
/* Step 1:
Root splits the buffer in 2 and sends segmented message down the branches .
2005-09-02 02:56:47 +04:00
Left subtree of the tree receives first half of the buffer , while right
2005-08-31 05:43:48 +04:00
subtree receives the remaining message .
*/
/* determine if I am left (0) or right (1), (root is right) */
lr = ( ( rank + size - root ) % size + 1 ) % 2 ;
/* root code */
if ( rank = = root ) {
/* determine segment count */
sendcount [ 0 ] = segcount [ 0 ] ;
sendcount [ 1 ] = segcount [ 1 ] ;
/* for each segment */
for ( segindex = 0 ; segindex < num_segments [ 0 ] ; segindex + + ) {
/* for each child */
for ( i = 0 ; i < tree - > tree_nextsize & & i < 2 ; i + + ) {
if ( segindex > = num_segments [ i ] ) { /* no more segments */
continue ;
}
/* determine how many elements are being sent in this round */
if ( segindex = = ( num_segments [ i ] - 1 ) )
sendcount [ i ] = counts [ i ] - segindex * segcount [ i ] ;
/* send data */
MCA_PML_CALL ( send ( tmpbuf [ i ] , sendcount [ i ] , datatype ,
tree - > tree_next [ i ] , MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2006-10-18 06:00:46 +04:00
/* update tmp buffer */
2005-08-31 05:43:48 +04:00
tmpbuf [ i ] + = realsegsize [ i ] ;
}
}
}
/* intermediate nodes code */
else if ( tree - > tree_nextsize > 0 ) {
/* Intermediate nodes:
2005-09-02 02:56:47 +04:00
* It will receive segments only from one half of the data .
2005-08-31 05:43:48 +04:00
* Which one is determined by whether the node belongs to the " left " or " right "
* subtree . Topoloby building function builds binary tree such that
* odd " shifted ranks " ( ( rank + size - root ) % size ) are on the left subtree ,
* and even on the right subtree .
*
* Create the pipeline . We first post the first receive , then in the loop we
* post the next receive and after that wait for the previous receive to complete
* and we disseminating the data to all children .
*/
sendcount [ lr ] = segcount [ lr ] ;
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( irecv ( tmpbuf [ lr ] , sendcount [ lr ] , datatype ,
tree - > tree_prev , MCA_COLL_BASE_TAG_BCAST ,
comm , & base_req ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
for ( segindex = 1 ; segindex < num_segments [ lr ] ; segindex + + ) {
/* determine how many elements to expect in this round */
if ( segindex = = ( num_segments [ lr ] - 1 ) )
2012-03-06 02:23:44 +04:00
sendcount [ lr ] = counts [ lr ] - ( ptrdiff_t ) segindex * ( ptrdiff_t ) segcount [ lr ] ;
2005-08-31 05:43:48 +04:00
/* post new irecv */
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( irecv ( tmpbuf [ lr ] + realsegsize [ lr ] , sendcount [ lr ] ,
datatype , tree - > tree_prev , MCA_COLL_BASE_TAG_BCAST ,
comm , & new_req ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
/* wait for and forward current segment */
err = ompi_request_wait_all ( 1 , & base_req , MPI_STATUSES_IGNORE ) ;
for ( i = 0 ; i < tree - > tree_nextsize ; i + + ) { /* send data to children (segcount[lr]) */
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( send ( tmpbuf [ lr ] , segcount [ lr ] , datatype ,
tree - > tree_next [ i ] , MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
} /* end of for each child */
/* upate the base request */
base_req = new_req ;
/* go to the next buffer (ie. the one corresponding to the next recv) */
tmpbuf [ lr ] + = realsegsize [ lr ] ;
} /* end of for segindex */
/* wait for the last segment and forward current segment */
err = ompi_request_wait_all ( 1 , & base_req , MPI_STATUSES_IGNORE ) ;
for ( i = 0 ; i < tree - > tree_nextsize ; i + + ) { /* send data to children */
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( send ( tmpbuf [ lr ] , sendcount [ lr ] , datatype ,
tree - > tree_next [ i ] , MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
} /* end of for each child */
}
/* leaf nodes */
else {
2005-09-02 06:19:59 +04:00
/* Just consume segments as fast as possible */
2005-08-31 05:43:48 +04:00
sendcount [ lr ] = segcount [ lr ] ;
for ( segindex = 0 ; segindex < num_segments [ lr ] ; segindex + + ) {
/* determine how many elements to expect in this round */
2012-03-06 02:23:44 +04:00
if ( segindex = = ( num_segments [ lr ] - 1 ) )
sendcount [ lr ] = counts [ lr ] - ( ptrdiff_t ) segindex * ( ptrdiff_t ) segcount [ lr ] ;
2005-08-31 05:43:48 +04:00
/* receive segments */
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( recv ( tmpbuf [ lr ] , sendcount [ lr ] , datatype ,
tree - > tree_prev , MCA_COLL_BASE_TAG_BCAST ,
comm , MPI_STATUS_IGNORE ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
/* update the initial pointer to the buffer */
tmpbuf [ lr ] + = realsegsize [ lr ] ;
}
}
/* reset the buffer pointers */
tmpbuf [ 0 ] = ( char * ) buffer ;
2012-03-06 02:23:44 +04:00
tmpbuf [ 1 ] = ( char * ) buffer + ( ptrdiff_t ) counts [ 0 ] * type_extent ;
2005-08-31 05:43:48 +04:00
/* Step 2:
Find your immediate pair ( identical node in opposite subtree ) and SendRecv
data buffer with them .
The tree building function ensures that
if ( we are not root )
if we are in the left subtree ( lr = = 0 ) our pair is ( rank + 1 ) % size .
if we are in the right subtree ( lr = = 1 ) our pair is ( rank - 1 ) % size
If we have even number of nodes the rank ( size - 1 ) will pair up with root .
*/
if ( lr = = 0 ) {
pair = ( rank + 1 ) % size ;
} else {
pair = ( rank + size - 1 ) % size ;
}
2005-09-02 02:56:47 +04:00
2005-08-31 05:43:48 +04:00
if ( ( size % 2 ) ! = 0 & & rank ! = root ) {
2005-12-22 16:49:33 +03:00
err = ompi_coll_tuned_sendrecv ( tmpbuf [ lr ] , counts [ lr ] , datatype ,
2006-10-18 06:00:46 +04:00
pair , MCA_COLL_BASE_TAG_BCAST ,
tmpbuf [ ( lr + 1 ) % 2 ] , counts [ ( lr + 1 ) % 2 ] , datatype ,
pair , MCA_COLL_BASE_TAG_BCAST ,
comm , MPI_STATUS_IGNORE , rank ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
} else if ( ( size % 2 ) = = 0 ) {
/* root sends right buffer to the last node */
if ( rank = = root ) {
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( send ( tmpbuf [ 1 ] , counts [ 1 ] , datatype ,
( root + size - 1 ) % size , MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
/* last node receives right buffer from the root */
2005-09-02 02:56:47 +04:00
else if ( rank = = ( root + size - 1 ) % size ) {
2012-08-16 21:49:48 +04:00
err = MCA_PML_CALL ( recv ( tmpbuf [ 1 ] , counts [ 1 ] , datatype ,
root , MCA_COLL_BASE_TAG_BCAST ,
comm , MPI_STATUS_IGNORE ) ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
/* everyone else exchanges buffers */
else {
2005-12-22 16:49:33 +03:00
err = ompi_coll_tuned_sendrecv ( tmpbuf [ lr ] , counts [ lr ] , datatype ,
2006-10-18 06:00:46 +04:00
pair , MCA_COLL_BASE_TAG_BCAST ,
tmpbuf [ ( lr + 1 ) % 2 ] , counts [ ( lr + 1 ) % 2 ] , datatype ,
pair , MCA_COLL_BASE_TAG_BCAST ,
comm , MPI_STATUS_IGNORE , rank ) ;
2005-08-31 05:43:48 +04:00
if ( err ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
}
return ( MPI_SUCCESS ) ;
error_hndl :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " %s:%4d \t Error occurred %d, rank %2d " , __FILE__ , line , err , rank ) ) ;
2005-08-31 05:43:48 +04:00
return ( err ) ;
}
2005-09-02 06:19:59 +04:00
2005-10-27 03:51:56 +04:00
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and / or small data sizes they
* are just as fast as tuned / tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1 . i . e . in V2 we will handle this differently and so will not
* have to duplicate code .
* GEF Oct05 after asking Jeff .
*/
/* copied function (with appropriate renaming) starts here */
/*
* bcast_lin_intra
*
* Function : - broadcast using O ( N ) algorithm
* Accepts : - same arguments as MPI_Bcast ( )
* Returns : - MPI_SUCCESS or error code
*/
int
2005-12-22 16:49:33 +03:00
ompi_coll_tuned_bcast_intra_basic_linear ( void * buff , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * datatype , int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-10-27 03:51:56 +04:00
{
2012-04-06 19:48:07 +04:00
int i , size , rank , err ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2012-04-06 19:48:07 +04:00
ompi_request_t * * preq , * * reqs = data - > mcct_reqs ;
2007-08-19 07:37:49 +04:00
2005-10-27 03:51:56 +04:00
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d " , rank , root ) ) ;
2005-10-27 03:51:56 +04:00
/* Non-root receive the data. */
if ( rank ! = root ) {
return MCA_PML_CALL ( recv ( buff , count , datatype , root ,
MCA_COLL_BASE_TAG_BCAST , comm ,
MPI_STATUS_IGNORE ) ) ;
}
/* Root sends data to all others. */
for ( i = 0 , preq = reqs ; i < size ; + + i ) {
if ( i = = rank ) {
continue ;
}
err = MCA_PML_CALL ( isend_init ( buff , count , datatype , i ,
MCA_COLL_BASE_TAG_BCAST ,
MCA_PML_BASE_SEND_STANDARD ,
comm , preq + + ) ) ;
if ( MPI_SUCCESS ! = err ) {
return err ;
}
}
- - i ;
/* Start your engines. This will never return an error. */
MCA_PML_CALL ( start ( i , reqs ) ) ;
/* Wait for them all. If there's an error, note that we don't
* care what the error was - - just that there * was * an error . The
* PML will finish all requests , even if one or more of them fail .
* i . e . , by the end of this call , all the requests are free - able .
* So free them anyway - - even if there was an error , and return
* the error after we free everything . */
err = ompi_request_wait_all ( i , reqs , MPI_STATUSES_IGNORE ) ;
/* Free the reqs */
2005-12-22 16:49:33 +03:00
ompi_coll_tuned_free_reqs ( reqs , i ) ;
2005-10-27 03:51:56 +04:00
/* All done */
return err ;
}
/* copied function (with appropriate renaming) ends here */
2006-04-20 03:42:06 +04:00
/* The following are used by dynamic and forced rules */
2005-10-27 03:51:56 +04:00
2006-04-20 03:42:06 +04:00
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
2005-10-27 03:51:56 +04:00
2006-04-20 03:42:06 +04:00
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
2005-10-27 03:51:56 +04:00
2006-04-20 03:42:06 +04:00
int ompi_coll_tuned_bcast_intra_check_forced_init ( coll_tuned_force_algorithm_mca_param_indices_t * mca_param_indices )
2005-10-25 07:55:58 +04:00
{
2013-03-28 01:09:41 +04:00
mca_base_var_enum_t * new_enum ;
ompi_coll_tuned_forced_max_algorithms [ BCAST ] = coll_tuned_bcast_algorithm_count ;
( void ) mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" bcast_algorithm_count " ,
" Number of bcast algorithms available " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 ,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_CONSTANT ,
& coll_tuned_bcast_algorithm_count ) ;
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_bcast_forced_algorithm = 0 ;
( void ) mca_base_var_enum_create ( " coll_tuned_bcast_algorithms " , bcast_algorithms , & new_enum ) ;
mca_param_indices - > algorithm_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" bcast_algorithm " ,
" Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree. " ,
MCA_BASE_VAR_TYPE_INT , new_enum , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_bcast_forced_algorithm ) ;
OBJ_RELEASE ( new_enum ) ;
2009-01-03 18:56:25 +03:00
if ( mca_param_indices - > algorithm_param_index < 0 ) {
return mca_param_indices - > algorithm_param_index ;
}
2006-10-24 02:29:17 +04:00
2013-03-28 01:09:41 +04:00
coll_tuned_bcast_segment_size = 0 ;
mca_param_indices - > segsize_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" bcast_algorithm_segmentsize " ,
" Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_bcast_segment_size ) ;
coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout ; /* get system wide default */
mca_param_indices - > tree_fanout_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" bcast_algorithm_tree_fanout " ,
" Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_bcast_tree_fanout ) ;
coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout ; /* get system wide default */
mca_param_indices - > chain_fanout_param_index =
mca_base_component_var_register ( & mca_coll_tuned_component . super . collm_version ,
" bcast_algorithm_chain_fanout " ,
" Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. " ,
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
OPAL_INFO_LVL_5 ,
MCA_BASE_VAR_SCOPE_READONLY ,
& coll_tuned_bcast_chain_fanout ) ;
2005-10-25 07:55:58 +04:00
2006-10-18 06:00:46 +04:00
return ( MPI_SUCCESS ) ;
2005-10-25 07:55:58 +04:00
}
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_bcast_intra_do_forced ( void * buf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module )
2005-10-25 07:55:58 +04:00
{
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_do_forced algorithm %d " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ BCAST ] . algorithm ) ) ;
switch ( data - > user_forced [ BCAST ] . algorithm ) {
case ( 0 ) : return ompi_coll_tuned_bcast_intra_dec_fixed ( buf , count , dtype , root , comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_bcast_intra_basic_linear ( buf , count , dtype , root , comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_bcast_intra_chain ( buf , count , dtype , root , comm , module ,
data - > user_forced [ BCAST ] . segsize ,
data - > user_forced [ BCAST ] . chain_fanout ) ;
case ( 3 ) : return ompi_coll_tuned_bcast_intra_pipeline ( buf , count , dtype , root , comm , module ,
data - > user_forced [ BCAST ] . segsize ) ;
case ( 4 ) : return ompi_coll_tuned_bcast_intra_split_bintree ( buf , count , dtype , root , comm , module ,
data - > user_forced [ BCAST ] . segsize ) ;
case ( 5 ) : return ompi_coll_tuned_bcast_intra_bintree ( buf , count , dtype , root , comm , module ,
data - > user_forced [ BCAST ] . segsize ) ;
case ( 6 ) : return ompi_coll_tuned_bcast_intra_binomial ( buf , count , dtype , root , comm , module ,
data - > user_forced [ BCAST ] . segsize ) ;
2005-10-25 07:55:58 +04:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid? " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ BCAST ] . algorithm , ompi_coll_tuned_forced_max_algorithms [ BCAST ] ) ) ;
2005-10-25 07:55:58 +04:00
} /* switch */
2006-11-10 08:53:50 +03:00
return ( MPI_ERR_ARG ) ;
2005-10-25 07:55:58 +04:00
}
2005-11-11 07:49:29 +03:00
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_bcast_intra_do_this ( void * buf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
int root ,
struct ompi_communicator_t * comm ,
2012-04-06 19:48:07 +04:00
mca_coll_base_module_t * module ,
2006-10-18 06:00:46 +04:00
int algorithm , int faninout , int segsize )
2005-11-11 07:49:29 +03:00
{
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d " ,
2006-10-18 06:00:46 +04:00
algorithm , faninout , segsize ) ) ;
2005-11-11 07:49:29 +03:00
2006-10-18 06:00:46 +04:00
switch ( algorithm ) {
2007-08-19 07:37:49 +04:00
case ( 0 ) : return ompi_coll_tuned_bcast_intra_dec_fixed ( buf , count , dtype , root , comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_bcast_intra_basic_linear ( buf , count , dtype , root , comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_bcast_intra_chain ( buf , count , dtype , root , comm , module , segsize , faninout ) ;
case ( 3 ) : return ompi_coll_tuned_bcast_intra_pipeline ( buf , count , dtype , root , comm , module , segsize ) ;
case ( 4 ) : return ompi_coll_tuned_bcast_intra_split_bintree ( buf , count , dtype , root , comm , module , segsize ) ;
case ( 5 ) : return ompi_coll_tuned_bcast_intra_bintree ( buf , count , dtype , root , comm , module , segsize ) ;
case ( 6 ) : return ompi_coll_tuned_bcast_intra_binomial ( buf , count , dtype , root , comm , module , segsize ) ;
2005-11-11 07:49:29 +03:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid? " ,
2006-10-18 06:00:46 +04:00
algorithm , ompi_coll_tuned_forced_max_algorithms [ BCAST ] ) ) ;
2005-11-11 07:49:29 +03:00
} /* switch */
2006-11-10 08:53:50 +03:00
return ( MPI_ERR_ARG ) ;
2005-11-11 07:49:29 +03:00
}