2005-09-06 09:21:57 +04:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2009-08-15 01:06:23 +04:00
* Copyright ( c ) 2004 - 2009 The University of Tennessee and The University
2005-11-05 22:57:48 +03:00
* of Tennessee Research Foundation . All rights
* reserved .
2005-09-06 09:21:57 +04:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "ompi_config.h"
# include "mpi.h"
2006-02-12 04:33:29 +03:00
# include "ompi/constants.h"
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
# include "ompi/datatype/ompi_datatype.h"
2006-02-12 04:33:29 +03:00
# include "ompi/communicator/communicator.h"
# include "ompi/mca/coll/coll.h"
# include "ompi/mca/coll/base/coll_tags.h"
# include "ompi/mca/pml/pml.h"
# include "ompi/op/op.h"
2005-09-06 09:21:57 +04:00
# include "coll_tuned.h"
# include "coll_tuned_topo.h"
2006-10-27 02:53:05 +04:00
/**
* This is a generic implementation of the reduce protocol . It used the tree
* provided as an argument and execute all operations using a segment of
* count times a datatype .
* For the last communication it will update the count in order to limit
2007-02-15 01:38:38 +03:00
* the number of datatype to the original count ( original_count )
*
* Note that for non - commutative operations we cannot save memory copy
* for the first block : thus we must copy sendbuf to accumbuf on intermediate
* to keep the optimized loop happy .
2006-10-27 02:53:05 +04:00
*/
int ompi_coll_tuned_reduce_generic ( void * sendbuf , void * recvbuf , int original_count ,
ompi_datatype_t * datatype , ompi_op_t * op ,
int root , ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
ompi_coll_tree_t * tree , int count_by_segment ,
int max_outstanding_reqs )
2005-09-06 09:21:57 +04:00
{
2007-03-08 03:54:52 +03:00
char * inbuf [ 2 ] = { NULL , NULL } , * inbuf_free [ 2 ] = { NULL , NULL } ;
char * accumbuf = NULL , * accumbuf_free = NULL ;
char * local_op_buffer = NULL , * sendtmpbuf = NULL ;
ptrdiff_t extent , lower_bound , segment_increment ;
2007-02-15 01:38:38 +03:00
size_t typelng ;
2006-10-21 00:17:34 +04:00
ompi_request_t * reqs [ 2 ] = { MPI_REQUEST_NULL , MPI_REQUEST_NULL } ;
2006-10-27 02:53:05 +04:00
int num_segments , line , ret , segindex , i , rank ;
2007-02-15 01:38:38 +03:00
int recvcount , prevcount , inbi ;
2005-09-06 09:21:57 +04:00
2006-10-20 02:20:33 +04:00
/**
* Determine number of segments and number of elements
* sent per operation
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_get_extent ( datatype , & lower_bound , & extent ) ;
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-10-27 02:53:05 +04:00
num_segments = ( original_count + count_by_segment - 1 ) / count_by_segment ;
2007-03-08 03:54:52 +03:00
segment_increment = count_by_segment * extent ;
2005-09-06 09:21:57 +04:00
2006-10-20 02:20:33 +04:00
sendtmpbuf = ( char * ) sendbuf ;
if ( sendbuf = = MPI_IN_PLACE ) {
sendtmpbuf = ( char * ) recvbuf ;
2005-10-14 04:00:37 +04:00
}
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d " , original_count , ( unsigned long ) ( num_segments * segment_increment ) , ( unsigned long ) segment_increment , max_outstanding_reqs ) ) ;
2007-04-26 00:39:53 +04:00
2006-10-27 02:53:05 +04:00
rank = ompi_comm_rank ( comm ) ;
2007-03-08 03:54:52 +03:00
/* non-leaf nodes - wait for children to send me data & forward up
( if needed ) */
2006-10-27 02:53:05 +04:00
if ( tree - > tree_nextsize > 0 ) {
2007-03-08 03:54:52 +03:00
ptrdiff_t true_lower_bound , true_extent , real_segment_size ;
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_get_true_extent ( datatype , & true_lower_bound ,
2009-08-15 01:06:23 +04:00
& true_extent ) ;
2007-03-08 03:54:52 +03:00
2007-02-07 21:57:03 +03:00
/* handle non existant recv buffer (i.e. its NULL) and
protect the recv buffer on non - root nodes */
2006-10-20 23:47:52 +04:00
accumbuf = ( char * ) recvbuf ;
2007-02-07 21:57:03 +03:00
if ( ( NULL = = accumbuf ) | | ( root ! = rank ) ) {
2007-03-08 03:54:52 +03:00
/* Allocate temporary accumulator buffer. */
accumbuf_free = ( char * ) malloc ( true_extent +
( original_count - 1 ) * extent ) ;
2009-08-15 01:06:23 +04:00
if ( accumbuf_free = = NULL ) {
2007-03-08 03:54:52 +03:00
line = __LINE__ ; ret = - 1 ; goto error_hndl ;
}
accumbuf = accumbuf_free - lower_bound ;
2009-08-15 01:06:23 +04:00
}
2006-10-20 02:20:33 +04:00
2007-02-15 01:38:38 +03:00
/* If this is a non-commutative operation we must copy
sendbuf to the accumbuf , in order to simplfy the loops */
if ( ! ompi_op_is_commute ( op ) ) {
2009-08-15 01:06:23 +04:00
ompi_datatype_copy_content_same_ddt ( datatype , original_count ,
( char * ) accumbuf ,
( char * ) sendtmpbuf ) ;
2007-02-15 01:38:38 +03:00
}
2005-09-06 09:21:57 +04:00
/* Allocate two buffers for incoming segments */
2007-03-08 03:54:52 +03:00
real_segment_size = true_extent + ( count_by_segment - 1 ) * extent ;
inbuf_free [ 0 ] = ( char * ) malloc ( real_segment_size ) ;
if ( inbuf_free [ 0 ] = = NULL ) {
line = __LINE__ ; ret = - 1 ; goto error_hndl ;
}
inbuf [ 0 ] = inbuf_free [ 0 ] - lower_bound ;
2005-09-06 09:21:57 +04:00
/* if there is chance to overlap communication -
allocate second buffer */
2006-10-27 02:53:05 +04:00
if ( ( num_segments > 1 ) | | ( tree - > tree_nextsize > 1 ) ) {
2007-03-08 03:54:52 +03:00
inbuf_free [ 1 ] = ( char * ) malloc ( real_segment_size ) ;
if ( inbuf_free [ 1 ] = = NULL ) {
line = __LINE__ ; ret = - 1 ; goto error_hndl ;
}
inbuf [ 1 ] = inbuf_free [ 1 ] - lower_bound ;
}
2005-09-06 09:21:57 +04:00
/* reset input buffer index and receive count */
2006-10-20 23:47:52 +04:00
inbi = 0 ;
recvcount = 0 ;
2005-09-06 09:21:57 +04:00
/* for each segment */
2006-10-20 23:47:52 +04:00
for ( segindex = 0 ; segindex < = num_segments ; segindex + + ) {
2005-09-06 09:21:57 +04:00
prevcount = recvcount ;
/* recvcount - number of elements in current segment */
2006-10-27 02:53:05 +04:00
recvcount = count_by_segment ;
2006-10-20 23:47:52 +04:00
if ( segindex = = ( num_segments - 1 ) )
2006-10-27 02:53:05 +04:00
recvcount = original_count - count_by_segment * segindex ;
2006-10-20 23:47:52 +04:00
/* for each child */
2006-10-27 02:53:05 +04:00
for ( i = 0 ; i < tree - > tree_nextsize ; i + + ) {
2006-10-20 23:47:52 +04:00
/**
* We try to overlap communication :
* either with next segment or with the next child
*/
2005-09-06 09:21:57 +04:00
/* post irecv for current segindex on current child */
2006-10-20 23:47:52 +04:00
if ( segindex < num_segments ) {
void * local_recvbuf = inbuf [ inbi ] ;
if ( 0 = = i ) {
2007-02-15 01:38:38 +03:00
/* for the first step (1st child per segment) and
* commutative operations we might be able to irecv
* directly into the accumulate buffer so that we can
* reduce ( op ) this with our sendbuf in one step as
* ompi_op_reduce only has two buffer pointers ,
* this avoids an extra memory copy .
2006-10-20 23:47:52 +04:00
*
2007-02-15 01:38:38 +03:00
* BUT if the operation is non - commutative or
* we are root and are USING MPI_IN_PLACE this is wrong !
2006-10-20 23:47:52 +04:00
*/
2007-02-15 01:38:38 +03:00
if ( ( ompi_op_is_commute ( op ) ) & &
! ( ( MPI_IN_PLACE = = sendbuf ) & & ( rank = = tree - > tree_root ) ) ) {
2007-03-08 03:54:52 +03:00
local_recvbuf = accumbuf + segindex * segment_increment ;
2005-10-14 03:38:21 +04:00
}
2005-09-10 03:05:17 +04:00
}
2007-02-15 01:38:38 +03:00
2007-03-08 03:54:52 +03:00
ret = MCA_PML_CALL ( irecv ( local_recvbuf , recvcount , datatype ,
2007-02-15 01:38:38 +03:00
tree - > tree_next [ i ] ,
MCA_COLL_BASE_TAG_REDUCE , comm ,
& reqs [ inbi ] ) ) ;
2007-03-08 03:54:52 +03:00
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2005-09-06 09:21:57 +04:00
}
2007-02-15 01:38:38 +03:00
/* wait for previous req to complete, if any.
2007-03-08 03:54:52 +03:00
if there are no requests reqs [ inbi ^ 1 ] will be
MPI_REQUEST_NULL . */
2006-10-20 23:47:52 +04:00
/* wait on data from last child for previous segment */
2007-03-08 03:54:52 +03:00
ret = ompi_request_wait_all ( 1 , & reqs [ inbi ^ 1 ] ,
MPI_STATUSES_IGNORE ) ;
2006-10-20 23:47:52 +04:00
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-02-15 01:38:38 +03:00
local_op_buffer = inbuf [ inbi ^ 1 ] ;
2006-10-20 23:47:52 +04:00
if ( i > 0 ) {
2007-02-15 01:38:38 +03:00
/* our first operation is to combine our own [sendbuf] data
* with the data we recvd from down stream ( but only
* the operation is commutative and if we are not root and
* not using MPI_IN_PLACE )
2006-10-20 23:47:52 +04:00
*/
if ( 1 = = i ) {
2007-02-15 01:38:38 +03:00
if ( ( ompi_op_is_commute ( op ) ) & &
! ( ( MPI_IN_PLACE = = sendbuf ) & & ( rank = = tree - > tree_root ) ) ) {
2007-03-08 03:54:52 +03:00
local_op_buffer = sendtmpbuf + segindex * segment_increment ;
2005-10-14 03:38:21 +04:00
}
2005-09-10 03:05:17 +04:00
}
2006-10-20 23:47:52 +04:00
/* apply operation */
2007-02-15 01:38:38 +03:00
ompi_op_reduce ( op , local_op_buffer ,
2007-03-08 03:54:52 +03:00
accumbuf + segindex * segment_increment ,
recvcount , datatype ) ;
2006-10-20 23:47:52 +04:00
} else if ( segindex > 0 ) {
2007-03-08 03:54:52 +03:00
void * accumulator = accumbuf + ( segindex - 1 ) * segment_increment ;
2006-10-27 02:53:05 +04:00
if ( tree - > tree_nextsize < = 1 ) {
2007-02-15 01:38:38 +03:00
if ( ( ompi_op_is_commute ( op ) ) & &
! ( ( MPI_IN_PLACE = = sendbuf ) & & ( rank = = tree - > tree_root ) ) ) {
2007-03-08 03:54:52 +03:00
local_op_buffer = sendtmpbuf + ( segindex - 1 ) * segment_increment ;
2005-10-14 03:38:21 +04:00
}
2005-09-10 03:05:17 +04:00
}
2007-02-15 01:38:38 +03:00
ompi_op_reduce ( op , local_op_buffer , accumulator , prevcount ,
datatype ) ;
2005-09-10 03:05:17 +04:00
2007-03-08 03:54:52 +03:00
/* all reduced on available data this step (i) complete,
* pass to the next process unless you are the root .
2006-10-24 02:29:17 +04:00
*/
2006-10-27 02:53:05 +04:00
if ( rank ! = tree - > tree_root ) {
2005-09-10 03:05:17 +04:00
/* send combined/accumulated data to parent */
2007-03-08 03:54:52 +03:00
ret = MCA_PML_CALL ( send ( accumulator , prevcount ,
datatype , tree - > tree_prev ,
2007-02-15 01:38:38 +03:00
MCA_COLL_BASE_TAG_REDUCE ,
2007-03-08 03:54:52 +03:00
MCA_PML_BASE_SEND_STANDARD ,
comm ) ) ;
if ( ret ! = MPI_SUCCESS ) {
line = __LINE__ ; goto error_hndl ;
}
2005-09-06 09:21:57 +04:00
}
2005-09-10 03:05:17 +04:00
2007-02-15 01:38:38 +03:00
/* we stop when segindex = number of segments
2007-03-08 03:54:52 +03:00
( i . e . we do num_segment + 1 steps for pipelining */
2005-09-06 09:21:57 +04:00
if ( segindex = = num_segments ) break ;
}
/* update input buffer index */
2007-02-15 01:38:38 +03:00
inbi = inbi ^ 1 ;
2005-09-06 09:21:57 +04:00
} /* end of for each child */
} /* end of for each segment */
/* clean up */
2007-03-08 03:54:52 +03:00
if ( inbuf_free [ 0 ] ! = NULL ) free ( inbuf_free [ 0 ] ) ;
if ( inbuf_free [ 1 ] ! = NULL ) free ( inbuf_free [ 1 ] ) ;
if ( accumbuf_free ! = NULL ) free ( accumbuf_free ) ;
2005-09-06 09:21:57 +04:00
}
2007-04-26 00:39:53 +04:00
/* leaf nodes
Depending on the value of max_outstanding_reqs and
2007-05-03 01:42:45 +04:00
the number of segments we have two options :
2007-04-26 00:39:53 +04:00
- send all segments using blocking send to the parent , or
- avoid overflooding the parent nodes by limiting the number of
2009-08-15 01:06:23 +04:00
outstanding requests to max_oustanding_reqs .
2007-05-03 01:42:45 +04:00
TODO / POSSIBLE IMPROVEMENT : If there is a way to determine the eager size
for the current communication , synchronization should be used only
when the message / segment size is smaller than the eager size .
2009-08-15 01:06:23 +04:00
*/
2005-09-06 09:21:57 +04:00
else {
2007-04-26 00:39:53 +04:00
/* If the number of segments is less than a maximum number of oustanding
2007-05-03 01:42:45 +04:00
requests or there is no limit on the maximum number of outstanding
requests , we send data to the parent using blocking send */
2007-04-26 00:39:53 +04:00
if ( ( 0 = = max_outstanding_reqs ) | |
2007-05-03 01:42:45 +04:00
( num_segments < = max_outstanding_reqs ) ) {
2007-04-26 00:39:53 +04:00
segindex = 0 ;
while ( original_count > 0 ) {
if ( original_count < count_by_segment ) {
count_by_segment = original_count ;
}
ret = MCA_PML_CALL ( send ( ( char * ) sendbuf +
segindex * segment_increment ,
count_by_segment , datatype ,
tree - > tree_prev ,
MCA_COLL_BASE_TAG_REDUCE ,
MCA_PML_BASE_SEND_STANDARD ,
comm ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
segindex + + ;
original_count - = count_by_segment ;
}
}
/* Otherwise, introduce flow control:
- post max_outstanding_reqs non - blocking synchronous send ,
- for remaining segments
2009-08-15 01:06:23 +04:00
- wait for a ssend to complete , and post the next one .
2007-04-26 00:39:53 +04:00
- wait for all outstanding sends to complete .
2009-08-15 01:06:23 +04:00
*/
2007-04-26 00:39:53 +04:00
else {
int creq = 0 ;
ompi_request_t * * sreq = NULL ;
sreq = ( ompi_request_t * * ) calloc ( max_outstanding_reqs ,
sizeof ( ompi_request_t * ) ) ;
if ( NULL = = sreq ) { line = __LINE__ ; ret = - 1 ; goto error_hndl ; }
/* post first group of requests */
for ( segindex = 0 ; segindex < max_outstanding_reqs ; segindex + + ) {
ret = MCA_PML_CALL ( isend ( ( char * ) sendbuf +
segindex * segment_increment ,
count_by_segment , datatype ,
tree - > tree_prev ,
MCA_COLL_BASE_TAG_REDUCE ,
MCA_PML_BASE_SEND_SYNCHRONOUS , comm ,
& sreq [ segindex ] ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
original_count - = count_by_segment ;
}
creq = 0 ;
while ( original_count > 0 ) {
/* wait on a posted request to complete */
ret = ompi_request_wait ( & sreq [ creq ] , MPI_STATUS_IGNORE ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
sreq [ creq ] = MPI_REQUEST_NULL ;
if ( original_count < count_by_segment ) {
count_by_segment = original_count ;
}
ret = MCA_PML_CALL ( isend ( ( char * ) sendbuf +
segindex * segment_increment ,
count_by_segment , datatype ,
tree - > tree_prev ,
MCA_COLL_BASE_TAG_REDUCE ,
MCA_PML_BASE_SEND_SYNCHRONOUS , comm ,
& sreq [ creq ] ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
creq = ( creq + 1 ) % max_outstanding_reqs ;
segindex + + ;
original_count - = count_by_segment ;
2007-03-08 03:54:52 +03:00
}
2007-04-26 00:39:53 +04:00
/* Wait on the remaining request to complete */
ret = ompi_request_wait_all ( max_outstanding_reqs , sreq ,
MPI_STATUSES_IGNORE ) ;
2005-09-06 09:21:57 +04:00
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2007-04-26 00:39:53 +04:00
/* free requests */
free ( sreq ) ;
2005-09-06 09:21:57 +04:00
}
}
2006-10-27 02:53:05 +04:00
return OMPI_SUCCESS ;
2005-09-06 09:21:57 +04:00
2006-10-27 02:53:05 +04:00
error_hndl : /* error handler */
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream ,
2007-03-08 03:54:52 +03:00
" ERROR_HNDL: node %d file %s line %d error %d \n " ,
2007-02-15 01:38:38 +03:00
rank , __FILE__ , line , ret ) ) ;
2007-03-08 03:54:52 +03:00
if ( inbuf_free [ 0 ] ! = NULL ) free ( inbuf_free [ 0 ] ) ;
if ( inbuf_free [ 1 ] ! = NULL ) free ( inbuf_free [ 1 ] ) ;
if ( accumbuf_free ! = NULL ) free ( accumbuf ) ;
2005-09-06 09:21:57 +04:00
return ret ;
}
2006-10-27 02:53:05 +04:00
/* Attention: this version of the reduce operations does not
work for :
- non - commutative operations
- segment sizes which are not multiplies of the extent of the datatype
meaning that at least one datatype must fit in the segment !
*/
int ompi_coll_tuned_reduce_intra_chain ( void * sendbuf , void * recvbuf , int count ,
2007-03-08 03:54:52 +03:00
ompi_datatype_t * datatype ,
ompi_op_t * op , int root ,
2007-08-19 07:37:49 +04:00
ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
uint32_t segsize , int fanout ,
int max_outstanding_reqs )
2006-10-27 02:53:05 +04:00
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-10-27 02:53:05 +04:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2006-10-27 02:53:05 +04:00
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_chain rank %d fo %d ss %5d " , ompi_comm_rank ( comm ) , fanout , segsize ) ) ;
2006-10-27 02:53:05 +04:00
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_CHAIN ( comm , tuned_module , root , fanout ) ;
2006-10-27 02:53:05 +04:00
/**
* Determine number of segments and number of elements
* sent per operation
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 23:04:08 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2006-10-27 02:53:05 +04:00
2007-03-08 03:54:52 +03:00
return ompi_coll_tuned_reduce_generic ( sendbuf , recvbuf , count , datatype ,
2007-08-19 07:37:49 +04:00
op , root , comm , module ,
data - > cached_chain ,
2007-04-26 00:39:53 +04:00
segcount , max_outstanding_reqs ) ;
2006-10-27 02:53:05 +04:00
}
2005-09-06 09:21:57 +04:00
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_reduce_intra_pipeline ( void * sendbuf , void * recvbuf ,
2006-10-18 06:00:46 +04:00
int count , ompi_datatype_t * datatype ,
ompi_op_t * op , int root ,
2007-08-19 07:37:49 +04:00
ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
uint32_t segsize ,
int max_outstanding_reqs )
2005-09-06 09:21:57 +04:00
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-10-27 02:53:05 +04:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2005-10-27 03:51:56 +04:00
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_pipeline rank %d ss %5d " ,
2006-10-27 02:53:05 +04:00
ompi_comm_rank ( comm ) , segsize ) ) ;
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_PIPELINE ( comm , tuned_module , root ) ;
2006-10-27 02:53:05 +04:00
/**
* Determine number of segments and number of elements
* sent per operation
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 23:04:08 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2006-10-27 02:53:05 +04:00
2007-03-08 03:54:52 +03:00
return ompi_coll_tuned_reduce_generic ( sendbuf , recvbuf , count , datatype ,
2007-08-19 07:37:49 +04:00
op , root , comm , module ,
data - > cached_pipeline ,
2007-04-26 00:39:53 +04:00
segcount , max_outstanding_reqs ) ;
2006-10-27 02:53:05 +04:00
}
int ompi_coll_tuned_reduce_intra_binary ( void * sendbuf , void * recvbuf ,
int count , ompi_datatype_t * datatype ,
ompi_op_t * op , int root ,
2007-03-08 03:54:52 +03:00
ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
uint32_t segsize ,
int max_outstanding_reqs )
2006-10-27 02:53:05 +04:00
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-10-27 02:53:05 +04:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2006-10-27 02:53:05 +04:00
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_binary rank %d ss %5d " ,
2006-10-27 02:53:05 +04:00
ompi_comm_rank ( comm ) , segsize ) ) ;
2005-10-27 03:51:56 +04:00
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_BINTREE ( comm , tuned_module , root ) ;
2006-10-27 02:53:05 +04:00
/**
* Determine number of segments and number of elements
* sent per operation
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 23:04:08 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2005-10-27 03:51:56 +04:00
2007-03-08 03:54:52 +03:00
return ompi_coll_tuned_reduce_generic ( sendbuf , recvbuf , count , datatype ,
2007-08-19 07:37:49 +04:00
op , root , comm , module ,
data - > cached_bintree ,
2007-04-26 00:39:53 +04:00
segcount , max_outstanding_reqs ) ;
2005-09-06 09:21:57 +04:00
}
2005-09-10 03:05:17 +04:00
2006-10-27 02:53:05 +04:00
int ompi_coll_tuned_reduce_intra_binomial ( void * sendbuf , void * recvbuf ,
int count , ompi_datatype_t * datatype ,
ompi_op_t * op , int root ,
2007-03-08 03:54:52 +03:00
ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
uint32_t segsize ,
int max_outstanding_reqs )
2006-10-27 02:53:05 +04:00
{
2006-11-10 23:04:08 +03:00
int segcount = count ;
2006-10-27 02:53:05 +04:00
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2006-10-27 02:53:05 +04:00
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_binomial rank %d ss %5d " ,
2006-10-27 02:53:05 +04:00
ompi_comm_rank ( comm ) , segsize ) ) ;
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_IN_ORDER_BMTREE ( comm , tuned_module , root ) ;
2006-10-27 02:53:05 +04:00
/**
* Determine number of segments and number of elements
* sent per operation
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2006-11-10 23:04:08 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
2006-10-27 02:53:05 +04:00
2007-03-08 03:54:52 +03:00
return ompi_coll_tuned_reduce_generic ( sendbuf , recvbuf , count , datatype ,
2007-08-19 07:37:49 +04:00
op , root , comm , module ,
data - > cached_in_order_bmtree ,
2007-04-26 00:39:53 +04:00
segcount , max_outstanding_reqs ) ;
2006-10-27 02:53:05 +04:00
}
2005-10-27 03:51:56 +04:00
2007-02-15 01:38:38 +03:00
/*
* reduce_intra_in_order_binary
*
* Function : Logarithmic reduce operation for non - commutative operations .
* Acecpts : same as MPI_Reduce ( )
* Returns : MPI_SUCCESS or error code
*/
int ompi_coll_tuned_reduce_intra_in_order_binary ( void * sendbuf , void * recvbuf ,
int count ,
ompi_datatype_t * datatype ,
ompi_op_t * op , int root ,
ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
uint32_t segsize ,
int max_outstanding_reqs )
2007-02-15 01:38:38 +03:00
{
int ret ;
int rank , size , io_root ;
int segcount = count ;
void * use_this_sendbuf = NULL , * use_this_recvbuf = NULL ;
size_t typelng ;
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
2007-02-15 01:38:38 +03:00
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_in_order_binary rank %d ss %5d " ,
2007-02-15 01:38:38 +03:00
rank , segsize ) ) ;
2007-08-19 07:37:49 +04:00
COLL_TUNED_UPDATE_IN_ORDER_BINTREE ( comm , tuned_module ) ;
2007-02-15 01:38:38 +03:00
/**
* Determine number of segments and number of elements
* sent per operation
*/
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype , & typelng ) ;
2007-02-15 01:38:38 +03:00
COLL_TUNED_COMPUTED_SEGCOUNT ( segsize , typelng , segcount ) ;
/* An in-order binary tree must use root (size-1) to preserve the order of
operations . Thus , if root is not rank ( size - 1 ) , then we must handle
1. MPI_IN_PLACE option on real root , and
2. we must allocate temporary recvbuf on rank ( size - 1 ) .
Note that generic function must be careful not to switch order of
operations for non - commutative ops .
*/
io_root = size - 1 ;
use_this_sendbuf = sendbuf ;
use_this_recvbuf = recvbuf ;
if ( io_root ! = root ) {
2009-08-15 01:06:23 +04:00
ptrdiff_t tlb , text , lb , ext ;
char * tmpbuf = NULL ;
2007-02-15 01:38:38 +03:00
2009-08-15 01:06:23 +04:00
ompi_datatype_get_extent ( datatype , & lb , & ext ) ;
ompi_datatype_get_true_extent ( datatype , & tlb , & text ) ;
if ( ( root = = rank ) & & ( MPI_IN_PLACE = = sendbuf ) ) {
tmpbuf = ( char * ) malloc ( text + ( count - 1 ) * ext ) ;
if ( NULL = = tmpbuf ) {
return MPI_ERR_INTERN ;
}
ompi_datatype_copy_content_same_ddt ( datatype , count ,
( char * ) tmpbuf ,
( char * ) recvbuf ) ;
use_this_sendbuf = tmpbuf ;
} else if ( io_root = = rank ) {
tmpbuf = ( char * ) malloc ( text + ( count - 1 ) * ext ) ;
if ( NULL = = tmpbuf ) {
return MPI_ERR_INTERN ;
}
use_this_recvbuf = tmpbuf ;
}
2007-02-15 01:38:38 +03:00
}
/* Use generic reduce with in-order binary tree topology and io_root */
2007-08-19 07:37:49 +04:00
ret = ompi_coll_tuned_reduce_generic ( use_this_sendbuf , use_this_recvbuf , count , datatype ,
2009-08-15 01:06:23 +04:00
op , io_root , comm , module ,
2007-08-19 07:37:49 +04:00
data - > cached_in_order_bintree ,
2007-04-26 00:39:53 +04:00
segcount , max_outstanding_reqs ) ;
2007-02-15 01:38:38 +03:00
if ( MPI_SUCCESS ! = ret ) { return ret ; }
/* Clean up */
if ( io_root ! = root ) {
2009-08-15 01:06:23 +04:00
if ( root = = rank ) {
/* Receive result from rank io_root to recvbuf */
ret = MCA_PML_CALL ( recv ( recvbuf , count , datatype , io_root ,
MCA_COLL_BASE_TAG_REDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = ret ) { return ret ; }
if ( MPI_IN_PLACE = = sendbuf ) {
free ( use_this_sendbuf ) ;
}
2007-02-15 01:38:38 +03:00
2009-08-15 01:06:23 +04:00
} else if ( io_root = = rank ) {
/* Send result from use_this_recvbuf to root */
ret = MCA_PML_CALL ( send ( use_this_recvbuf , count , datatype , root ,
MCA_COLL_BASE_TAG_REDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( MPI_SUCCESS ! = ret ) { return ret ; }
free ( use_this_recvbuf ) ;
}
2007-02-15 01:38:38 +03:00
}
return MPI_SUCCESS ;
}
2005-10-27 03:51:56 +04:00
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and / or small data sizes they
* are just as fast as tuned / tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1 . i . e . in V2 we will handle this differently and so will not
* have to duplicate code .
* GEF Oct05 after asking Jeff .
*/
/* copied function (with appropriate renaming) starts here */
/*
* reduce_lin_intra
*
* Function : - reduction using O ( N ) algorithm
* Accepts : - same as MPI_Reduce ( )
* Returns : - MPI_SUCCESS or error code
*/
int
2005-12-22 16:49:33 +03:00
ompi_coll_tuned_reduce_intra_basic_linear ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
2007-08-19 07:37:49 +04:00
int root ,
2009-08-15 01:06:23 +04:00
struct ompi_communicator_t * comm ,
mca_coll_base_module_t * module )
2005-10-27 03:51:56 +04:00
{
2006-10-18 00:20:58 +04:00
int i , rank , err , size ;
ptrdiff_t true_lb , true_extent , lb , extent ;
2005-10-27 03:51:56 +04:00
char * free_buffer = NULL ;
char * pml_buffer = NULL ;
char * inplace_temp = NULL ;
char * inbuf ;
/* Initialize */
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_basic_linear rank %d " , rank ) ) ;
2005-10-27 03:51:56 +04:00
/* If not root, send data to the root. */
if ( rank ! = root ) {
err = MCA_PML_CALL ( send ( sbuf , count , dtype , root ,
MCA_COLL_BASE_TAG_REDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
return err ;
}
2007-03-08 03:54:52 +03:00
/* see discussion in ompi_coll_basic_reduce_lin_intra about
extent and true extent */
2006-10-18 06:00:46 +04:00
/* for reducing buffer allocation lengths.... */
2005-10-27 03:51:56 +04:00
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_get_extent ( dtype , & lb , & extent ) ;
ompi_datatype_get_true_extent ( dtype , & true_lb , & true_extent ) ;
2005-10-27 03:51:56 +04:00
if ( MPI_IN_PLACE = = sbuf ) {
sbuf = rbuf ;
2006-08-24 20:38:08 +04:00
inplace_temp = ( char * ) malloc ( true_extent + ( count - 1 ) * extent ) ;
2005-10-27 03:51:56 +04:00
if ( NULL = = inplace_temp ) {
return OMPI_ERR_OUT_OF_RESOURCE ;
}
rbuf = inplace_temp - lb ;
}
if ( size > 1 ) {
2006-08-24 20:38:08 +04:00
free_buffer = ( char * ) malloc ( true_extent + ( count - 1 ) * extent ) ;
2005-10-27 03:51:56 +04:00
if ( NULL = = free_buffer ) {
return OMPI_ERR_OUT_OF_RESOURCE ;
}
pml_buffer = free_buffer - lb ;
}
/* Initialize the receive buffer. */
if ( rank = = ( size - 1 ) ) {
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
err = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) rbuf ,
2009-08-15 01:06:23 +04:00
( char * ) sbuf ) ;
2005-10-27 03:51:56 +04:00
} else {
err = MCA_PML_CALL ( recv ( rbuf , count , dtype , size - 1 ,
MCA_COLL_BASE_TAG_REDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
}
if ( MPI_SUCCESS ! = err ) {
if ( NULL ! = free_buffer ) {
free ( free_buffer ) ;
}
return err ;
}
/* Loop receiving and calling reduction function (C or Fortran). */
for ( i = size - 2 ; i > = 0 ; - - i ) {
if ( rank = = i ) {
2006-08-24 20:38:08 +04:00
inbuf = ( char * ) sbuf ;
2005-10-27 03:51:56 +04:00
} else {
err = MCA_PML_CALL ( recv ( pml_buffer , count , dtype , i ,
MCA_COLL_BASE_TAG_REDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = err ) {
if ( NULL ! = free_buffer ) {
free ( free_buffer ) ;
}
return err ;
}
inbuf = pml_buffer ;
}
/* Perform the reduction */
ompi_op_reduce ( op , inbuf , rbuf , count , dtype ) ;
}
if ( NULL ! = inplace_temp ) {
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
err = ompi_datatype_copy_content_same_ddt ( dtype , count , ( char * ) sbuf ,
2009-08-15 01:06:23 +04:00
inplace_temp ) ;
2005-10-27 03:51:56 +04:00
free ( inplace_temp ) ;
}
if ( NULL ! = free_buffer ) {
free ( free_buffer ) ;
}
/* All done */
return MPI_SUCCESS ;
}
/* copied function (with appropriate renaming) ends here */
2006-10-27 02:53:05 +04:00
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced / fixed / locked in
* as you add methods / algorithms you must update this and the query / map routines
*
* this routine is called by the component only
2007-03-08 03:54:52 +03:00
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead .
2006-10-27 02:53:05 +04:00
*/
2006-04-20 03:42:06 +04:00
int ompi_coll_tuned_reduce_intra_check_forced_init ( coll_tuned_force_algorithm_mca_param_indices_t * mca_param_indices )
2005-10-25 07:55:58 +04:00
{
2007-04-26 00:39:53 +04:00
int rc , requested_alg , max_alg = 6 , max_requests ;
2006-04-20 03:42:06 +04:00
2006-10-18 06:00:46 +04:00
ompi_coll_tuned_forced_max_algorithms [ REDUCE ] = max_alg ;
rc = mca_base_param_reg_int ( & mca_coll_tuned_component . super . collm_version ,
" reduce_algorithm_count " ,
" Number of reduce algorithms available " ,
false , true , max_alg , NULL ) ;
2006-10-24 02:29:17 +04:00
mca_param_indices - > algorithm_param_index
= mca_base_param_reg_int ( & mca_coll_tuned_component . super . collm_version ,
" reduce_algorithm " ,
2007-02-15 01:38:38 +03:00
" Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary " ,
2006-10-24 02:29:17 +04:00
false , false , 0 , NULL ) ;
2009-01-03 18:56:25 +03:00
if ( mca_param_indices - > algorithm_param_index < 0 ) {
return mca_param_indices - > algorithm_param_index ;
}
2006-11-11 00:47:07 +03:00
mca_base_param_lookup_int ( mca_param_indices - > algorithm_param_index , & ( requested_alg ) ) ;
2008-08-11 15:21:04 +04:00
if ( 0 > requested_alg | | requested_alg > max_alg ) {
2006-11-10 22:54:09 +03:00
if ( 0 = = ompi_comm_rank ( MPI_COMM_WORLD ) ) {
2008-06-09 18:53:58 +04:00
opal_output ( 0 , " Reduce algorithm #%d is not available (range [0..%d]). Switching back to ignore(0) \n " ,
2006-11-11 00:47:07 +03:00
requested_alg , max_alg ) ;
2006-11-10 22:54:09 +03:00
}
2006-11-11 00:47:07 +03:00
mca_base_param_set_int ( mca_param_indices - > algorithm_param_index , 0 ) ;
2006-11-10 22:54:09 +03:00
}
2006-10-24 02:29:17 +04:00
mca_param_indices - > segsize_param_index
= mca_base_param_reg_int ( & mca_coll_tuned_component . super . collm_version ,
" reduce_algorithm_segmentsize " ,
" Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. " ,
false , false , 0 , NULL ) ;
mca_param_indices - > tree_fanout_param_index
= mca_base_param_reg_int ( & mca_coll_tuned_component . super . collm_version ,
" reduce_algorithm_tree_fanout " ,
" Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. " ,
false , false ,
ompi_coll_tuned_init_tree_fanout , /* get system wide default */
NULL ) ;
mca_param_indices - > chain_fanout_param_index
= mca_base_param_reg_int ( & mca_coll_tuned_component . super . collm_version ,
" reduce_algorithm_chain_fanout " ,
" Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. " ,
false , false ,
ompi_coll_tuned_init_chain_fanout , /* get system wide default */
NULL ) ;
2007-04-26 00:39:53 +04:00
mca_param_indices - > max_requests_param_index
= mca_base_param_reg_int ( & mca_coll_tuned_component . super . collm_version ,
" reduce_algorithm_max_requests " ,
" Maximum number of outstanding send requests on leaf nodes. 0 means no limit. " ,
false , false , 0 , /* no limit for reduce by default */
NULL ) ;
2009-01-03 18:56:25 +03:00
if ( mca_param_indices - > max_requests_param_index < 0 ) {
return mca_param_indices - > max_requests_param_index ;
}
2007-04-26 00:39:53 +04:00
mca_base_param_lookup_int ( mca_param_indices - > max_requests_param_index , & ( max_requests ) ) ;
if ( max_requests < 0 ) {
if ( 0 = = ompi_comm_rank ( MPI_COMM_WORLD ) ) {
2008-06-09 18:53:58 +04:00
opal_output ( 0 , " Maximum outstanding requests must be positive number or 0. Initializing to 0 (no limit). \n " ) ;
2007-04-26 00:39:53 +04:00
}
mca_base_param_set_int ( mca_param_indices - > max_requests_param_index , 0 ) ;
}
2006-10-18 06:00:46 +04:00
return ( MPI_SUCCESS ) ;
2005-10-25 07:55:58 +04:00
}
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_reduce_intra_do_forced ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op , int root ,
2007-08-19 07:37:49 +04:00
struct ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module )
2005-10-25 07:55:58 +04:00
{
2007-08-19 07:37:49 +04:00
mca_coll_tuned_module_t * tuned_module = ( mca_coll_tuned_module_t * ) module ;
mca_coll_tuned_comm_t * data = tuned_module - > tuned_data ;
const int segsize = data - > user_forced [ REDUCE ] . segsize ;
const int chain_fanout = data - > user_forced [ REDUCE ] . chain_fanout ;
const int max_requests = data - > user_forced [ REDUCE ] . max_requests ;
2007-04-26 00:39:53 +04:00
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_do_forced selected algorithm %d " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ REDUCE ] . algorithm ) ) ;
switch ( data - > user_forced [ REDUCE ] . algorithm ) {
case ( 0 ) : return ompi_coll_tuned_reduce_intra_dec_fixed ( sbuf , rbuf , count , dtype ,
op , root , comm , module ) ;
case ( 1 ) : return ompi_coll_tuned_reduce_intra_basic_linear ( sbuf , rbuf , count , dtype ,
op , root , comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_reduce_intra_chain ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , chain_fanout , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 3 ) : return ompi_coll_tuned_reduce_intra_pipeline ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 4 ) : return ompi_coll_tuned_reduce_intra_binary ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 5 ) : return ompi_coll_tuned_reduce_intra_binomial ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 6 ) : return ompi_coll_tuned_reduce_intra_in_order_binary ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
2007-08-19 07:37:49 +04:00
segsize , max_requests ) ;
2005-10-25 07:55:58 +04:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid? " ,
2007-08-19 07:37:49 +04:00
data - > user_forced [ REDUCE ] . algorithm , ompi_coll_tuned_forced_max_algorithms [ REDUCE ] ) ) ;
2005-10-25 07:55:58 +04:00
return ( MPI_ERR_ARG ) ;
} /* switch */
}
2005-11-11 07:49:29 +03:00
2005-12-22 16:49:33 +03:00
int ompi_coll_tuned_reduce_intra_do_this ( void * sbuf , void * rbuf , int count ,
2006-10-18 06:00:46 +04:00
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op , int root ,
struct ompi_communicator_t * comm ,
2009-08-15 01:06:23 +04:00
mca_coll_base_module_t * module ,
2007-04-26 00:39:53 +04:00
int algorithm , int faninout ,
int segsize , int max_requests )
2005-11-11 07:49:29 +03:00
{
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d " ,
2006-10-18 06:00:46 +04:00
algorithm , faninout , segsize ) ) ;
2005-11-11 07:49:29 +03:00
2006-10-18 06:00:46 +04:00
switch ( algorithm ) {
2007-08-19 07:37:49 +04:00
case ( 0 ) : return ompi_coll_tuned_reduce_intra_dec_fixed ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ) ;
2007-08-19 07:37:49 +04:00
case ( 1 ) : return ompi_coll_tuned_reduce_intra_basic_linear ( sbuf , rbuf , count , dtype ,
op , root , comm , module ) ;
case ( 2 ) : return ompi_coll_tuned_reduce_intra_chain ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
2007-08-19 07:37:49 +04:00
segsize , faninout , max_requests ) ;
case ( 3 ) : return ompi_coll_tuned_reduce_intra_pipeline ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 4 ) : return ompi_coll_tuned_reduce_intra_binary ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 5 ) : return ompi_coll_tuned_reduce_intra_binomial ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
segsize , max_requests ) ;
2007-08-19 07:37:49 +04:00
case ( 6 ) : return ompi_coll_tuned_reduce_intra_in_order_binary ( sbuf , rbuf , count , dtype ,
2009-08-15 01:06:23 +04:00
op , root , comm , module ,
2007-08-19 07:37:49 +04:00
segsize , max_requests ) ;
2005-11-11 07:49:29 +03:00
default :
2008-06-09 18:53:58 +04:00
OPAL_OUTPUT ( ( ompi_coll_tuned_stream , " coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid? " ,
2006-10-18 06:00:46 +04:00
algorithm , ompi_coll_tuned_forced_max_algorithms [ REDUCE ] ) ) ;
2005-11-11 07:49:29 +03:00
return ( MPI_ERR_ARG ) ;
} /* switch */
}