2005-09-06 09:21:57 +04:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
* Copyright ( c ) 2004 - 2005 The University of Tennessee and The University
* of Tennessee Research Foundation . All rights
* reserved .
2005-09-06 09:21:57 +04:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "ompi_config.h"
# include "mpi.h"
# include "ompi/include/constants.h"
# include "datatype/datatype.h"
# include "communicator/communicator.h"
# include "mca/coll/coll.h"
# include "mca/coll/base/coll_tags.h"
# include "mca/pml/pml.h"
# include "op/op.h"
# include "coll_tuned.h"
# include "coll_tuned_topo.h"
2005-09-10 03:05:17 +04:00
# include <sys/types.h>
# include <unistd.h>
/* temp debug routines */
static int dump_buf_int ( char * ptr , int count , char * comment , int rank ) ;
static int dump_buf_int ( char * ptr , int count , char * comment , int rank ) {
int i = 0 ;
int * tptr ;
int c = 0 ;
tptr = ( int * ) ptr ;
printf ( " %1d " , rank ) ;
if ( comment ) printf ( " %s " , comment ) ;
if ( count < 0 ) {
printf ( " cnt %d? \n " , count ) ;
return ( 0 ) ;
}
if ( count > 5 ) c = 5 ;
else c = count ;
printf ( " Cnt %1d " , count ) ;
for ( i = 0 ; i < c ; i + + ) {
printf ( " %1d [%1d] " , i , * tptr + + ) ;
}
if ( c ! = count ) {
tptr = ( int * ) ptr ;
printf ( " ... %1d [%1d] " , count - 1 , tptr [ count - 1 ] ) ;
}
printf ( " \n " ) ;
return ( 0 ) ;
}
2005-09-06 09:21:57 +04:00
/* Attention: this version of the reduce operations does not
work for :
- non - commutative operations
- segment sizes which are not multiplies of the extent of the datatype
2005-09-10 03:05:17 +04:00
meaning that at least one datatype must fit in the segment !
2005-09-06 09:21:57 +04:00
*/
int mca_coll_tuned_reduce_intra_chain ( void * sendbuf , void * recvbuf , int count ,
ompi_datatype_t * datatype , ompi_op_t * op ,
int root , ompi_communicator_t * comm , uint32_t segsize ,
int fanout )
{
int ret , line , rank , size , i = 0 ;
int recvcount , sendcount , prevcount , inbi , previnbi ;
int segcount , segindex , num_segments , realsegsize ;
char * inbuf [ 2 ] = { NULL , NULL } ;
2005-09-10 03:05:17 +04:00
char * accumbuf = NULL ;
char * sendtmpbuf = NULL ;
2005-09-09 06:39:57 +04:00
long ext , lb ;
int typelng ;
2005-10-14 04:00:37 +04:00
int allocedaccumbuf ;
2005-09-06 09:21:57 +04:00
ompi_request_t * reqs [ 2 ] ;
ompi_coll_chain_t * chain ;
2005-09-10 03:05:17 +04:00
2005-09-06 09:21:57 +04:00
size = ompi_comm_size ( comm ) ;
rank = ompi_comm_rank ( comm ) ;
2005-10-27 03:51:56 +04:00
OPAL_OUTPUT ( ( mca_coll_tuned_stream , " coll:tuned:reduce_intra_chain rank %d " , rank ) ) ;
2005-09-06 09:21:57 +04:00
/* ----------------------------------------------------------------- */
/* setup the chain topology.
* if the previous chain topology is the same , then use this cached copy
* other wise recreate it .
*/
if ( ( comm - > c_coll_selected_data - > cached_chain ) & & ( comm - > c_coll_selected_data - > cached_chain_root = = root )
& & ( comm - > c_coll_selected_data - > cached_chain_fanout = = fanout ) ) {
chain = comm - > c_coll_selected_data - > cached_chain ;
}
else {
if ( comm - > c_coll_selected_data - > cached_chain ) { /* destroy previous chain if defined */
ompi_coll_tuned_topo_destroy_chain ( & comm - > c_coll_selected_data - > cached_chain ) ;
}
comm - > c_coll_selected_data - > cached_chain = chain = ompi_coll_tuned_topo_build_chain ( fanout , comm , root ) ;
comm - > c_coll_selected_data - > cached_chain_root = root ;
comm - > c_coll_selected_data - > cached_chain_fanout = fanout ;
}
/* ----------------------------------------------------------------- */
/* Determine number of segments and number of elements
sent per operation */
2005-09-09 06:39:57 +04:00
ompi_ddt_get_extent ( datatype , & lb , & ext ) ;
ompi_ddt_type_size ( datatype , & typelng ) ;
2005-09-06 09:21:57 +04:00
if ( segsize > 0 ) {
segcount = segsize / typelng ;
num_segments = count / segcount ;
if ( ( count % segcount ) ! = 0 ) num_segments + + ;
} else {
segcount = count ;
num_segments = 1 ;
}
realsegsize = segcount * ext ;
2005-09-10 03:05:17 +04:00
/* printf("rank %d root %d count %d \t\t segsize %d typesize %d typeext %d realsegsize %d segcount %d num_segments %d\n", */
/* rank, root, count, segsize, typelng, ext, realsegsize, segcount, num_segments); */
/* ompi_coll_tuned_topo_dump_chain (chain, rank); */
2005-09-06 09:21:57 +04:00
2005-09-10 03:05:17 +04:00
if ( sendbuf ! = MPI_IN_PLACE ) {
sendtmpbuf = ( char * ) sendbuf ;
2005-09-06 09:21:57 +04:00
}
2005-09-10 03:05:17 +04:00
else {
sendtmpbuf = ( char * ) recvbuf ;
}
2005-09-06 09:21:57 +04:00
2005-09-10 03:05:17 +04:00
/* handle special case when size == 1 */
2005-10-14 03:38:21 +04:00
if ( 1 = = size ) {
2005-09-10 03:05:17 +04:00
if ( sendbuf ! = MPI_IN_PLACE ) {
ompi_ddt_copy_content_same_ddt ( datatype , count , recvbuf , sendbuf ) ;
}
return MPI_SUCCESS ;
2005-09-06 09:21:57 +04:00
}
2005-10-14 04:00:37 +04:00
/* handle non existant recv buffer (i.e. its NULL.. like basic allreduce uses!) */
if ( recvbuf ) {
accumbuf = ( char * ) recvbuf ;
allocedaccumbuf = 0 ;
}
else {
accumbuf = ( char * ) malloc ( realsegsize ) ;
if ( accumbuf = = NULL ) { line = __LINE__ ; ret = - 1 ; goto error_hndl ; }
allocedaccumbuf = 1 ;
}
2005-09-10 03:05:17 +04:00
/* ----------------------------------------------------------------- */
2005-09-06 09:21:57 +04:00
/* non-leaf nodes -
wait for children to send me data & forward up ( if needed ) */
if ( chain - > chain_nextsize > 0 ) {
/* Allocate two buffers for incoming segments */
inbuf [ 0 ] = ( char * ) malloc ( realsegsize ) ;
if ( inbuf [ 0 ] = = NULL ) { line = __LINE__ ; ret = - 1 ; goto error_hndl ; }
/* if there is chance to overlap communication -
allocate second buffer */
if ( num_segments > 1 | | chain - > chain_nextsize > 1 ) {
inbuf [ 1 ] = ( char * ) malloc ( realsegsize ) ;
if ( inbuf [ 1 ] = = NULL ) { line = __LINE__ ; ret = - 1 ; goto error_hndl ; }
} else {
inbuf [ 1 ] = NULL ;
}
/* reset input buffer index and receive count */
inbi = 0 ; recvcount = 0 ;
/* for each segment */
for ( segindex = 0 ; segindex < = num_segments ; segindex + + ) {
prevcount = recvcount ;
/* recvcount - number of elements in current segment */
if ( segindex < num_segments - 1 ) { recvcount = segcount ; }
else { recvcount = count - segcount * segindex ; }
/* for each child */
for ( i = 0 ; i < chain - > chain_nextsize ; i + + ) {
/*
We try to overlap communication :
either with next segment or with the next child
*/
/* post irecv for current segindex on current child */
if ( segindex < num_segments ) {
2005-09-10 03:05:17 +04:00
if ( 0 = = i ) { /* for the first step (1st child per segment) */
2005-10-14 03:38:21 +04:00
/* we might be able to irecv directly into the accumulate buffer so that we */
/* can reduce(op) this with our sendbuf in one step */
/* as ompi_op_reduce only has two buffer pointers, this avoids */
2005-09-10 03:05:17 +04:00
/* an extra memory copy GEF */
2005-10-14 03:38:21 +04:00
/* BUT if we are root and are USING MPI_IN_PLACE this is wrong ek! */
/* check for root might not be needed as it should be checked higher up */
if ( ( MPI_IN_PLACE = = sendbuf ) & & ( rank = = root ) ) {
ret = MCA_PML_CALL ( irecv ( inbuf [ inbi ] ,
recvcount , datatype ,
chain - > chain_next [ i ] ,
MCA_COLL_BASE_TAG_REDUCE ,
comm , & reqs [ inbi ] ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
} else {
ret = MCA_PML_CALL ( irecv ( accumbuf + segindex * realsegsize ,
recvcount , datatype ,
chain - > chain_next [ i ] ,
MCA_COLL_BASE_TAG_REDUCE ,
comm , & reqs [ inbi ] ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
} /* if first segment */
2005-09-10 03:05:17 +04:00
else { /* perform a irecv into the standard inbuf */
2005-09-06 09:21:57 +04:00
ret = MCA_PML_CALL ( irecv ( inbuf [ inbi ] , recvcount , datatype ,
chain - > chain_next [ i ] ,
MCA_COLL_BASE_TAG_REDUCE ,
comm , & reqs [ inbi ] ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2005-09-10 03:05:17 +04:00
}
2005-09-06 09:21:57 +04:00
}
/* wait for previous req to complete, if any */
previnbi = ( inbi + 1 ) % 2 ;
if ( i > 0 ) {
/* wait on data from previous child for current segment */
ret = ompi_request_wait_all ( 1 , & reqs [ previnbi ] , MPI_STATUSES_IGNORE ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
/* apply operation */
2005-09-10 03:05:17 +04:00
if ( 1 = = i ) {
/* our first operation is to combine our own [sendbuf] data with the data we recvd from down stream */
2005-10-14 03:38:21 +04:00
/* (but only if we are not root and not using MPI_IN_PLACE) */
if ( ( MPI_IN_PLACE = = sendbuf ) & & ( rank = = root ) ) {
ompi_op_reduce ( op , inbuf [ previnbi ] , accumbuf + segindex * realsegsize , recvcount , datatype ) ;
}
else {
ompi_op_reduce ( op , sendtmpbuf + segindex * realsegsize , accumbuf + segindex * realsegsize , recvcount , datatype ) ;
}
2005-09-10 03:05:17 +04:00
}
else { /* not the first child, we can accumulate straight into accumbuf normally from the inbuf buffers */
ompi_op_reduce ( op , inbuf [ previnbi ] , accumbuf + segindex * realsegsize , recvcount , datatype ) ;
} /* if i>0 (if not first step) */
2005-09-06 09:21:57 +04:00
} else if ( i = = 0 & & segindex > 0 ) {
/* wait on data from last child for previous segment */
ret = ompi_request_wait_all ( 1 , & reqs [ previnbi ] , MPI_STATUSES_IGNORE ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
2005-09-10 03:05:17 +04:00
if ( chain - > chain_nextsize > 1 ) { /* if I have more than one child */
/* I reduce the data in the the inbuf and the accumbuf */
/* as the accumbuf already contains some accumulated results */
ompi_op_reduce ( op , inbuf [ previnbi ] , accumbuf + ( segindex - 1 ) * realsegsize , prevcount , datatype ) ;
}
else { /* I have only one child, so I must combine my data (sendbuf) with the accumulated data in accumbuf */
2005-10-14 03:38:21 +04:00
/* (but only if we are not root and not using MPI_IN_PLACE) */
if ( ( MPI_IN_PLACE = = sendbuf ) & & ( rank = = root ) ) {
ompi_op_reduce ( op , inbuf [ previnbi ] , accumbuf + ( segindex - 1 ) * realsegsize , prevcount , datatype ) ;
}
else {
ompi_op_reduce ( op , sendtmpbuf + ( segindex - 1 ) * realsegsize , accumbuf + ( segindex - 1 ) * realsegsize , prevcount , datatype ) ;
}
2005-09-10 03:05:17 +04:00
}
/* all reduced on available data this step (i) complete, pass to the next process unless your the root */
2005-09-06 09:21:57 +04:00
if ( rank ! = root ) {
2005-09-10 03:05:17 +04:00
/* send combined/accumulated data to parent */
ret = MCA_PML_CALL ( send ( accumbuf + ( segindex - 1 ) * realsegsize ,
2005-09-06 09:21:57 +04:00
prevcount , datatype , chain - > chain_prev ,
MCA_COLL_BASE_TAG_REDUCE , MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
2005-09-10 03:05:17 +04:00
/* we stop when segindex = number of segments (i.e. we do num_segment+1 steps to allow for pipelining */
2005-09-06 09:21:57 +04:00
if ( segindex = = num_segments ) break ;
}
/* update input buffer index */
inbi = previnbi ;
} /* end of for each child */
} /* end of for each segment */
/* clean up */
if ( inbuf ! = NULL ) {
if ( inbuf [ 0 ] ! = NULL ) free ( inbuf [ 0 ] ) ;
if ( inbuf [ 1 ] ! = NULL ) free ( inbuf [ 1 ] ) ;
2005-10-14 04:00:37 +04:00
if ( allocedaccumbuf ) free ( accumbuf ) ;
2005-09-06 09:21:57 +04:00
}
}
/* leaf nodes */
else {
/* Send segmented data to parents */
for ( segindex = 0 ; segindex < num_segments ; segindex + + ) {
if ( segindex < num_segments - 1 ) sendcount = segcount ;
else sendcount = count - segindex * segcount ;
2005-10-11 22:51:03 +04:00
ret = MCA_PML_CALL ( send ( ( char * ) sendbuf + segindex * realsegsize , sendcount ,
2005-09-06 09:21:57 +04:00
datatype , chain - > chain_prev ,
MCA_COLL_BASE_TAG_REDUCE , MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
if ( ret ! = MPI_SUCCESS ) { line = __LINE__ ; goto error_hndl ; }
}
}
return MPI_SUCCESS ;
/* error handler */
error_hndl :
2005-10-11 22:51:03 +04:00
OPAL_OUTPUT ( ( mca_coll_tuned_stream , " ERROR_HNDL: node %d file %s line %d error %d \n " , rank , __FILE__ , line , ret ) ) ;
2005-09-06 09:21:57 +04:00
if ( inbuf ! = NULL ) {
if ( inbuf [ 0 ] ! = NULL ) free ( inbuf [ 0 ] ) ;
if ( inbuf [ 1 ] ! = NULL ) free ( inbuf [ 1 ] ) ;
2005-10-14 04:00:37 +04:00
if ( allocedaccumbuf ) free ( accumbuf ) ;
2005-09-06 09:21:57 +04:00
}
return ret ;
}
int mca_coll_tuned_reduce_intra_pipeline ( void * sendbuf , void * recvbuf ,
int count , ompi_datatype_t * datatype ,
ompi_op_t * op , int root ,
ompi_communicator_t * comm , uint32_t segsize )
{
2005-10-27 03:51:56 +04:00
int rank ;
rank = ompi_comm_rank ( comm ) ;
OPAL_OUTPUT ( ( mca_coll_tuned_stream , " coll:tuned:reduce_intra_pipeline rank %d " , rank ) ) ;
2005-09-09 06:39:57 +04:00
return mca_coll_tuned_reduce_intra_chain ( sendbuf , recvbuf , count ,
2005-09-06 09:21:57 +04:00
datatype , op , root , comm ,
segsize , 1 ) ;
}
2005-09-10 03:05:17 +04:00
2005-10-27 03:51:56 +04:00
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and / or small data sizes they
* are just as fast as tuned / tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1 . i . e . in V2 we will handle this differently and so will not
* have to duplicate code .
* GEF Oct05 after asking Jeff .
*/
/* copied function (with appropriate renaming) starts here */
/*
* reduce_lin_intra
*
* Function : - reduction using O ( N ) algorithm
* Accepts : - same as MPI_Reduce ( )
* Returns : - MPI_SUCCESS or error code
*/
int
mca_coll_tuned_reduce_intra_basic_linear ( void * sbuf , void * rbuf , int count ,
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op ,
int root , struct ompi_communicator_t * comm )
{
int i ;
int rank ;
int err ;
int size ;
long true_lb , true_extent , lb , extent ;
char * free_buffer = NULL ;
char * pml_buffer = NULL ;
char * inplace_temp = NULL ;
char * inbuf ;
/* Initialize */
rank = ompi_comm_rank ( comm ) ;
size = ompi_comm_size ( comm ) ;
OPAL_OUTPUT ( ( mca_coll_tuned_stream , " coll:tuned:reduce_intra_basic_linear rank %d " , rank ) ) ;
/* If not root, send data to the root. */
if ( rank ! = root ) {
err = MCA_PML_CALL ( send ( sbuf , count , dtype , root ,
MCA_COLL_BASE_TAG_REDUCE ,
MCA_PML_BASE_SEND_STANDARD , comm ) ) ;
return err ;
}
/* see discussion in mca_coll_basic_reduce_lin_intra about extent and true extend */
/* for reducing buffer allocation lengths.... */
ompi_ddt_get_extent ( dtype , & lb , & extent ) ;
ompi_ddt_get_true_extent ( dtype , & true_lb , & true_extent ) ;
if ( MPI_IN_PLACE = = sbuf ) {
sbuf = rbuf ;
inplace_temp = malloc ( true_extent + ( count - 1 ) * extent ) ;
if ( NULL = = inplace_temp ) {
return OMPI_ERR_OUT_OF_RESOURCE ;
}
rbuf = inplace_temp - lb ;
}
if ( size > 1 ) {
free_buffer = malloc ( true_extent + ( count - 1 ) * extent ) ;
if ( NULL = = free_buffer ) {
return OMPI_ERR_OUT_OF_RESOURCE ;
}
pml_buffer = free_buffer - lb ;
}
/* Initialize the receive buffer. */
if ( rank = = ( size - 1 ) ) {
err = ompi_ddt_copy_content_same_ddt ( dtype , count , rbuf , sbuf ) ;
} else {
err = MCA_PML_CALL ( recv ( rbuf , count , dtype , size - 1 ,
MCA_COLL_BASE_TAG_REDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
}
if ( MPI_SUCCESS ! = err ) {
if ( NULL ! = free_buffer ) {
free ( free_buffer ) ;
}
return err ;
}
/* Loop receiving and calling reduction function (C or Fortran). */
for ( i = size - 2 ; i > = 0 ; - - i ) {
if ( rank = = i ) {
inbuf = sbuf ;
} else {
err = MCA_PML_CALL ( recv ( pml_buffer , count , dtype , i ,
MCA_COLL_BASE_TAG_REDUCE , comm ,
MPI_STATUS_IGNORE ) ) ;
if ( MPI_SUCCESS ! = err ) {
if ( NULL ! = free_buffer ) {
free ( free_buffer ) ;
}
return err ;
}
inbuf = pml_buffer ;
}
/* Perform the reduction */
ompi_op_reduce ( op , inbuf , rbuf , count , dtype ) ;
}
if ( NULL ! = inplace_temp ) {
err = ompi_ddt_copy_content_same_ddt ( dtype , count , sbuf , inplace_temp ) ;
free ( inplace_temp ) ;
}
if ( NULL ! = free_buffer ) {
free ( free_buffer ) ;
}
/* All done */
return MPI_SUCCESS ;
}
/* copied function (with appropriate renaming) ends here */
2005-10-25 07:55:58 +04:00
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
int mca_coll_tuned_reduce_intra_check_forced ( )
{
mca_base_param_reg_int ( & mca_coll_tuned_component . collm_version ,
" reduce_algorithm " ,
" Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline " ,
false , false , mca_coll_tuned_reduce_forced_choice ,
& mca_coll_tuned_reduce_forced_choice ) ;
mca_base_param_reg_int ( & mca_coll_tuned_component . collm_version ,
" reduce_algorithm_segmentsize " ,
" Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. " ,
false , false , mca_coll_tuned_reduce_forced_segsize ,
& mca_coll_tuned_reduce_forced_segsize ) ;
mca_base_param_reg_int ( & mca_coll_tuned_component . collm_version ,
" reduce_algorithm_tree_fanout " ,
" Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. " ,
false , false ,
mca_coll_tuned_init_tree_fanout , /* get system wide default */
& mca_coll_tuned_reduce_forced_tree_fanout ) ;
mca_base_param_reg_int ( & mca_coll_tuned_component . collm_version ,
" reduce_algorithm_chain_fanout " ,
" Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. " ,
false , false ,
mca_coll_tuned_init_chain_fanout , /* get system wide default */
& mca_coll_tuned_reduce_forced_chain_fanout ) ;
return ( MPI_SUCCESS ) ;
}
int mca_coll_tuned_reduce_intra_query ( )
{
return ( 3 ) ; /* 3 algorithms available */
}
int mca_coll_tuned_reduce_intra_do_forced ( void * sbuf , void * rbuf , int count ,
struct ompi_datatype_t * dtype ,
struct ompi_op_t * op , int root ,
struct ompi_communicator_t * comm )
{
2005-10-31 23:45:50 +03:00
OPAL_OUTPUT ( ( mca_coll_tuned_stream , " coll:tuned:reduce_intra_do_forced selected algorithm %d " , mca_coll_tuned_reduce_forced_choice ) ) ;
2005-10-25 07:55:58 +04:00
switch ( mca_coll_tuned_reduce_forced_choice ) {
case ( 0 ) : return mca_coll_tuned_reduce_intra_dec_fixed ( sbuf , rbuf , count , dtype , op , root , comm ) ;
2005-10-27 03:51:56 +04:00
case ( 1 ) : return mca_coll_tuned_reduce_intra_basic_linear ( sbuf , rbuf , count , dtype , op , root , comm ) ;
2005-10-25 07:55:58 +04:00
case ( 2 ) : return mca_coll_tuned_reduce_intra_chain ( sbuf , rbuf , count , dtype , op , root , comm ,
mca_coll_tuned_reduce_forced_segsize , mca_coll_tuned_reduce_forced_chain_fanout ) ;
case ( 3 ) : return mca_coll_tuned_reduce_intra_pipeline ( sbuf , rbuf , count , dtype , op , root , comm ,
mca_coll_tuned_reduce_forced_segsize ) ;
default :
OPAL_OUTPUT ( ( mca_coll_tuned_stream , " coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid? " ,
mca_coll_tuned_reduce_forced_choice , mca_coll_tuned_reduce_intra_query ( ) ) ) ;
return ( MPI_ERR_ARG ) ;
} /* switch */
}