Cleanup of generic reduce function and possible (low probability) bug fix.
- fixing line lengths and some of the comments - possible bug fix (but I do not think we exposed it in any tests so far) temporary buffers were allocated as multiples of extent instead of true_extent + (count -1) * extent. Everything is still passing Intel tests over tcp and btl mx up to 64 nodes. This commit was SVN r13956.
Этот коммит содержится в:
родитель
57cbafafd5
Коммит
9780a000ba
@ -45,9 +45,10 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
int root, ompi_communicator_t* comm,
|
int root, ompi_communicator_t* comm,
|
||||||
ompi_coll_tree_t* tree, int count_by_segment )
|
ompi_coll_tree_t* tree, int count_by_segment )
|
||||||
{
|
{
|
||||||
char *inbuf[2] = {(char*)NULL, (char*)NULL};
|
char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
|
||||||
char *local_op_buffer = NULL, *accumbuf = NULL, *sendtmpbuf = NULL;
|
char *accumbuf = NULL, *accumbuf_free = NULL;
|
||||||
ptrdiff_t extent, lower_bound, realsegsize;
|
char *local_op_buffer = NULL, *sendtmpbuf = NULL;
|
||||||
|
ptrdiff_t extent, lower_bound, segment_increment;
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||||
int num_segments, line, ret, segindex, i, rank;
|
int num_segments, line, ret, segindex, i, rank;
|
||||||
@ -60,7 +61,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
ompi_ddt_get_extent( datatype, &lower_bound, &extent );
|
ompi_ddt_get_extent( datatype, &lower_bound, &extent );
|
||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
|
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
|
||||||
realsegsize = count_by_segment * extent;
|
segment_increment = count_by_segment * extent;
|
||||||
|
|
||||||
sendtmpbuf = (char*) sendbuf;
|
sendtmpbuf = (char*) sendbuf;
|
||||||
if( sendbuf == MPI_IN_PLACE ) {
|
if( sendbuf == MPI_IN_PLACE ) {
|
||||||
@ -69,19 +70,24 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
/* non-leaf nodes - wait for children to send me data & forward up (if needed) */
|
/* non-leaf nodes - wait for children to send me data & forward up
|
||||||
|
(if needed) */
|
||||||
if( tree->tree_nextsize > 0 ) {
|
if( tree->tree_nextsize > 0 ) {
|
||||||
|
ptrdiff_t true_lower_bound, true_extent, real_segment_size;
|
||||||
|
ompi_ddt_get_true_extent( datatype, &true_lower_bound,
|
||||||
|
&true_extent );
|
||||||
|
|
||||||
/* handle non existant recv buffer (i.e. its NULL) and
|
/* handle non existant recv buffer (i.e. its NULL) and
|
||||||
protect the recv buffer on non-root nodes */
|
protect the recv buffer on non-root nodes */
|
||||||
accumbuf = (char*)recvbuf;
|
accumbuf = (char*)recvbuf;
|
||||||
if( (NULL == accumbuf) || (root != rank) ) {
|
if( (NULL == accumbuf) || (root != rank) ) {
|
||||||
ptrdiff_t true_lower_bound, true_extent;
|
/* Allocate temporary accumulator buffer. */
|
||||||
ompi_ddt_get_true_extent( datatype, &true_lower_bound,
|
accumbuf_free = (char*)malloc(true_extent +
|
||||||
&true_extent );
|
(original_count - 1) * extent);
|
||||||
/* Allocate temporary accumulator buffer.
|
if (accumbuf_free == NULL) {
|
||||||
TODO: The size of the buffer can be optimized to be segment size */
|
line = __LINE__; ret = -1; goto error_hndl;
|
||||||
accumbuf = (char*)malloc(true_extent + (original_count - 1) * extent);
|
}
|
||||||
if (accumbuf == NULL) { line = __LINE__; ret = -1; goto error_hndl; }
|
accumbuf = accumbuf_free - lower_bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If this is a non-commutative operation we must copy
|
/* If this is a non-commutative operation we must copy
|
||||||
@ -92,16 +98,22 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
(char*)sendtmpbuf);
|
(char*)sendtmpbuf);
|
||||||
}
|
}
|
||||||
/* Allocate two buffers for incoming segments */
|
/* Allocate two buffers for incoming segments */
|
||||||
inbuf[0] = (char*) malloc(realsegsize);
|
real_segment_size = true_extent + (count_by_segment - 1) * extent;
|
||||||
if( inbuf[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; }
|
inbuf_free[0] = (char*) malloc(real_segment_size);
|
||||||
|
if( inbuf_free[0] == NULL ) {
|
||||||
|
line = __LINE__; ret = -1; goto error_hndl;
|
||||||
|
}
|
||||||
|
inbuf[0] = inbuf_free[0] - lower_bound;
|
||||||
/* if there is chance to overlap communication -
|
/* if there is chance to overlap communication -
|
||||||
allocate second buffer */
|
allocate second buffer */
|
||||||
if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
|
if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
|
||||||
inbuf[1] = (char*) malloc(realsegsize);
|
inbuf_free[1] = (char*) malloc(real_segment_size);
|
||||||
if( inbuf[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl;}
|
if( inbuf_free[1] == NULL ) {
|
||||||
} else {
|
line = __LINE__; ret = -1; goto error_hndl;
|
||||||
inbuf[1] = NULL;
|
}
|
||||||
}
|
inbuf[1] = inbuf_free[1] - lower_bound;
|
||||||
|
}
|
||||||
|
|
||||||
/* reset input buffer index and receive count */
|
/* reset input buffer index and receive count */
|
||||||
inbi = 0;
|
inbi = 0;
|
||||||
recvcount = 0;
|
recvcount = 0;
|
||||||
@ -135,20 +147,22 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
*/
|
*/
|
||||||
if( (ompi_op_is_commute(op)) &&
|
if( (ompi_op_is_commute(op)) &&
|
||||||
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
||||||
local_recvbuf = accumbuf + segindex * realsegsize;
|
local_recvbuf = accumbuf + segindex * segment_increment;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype,
|
ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype,
|
||||||
tree->tree_next[i],
|
tree->tree_next[i],
|
||||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||||
&reqs[inbi]));
|
&reqs[inbi]));
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;}
|
||||||
}
|
}
|
||||||
/* wait for previous req to complete, if any.
|
/* wait for previous req to complete, if any.
|
||||||
if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */
|
if there are no requests reqs[inbi ^1] will be
|
||||||
|
MPI_REQUEST_NULL. */
|
||||||
/* wait on data from last child for previous segment */
|
/* wait on data from last child for previous segment */
|
||||||
ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE );
|
ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1],
|
||||||
|
MPI_STATUSES_IGNORE );
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
local_op_buffer = inbuf[inbi ^ 1];
|
local_op_buffer = inbuf[inbi ^ 1];
|
||||||
if( i > 0 ) {
|
if( i > 0 ) {
|
||||||
@ -160,38 +174,41 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
if( 1 == i ) {
|
if( 1 == i ) {
|
||||||
if( (ompi_op_is_commute(op)) &&
|
if( (ompi_op_is_commute(op)) &&
|
||||||
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
||||||
local_op_buffer = sendtmpbuf + segindex * realsegsize;
|
local_op_buffer = sendtmpbuf + segindex * segment_increment;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* apply operation */
|
/* apply operation */
|
||||||
ompi_op_reduce(op, local_op_buffer,
|
ompi_op_reduce(op, local_op_buffer,
|
||||||
accumbuf + segindex * realsegsize, recvcount,
|
accumbuf + segindex * segment_increment,
|
||||||
datatype );
|
recvcount, datatype );
|
||||||
} else if ( segindex > 0 ) {
|
} else if ( segindex > 0 ) {
|
||||||
void* accumulator = accumbuf + (segindex-1) * realsegsize;
|
void* accumulator = accumbuf + (segindex-1) * segment_increment;
|
||||||
if( tree->tree_nextsize <= 1 ) {
|
if( tree->tree_nextsize <= 1 ) {
|
||||||
if( (ompi_op_is_commute(op)) &&
|
if( (ompi_op_is_commute(op)) &&
|
||||||
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
!((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
|
||||||
local_op_buffer = sendtmpbuf + (segindex-1) * realsegsize;
|
local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
|
ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
|
||||||
datatype );
|
datatype );
|
||||||
|
|
||||||
/* all reduced on available data this step (i) complete, pass to
|
/* all reduced on available data this step (i) complete,
|
||||||
* the next process unless your the root
|
* pass to the next process unless you are the root.
|
||||||
*/
|
*/
|
||||||
if (rank != tree->tree_root) {
|
if (rank != tree->tree_root) {
|
||||||
/* send combined/accumulated data to parent */
|
/* send combined/accumulated data to parent */
|
||||||
ret = MCA_PML_CALL( send( accumulator, prevcount, datatype,
|
ret = MCA_PML_CALL( send( accumulator, prevcount,
|
||||||
tree->tree_prev,
|
datatype, tree->tree_prev,
|
||||||
MCA_COLL_BASE_TAG_REDUCE,
|
MCA_COLL_BASE_TAG_REDUCE,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm) );
|
MCA_PML_BASE_SEND_STANDARD,
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
comm) );
|
||||||
|
if (ret != MPI_SUCCESS) {
|
||||||
|
line = __LINE__; goto error_hndl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* we stop when segindex = number of segments
|
/* we stop when segindex = number of segments
|
||||||
(i.e. we do num_segment+1 steps to allow for pipelining */
|
(i.e. we do num_segment+1 steps for pipelining */
|
||||||
if (segindex == num_segments) break;
|
if (segindex == num_segments) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -201,9 +218,9 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
} /* end of for each segment */
|
} /* end of for each segment */
|
||||||
|
|
||||||
/* clean up */
|
/* clean up */
|
||||||
if( inbuf[0] != NULL) free(inbuf[0]);
|
if( inbuf_free[0] != NULL) free(inbuf_free[0]);
|
||||||
if( inbuf[1] != NULL) free(inbuf[1]);
|
if( inbuf_free[1] != NULL) free(inbuf_free[1]);
|
||||||
if( (NULL == recvbuf) || (root != rank) ) free(accumbuf);
|
if( accumbuf_free != NULL ) free(accumbuf_free);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* leaf nodes */
|
/* leaf nodes */
|
||||||
@ -211,10 +228,14 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
/* Send segmented data to parents */
|
/* Send segmented data to parents */
|
||||||
segindex = 0;
|
segindex = 0;
|
||||||
while( original_count > 0 ) {
|
while( original_count > 0 ) {
|
||||||
if( original_count < count_by_segment ) count_by_segment = original_count;
|
if( original_count < count_by_segment ) {
|
||||||
ret = MCA_PML_CALL( send((char*)sendbuf + segindex * realsegsize, count_by_segment,
|
count_by_segment = original_count;
|
||||||
datatype, tree->tree_prev,
|
}
|
||||||
MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) );
|
ret = MCA_PML_CALL( send((char*)sendbuf +
|
||||||
|
segindex * segment_increment,
|
||||||
|
count_by_segment, datatype,
|
||||||
|
tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE,
|
||||||
|
MCA_PML_BASE_SEND_STANDARD, comm) );
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
segindex++;
|
segindex++;
|
||||||
original_count -= count_by_segment;
|
original_count -= count_by_segment;
|
||||||
@ -223,11 +244,12 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
|
|
||||||
error_hndl: /* error handler */
|
error_hndl: /* error handler */
|
||||||
OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n",
|
OPAL_OUTPUT (( ompi_coll_tuned_stream,
|
||||||
|
"ERROR_HNDL: node %d file %s line %d error %d\n",
|
||||||
rank, __FILE__, line, ret ));
|
rank, __FILE__, line, ret ));
|
||||||
if( inbuf[0] != NULL ) free(inbuf[0]);
|
if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
|
||||||
if( inbuf[1] != NULL ) free(inbuf[1]);
|
if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
|
||||||
if( (NULL == recvbuf) || ((root != rank) && (NULL != accumbuf)) ) free(accumbuf);
|
if( accumbuf_free != NULL ) free(accumbuf);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -239,9 +261,10 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||||
ompi_datatype_t* datatype, ompi_op_t* op,
|
ompi_datatype_t* datatype,
|
||||||
int root, ompi_communicator_t* comm, uint32_t segsize,
|
ompi_op_t* op, int root,
|
||||||
int fanout)
|
ompi_communicator_t* comm,
|
||||||
|
uint32_t segsize, int fanout)
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
@ -256,15 +279,18 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
|||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||||
|
|
||||||
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
|
||||||
comm->c_coll_selected_data->cached_chain, segcount );
|
op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_chain,
|
||||||
|
segcount );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm, uint32_t segsize )
|
ompi_communicator_t* comm,
|
||||||
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
@ -281,14 +307,17 @@ int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
|||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||||
|
|
||||||
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
|
||||||
comm->c_coll_selected_data->cached_pipeline, segcount );
|
op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_pipeline,
|
||||||
|
segcount );
|
||||||
}
|
}
|
||||||
|
|
||||||
int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
|
int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
|
||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm, uint32_t segsize )
|
ompi_communicator_t* comm,
|
||||||
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
@ -305,14 +334,17 @@ int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
|
|||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||||
|
|
||||||
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
|
||||||
comm->c_coll_selected_data->cached_bintree, segcount );
|
op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_bintree,
|
||||||
|
segcount );
|
||||||
}
|
}
|
||||||
|
|
||||||
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
|
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
|
||||||
int count, ompi_datatype_t* datatype,
|
int count, ompi_datatype_t* datatype,
|
||||||
ompi_op_t* op, int root,
|
ompi_op_t* op, int root,
|
||||||
ompi_communicator_t* comm, uint32_t segsize )
|
ompi_communicator_t* comm,
|
||||||
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
@ -329,8 +361,10 @@ int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
|
|||||||
ompi_ddt_type_size( datatype, &typelng );
|
ompi_ddt_type_size( datatype, &typelng );
|
||||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||||
|
|
||||||
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
|
return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype,
|
||||||
comm->c_coll_selected_data->cached_bmtree, segcount );
|
op, root, comm,
|
||||||
|
comm->c_coll_selected_data->cached_bmtree,
|
||||||
|
segcount );
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -484,7 +518,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extend */
|
/* see discussion in ompi_coll_basic_reduce_lin_intra about
|
||||||
|
extent and true extent */
|
||||||
/* for reducing buffer allocation lengths.... */
|
/* for reducing buffer allocation lengths.... */
|
||||||
|
|
||||||
ompi_ddt_get_extent(dtype, &lb, &extent);
|
ompi_ddt_get_extent(dtype, &lb, &extent);
|
||||||
@ -510,7 +545,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
/* Initialize the receive buffer. */
|
/* Initialize the receive buffer. */
|
||||||
|
|
||||||
if (rank == (size - 1)) {
|
if (rank == (size - 1)) {
|
||||||
err = ompi_ddt_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
|
err = ompi_ddt_copy_content_same_ddt(dtype, count, (char*)rbuf,
|
||||||
|
(char*)sbuf);
|
||||||
} else {
|
} else {
|
||||||
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
|
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
|
||||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||||
@ -547,7 +583,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (NULL != inplace_temp) {
|
if (NULL != inplace_temp) {
|
||||||
err = ompi_ddt_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp);
|
err = ompi_ddt_copy_content_same_ddt(dtype, count, (char*)sbuf,
|
||||||
|
inplace_temp);
|
||||||
free(inplace_temp);
|
free(inplace_temp);
|
||||||
}
|
}
|
||||||
if (NULL != free_buffer) {
|
if (NULL != free_buffer) {
|
||||||
@ -568,8 +605,9 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
* as you add methods/algorithms you must update this and the query/map routines
|
* as you add methods/algorithms you must update this and the query/map routines
|
||||||
*
|
*
|
||||||
* this routine is called by the component only
|
* this routine is called by the component only
|
||||||
* this makes sure that the mca parameters are set to their initial values and perms
|
* this makes sure that the mca parameters are set to their initial values and
|
||||||
* module does not call this they call the forced_getvalues routine instead
|
* perms module does not call this they call the forced_getvalues routine
|
||||||
|
* instead.
|
||||||
*/
|
*/
|
||||||
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||||
{
|
{
|
||||||
@ -630,18 +668,34 @@ int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
|
|||||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
|
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
|
||||||
|
|
||||||
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
|
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf,
|
||||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
|
count, dtype,
|
||||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
|
op, root, comm);
|
||||||
|
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf,
|
||||||
|
count, dtype,
|
||||||
|
op, root,
|
||||||
|
comm);
|
||||||
|
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count,
|
||||||
|
dtype, op, root,
|
||||||
|
comm,
|
||||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize,
|
comm->c_coll_selected_data->user_forced[REDUCE].segsize,
|
||||||
comm->c_coll_selected_data->user_forced[REDUCE].chain_fanout);
|
comm->c_coll_selected_data->user_forced[REDUCE].chain_fanout);
|
||||||
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
|
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count,
|
||||||
|
dtype, op, root,
|
||||||
|
comm,
|
||||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
||||||
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype, op, root, comm,
|
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count,
|
||||||
|
dtype, op, root,
|
||||||
|
comm,
|
||||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
||||||
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype, op, root, comm,
|
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count,
|
||||||
|
dtype, op, root,
|
||||||
|
comm,
|
||||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
||||||
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm,
|
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf,
|
||||||
|
count,
|
||||||
|
dtype, op,
|
||||||
|
root, comm,
|
||||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
@ -661,17 +715,30 @@ int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
|
|||||||
algorithm, faninout, segsize));
|
algorithm, faninout, segsize));
|
||||||
|
|
||||||
switch (algorithm) {
|
switch (algorithm) {
|
||||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
|
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf,
|
||||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
|
count, dtype, op,
|
||||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
|
root, comm);
|
||||||
|
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf,
|
||||||
|
count, dtype,
|
||||||
|
op, root,
|
||||||
|
comm);
|
||||||
|
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count,
|
||||||
|
dtype, op, root,
|
||||||
|
comm,
|
||||||
segsize, faninout);
|
segsize, faninout);
|
||||||
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
|
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count,
|
||||||
segsize);
|
dtype, op, root,
|
||||||
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count, dtype, op, root, comm,
|
comm, segsize);
|
||||||
segsize);
|
case (4): return ompi_coll_tuned_reduce_intra_binary (sbuf, rbuf, count,
|
||||||
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count, dtype, op, root, comm,
|
dtype, op, root,
|
||||||
segsize);
|
comm, segsize);
|
||||||
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm,
|
case (5): return ompi_coll_tuned_reduce_intra_binomial (sbuf, rbuf, count,
|
||||||
|
dtype, op, root,
|
||||||
|
comm, segsize);
|
||||||
|
case (6): return ompi_coll_tuned_reduce_intra_in_order_binary(sbuf, rbuf,
|
||||||
|
count,
|
||||||
|
dtype, op,
|
||||||
|
root, comm,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user