diff --git a/ompi/mca/coll/tuned/coll_tuned_barrier.c b/ompi/mca/coll/tuned/coll_tuned_barrier.c index 0041df6e0c..74f0c2698d 100644 --- a/ompi/mca/coll/tuned/coll_tuned_barrier.c +++ b/ompi/mca/coll/tuned/coll_tuned_barrier.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2014 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -49,6 +49,61 @@ static mca_base_var_enum_value_t barrier_algorithms[] = { {0, NULL} }; +/** + * A quick version of the MPI_Sendreceive implemented for the barrier. + * No actual data is moved across the wire, we use 0-byte messages to + * signal a two peer synchronization. + */ +static inline int +ompi_coll_tuned_sendrecv_zero(int dest, int stag, + int source, int rtag, + MPI_Comm comm) + +{ + int err, line = 0; + ompi_request_t* reqs[2]; + ompi_status_public_t statuses[2]; + + /* post new irecv */ + err = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag, + comm, &reqs[0])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } + + /* send data to children */ + err = MCA_PML_CALL(isend( NULL, 0, MPI_BYTE, dest, stag, + MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } + + err = ompi_request_wait_all( 2, reqs, statuses ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } + + return (MPI_SUCCESS); + + error_handler: + /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error + * code that we can propagate up the stack. Instead, look for the real + * error code from the MPI_ERROR in the status. + */ + if( MPI_ERR_IN_STATUS == err ) { + /* At least we know the error was detected during the wait_all */ + int err_index = 1; + if( MPI_SUCCESS == statuses[0].MPI_ERROR ) { + err_index = 0; + } + err = statuses[err_index].MPI_ERROR; + OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s" + " stage of ompi_coll_tuned_sendrecv_zero\n", + __FILE__, line, err, (0 == err_index ? "receive" : "send"))); + } else { + /* Error discovered during the posting of the irecv or isend, + * and no status is available. + */ + OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n", + __FILE__, line, err)); + } + return err; +} + /* * Barrier is ment to be a synchronous operation, as some BTLs can mark * a request done before its passed to the NIC and progress might not be made @@ -157,11 +212,9 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * if (rank >= adjsize) { /* send message to lower ranked node */ remote = rank - adjsize; - err = ompi_coll_tuned_sendrecv_actual(NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, - NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, - comm, MPI_STATUS_IGNORE); + err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, + remote, MCA_COLL_BASE_TAG_BARRIER, + comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } else if (rank < (size - adjsize)) { @@ -184,11 +237,9 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * if (remote >= adjsize) continue; /* post receive from the remote node */ - err = ompi_coll_tuned_sendrecv_actual(NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, - NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, - comm, MPI_STATUS_IGNORE); + err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, + remote, MCA_COLL_BASE_TAG_BARRIER, + comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } @@ -235,11 +286,9 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm, to = (rank + distance) % size; /* send message to lower ranked node */ - err = ompi_coll_tuned_sendrecv_actual(NULL, 0, MPI_BYTE, to, - MCA_COLL_BASE_TAG_BARRIER, - NULL, 0, MPI_BYTE, from, - MCA_COLL_BASE_TAG_BARRIER, - comm, MPI_STATUS_IGNORE); + err = ompi_coll_tuned_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER, + from, MCA_COLL_BASE_TAG_BARRIER, + comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } @@ -266,11 +315,9 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm, "ompi_coll_tuned_barrier_intra_two_procs rank %d", remote)); remote = (remote + 1) & 0x1; - err = ompi_coll_tuned_sendrecv_actual(NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, - NULL, 0, MPI_BYTE, remote, - MCA_COLL_BASE_TAG_BARRIER, - comm, MPI_STATUS_IGNORE); + err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, + remote, MCA_COLL_BASE_TAG_BARRIER, + comm); return (err); } diff --git a/ompi/mca/coll/tuned/coll_tuned_util.c b/ompi/mca/coll/tuned/coll_tuned_util.c index 97b42602a3..8fe57ce01b 100644 --- a/ompi/mca/coll/tuned/coll_tuned_util.c +++ b/ompi/mca/coll/tuned/coll_tuned_util.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2014 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -29,69 +29,6 @@ #include "ompi/mca/pml/pml.h" #include "coll_tuned_util.h" -int ompi_coll_tuned_sendrecv_actual( void* sendbuf, size_t scount, - ompi_datatype_t* sdatatype, - int dest, int stag, - void* recvbuf, size_t rcount, - ompi_datatype_t* rdatatype, - int source, int rtag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status ) - -{ /* post receive first, then send, then waitall... should be fast (I hope) */ - int err, line = 0; - ompi_request_t* reqs[2]; - ompi_status_public_t statuses[2]; - - /* post new irecv */ - err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, - comm, &reqs[0])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } - - /* send data to children */ - err = MCA_PML_CALL(isend( sendbuf, scount, sdatatype, dest, stag, - MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } - - err = ompi_request_wait_all( 2, reqs, statuses ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } - - if (MPI_STATUS_IGNORE != status) { - *status = statuses[0]; - } - - return (MPI_SUCCESS); - - error_handler: - /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error - * code that we can propagate up the stack. Instead, look for the real - * error code from the MPI_ERROR in the status. - */ - if( MPI_ERR_IN_STATUS == err ) { - /* At least we know the error was detected during the wait_all */ - int err_index = 0; - if( MPI_SUCCESS == statuses[0].MPI_ERROR ) { - err_index = 1; - } - if (MPI_STATUS_IGNORE != status) { - *status = statuses[err_index]; - } - err = statuses[err_index].MPI_ERROR; - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred (req index %d)\n", - __FILE__, line, err, err_index)); - } else { - /* Error discovered during the posting of the irecv or isend, - * and no status is available. - */ - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n", - __FILE__, line, err)); - if (MPI_STATUS_IGNORE != status) { - status->MPI_ERROR = err; - } - } - return (err); -} - int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, @@ -133,10 +70,10 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, *status = statuses[0]; } } else { - /* FIXME this is currently unsupported but unused */ - assert (MPI_STATUS_IGNORE == status); + if( MPI_STATUS_IGNORE != status ) + *status = ompi_status_empty; } - + return (MPI_SUCCESS); error_handler: @@ -146,16 +83,17 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, */ if( MPI_ERR_IN_STATUS == err ) { /* At least we know the error was detected during the wait_all */ - int err_index = 0; + int err_index = 1; if( MPI_SUCCESS == statuses[0].MPI_ERROR ) { - err_index = 1; + err_index = 0; } if (MPI_STATUS_IGNORE != status) { *status = statuses[err_index]; } err = statuses[err_index].MPI_ERROR; - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred (req index %d)\n", - __FILE__, line, err, err_index)); + OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s" + " stage of ompi_coll_tuned_sendrecv_zero\n", + __FILE__, line, err, (0 == err_index ? "receive" : "send"))); } else { /* Error discovered during the posting of the irecv or isend, * and no status is available. @@ -168,73 +106,4 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, } return (err); } -/* - * localcompleted version that makes sure the send has completed locally - * Currently this is a sync call, but will change to locally completed - * version when available - */ - -int ompi_coll_tuned_sendrecv_actual_localcompleted( void* sendbuf, size_t scount, - ompi_datatype_t* sdatatype, - int dest, int stag, - void* recvbuf, size_t rcount, - ompi_datatype_t* rdatatype, - int source, int rtag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status ) - -{ /* post receive first, then [local] sync send, then wait... should be fast (I hope) */ - int err, line = 0; - ompi_request_t* req[2]; - ompi_status_public_t statuses[2]; - - /* post new irecv */ - err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, - comm, &(req[0]))); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } - - /* send data to children */ - err = MCA_PML_CALL(isend( sendbuf, scount, sdatatype, dest, stag, - MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &(req[1]))); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } - - err = ompi_request_wait_all( 2, req, statuses ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; } - - if (MPI_STATUS_IGNORE != status) { - *status = statuses[0]; - } - - return (MPI_SUCCESS); - - error_handler: - /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error - * code that we can propagate up the stack. Instead, look for the real - * error code from the MPI_ERROR in the status. - */ - if( MPI_ERR_IN_STATUS == err ) { - /* At least we know the error was detected during the wait_all */ - int err_index = 0; - if( MPI_SUCCESS == statuses[0].MPI_ERROR ) { - err_index = 1; - } - if (MPI_STATUS_IGNORE != status) { - *status = statuses[err_index]; - } - err = statuses[err_index].MPI_ERROR; - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred (req index %d)\n", - __FILE__,line,err, err_index)); - } else { - /* Error discovered during the posting of the irecv or isend, - * and no status is available. - */ - OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n", - __FILE__, line, err)); - if (MPI_STATUS_IGNORE != status) { - status->MPI_ERROR = err; - } - } - - return (err); -} diff --git a/ompi/mca/coll/tuned/coll_tuned_util.h b/ompi/mca/coll/tuned/coll_tuned_util.h index a3c8695d0e..e46e7f4020 100644 --- a/ompi/mca/coll/tuned/coll_tuned_util.h +++ b/ompi/mca/coll/tuned/coll_tuned_util.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2014 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -31,16 +31,11 @@ BEGIN_C_DECLS -/* prototypes */ -int ompi_coll_tuned_sendrecv_actual( void* sendbuf, size_t scount, - ompi_datatype_t* sdatatype, - int dest, int stag, - void* recvbuf, size_t rcount, - ompi_datatype_t* rdatatype, - int source, int rtag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status ); - +/** + * A MPI_like function doing a send and a receive simultaneously. + * If one of the communications results in a zero-byte message the + * communication is ignored, and no message will cross to the peer. + */ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, @@ -51,8 +46,12 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount, ompi_status_public_t* status ); -/* inline functions */ - +/** + * Similar to the function above this implementation of send-receive + * do not generate communications for zero-bytes messages. Thus, it is + * improper to use in the context of some algorithms for collective + * communications. + */ static inline int ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, int dest, int stag, @@ -61,7 +60,7 @@ ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdataty struct ompi_communicator_t* comm, ompi_status_public_t* status, int myid ) { - if ((dest == myid) && (source == myid)) { + if ((dest == source) && (source == myid)) { return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype, recvbuf, (int32_t) rcount, rdatatype); } @@ -71,65 +70,6 @@ ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdataty source, rtag, comm, status); } -int -ompi_coll_tuned_sendrecv_actual_localcompleted( void* sendbuf, size_t scount, - ompi_datatype_t* sdatatype, - int dest, int stag, - void* recvbuf, size_t rcount, - ompi_datatype_t* rdatatype, - int source, int rtag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status ); - - -/* inline functions */ - -static inline int -ompi_coll_tuned_sendrecv_localcompleted( void* sendbuf, size_t scount, - ompi_datatype_t* sdatatype, - int dest, int stag, - void* recvbuf, size_t rcount, - ompi_datatype_t* rdatatype, - int source, int rtag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status, int myid ) -{ - if ((dest == myid) && (source == myid)) { - return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype, - recvbuf, (int32_t) rcount, rdatatype); - } - return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount, - sdatatype, dest, - stag, - recvbuf, rcount, - rdatatype, - source, rtag, comm, - status); -} - -/* inline functions */ -static inline int -ompi_coll_tuned_isendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdtype, - int dest, int stag, ompi_request_t** sreq, - void* recvbuf, size_t rcount, ompi_datatype_t* rdtype, - int source, int rtag, ompi_request_t** rreq, - struct ompi_communicator_t* comm ) { - int ret, line; - - ret = MCA_PML_CALL(irecv(recvbuf, rcount, rdtype, source, rtag, comm, rreq)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; } - - ret = MCA_PML_CALL(isend(sendbuf, scount, sdtype, dest, stag, - MCA_PML_BASE_SEND_STANDARD, comm, sreq)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; } - - return MPI_SUCCESS; - error_handler: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%d\tError occurred %d\n", - __FILE__, line, ret)); - return ret; -} - END_C_DECLS #endif /* MCA_COLL_TUNED_UTIL_EXPORT_H */