diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index eeb1d35fb4..828b32061a 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -350,7 +350,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count, char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; ptrdiff_t true_lb, true_extent, lb, extent; ptrdiff_t block_offset, max_real_segsize; - ompi_request_t *reqs[2] = {NULL, NULL}; + ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -528,6 +528,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count, error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); + ompi_coll_base_free_reqs(reqs, 2); (void)line; // silence compiler warning if (NULL != inbuf[0]) free(inbuf[0]); if (NULL != inbuf[1]) free(inbuf[1]); @@ -627,7 +628,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int size_t typelng; char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; ptrdiff_t block_offset, max_real_segsize; - ompi_request_t *reqs[2] = {NULL, NULL}; + ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; ptrdiff_t lb, extent, gap; size = ompi_comm_size(comm); @@ -847,6 +848,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); + ompi_coll_base_free_reqs(reqs, 2); (void)line; // silence compiler warning if (NULL != inbuf[0]) free(inbuf[0]); if (NULL != inbuf[1]) free(inbuf[1]); diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index 3f1bdc5fb5..a61bf40ca9 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -393,6 +393,7 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, if (0 < total_reqs) { reqs = ompi_coll_base_comm_get_reqs(module->base_data, 2 * total_reqs); if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; } + reqs[0] = reqs[1] = MPI_REQUEST_NULL; } prcv = (char *) rbuf; @@ -468,6 +469,15 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, return MPI_SUCCESS; error_hndl: + /* find a real error code */ + if (MPI_ERR_IN_STATUS == error) { + for( ri = 0; ri < nreqs; ri++ ) { + if (MPI_REQUEST_NULL == reqs[ri]) continue; + if (MPI_ERR_PENDING == reqs[ri]->req_status.MPI_ERROR) continue; + error = reqs[ri]->req_status.MPI_ERROR; + break; + } + } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, rank)); @@ -661,7 +671,16 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err_hndl: - if( MPI_SUCCESS != err ) { + if (MPI_SUCCESS != err) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for( i = 0; i < nreqs; i++ ) { + if (MPI_REQUEST_NULL == req[i]) continue; + if (MPI_ERR_PENDING == req[i]->req_status.MPI_ERROR) continue; + err = req[i]->req_status.MPI_ERROR; + break; + } + } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning diff --git a/ompi/mca/coll/base/coll_base_alltoallv.c b/ompi/mca/coll/base/coll_base_alltoallv.c index aec8b85944..dbe33e8eee 100644 --- a/ompi/mca/coll/base/coll_base_alltoallv.c +++ b/ompi/mca/coll/base/coll_base_alltoallv.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -276,6 +276,15 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); err_hndl: + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for( i = 0; i < nreqs; i++ ) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + err = reqs[i]->req_status.MPI_ERROR; + break; + } + } /* Free the requests in all cases as they are persistent */ ompi_coll_base_free_reqs(reqs, nreqs); diff --git a/ompi/mca/coll/base/coll_base_barrier.c b/ompi/mca/coll/base/coll_base_barrier.c index a190f3be72..49ac4ea2e9 100644 --- a/ompi/mca/coll/base/coll_base_barrier.c +++ b/ompi/mca/coll/base/coll_base_barrier.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -102,8 +102,10 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, { int rank, size, err = 0, line = 0, left, right; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return OMPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); @@ -172,8 +174,10 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c { int rank, size, adjsize, err, line, mask, remote; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return OMPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); @@ -251,8 +255,10 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, { int rank, size, distance, to, from, err, line = 0; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_bruck rank %d", rank)); @@ -285,16 +291,19 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int remote, err; + int remote, size, err; + + size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + if( 2 != ompi_comm_size(comm) ) { + return MPI_ERR_UNSUPPORTED_OPERATION; + } remote = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_two_procs rank %d", remote)); - if (2 != ompi_comm_size(comm)) { - return MPI_ERR_UNSUPPORTED_OPERATION; - } - remote = (remote + 1) & 0x1; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, @@ -324,8 +333,10 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, int i, err, rank, size, line; ompi_request_t** requests = NULL; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + rank = ompi_comm_rank(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { @@ -367,11 +378,21 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, /* All done */ return MPI_SUCCESS; err_hndl: + if( NULL != requests ) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for( i = 0; i < size; i++ ) { + if (MPI_REQUEST_NULL == requests[i]) continue; + if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue; + err = requests[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(requests, size); + } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning - if( NULL != requests ) - ompi_coll_base_free_reqs(requests, size); return err; } /* copied function (with appropriate renaming) ends here */ @@ -385,8 +406,10 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm, { int rank, size, depth, err, jump, partner; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_tree %d", rank)); diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index a35e18fa9c..9a23505196 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -214,13 +214,29 @@ ompi_coll_base_bcast_intra_generic( void* buffer, return (MPI_SUCCESS); error_hndl: + if (MPI_ERR_IN_STATUS == err) { + for( req_index = 0; req_index < 2; req_index++ ) { + if (MPI_REQUEST_NULL == recv_reqs[req_index]) continue; + if (MPI_ERR_PENDING == recv_reqs[req_index]->req_status.MPI_ERROR) continue; + err = recv_reqs[req_index]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs( recv_reqs, 2); + if( NULL != send_reqs ) { + if (MPI_ERR_IN_STATUS == err) { + for( req_index = 0; req_index < tree->tree_nextsize; req_index++ ) { + if (MPI_REQUEST_NULL == send_reqs[req_index]) continue; + if (MPI_ERR_PENDING == send_reqs[req_index]->req_status.MPI_ERROR) continue; + err = send_reqs[req_index]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); + } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warnings - ompi_coll_base_free_reqs( recv_reqs, 2); - if( NULL != send_reqs ) { - ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); - } return err; } @@ -649,12 +665,21 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count, * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. - * So free them anyway -- even if there was an error, and return - * the error after we free everything. */ + * So free them anyway -- even if there was an error. + * Note we still need to get the actual error, as collective + * operations cannot return MPI_ERR_IN_STATUS. + */ err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE); err_hndl: if( MPI_SUCCESS != err ) { /* Free the reqs */ + /* first find the real error code */ + for( preq = reqs; preq < reqs+i; preq++ ) { + if (MPI_REQUEST_NULL == *preq) continue; + if (MPI_ERR_PENDING == (*preq)->req_status.MPI_ERROR) continue; + err = (*preq)->req_status.MPI_ERROR; + break; + } ompi_coll_base_free_reqs(reqs, i); } diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index 8d5ab70d70..6fd1e98146 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -326,6 +326,15 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, return MPI_SUCCESS; error_hndl: if (NULL != reqs) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == ret) { + for( i = 0; i < size; i++ ) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + ret = reqs[i]->req_status.MPI_ERROR; + break; + } + } ompi_coll_base_free_reqs(reqs, size); } OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index 82838ddbcd..dfd709bfb9 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -338,16 +338,34 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi return OMPI_SUCCESS; error_hndl: /* error handler */ + /* find a real error code */ + if (MPI_ERR_IN_STATUS == ret) { + for( i = 0; i < 2; i++ ) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + ret = reqs[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(reqs, 2); + if( NULL != sreq ) { + if (MPI_ERR_IN_STATUS == ret) { + for( i = 0; i < max_outstanding_reqs; i++ ) { + if (MPI_REQUEST_NULL == sreq[i]) continue; + if (MPI_ERR_PENDING == sreq[i]->req_status.MPI_ERROR) continue; + ret = sreq[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); + } + if( inbuf_free[0] != NULL ) free(inbuf_free[0]); + if( inbuf_free[1] != NULL ) free(inbuf_free[1]); + if( accumbuf_free != NULL ) free(accumbuf); OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning - if( inbuf_free[0] != NULL ) free(inbuf_free[0]); - if( inbuf_free[1] != NULL ) free(inbuf_free[1]); - if( accumbuf_free != NULL ) free(accumbuf); - if( NULL != sreq ) { - ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); - } return ret; } diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 948a17376c..984a91787a 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -464,7 +464,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL; char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL}; ptrdiff_t extent, max_real_segsize, dsize, gap = 0; - ompi_request_t *reqs[2] = {NULL, NULL}; + ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 5736c0946f..422894e45f 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -42,7 +42,7 @@ int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, { /* post receive first, then send, then wait... should be fast (I hope) */ int err, line = 0; size_t rtypesize, stypesize; - ompi_request_t *req; + ompi_request_t *req = MPI_REQUEST_NULL; ompi_status_public_t rstatus; /* post new irecv */