Always return a valid error code from collective operations
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
Этот коммит содержится в:
родитель
ddf7e43d57
Коммит
466217fadd
@ -350,7 +350,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
|
||||
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
ptrdiff_t block_offset, max_real_segsize;
|
||||
ompi_request_t *reqs[2] = {NULL, NULL};
|
||||
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
@ -528,6 +528,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
__FILE__, line, rank, ret));
|
||||
ompi_coll_base_free_reqs(reqs, 2);
|
||||
(void)line; // silence compiler warning
|
||||
if (NULL != inbuf[0]) free(inbuf[0]);
|
||||
if (NULL != inbuf[1]) free(inbuf[1]);
|
||||
@ -627,7 +628,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
|
||||
size_t typelng;
|
||||
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
|
||||
ptrdiff_t block_offset, max_real_segsize;
|
||||
ompi_request_t *reqs[2] = {NULL, NULL};
|
||||
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||
ptrdiff_t lb, extent, gap;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
@ -847,6 +848,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
__FILE__, line, rank, ret));
|
||||
ompi_coll_base_free_reqs(reqs, 2);
|
||||
(void)line; // silence compiler warning
|
||||
if (NULL != inbuf[0]) free(inbuf[0]);
|
||||
if (NULL != inbuf[1]) free(inbuf[1]);
|
||||
|
@ -393,6 +393,7 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
|
||||
if (0 < total_reqs) {
|
||||
reqs = ompi_coll_base_comm_get_reqs(module->base_data, 2 * total_reqs);
|
||||
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
|
||||
reqs[0] = reqs[1] = MPI_REQUEST_NULL;
|
||||
}
|
||||
|
||||
prcv = (char *) rbuf;
|
||||
@ -468,6 +469,15 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
error_hndl:
|
||||
/* find a real error code */
|
||||
if (MPI_ERR_IN_STATUS == error) {
|
||||
for( ri = 0; ri < nreqs; ri++ ) {
|
||||
if (MPI_REQUEST_NULL == reqs[ri]) continue;
|
||||
if (MPI_ERR_PENDING == reqs[ri]->req_status.MPI_ERROR) continue;
|
||||
error = reqs[ri]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
|
||||
rank));
|
||||
@ -661,7 +671,16 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
err_hndl:
|
||||
if( MPI_SUCCESS != err ) {
|
||||
if (MPI_SUCCESS != err) {
|
||||
/* find a real error code */
|
||||
if (MPI_ERR_IN_STATUS == err) {
|
||||
for( i = 0; i < nreqs; i++ ) {
|
||||
if (MPI_REQUEST_NULL == req[i]) continue;
|
||||
if (MPI_ERR_PENDING == req[i]->req_status.MPI_ERROR) continue;
|
||||
err = req[i]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank) );
|
||||
(void)line; // silence compiler warning
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2016 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2017 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -276,6 +276,15 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts
|
||||
err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
|
||||
|
||||
err_hndl:
|
||||
/* find a real error code */
|
||||
if (MPI_ERR_IN_STATUS == err) {
|
||||
for( i = 0; i < nreqs; i++ ) {
|
||||
if (MPI_REQUEST_NULL == reqs[i]) continue;
|
||||
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
|
||||
err = reqs[i]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Free the requests in all cases as they are persistent */
|
||||
ompi_coll_base_free_reqs(reqs, nreqs);
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2016 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2017 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -102,8 +102,10 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||
{
|
||||
int rank, size, err = 0, line = 0, left, right;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
if( 1 == size )
|
||||
return OMPI_SUCCESS;
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
|
||||
|
||||
@ -172,8 +174,10 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c
|
||||
{
|
||||
int rank, size, adjsize, err, line, mask, remote;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
if( 1 == size )
|
||||
return OMPI_SUCCESS;
|
||||
rank = ompi_comm_rank(comm);
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_recursivedoubling rank %d",
|
||||
rank));
|
||||
@ -251,8 +255,10 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||
{
|
||||
int rank, size, distance, to, from, err, line = 0;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
if( 1 == size )
|
||||
return MPI_SUCCESS;
|
||||
rank = ompi_comm_rank(comm);
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_bruck rank %d", rank));
|
||||
|
||||
@ -285,16 +291,19 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||
int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int remote, err;
|
||||
int remote, size, err;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
if( 1 == size )
|
||||
return MPI_SUCCESS;
|
||||
if( 2 != ompi_comm_size(comm) ) {
|
||||
return MPI_ERR_UNSUPPORTED_OPERATION;
|
||||
}
|
||||
|
||||
remote = ompi_comm_rank(comm);
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_two_procs rank %d", remote));
|
||||
|
||||
if (2 != ompi_comm_size(comm)) {
|
||||
return MPI_ERR_UNSUPPORTED_OPERATION;
|
||||
}
|
||||
|
||||
remote = (remote + 1) & 0x1;
|
||||
|
||||
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
@ -324,8 +333,10 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
|
||||
int i, err, rank, size, line;
|
||||
ompi_request_t** requests = NULL;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
if( 1 == size )
|
||||
return MPI_SUCCESS;
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* All non-root send & receive zero-length message. */
|
||||
if (rank > 0) {
|
||||
@ -367,11 +378,21 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
|
||||
/* All done */
|
||||
return MPI_SUCCESS;
|
||||
err_hndl:
|
||||
if( NULL != requests ) {
|
||||
/* find a real error code */
|
||||
if (MPI_ERR_IN_STATUS == err) {
|
||||
for( i = 0; i < size; i++ ) {
|
||||
if (MPI_REQUEST_NULL == requests[i]) continue;
|
||||
if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue;
|
||||
err = requests[i]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ompi_coll_base_free_reqs(requests, size);
|
||||
}
|
||||
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank) );
|
||||
(void)line; // silence compiler warning
|
||||
if( NULL != requests )
|
||||
ompi_coll_base_free_reqs(requests, size);
|
||||
return err;
|
||||
}
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
@ -385,8 +406,10 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||
{
|
||||
int rank, size, depth, err, jump, partner;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
if( 1 == size )
|
||||
return MPI_SUCCESS;
|
||||
rank = ompi_comm_rank(comm);
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_tree %d",
|
||||
rank));
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2016 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2017 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -214,13 +214,29 @@ ompi_coll_base_bcast_intra_generic( void* buffer,
|
||||
return (MPI_SUCCESS);
|
||||
|
||||
error_hndl:
|
||||
if (MPI_ERR_IN_STATUS == err) {
|
||||
for( req_index = 0; req_index < 2; req_index++ ) {
|
||||
if (MPI_REQUEST_NULL == recv_reqs[req_index]) continue;
|
||||
if (MPI_ERR_PENDING == recv_reqs[req_index]->req_status.MPI_ERROR) continue;
|
||||
err = recv_reqs[req_index]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ompi_coll_base_free_reqs( recv_reqs, 2);
|
||||
if( NULL != send_reqs ) {
|
||||
if (MPI_ERR_IN_STATUS == err) {
|
||||
for( req_index = 0; req_index < tree->tree_nextsize; req_index++ ) {
|
||||
if (MPI_REQUEST_NULL == send_reqs[req_index]) continue;
|
||||
if (MPI_ERR_PENDING == send_reqs[req_index]->req_status.MPI_ERROR) continue;
|
||||
err = send_reqs[req_index]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize);
|
||||
}
|
||||
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank) );
|
||||
(void)line; // silence compiler warnings
|
||||
ompi_coll_base_free_reqs( recv_reqs, 2);
|
||||
if( NULL != send_reqs ) {
|
||||
ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -649,12 +665,21 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
|
||||
* care what the error was -- just that there *was* an error. The
|
||||
* PML will finish all requests, even if one or more of them fail.
|
||||
* i.e., by the end of this call, all the requests are free-able.
|
||||
* So free them anyway -- even if there was an error, and return
|
||||
* the error after we free everything. */
|
||||
* So free them anyway -- even if there was an error.
|
||||
* Note we still need to get the actual error, as collective
|
||||
* operations cannot return MPI_ERR_IN_STATUS.
|
||||
*/
|
||||
|
||||
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
|
||||
err_hndl:
|
||||
if( MPI_SUCCESS != err ) { /* Free the reqs */
|
||||
/* first find the real error code */
|
||||
for( preq = reqs; preq < reqs+i; preq++ ) {
|
||||
if (MPI_REQUEST_NULL == *preq) continue;
|
||||
if (MPI_ERR_PENDING == (*preq)->req_status.MPI_ERROR) continue;
|
||||
err = (*preq)->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
ompi_coll_base_free_reqs(reqs, i);
|
||||
}
|
||||
|
||||
|
@ -326,6 +326,15 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount,
|
||||
return MPI_SUCCESS;
|
||||
error_hndl:
|
||||
if (NULL != reqs) {
|
||||
/* find a real error code */
|
||||
if (MPI_ERR_IN_STATUS == ret) {
|
||||
for( i = 0; i < size; i++ ) {
|
||||
if (MPI_REQUEST_NULL == reqs[i]) continue;
|
||||
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
|
||||
ret = reqs[i]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ompi_coll_base_free_reqs(reqs, size);
|
||||
}
|
||||
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
|
||||
|
@ -338,16 +338,34 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
error_hndl: /* error handler */
|
||||
/* find a real error code */
|
||||
if (MPI_ERR_IN_STATUS == ret) {
|
||||
for( i = 0; i < 2; i++ ) {
|
||||
if (MPI_REQUEST_NULL == reqs[i]) continue;
|
||||
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
|
||||
ret = reqs[i]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ompi_coll_base_free_reqs(reqs, 2);
|
||||
if( NULL != sreq ) {
|
||||
if (MPI_ERR_IN_STATUS == ret) {
|
||||
for( i = 0; i < max_outstanding_reqs; i++ ) {
|
||||
if (MPI_REQUEST_NULL == sreq[i]) continue;
|
||||
if (MPI_ERR_PENDING == sreq[i]->req_status.MPI_ERROR) continue;
|
||||
ret = sreq[i]->req_status.MPI_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
|
||||
}
|
||||
if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
|
||||
if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
|
||||
if( accumbuf_free != NULL ) free(accumbuf);
|
||||
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
|
||||
"ERROR_HNDL: node %d file %s line %d error %d\n",
|
||||
rank, __FILE__, line, ret ));
|
||||
(void)line; // silence compiler warning
|
||||
if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
|
||||
if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
|
||||
if( accumbuf_free != NULL ) free(accumbuf);
|
||||
if( NULL != sreq ) {
|
||||
ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -464,7 +464,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in
|
||||
char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL;
|
||||
char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL};
|
||||
ptrdiff_t extent, max_real_segsize, dsize, gap = 0;
|
||||
ompi_request_t *reqs[2] = {NULL, NULL};
|
||||
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
@ -41,7 +41,7 @@ int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount,
|
||||
{ /* post receive first, then send, then wait... should be fast (I hope) */
|
||||
int err, line = 0;
|
||||
size_t rtypesize, stypesize;
|
||||
ompi_request_t *req;
|
||||
ompi_request_t *req = MPI_REQUEST_NULL;
|
||||
ompi_status_public_t rstatus;
|
||||
|
||||
/* post new irecv */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user