Merge pull request #6086 from ICLDisco/export/errors_nbc
Manage errors in NBC collective ops
Этот коммит содержится в:
Коммит
bd0d2b832e
@ -3,7 +3,7 @@
|
|||||||
* Copyright (c) 2006 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2006 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2013 The University of Tennessee and The University
|
* Copyright (c) 2013-2018 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006 The Technical University of Chemnitz. All
|
* Copyright (c) 2006 The Technical University of Chemnitz. All
|
||||||
@ -335,8 +335,14 @@ int NBC_Progress(NBC_Handle *handle) {
|
|||||||
while (handle->req_count) {
|
while (handle->req_count) {
|
||||||
ompi_request_t *subreq = handle->req_array[handle->req_count - 1];
|
ompi_request_t *subreq = handle->req_array[handle->req_count - 1];
|
||||||
if (REQUEST_COMPLETE(subreq)) {
|
if (REQUEST_COMPLETE(subreq)) {
|
||||||
ompi_request_free(&subreq);
|
if(OPAL_UNLIKELY( OMPI_SUCCESS != subreq->req_status.MPI_ERROR )) {
|
||||||
|
NBC_Error ("MPI Error in NBC subrequest %p : %d", subreq, subreq->req_status.MPI_ERROR);
|
||||||
|
/* copy the error code from the underlying request and let the
|
||||||
|
* round finish */
|
||||||
|
handle->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
|
||||||
|
}
|
||||||
handle->req_count--;
|
handle->req_count--;
|
||||||
|
ompi_request_free(&subreq);
|
||||||
} else {
|
} else {
|
||||||
flag = false;
|
flag = false;
|
||||||
break;
|
break;
|
||||||
@ -349,6 +355,26 @@ int NBC_Progress(NBC_Handle *handle) {
|
|||||||
|
|
||||||
/* a round is finished */
|
/* a round is finished */
|
||||||
if (flag) {
|
if (flag) {
|
||||||
|
/* reset handle for next round */
|
||||||
|
if (NULL != handle->req_array) {
|
||||||
|
/* free request array */
|
||||||
|
free (handle->req_array);
|
||||||
|
handle->req_array = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
handle->req_count = 0;
|
||||||
|
|
||||||
|
/* previous round had an error */
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != handle->super.req_status.MPI_ERROR)) {
|
||||||
|
res = handle->super.req_status.MPI_ERROR;
|
||||||
|
NBC_Error("NBC_Progress: an error %d was found during schedule %p at row-offset %li - aborting the schedule\n", res, handle->schedule, handle->row_offset);
|
||||||
|
handle->nbc_complete = true;
|
||||||
|
if (!handle->super.req_persistent) {
|
||||||
|
NBC_Free(handle);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
/* adjust delim to start of current round */
|
/* adjust delim to start of current round */
|
||||||
NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset);
|
NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset);
|
||||||
delim = handle->schedule->data + handle->row_offset;
|
delim = handle->schedule->data + handle->row_offset;
|
||||||
@ -358,14 +384,6 @@ int NBC_Progress(NBC_Handle *handle) {
|
|||||||
/* adjust delim to end of current round -> delimiter */
|
/* adjust delim to end of current round -> delimiter */
|
||||||
delim = delim + size;
|
delim = delim + size;
|
||||||
|
|
||||||
if (NULL != handle->req_array) {
|
|
||||||
/* free request array */
|
|
||||||
free (handle->req_array);
|
|
||||||
handle->req_array = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
handle->req_count = 0;
|
|
||||||
|
|
||||||
if (*delim == 0) {
|
if (*delim == 0) {
|
||||||
/* this was the last round - we're done */
|
/* this was the last round - we're done */
|
||||||
NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n");
|
NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n");
|
||||||
@ -638,6 +656,7 @@ int NBC_Start(NBC_Handle *handle) {
|
|||||||
|
|
||||||
/* kick off first round */
|
/* kick off first round */
|
||||||
handle->super.req_state = OMPI_REQUEST_ACTIVE;
|
handle->super.req_state = OMPI_REQUEST_ACTIVE;
|
||||||
|
handle->super.req_status.MPI_ERROR = OMPI_SUCCESS;
|
||||||
res = NBC_Start_round(handle);
|
res = NBC_Start_round(handle);
|
||||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
|
||||||
return res;
|
return res;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user