coll/libnbc: do not recursively call opal_progress()
instead of invoking ompi_request_test_all(), that will end up calling opal_progress() recursively, manually check the status of the requests. the same method is used in ompi_comm_request_progress() Refs open-mpi/ompi#3901 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Этот коммит содержится в:
родитель
9a8797a6be
Коммит
1a41482720
@ -10,7 +10,7 @@
|
||||
* rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
*
|
||||
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
|
||||
@ -315,7 +315,8 @@ static inline void NBC_Free (NBC_Handle* handle) {
|
||||
*
|
||||
* to be called *only* from the progress thread !!! */
|
||||
int NBC_Progress(NBC_Handle *handle) {
|
||||
int flag, res, ret=NBC_CONTINUE;
|
||||
int res, ret=NBC_CONTINUE;
|
||||
bool flag;
|
||||
unsigned long size = 0;
|
||||
char *delim;
|
||||
int i;
|
||||
@ -325,43 +326,27 @@ int NBC_Progress(NBC_Handle *handle) {
|
||||
return NBC_OK;
|
||||
}
|
||||
|
||||
flag = true;
|
||||
|
||||
if ((handle->req_count > 0) && (handle->req_array != NULL)) {
|
||||
NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count);
|
||||
#ifdef NBC_TIMING
|
||||
Test_time -= MPI_Wtime();
|
||||
#endif
|
||||
res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE);
|
||||
if(res != OMPI_SUCCESS) {
|
||||
// Attempt to cancel outstanding requests
|
||||
for(i = 0; i < handle->req_count; ++i ) {
|
||||
// If the request is complete, then try to report the error code
|
||||
if( handle->req_array[i]->req_complete ) {
|
||||
if( OMPI_SUCCESS != handle->req_array[i]->req_status.MPI_ERROR ) {
|
||||
NBC_Error ("MPI Error in MPI_Testall() (req %d = %d)", i, handle->req_array[i]->req_status.MPI_ERROR);
|
||||
}
|
||||
/* don't call ompi_request_test_all as it causes a recursive call into opal_progress */
|
||||
while (handle->req_count) {
|
||||
ompi_request_t *subreq = handle->req_array[handle->req_count - 1];
|
||||
if (REQUEST_COMPLETE(subreq)) {
|
||||
ompi_request_free(&subreq);
|
||||
handle->req_count--;
|
||||
} else {
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
ompi_request_cancel(handle->req_array[i]);
|
||||
// If the PML actually canceled the request, then wait on it
|
||||
if( handle->req_array[i]->req_status._cancelled) {
|
||||
ompi_request_wait(&handle->req_array[i], &status);
|
||||
}
|
||||
// Warn the user that we had to leave a PML message outstanding so
|
||||
// bad things could happen if they continue using nonblocking collectives
|
||||
else {
|
||||
NBC_Error ("MPI Error: Not able to cancel the internal request %d. "
|
||||
"Be aware that continuing to use nonblocking collectives on this communicator may result in undefined behavior.", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#ifdef NBC_TIMING
|
||||
Test_time += MPI_Wtime();
|
||||
#endif
|
||||
} else {
|
||||
flag = 1; /* we had no open requests -> proceed to next round */
|
||||
}
|
||||
|
||||
/* a round is finished */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user