Merge pull request #2245 from jjhursey/topic/libnbc-error-path
coll/libnbc: Fix error path on internal error
Этот коммит содержится в:
Коммит
d1ecc83e14
@ -13,6 +13,9 @@
|
|||||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2016 Research Organization for Information Science
|
||||||
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -234,17 +237,24 @@ int
|
|||||||
ompi_coll_libnbc_progress(void)
|
ompi_coll_libnbc_progress(void)
|
||||||
{
|
{
|
||||||
ompi_coll_libnbc_request_t* request, *next;
|
ompi_coll_libnbc_request_t* request, *next;
|
||||||
|
int res;
|
||||||
|
|
||||||
if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;
|
if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;
|
||||||
|
|
||||||
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
|
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
|
||||||
ompi_coll_libnbc_request_t) {
|
ompi_coll_libnbc_request_t) {
|
||||||
if (OMPI_SUCCESS == NBC_Progress(request)) {
|
res = NBC_Progress(request);
|
||||||
|
if( NBC_CONTINUE != res ) {
|
||||||
/* done, remove and complete */
|
/* done, remove and complete */
|
||||||
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
|
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
|
||||||
&request->super.super.super);
|
&request->super.super.super);
|
||||||
|
|
||||||
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
|
if( OMPI_SUCCESS == res || NBC_OK == res || NBC_SUCCESS == res ) {
|
||||||
|
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
request->super.req_status.MPI_ERROR = res;
|
||||||
|
}
|
||||||
OPAL_THREAD_LOCK(&ompi_request_lock);
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
||||||
ompi_request_complete(&request->super, true);
|
ompi_request_complete(&request->super, true);
|
||||||
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
|
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
|
||||||
*
|
*
|
||||||
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include "nbc_internal.h"
|
#include "nbc_internal.h"
|
||||||
@ -312,6 +313,8 @@ int NBC_Progress(NBC_Handle *handle) {
|
|||||||
int flag, res, ret=NBC_CONTINUE;
|
int flag, res, ret=NBC_CONTINUE;
|
||||||
unsigned long size = 0;
|
unsigned long size = 0;
|
||||||
char *delim;
|
char *delim;
|
||||||
|
int i;
|
||||||
|
ompi_status_public_t status;
|
||||||
|
|
||||||
/* the handle is done if there is no schedule attached */
|
/* the handle is done if there is no schedule attached */
|
||||||
if (NULL == handle->schedule) {
|
if (NULL == handle->schedule) {
|
||||||
@ -325,8 +328,30 @@ int NBC_Progress(NBC_Handle *handle) {
|
|||||||
#endif
|
#endif
|
||||||
res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE);
|
res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE);
|
||||||
if(res != OMPI_SUCCESS) {
|
if(res != OMPI_SUCCESS) {
|
||||||
NBC_Error ("MPI Error in MPI_Testall() (%i)", res);
|
// Attempt to cancel outstanding requests
|
||||||
return res;
|
for(i = 0; i < handle->req_count; ++i ) {
|
||||||
|
// If the request is complete, then try to report the error code
|
||||||
|
if( handle->req_array[i]->req_complete ) {
|
||||||
|
if( OMPI_SUCCESS != handle->req_array[i]->req_status.MPI_ERROR ) {
|
||||||
|
NBC_Error ("MPI Error in MPI_Testall() (req %d = %d)", i, handle->req_array[i]->req_status.MPI_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ompi_request_cancel(handle->req_array[i]);
|
||||||
|
// If the PML actually canceled the request, then wait on it
|
||||||
|
if( handle->req_array[i]->req_status._cancelled) {
|
||||||
|
ompi_request_wait(&handle->req_array[i], &status);
|
||||||
|
}
|
||||||
|
// Warn the user that we had to leave a PML message outstanding so
|
||||||
|
// bad things could happen if they continue using nonblocking collectives
|
||||||
|
else {
|
||||||
|
NBC_Error ("MPI Error: Not able to cancel the internal request %d. "
|
||||||
|
"Be aware that continuing to use nonblocking collectives on this communicator may result in undefined behavior.", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
#ifdef NBC_TIMING
|
#ifdef NBC_TIMING
|
||||||
Test_time += MPI_Wtime();
|
Test_time += MPI_Wtime();
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user