coll/ml: better error handling
when CHECK_AND_RECYCLE detects an error, a message is displayed if the error occurs on an intrinsic communicator, then abort the program (instead of trying to free the communicator) cmr=v1.8.3:reviewer=hjelmn This commit was SVN r32659.
Этот коммит содержится в:
родитель
c2bcda518f
Коммит
edfbeba7bf
@ -4,6 +4,8 @@
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -424,9 +426,19 @@ do {
|
||||
/* release potentially may trigger ML module distraction and having */ \
|
||||
/* the element not on the list may cause memory leak. */ \
|
||||
if (OPAL_UNLIKELY(is_coll_sync)) { \
|
||||
OBJ_RELEASE(comm); \
|
||||
/* After this point it is UNSAFE to touch ml module */ \
|
||||
/* or communicator */ \
|
||||
if (OMPI_COMM_IS_INTRINSIC(comm)) { \
|
||||
opal_show_help("help-mpi-coll-ml.txt", \
|
||||
"coll-ml-check-fatal-error", true, \
|
||||
comm->c_name); \
|
||||
ompi_mpi_abort(comm, 6); \
|
||||
} else { \
|
||||
opal_show_help("help-mpi-coll-ml.txt", \
|
||||
"coll-ml-check-error", true, \
|
||||
comm->c_name); \
|
||||
/* After this point it is UNSAFE to touch ml module */ \
|
||||
/* or communicator */ \
|
||||
OBJ_RELEASE(comm); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
@ -1,6 +1,8 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2014 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2014 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -48,3 +50,15 @@ zero copy method.
|
||||
|
||||
ML could not be used because the mca param coll_ml_bcast_algorithm
|
||||
was not set to static and other broadcast implementation was available.
|
||||
|
||||
[coll-ml-check-error]
|
||||
|
||||
ML detected an error on communicator %s
|
||||
|
||||
This communicator cannot be used any more
|
||||
|
||||
[coll-ml-check-fatal-error]
|
||||
|
||||
ML detected an unrecoverable error on intrinsic communicator %s
|
||||
|
||||
The program will now abort
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user