Make mpool fail-to-unregister-freed-memory errors be fatal. Try to
make that routine a bit more safe, too (ensure to not call malloc and friends if from_alloc==true). This commit was SVN r20984.
Этот коммит содержится в:
родитель
b5deb228f3
Коммит
778c8c86d2
@ -47,3 +47,14 @@ to a slower network transport (such as TCP).
|
||||
Mpool name: %s
|
||||
Process: %s
|
||||
Local host: %s
|
||||
#
|
||||
[cannot deregister in-use memory]
|
||||
Open MPI intercepted a call to free memory that is still being used by
|
||||
an ongoing MPI communication. This usually reflects an error in the
|
||||
MPI application; it may signify memory corruption. Open MPI will now
|
||||
abort your job.
|
||||
|
||||
Mpool name: %s
|
||||
Local host: %s
|
||||
Buffer address: %p
|
||||
Buffer size: %lu
|
||||
|
@ -22,14 +22,27 @@
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "mpool_base_mem_cb.h"
|
||||
#include "base.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "ompi/mca/mpool/base/mpool_base_mem_cb.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/runtime/mpiruntime.h"
|
||||
|
||||
|
||||
static char msg[512];
|
||||
|
||||
|
||||
/*
|
||||
* memory hook callback, called when memory is free'd out from under us
|
||||
* memory hook callback, called when memory is free'd out from under
|
||||
* us. Be wary of the from_alloc flag -- if you're called with
|
||||
* from_alloc==true, then you cannot call malloc (or any of its
|
||||
* friends)!
|
||||
*/
|
||||
void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
||||
bool from_alloc)
|
||||
@ -37,14 +50,17 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
||||
mca_mpool_base_selected_module_t* current;
|
||||
int rc;
|
||||
opal_list_item_t* item;
|
||||
if(size == 0) {
|
||||
|
||||
/* Only do anything meaningful if we're between MPI_INIT and
|
||||
MPI_FINALIZE, and size != 0 */
|
||||
if ((from_alloc && (!ompi_mpi_initialized || ompi_mpi_finalized)) ||
|
||||
size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for(item = opal_list_get_first(&mca_mpool_base_modules);
|
||||
item != opal_list_get_end(&mca_mpool_base_modules);
|
||||
item = opal_list_get_next(item)) {
|
||||
bool warn = true;
|
||||
|
||||
current = (mca_mpool_base_selected_module_t*) item;
|
||||
|
||||
@ -52,11 +68,30 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
||||
rc = current->mpool_module->mpool_release_memory(current->mpool_module,
|
||||
base, size);
|
||||
|
||||
if(rc != OMPI_SUCCESS && true == warn) {
|
||||
opal_output(0, "Memory %p:%lu cannot be freed from the "
|
||||
"registration cache. Possible memory corruption.\n",
|
||||
base, (unsigned long)size);
|
||||
warn = false;
|
||||
if (rc != OMPI_SUCCESS) {
|
||||
if (from_alloc) {
|
||||
int len;
|
||||
len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in use by an ongoing MPI communication (buffer %p, size %lu). MPI job will now abort.\n",
|
||||
orte_process_info.nodename,
|
||||
getpid(),
|
||||
base, (unsigned long) size);
|
||||
msg[sizeof(msg) - 1] = '\0';
|
||||
write(2, msg, len);
|
||||
} else {
|
||||
orte_show_help("help-mpool-base.txt",
|
||||
"cannot deregister in-use memory", true,
|
||||
current->mpool_component->mpool_version.mca_component_name,
|
||||
orte_process_info.nodename,
|
||||
base, (unsigned long) size);
|
||||
}
|
||||
|
||||
/* We're in a callback from somewhere; we can't do
|
||||
anything meaningful to pass an error back up. :-(
|
||||
So just exit. Call _exit() so that we don't try to
|
||||
call anything on the way out -- just exit!
|
||||
(remember that we're in a callback, and state may
|
||||
be very undefined at this point...) */
|
||||
_exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user