Make mpool fail-to-unregister-freed-memory errors be fatal. Try to
make that routine a bit more safe, too (ensure to not call malloc and friends if from_alloc==true). This commit was SVN r20984.
Этот коммит содержится в:
родитель
b5deb228f3
Коммит
778c8c86d2
@ -47,3 +47,14 @@ to a slower network transport (such as TCP).
|
|||||||
Mpool name: %s
|
Mpool name: %s
|
||||||
Process: %s
|
Process: %s
|
||||||
Local host: %s
|
Local host: %s
|
||||||
|
#
|
||||||
|
[cannot deregister in-use memory]
|
||||||
|
Open MPI intercepted a call to free memory that is still being used by
|
||||||
|
an ongoing MPI communication. This usually reflects an error in the
|
||||||
|
MPI application; it may signify memory corruption. Open MPI will now
|
||||||
|
abort your job.
|
||||||
|
|
||||||
|
Mpool name: %s
|
||||||
|
Local host: %s
|
||||||
|
Buffer address: %p
|
||||||
|
Buffer size: %lu
|
||||||
|
@ -22,14 +22,27 @@
|
|||||||
*/
|
*/
|
||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "opal/class/opal_pointer_array.h"
|
#include "opal/class/opal_pointer_array.h"
|
||||||
#include "mpool_base_mem_cb.h"
|
|
||||||
#include "base.h"
|
|
||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
|
||||||
|
#include "ompi/mca/mpool/base/mpool_base_mem_cb.h"
|
||||||
|
#include "ompi/mca/mpool/base/base.h"
|
||||||
|
#include "ompi/runtime/mpiruntime.h"
|
||||||
|
|
||||||
|
|
||||||
|
static char msg[512];
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* memory hook callback, called when memory is free'd out from under us
|
* memory hook callback, called when memory is free'd out from under
|
||||||
|
* us. Be wary of the from_alloc flag -- if you're called with
|
||||||
|
* from_alloc==true, then you cannot call malloc (or any of its
|
||||||
|
* friends)!
|
||||||
*/
|
*/
|
||||||
void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
||||||
bool from_alloc)
|
bool from_alloc)
|
||||||
@ -37,14 +50,17 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
|||||||
mca_mpool_base_selected_module_t* current;
|
mca_mpool_base_selected_module_t* current;
|
||||||
int rc;
|
int rc;
|
||||||
opal_list_item_t* item;
|
opal_list_item_t* item;
|
||||||
if(size == 0) {
|
|
||||||
|
/* Only do anything meaningful if we're between MPI_INIT and
|
||||||
|
MPI_FINALIZE, and size != 0 */
|
||||||
|
if ((from_alloc && (!ompi_mpi_initialized || ompi_mpi_finalized)) ||
|
||||||
|
size == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(item = opal_list_get_first(&mca_mpool_base_modules);
|
for(item = opal_list_get_first(&mca_mpool_base_modules);
|
||||||
item != opal_list_get_end(&mca_mpool_base_modules);
|
item != opal_list_get_end(&mca_mpool_base_modules);
|
||||||
item = opal_list_get_next(item)) {
|
item = opal_list_get_next(item)) {
|
||||||
bool warn = true;
|
|
||||||
|
|
||||||
current = (mca_mpool_base_selected_module_t*) item;
|
current = (mca_mpool_base_selected_module_t*) item;
|
||||||
|
|
||||||
@ -52,11 +68,30 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
|
|||||||
rc = current->mpool_module->mpool_release_memory(current->mpool_module,
|
rc = current->mpool_module->mpool_release_memory(current->mpool_module,
|
||||||
base, size);
|
base, size);
|
||||||
|
|
||||||
if(rc != OMPI_SUCCESS && true == warn) {
|
if (rc != OMPI_SUCCESS) {
|
||||||
opal_output(0, "Memory %p:%lu cannot be freed from the "
|
if (from_alloc) {
|
||||||
"registration cache. Possible memory corruption.\n",
|
int len;
|
||||||
|
len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in use by an ongoing MPI communication (buffer %p, size %lu). MPI job will now abort.\n",
|
||||||
|
orte_process_info.nodename,
|
||||||
|
getpid(),
|
||||||
base, (unsigned long) size);
|
base, (unsigned long) size);
|
||||||
warn = false;
|
msg[sizeof(msg) - 1] = '\0';
|
||||||
|
write(2, msg, len);
|
||||||
|
} else {
|
||||||
|
orte_show_help("help-mpool-base.txt",
|
||||||
|
"cannot deregister in-use memory", true,
|
||||||
|
current->mpool_component->mpool_version.mca_component_name,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
base, (unsigned long) size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We're in a callback from somewhere; we can't do
|
||||||
|
anything meaningful to pass an error back up. :-(
|
||||||
|
So just exit. Call _exit() so that we don't try to
|
||||||
|
call anything on the way out -- just exit!
|
||||||
|
(remember that we're in a callback, and state may
|
||||||
|
be very undefined at this point...) */
|
||||||
|
_exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user