1
1

Make mpool fail-to-unregister-freed-memory errors be fatal. Try to

make that routine a bit more safe, too (ensure to not call malloc and
friends if from_alloc==true).

This commit was SVN r20984.
Этот коммит содержится в:
Jeff Squyres 2009-04-14 00:54:20 +00:00
родитель b5deb228f3
Коммит 778c8c86d2
2 изменённых файлов: 56 добавлений и 10 удалений

Просмотреть файл

@ -47,3 +47,14 @@ to a slower network transport (such as TCP).
Mpool name: %s
Process: %s
Local host: %s
#
[cannot deregister in-use memory]
Open MPI intercepted a call to free memory that is still being used by
an ongoing MPI communication. This usually reflects an error in the
MPI application; it may signify memory corruption. Open MPI will now
abort your job.
Mpool name: %s
Local host: %s
Buffer address: %p
Buffer size: %lu

Просмотреть файл

@ -22,14 +22,27 @@
*/
#include "ompi_config.h"
#include <unistd.h>
#include "opal/class/opal_pointer_array.h"
#include "mpool_base_mem_cb.h"
#include "base.h"
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "ompi/mca/mpool/base/mpool_base_mem_cb.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/runtime/mpiruntime.h"
static char msg[512];
/*
* memory hook callback, called when memory is free'd out from under us
* memory hook callback, called when memory is free'd out from under
* us. Be wary of the from_alloc flag -- if you're called with
* from_alloc==true, then you cannot call malloc (or any of its
* friends)!
*/
void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
bool from_alloc)
@ -37,14 +50,17 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
mca_mpool_base_selected_module_t* current;
int rc;
opal_list_item_t* item;
if(size == 0) {
/* Only do anything meaningful if we're between MPI_INIT and
MPI_FINALIZE, and size != 0 */
if ((from_alloc && (!ompi_mpi_initialized || ompi_mpi_finalized)) ||
size == 0) {
return;
}
for(item = opal_list_get_first(&mca_mpool_base_modules);
item != opal_list_get_end(&mca_mpool_base_modules);
item = opal_list_get_next(item)) {
bool warn = true;
current = (mca_mpool_base_selected_module_t*) item;
@ -52,11 +68,30 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata,
rc = current->mpool_module->mpool_release_memory(current->mpool_module,
base, size);
if(rc != OMPI_SUCCESS && true == warn) {
opal_output(0, "Memory %p:%lu cannot be freed from the "
"registration cache. Possible memory corruption.\n",
base, (unsigned long)size);
warn = false;
if (rc != OMPI_SUCCESS) {
if (from_alloc) {
int len;
len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in use by an ongoing MPI communication (buffer %p, size %lu). MPI job will now abort.\n",
orte_process_info.nodename,
getpid(),
base, (unsigned long) size);
msg[sizeof(msg) - 1] = '\0';
write(2, msg, len);
} else {
orte_show_help("help-mpool-base.txt",
"cannot deregister in-use memory", true,
current->mpool_component->mpool_version.mca_component_name,
orte_process_info.nodename,
base, (unsigned long) size);
}
/* We're in a callback from somewhere; we can't do
anything meaningful to pass an error back up. :-(
So just exit. Call _exit() so that we don't try to
call anything on the way out -- just exit!
(remember that we're in a callback, and state may
be very undefined at this point...) */
_exit(1);
}
}
}