1
1

Fix data corruption in MXM by registering to OPAL memory release hooks and removing any mappings created by mxm

This commit was SVN r28489.
Этот коммит содержится в:
Yossi Etigin 2013-05-14 12:27:44 +00:00
родитель f4f07bdb21
Коммит 64d98e0438
3 изменённых файлов: 61 добавлений и 15 удалений

Просмотреть файл

@ -15,6 +15,7 @@
#include "ompi/mca/mtl/base/mtl_base_datatype.h" #include "ompi/mca/mtl/base/mtl_base_datatype.h"
#include "ompi/proc/proc.h" #include "ompi/proc/proc.h"
#include "ompi/communicator/communicator.h" #include "ompi/communicator/communicator.h"
#include "opal/memoryhooks/memory.h"
#include "mtl_mxm.h" #include "mtl_mxm.h"
#include "mtl_mxm_types.h" #include "mtl_mxm_types.h"
@ -39,7 +40,11 @@ mca_mtl_mxm_module_t ompi_mtl_mxm = {
ompi_mtl_mxm_cancel, ompi_mtl_mxm_cancel,
ompi_mtl_mxm_add_comm, ompi_mtl_mxm_add_comm,
ompi_mtl_mxm_del_comm ompi_mtl_mxm_del_comm
} },
0,
0,
NULL,
NULL
}; };
@ -78,6 +83,10 @@ static uint32_t ompi_mtl_mxm_get_job_id(void)
} }
int ompi_mtl_mxm_progress(void); int ompi_mtl_mxm_progress(void);
#if MXM_API >= MXM_VERSION(2,0)
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
void *cbdata, bool from_alloc);
#endif
#if MXM_API < MXM_VERSION(2, 0) #if MXM_API < MXM_VERSION(2, 0)
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid) static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid)
@ -412,11 +421,22 @@ int ompi_mtl_mxm_module_init(void)
/* Register the MXM progress function */ /* Register the MXM progress function */
opal_progress_register(ompi_mtl_mxm_progress); opal_progress_register(ompi_mtl_mxm_progress);
#if MXM_API >= MXM_VERSION(2,0)
if (ompi_mtl_mxm.using_mem_hooks) {
opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL);
}
#endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int ompi_mtl_mxm_finalize(struct mca_mtl_base_module_t* mtl) int ompi_mtl_mxm_finalize(struct mca_mtl_base_module_t* mtl)
{ {
#if MXM_API >= MXM_VERSION(2,0)
if (ompi_mtl_mxm.using_mem_hooks) {
opal_mem_hooks_unregister_release(ompi_mtl_mxm_mem_release_cb);
}
#endif
opal_progress_unregister(ompi_mtl_mxm_progress); opal_progress_unregister(ompi_mtl_mxm_progress);
mxm_ep_destroy(ompi_mtl_mxm.ep); mxm_ep_destroy(ompi_mtl_mxm.ep);
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -566,6 +586,15 @@ int ompi_mtl_mxm_progress(void)
return 1; return 1;
} }
#if MXM_API >= MXM_VERSION(2,0)
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
void *cbdata, bool from_alloc)
{
mxm_mem_unmap(ompi_mtl_mxm.mxm_context, buf, length,
from_alloc ? MXM_MEM_UNMAP_MARK_INVALID : 0);
}
#endif
#if MXM_API >= MXM_VERSION(1,5) #if MXM_API >= MXM_VERSION(1,5)
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
ompi_mtl_mxm_message_t, ompi_mtl_mxm_message_t,

Просмотреть файл

@ -11,6 +11,7 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "ompi/proc/proc.h" #include "ompi/proc/proc.h"
#include "opal/memoryhooks/memory.h"
#include "mtl_mxm.h" #include "mtl_mxm.h"
#include "mtl_mxm_types.h" #include "mtl_mxm_types.h"
@ -103,20 +104,35 @@ static int ompi_mtl_mxm_component_open(void)
return OMPI_ERR_NOT_AVAILABLE; return OMPI_ERR_NOT_AVAILABLE;
} }
#if MXM_API < MXM_VERSION(1,5) #if MXM_API < MXM_VERSION(1,5)
mxm_fill_context_opts(&ompi_mtl_mxm.mxm_opts); mxm_fill_context_opts(&ompi_mtl_mxm.mxm_opts);
err = mxm_init(&ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context); err = mxm_init(&ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
MXM_VERBOSE(1, "mxm component open"); MXM_VERBOSE(1, "mxm component open");
#else #else
err = mxm_config_read_context_opts(&ompi_mtl_mxm.mxm_opts);
if (err != MXM_OK) {
MXM_ERROR("Failed to parse MXM configuration");
return OPAL_ERR_BAD_PARAM;
}
err = mxm_init(ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context); #if MXM_API >= MXM_VERSION(2,0)
MXM_VERBOSE(1, "mxm component open"); /* Register memory hooks */
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) &
opal_mem_hooks_support_level()))
{
setenv("MXM_MEM_ON_DEMAND_MAP", "y", 0);
MXM_VERBOSE(1, "Enabling on-demand memory mapping");
ompi_mtl_mxm.using_mem_hooks = 1;
} else {
MXM_VERBOSE(1, "Disabling on-demand memory mapping");
ompi_mtl_mxm.using_mem_hooks = 0;
}
#endif #endif
err = mxm_config_read_context_opts(&ompi_mtl_mxm.mxm_opts);
if (err != MXM_OK) {
MXM_ERROR("Failed to parse MXM configuration");
return OPAL_ERR_BAD_PARAM;
}
err = mxm_init(ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
MXM_VERBOSE(1, "mxm component open");
#endif
if (MXM_OK != err) { if (MXM_OK != err) {
if (MXM_ERR_NO_DEVICE == err) { if (MXM_ERR_NO_DEVICE == err) {
@ -158,8 +174,7 @@ static int ompi_mtl_mxm_component_close(void)
unsigned long cur_ver; unsigned long cur_ver;
cur_ver = mxm_get_version(); cur_ver = mxm_get_version();
if ((cur_ver == MXM_API) && (ompi_mtl_mxm.mxm_context != NULL)) {
if (cur_ver == MXM_API) {
mxm_cleanup(ompi_mtl_mxm.mxm_context); mxm_cleanup(ompi_mtl_mxm.mxm_context);
ompi_mtl_mxm.mxm_context = NULL; ompi_mtl_mxm.mxm_context = NULL;
@ -189,4 +204,3 @@ ompi_mtl_mxm_component_init(bool enable_progress_threads,
sizeof(mca_mtl_mxm_request_t) - sizeof(struct mca_mtl_request_t); sizeof(mca_mtl_mxm_request_t) - sizeof(struct mca_mtl_request_t);
return &ompi_mtl_mxm.super; return &ompi_mtl_mxm.super;
} }

Просмотреть файл

@ -29,12 +29,15 @@ typedef struct mca_mtl_mxm_module_t {
int verbose; int verbose;
int mxm_np; int mxm_np;
mxm_h mxm_context; mxm_h mxm_context;
mxm_ep_h ep;
#if MXM_API < MXM_VERSION(1,5) #if MXM_API < MXM_VERSION(1,5)
mxm_context_opts_t mxm_opts; mxm_context_opts_t mxm_opts;
#else #else
mxm_context_opts_t *mxm_opts; mxm_context_opts_t *mxm_opts;
#endif #endif
mxm_ep_h ep; #if MXM_API >= MXM_VERSION(2,0)
int using_mem_hooks;
#endif
} mca_mtl_mxm_module_t; } mca_mtl_mxm_module_t;