Fix data corruption in MXM by registering to OPAL memory release hooks and removing any mappings created by mxm
This commit was SVN r28489.
Этот коммит содержится в:
родитель
f4f07bdb21
Коммит
64d98e0438
@ -15,6 +15,7 @@
|
||||
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "opal/memoryhooks/memory.h"
|
||||
|
||||
#include "mtl_mxm.h"
|
||||
#include "mtl_mxm_types.h"
|
||||
@ -39,7 +40,11 @@ mca_mtl_mxm_module_t ompi_mtl_mxm = {
|
||||
ompi_mtl_mxm_cancel,
|
||||
ompi_mtl_mxm_add_comm,
|
||||
ompi_mtl_mxm_del_comm
|
||||
}
|
||||
},
|
||||
0,
|
||||
0,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
||||
@ -78,6 +83,10 @@ static uint32_t ompi_mtl_mxm_get_job_id(void)
|
||||
}
|
||||
|
||||
int ompi_mtl_mxm_progress(void);
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
|
||||
void *cbdata, bool from_alloc);
|
||||
#endif
|
||||
|
||||
#if MXM_API < MXM_VERSION(2, 0)
|
||||
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid)
|
||||
@ -412,11 +421,22 @@ int ompi_mtl_mxm_module_init(void)
|
||||
|
||||
/* Register the MXM progress function */
|
||||
opal_progress_register(ompi_mtl_mxm_progress);
|
||||
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
if (ompi_mtl_mxm.using_mem_hooks) {
|
||||
opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL);
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_mtl_mxm_finalize(struct mca_mtl_base_module_t* mtl)
|
||||
{
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
if (ompi_mtl_mxm.using_mem_hooks) {
|
||||
opal_mem_hooks_unregister_release(ompi_mtl_mxm_mem_release_cb);
|
||||
}
|
||||
#endif
|
||||
opal_progress_unregister(ompi_mtl_mxm_progress);
|
||||
mxm_ep_destroy(ompi_mtl_mxm.ep);
|
||||
return OMPI_SUCCESS;
|
||||
@ -566,6 +586,15 @@ int ompi_mtl_mxm_progress(void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
|
||||
void *cbdata, bool from_alloc)
|
||||
{
|
||||
mxm_mem_unmap(ompi_mtl_mxm.mxm_context, buf, length,
|
||||
from_alloc ? MXM_MEM_UNMAP_MARK_INVALID : 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if MXM_API >= MXM_VERSION(1,5)
|
||||
OBJ_CLASS_INSTANCE(
|
||||
ompi_mtl_mxm_message_t,
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "opal/memoryhooks/memory.h"
|
||||
|
||||
#include "mtl_mxm.h"
|
||||
#include "mtl_mxm_types.h"
|
||||
@ -103,20 +104,35 @@ static int ompi_mtl_mxm_component_open(void)
|
||||
return OMPI_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
#if MXM_API < MXM_VERSION(1,5)
|
||||
mxm_fill_context_opts(&ompi_mtl_mxm.mxm_opts);
|
||||
err = mxm_init(&ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
||||
MXM_VERBOSE(1, "mxm component open");
|
||||
mxm_fill_context_opts(&ompi_mtl_mxm.mxm_opts);
|
||||
err = mxm_init(&ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
||||
MXM_VERBOSE(1, "mxm component open");
|
||||
#else
|
||||
err = mxm_config_read_context_opts(&ompi_mtl_mxm.mxm_opts);
|
||||
if (err != MXM_OK) {
|
||||
MXM_ERROR("Failed to parse MXM configuration");
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
err = mxm_init(ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
||||
MXM_VERBOSE(1, "mxm component open");
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
/* Register memory hooks */
|
||||
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
||||
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) &
|
||||
opal_mem_hooks_support_level()))
|
||||
{
|
||||
setenv("MXM_MEM_ON_DEMAND_MAP", "y", 0);
|
||||
MXM_VERBOSE(1, "Enabling on-demand memory mapping");
|
||||
ompi_mtl_mxm.using_mem_hooks = 1;
|
||||
} else {
|
||||
MXM_VERBOSE(1, "Disabling on-demand memory mapping");
|
||||
ompi_mtl_mxm.using_mem_hooks = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
err = mxm_config_read_context_opts(&ompi_mtl_mxm.mxm_opts);
|
||||
if (err != MXM_OK) {
|
||||
MXM_ERROR("Failed to parse MXM configuration");
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
err = mxm_init(ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
||||
MXM_VERBOSE(1, "mxm component open");
|
||||
#endif
|
||||
|
||||
if (MXM_OK != err) {
|
||||
if (MXM_ERR_NO_DEVICE == err) {
|
||||
@ -158,8 +174,7 @@ static int ompi_mtl_mxm_component_close(void)
|
||||
unsigned long cur_ver;
|
||||
|
||||
cur_ver = mxm_get_version();
|
||||
|
||||
if (cur_ver == MXM_API) {
|
||||
if ((cur_ver == MXM_API) && (ompi_mtl_mxm.mxm_context != NULL)) {
|
||||
mxm_cleanup(ompi_mtl_mxm.mxm_context);
|
||||
ompi_mtl_mxm.mxm_context = NULL;
|
||||
|
||||
@ -189,4 +204,3 @@ ompi_mtl_mxm_component_init(bool enable_progress_threads,
|
||||
sizeof(mca_mtl_mxm_request_t) - sizeof(struct mca_mtl_request_t);
|
||||
return &ompi_mtl_mxm.super;
|
||||
}
|
||||
|
||||
|
@ -29,12 +29,15 @@ typedef struct mca_mtl_mxm_module_t {
|
||||
int verbose;
|
||||
int mxm_np;
|
||||
mxm_h mxm_context;
|
||||
mxm_ep_h ep;
|
||||
#if MXM_API < MXM_VERSION(1,5)
|
||||
mxm_context_opts_t mxm_opts;
|
||||
#else
|
||||
mxm_context_opts_t *mxm_opts;
|
||||
#endif
|
||||
mxm_ep_h ep;
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
int using_mem_hooks;
|
||||
#endif
|
||||
} mca_mtl_mxm_module_t;
|
||||
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user