Fix data corruption in MXM by registering to OPAL memory release hooks and removing any mappings created by mxm
This commit was SVN r28489.
Этот коммит содержится в:
родитель
f4f07bdb21
Коммит
64d98e0438
@ -15,6 +15,7 @@
|
|||||||
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
#include "ompi/communicator/communicator.h"
|
#include "ompi/communicator/communicator.h"
|
||||||
|
#include "opal/memoryhooks/memory.h"
|
||||||
|
|
||||||
#include "mtl_mxm.h"
|
#include "mtl_mxm.h"
|
||||||
#include "mtl_mxm_types.h"
|
#include "mtl_mxm_types.h"
|
||||||
@ -39,7 +40,11 @@ mca_mtl_mxm_module_t ompi_mtl_mxm = {
|
|||||||
ompi_mtl_mxm_cancel,
|
ompi_mtl_mxm_cancel,
|
||||||
ompi_mtl_mxm_add_comm,
|
ompi_mtl_mxm_add_comm,
|
||||||
ompi_mtl_mxm_del_comm
|
ompi_mtl_mxm_del_comm
|
||||||
}
|
},
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
NULL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -78,6 +83,10 @@ static uint32_t ompi_mtl_mxm_get_job_id(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
int ompi_mtl_mxm_progress(void);
|
int ompi_mtl_mxm_progress(void);
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
|
||||||
|
void *cbdata, bool from_alloc);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if MXM_API < MXM_VERSION(2, 0)
|
#if MXM_API < MXM_VERSION(2, 0)
|
||||||
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid)
|
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid)
|
||||||
@ -412,11 +421,22 @@ int ompi_mtl_mxm_module_init(void)
|
|||||||
|
|
||||||
/* Register the MXM progress function */
|
/* Register the MXM progress function */
|
||||||
opal_progress_register(ompi_mtl_mxm_progress);
|
opal_progress_register(ompi_mtl_mxm_progress);
|
||||||
|
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
if (ompi_mtl_mxm.using_mem_hooks) {
|
||||||
|
opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ompi_mtl_mxm_finalize(struct mca_mtl_base_module_t* mtl)
|
int ompi_mtl_mxm_finalize(struct mca_mtl_base_module_t* mtl)
|
||||||
{
|
{
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
if (ompi_mtl_mxm.using_mem_hooks) {
|
||||||
|
opal_mem_hooks_unregister_release(ompi_mtl_mxm_mem_release_cb);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
opal_progress_unregister(ompi_mtl_mxm_progress);
|
opal_progress_unregister(ompi_mtl_mxm_progress);
|
||||||
mxm_ep_destroy(ompi_mtl_mxm.ep);
|
mxm_ep_destroy(ompi_mtl_mxm.ep);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -566,6 +586,15 @@ int ompi_mtl_mxm_progress(void)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
|
||||||
|
void *cbdata, bool from_alloc)
|
||||||
|
{
|
||||||
|
mxm_mem_unmap(ompi_mtl_mxm.mxm_context, buf, length,
|
||||||
|
from_alloc ? MXM_MEM_UNMAP_MARK_INVALID : 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if MXM_API >= MXM_VERSION(1,5)
|
#if MXM_API >= MXM_VERSION(1,5)
|
||||||
OBJ_CLASS_INSTANCE(
|
OBJ_CLASS_INSTANCE(
|
||||||
ompi_mtl_mxm_message_t,
|
ompi_mtl_mxm_message_t,
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
|
#include "opal/memoryhooks/memory.h"
|
||||||
|
|
||||||
#include "mtl_mxm.h"
|
#include "mtl_mxm.h"
|
||||||
#include "mtl_mxm_types.h"
|
#include "mtl_mxm_types.h"
|
||||||
@ -103,20 +104,35 @@ static int ompi_mtl_mxm_component_open(void)
|
|||||||
return OMPI_ERR_NOT_AVAILABLE;
|
return OMPI_ERR_NOT_AVAILABLE;
|
||||||
}
|
}
|
||||||
#if MXM_API < MXM_VERSION(1,5)
|
#if MXM_API < MXM_VERSION(1,5)
|
||||||
mxm_fill_context_opts(&ompi_mtl_mxm.mxm_opts);
|
mxm_fill_context_opts(&ompi_mtl_mxm.mxm_opts);
|
||||||
err = mxm_init(&ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
err = mxm_init(&ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
||||||
MXM_VERBOSE(1, "mxm component open");
|
MXM_VERBOSE(1, "mxm component open");
|
||||||
#else
|
#else
|
||||||
err = mxm_config_read_context_opts(&ompi_mtl_mxm.mxm_opts);
|
|
||||||
if (err != MXM_OK) {
|
|
||||||
MXM_ERROR("Failed to parse MXM configuration");
|
|
||||||
return OPAL_ERR_BAD_PARAM;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = mxm_init(ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
MXM_VERBOSE(1, "mxm component open");
|
/* Register memory hooks */
|
||||||
|
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
||||||
|
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) &
|
||||||
|
opal_mem_hooks_support_level()))
|
||||||
|
{
|
||||||
|
setenv("MXM_MEM_ON_DEMAND_MAP", "y", 0);
|
||||||
|
MXM_VERBOSE(1, "Enabling on-demand memory mapping");
|
||||||
|
ompi_mtl_mxm.using_mem_hooks = 1;
|
||||||
|
} else {
|
||||||
|
MXM_VERBOSE(1, "Disabling on-demand memory mapping");
|
||||||
|
ompi_mtl_mxm.using_mem_hooks = 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
err = mxm_config_read_context_opts(&ompi_mtl_mxm.mxm_opts);
|
||||||
|
if (err != MXM_OK) {
|
||||||
|
MXM_ERROR("Failed to parse MXM configuration");
|
||||||
|
return OPAL_ERR_BAD_PARAM;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = mxm_init(ompi_mtl_mxm.mxm_opts, &ompi_mtl_mxm.mxm_context);
|
||||||
|
MXM_VERBOSE(1, "mxm component open");
|
||||||
|
#endif
|
||||||
|
|
||||||
if (MXM_OK != err) {
|
if (MXM_OK != err) {
|
||||||
if (MXM_ERR_NO_DEVICE == err) {
|
if (MXM_ERR_NO_DEVICE == err) {
|
||||||
@ -158,8 +174,7 @@ static int ompi_mtl_mxm_component_close(void)
|
|||||||
unsigned long cur_ver;
|
unsigned long cur_ver;
|
||||||
|
|
||||||
cur_ver = mxm_get_version();
|
cur_ver = mxm_get_version();
|
||||||
|
if ((cur_ver == MXM_API) && (ompi_mtl_mxm.mxm_context != NULL)) {
|
||||||
if (cur_ver == MXM_API) {
|
|
||||||
mxm_cleanup(ompi_mtl_mxm.mxm_context);
|
mxm_cleanup(ompi_mtl_mxm.mxm_context);
|
||||||
ompi_mtl_mxm.mxm_context = NULL;
|
ompi_mtl_mxm.mxm_context = NULL;
|
||||||
|
|
||||||
@ -189,4 +204,3 @@ ompi_mtl_mxm_component_init(bool enable_progress_threads,
|
|||||||
sizeof(mca_mtl_mxm_request_t) - sizeof(struct mca_mtl_request_t);
|
sizeof(mca_mtl_mxm_request_t) - sizeof(struct mca_mtl_request_t);
|
||||||
return &ompi_mtl_mxm.super;
|
return &ompi_mtl_mxm.super;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,12 +29,15 @@ typedef struct mca_mtl_mxm_module_t {
|
|||||||
int verbose;
|
int verbose;
|
||||||
int mxm_np;
|
int mxm_np;
|
||||||
mxm_h mxm_context;
|
mxm_h mxm_context;
|
||||||
|
mxm_ep_h ep;
|
||||||
#if MXM_API < MXM_VERSION(1,5)
|
#if MXM_API < MXM_VERSION(1,5)
|
||||||
mxm_context_opts_t mxm_opts;
|
mxm_context_opts_t mxm_opts;
|
||||||
#else
|
#else
|
||||||
mxm_context_opts_t *mxm_opts;
|
mxm_context_opts_t *mxm_opts;
|
||||||
#endif
|
#endif
|
||||||
mxm_ep_h ep;
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
int using_mem_hooks;
|
||||||
|
#endif
|
||||||
} mca_mtl_mxm_module_t;
|
} mca_mtl_mxm_module_t;
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user