PML/SPML/UCX: added evaluation of mmap events
- there was a set of UCX related issues reported which caused by mmap API hooks conflicts. We added diagnostic of such problems to simplify bug-resolving pipeline Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
Этот коммит содержится в:
родитель
fd9ed9cabb
Коммит
d8e3562bae
@ -112,6 +112,9 @@ AC_DEFUN([OMPI_CHECK_UCX],[
|
|||||||
ucp_request_check_status, ucp_put_nb, ucp_get_nb],
|
ucp_request_check_status, ucp_put_nb, ucp_get_nb],
|
||||||
[], [],
|
[], [],
|
||||||
[#include <ucp/api/ucp.h>])
|
[#include <ucp/api/ucp.h>])
|
||||||
|
AC_CHECK_DECLS([ucm_test_events],
|
||||||
|
[], [],
|
||||||
|
[#include <ucm/api/ucm.h>])
|
||||||
AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND,
|
AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND,
|
||||||
UCP_ATOMIC_POST_OP_OR,
|
UCP_ATOMIC_POST_OP_OR,
|
||||||
UCP_ATOMIC_POST_OP_XOR,
|
UCP_ATOMIC_POST_OP_XOR,
|
||||||
|
@ -422,6 +422,7 @@ int mca_pml_ucx_add_procs(struct ompi_proc_t **procs, size_t nprocs)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_common_ucx_mca_proc_added();
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,6 +132,28 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced)
|
|||||||
*(int*)fenced = 1;
|
*(int*)fenced = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void opal_common_ucx_mca_proc_added(void)
|
||||||
|
{
|
||||||
|
#if HAVE_DECL_UCM_TEST_EVENTS
|
||||||
|
static int warned = 0;
|
||||||
|
static char *mem_hooks_suggestion = "Try to add command line agrument "
|
||||||
|
"'--mca opal_common_ucx_opal_mem_hooks 1' to resolve "
|
||||||
|
"this issue.";
|
||||||
|
ucs_status_t status;
|
||||||
|
|
||||||
|
if (!warned) {
|
||||||
|
status = ucm_test_events(UCM_EVENT_VM_UNMAPPED);
|
||||||
|
if (status != UCS_OK) {
|
||||||
|
MCA_COMMON_UCX_WARN("UCX is unable to handle VM_UNMAP event. "
|
||||||
|
"This may cause performance degradation or data "
|
||||||
|
"corruption. %s",
|
||||||
|
opal_common_ucx.opal_mem_hooks ? "" : mem_hooks_suggestion);
|
||||||
|
warned = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
||||||
{
|
{
|
||||||
volatile int fenced = 0;
|
volatile int fenced = 0;
|
||||||
|
@ -49,6 +49,11 @@ BEGIN_C_DECLS
|
|||||||
__FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \
|
__FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \
|
||||||
" Error: " __VA_ARGS__)
|
" Error: " __VA_ARGS__)
|
||||||
|
|
||||||
|
#define MCA_COMMON_UCX_WARN(...) \
|
||||||
|
opal_output_verbose(0, opal_common_ucx.output, \
|
||||||
|
__FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \
|
||||||
|
" Warning: " __VA_ARGS__)
|
||||||
|
|
||||||
#define MCA_COMMON_UCX_VERBOSE(_level, ... ) \
|
#define MCA_COMMON_UCX_VERBOSE(_level, ... ) \
|
||||||
if (((_level) <= MCA_COMMON_UCX_MAX_VERBOSE) && \
|
if (((_level) <= MCA_COMMON_UCX_MAX_VERBOSE) && \
|
||||||
((_level) <= opal_common_ucx.verbose)) { \
|
((_level) <= opal_common_ucx.verbose)) { \
|
||||||
@ -101,6 +106,7 @@ extern opal_common_ucx_module_t opal_common_ucx;
|
|||||||
|
|
||||||
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
|
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
||||||
|
OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
||||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
||||||
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
|
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
|
||||||
|
@ -135,6 +135,8 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
|||||||
|
|
||||||
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
||||||
|
|
||||||
|
opal_common_ucx_mca_proc_added();
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user