diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 668b0ff147..044b599dc3 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -112,6 +112,9 @@ AC_DEFUN([OMPI_CHECK_UCX],[ ucp_request_check_status, ucp_put_nb, ucp_get_nb], [], [], [#include ]) + AC_CHECK_DECLS([ucm_test_events], + [], [], + [#include ]) AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND, UCP_ATOMIC_POST_OP_OR, UCP_ATOMIC_POST_OP_XOR, diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index fbf51822b0..39be7a4f14 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -422,6 +422,7 @@ int mca_pml_ucx_add_procs(struct ompi_proc_t **procs, size_t nprocs) } } + opal_common_ucx_mca_proc_added(); return OMPI_SUCCESS; } diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 84e26b221d..f4d8366f7e 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -132,6 +132,28 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced) *(int*)fenced = 1; } +void opal_common_ucx_mca_proc_added(void) +{ +#if HAVE_DECL_UCM_TEST_EVENTS + static int warned = 0; + static char *mem_hooks_suggestion = "Try to add command line agrument " + "'--mca opal_common_ucx_opal_mem_hooks 1' to resolve " + "this issue."; + ucs_status_t status; + + if (!warned) { + status = ucm_test_events(UCM_EVENT_VM_UNMAPPED); + if (status != UCS_OK) { + MCA_COMMON_UCX_WARN("UCX is unable to handle VM_UNMAP event. " + "This may cause performance degradation or data " + "corruption. %s", + opal_common_ucx.opal_mem_hooks ? "" : mem_hooks_suggestion); + warned = 1; + } + } +#endif +} + OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker) { volatile int fenced = 0; diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index d6ad4d91b1..5730ccf313 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -49,6 +49,11 @@ BEGIN_C_DECLS __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ " Error: " __VA_ARGS__) +#define MCA_COMMON_UCX_WARN(...) \ + opal_output_verbose(0, opal_common_ucx.output, \ + __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ + " Warning: " __VA_ARGS__) + #define MCA_COMMON_UCX_VERBOSE(_level, ... ) \ if (((_level) <= MCA_COMMON_UCX_MAX_VERBOSE) && \ ((_level) <= opal_common_ucx.verbose)) { \ @@ -101,6 +106,7 @@ extern opal_common_ucx_module_t opal_common_ucx; OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); +OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count, diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 256260b44c..8b27077e6d 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -135,6 +135,8 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx_ctx_default.ucp_peers = NULL; + opal_common_ucx_mca_proc_added(); + return ret; }