diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 668b0ff147..044b599dc3 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -112,6 +112,9 @@ AC_DEFUN([OMPI_CHECK_UCX],[ ucp_request_check_status, ucp_put_nb, ucp_get_nb], [], [], [#include ]) + AC_CHECK_DECLS([ucm_test_events], + [], [], + [#include ]) AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND, UCP_ATOMIC_POST_OP_OR, UCP_ATOMIC_POST_OP_XOR, diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 00a95644c2..e2a614e242 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -422,6 +422,7 @@ int mca_pml_ucx_add_procs(struct ompi_proc_t **procs, size_t nprocs) } } + opal_common_ucx_mca_proc_added(); return OMPI_SUCCESS; } diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 2213f118eb..69f8b0c467 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -132,6 +132,27 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced) *(int*)fenced = 1; } +void opal_common_ucx_mca_proc_added(void) +{ +#if HAVE_DECL_UCM_TEST_EVENTS + static int warned = 0; + static char *mem_hooks_suggestion = "Pls try adding --mca opal_common_ucx_opal_mem_hooks 1 " + "to mpirun/oshrun command line to resolve this issue."; + ucs_status_t status; + + if (!warned) { + status = ucm_test_events(UCM_EVENT_VM_UNMAPPED); + if (status != UCS_OK) { + MCA_COMMON_UCX_WARN("UCX is unable to handle VM_UNMAP event. " + "This may cause performance degradation or data " + "corruption. %s", + opal_common_ucx.opal_mem_hooks ? "" : mem_hooks_suggestion); + warned = 1; + } + } +#endif +} + OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker) { volatile int fenced = 0; diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 0cf46e5c28..7db964447e 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -39,10 +39,11 @@ BEGIN_C_DECLS #define MCA_COMMON_UCX_QUOTE(_x) \ _MCA_COMMON_UCX_QUOTE(_x) -#define MCA_COMMON_UCX_ERROR(...) \ - opal_output_verbose(0, opal_common_ucx.output, \ - __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ - " Error: " __VA_ARGS__) +#define MCA_COMMON_UCX_ERROR(...) \ + MCA_COMMON_UCX_VERBOSE(0, " Error: " __VA_ARGS__) + +#define MCA_COMMON_UCX_WARN(...) \ + MCA_COMMON_UCX_VERBOSE(0, " Warning: " __VA_ARGS__) #define MCA_COMMON_UCX_VERBOSE(_level, ... ) \ if (((_level) <= MCA_COMMON_UCX_MAX_VERBOSE) && \ @@ -96,6 +97,7 @@ extern opal_common_ucx_module_t opal_common_ucx; OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); +OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component); diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 67bfc7ceab..d20bfd9583 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -138,6 +138,8 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx_ctx_default.ucp_peers = NULL; + opal_common_ucx_mca_proc_added(); + return ret; }