From 9d1994b90697a41b1782af50581d70d3386f5485 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sun, 19 May 2019 19:08:11 +0300 Subject: [PATCH] OSC/UCX: Fix deadlock with atomic lock Atomic lock must progress local worker while obtaining the remote lock, otherwise an active message which actually releases the lock might not be processed while polling on local memory location. Signed-off-by: Yossi Itigin --- ompi/mca/osc/ucx/osc_ucx_active_target.c | 1 + ompi/mca/osc/ucx/osc_ucx_comm.c | 9 ++++++--- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 10 +++++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_active_target.c b/ompi/mca/osc/ucx/osc_ucx_active_target.c index 209a5f0787..723d40b662 100644 --- a/ompi/mca/osc/ucx/osc_ucx_active_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_active_target.c @@ -272,6 +272,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0); } + ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker); usleep(100); } while (1); } diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c index 388493f8e5..4b58031f4d 100644 --- a/ompi/mca/osc/ucx/osc_ucx_comm.c +++ b/ompi/mca/osc/ucx/osc_ucx_comm.c @@ -240,7 +240,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) { uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET; int ret = OMPI_SUCCESS; - while (result_value != TARGET_LOCK_UNLOCKED) { + for (;;) { ret = opal_common_ucx_wpmem_cmpswp(module->state_mem, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, target, &result_value, sizeof(result_value), @@ -249,9 +249,12 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) { OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret); return OMPI_ERROR; } - } + if (result_value == TARGET_LOCK_UNLOCKED) { + return OMPI_SUCCESS; + } - return ret; + ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker); + } } static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) { diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 04d6cb6e07..90d617bd89 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -42,6 +42,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { } else { break; } + ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker); } return ret; @@ -58,7 +59,7 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) { uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; int ret = OMPI_SUCCESS; - while (result_value != TARGET_LOCK_UNLOCKED) { + for (;;) { ret = opal_common_ucx_wpmem_cmpswp(module->state_mem, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, target, &result_value, sizeof(result_value), @@ -66,9 +67,12 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) { if (OMPI_SUCCESS != ret) { return ret; } - } + if (result_value == TARGET_LOCK_UNLOCKED) { + return OMPI_SUCCESS; + } - return ret; + ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker); + } } static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {