OSC/UCX: Fix deadlock with atomic lock
Atomic lock must progress local worker while obtaining the remote lock, otherwise an active message which actually releases the lock might not be processed while polling on local memory location. Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
Этот коммит содержится в:
родитель
61adcd9fc2
Коммит
9d1994b906
@ -272,6 +272,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
|
||||
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
|
||||
}
|
||||
|
||||
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
|
||||
usleep(100);
|
||||
} while (1);
|
||||
}
|
||||
|
@ -240,7 +240,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
while (result_value != TARGET_LOCK_UNLOCKED) {
|
||||
for (;;) {
|
||||
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
|
||||
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
target, &result_value, sizeof(result_value),
|
||||
@ -249,9 +249,12 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
if (result_value == TARGET_LOCK_UNLOCKED) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
return ret;
|
||||
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) {
|
||||
|
@ -42,6 +42,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -58,7 +59,7 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
while (result_value != TARGET_LOCK_UNLOCKED) {
|
||||
for (;;) {
|
||||
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
|
||||
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
target, &result_value, sizeof(result_value),
|
||||
@ -66,9 +67,12 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
if (result_value == TARGET_LOCK_UNLOCKED) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
return ret;
|
||||
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user