1
1

OSC/UCX: Fix deadlock with atomic lock

Atomic lock must progress local worker while obtaining the remote lock,
otherwise an active message which actually releases the lock might not
be processed while polling on local memory location.

Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
Этот коммит содержится в:
Yossi Itigin 2019-05-19 19:08:11 +03:00
родитель 61adcd9fc2
Коммит 9d1994b906
3 изменённых файлов: 14 добавлений и 6 удалений

Просмотреть файл

@ -272,6 +272,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
}
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
usleep(100);
} while (1);
}

Просмотреть файл

@ -240,7 +240,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET;
int ret = OMPI_SUCCESS;
while (result_value != TARGET_LOCK_UNLOCKED) {
for (;;) {
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
target, &result_value, sizeof(result_value),
@ -249,9 +249,12 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret);
return OMPI_ERROR;
}
}
if (result_value == TARGET_LOCK_UNLOCKED) {
return OMPI_SUCCESS;
}
return ret;
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
}
}
static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) {

Просмотреть файл

@ -42,6 +42,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
} else {
break;
}
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
}
return ret;
@ -58,7 +59,7 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
int ret = OMPI_SUCCESS;
while (result_value != TARGET_LOCK_UNLOCKED) {
for (;;) {
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
target, &result_value, sizeof(result_value),
@ -66,9 +67,12 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
if (OMPI_SUCCESS != ret) {
return ret;
}
}
if (result_value == TARGET_LOCK_UNLOCKED) {
return OMPI_SUCCESS;
}
return ret;
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
}
}
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {