1
1

OSC/UCX: Fix deadlock with atomic lock

Atomic lock must progress local worker while obtaining the remote lock,
otherwise an active message which actually releases the lock might not
be processed while polling on local memory location.

(picked from master 9d1994b)

Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
Этот коммит содержится в:
Yossi Itigin 2019-05-19 19:08:11 +03:00
родитель c22326e59a
Коммит 4f9fb3e9ce
3 изменённых файлов: 14 добавлений и 5 удалений

Просмотреть файл

@ -276,6 +276,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
}
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
usleep(100);
} while (1);
}

Просмотреть файл

@ -281,7 +281,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET;
ucs_status_t status;
while (result_value != TARGET_LOCK_UNLOCKED) {
for (;;) {
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
&result_value, sizeof(result_value),
remote_addr, rkey,
@ -290,9 +290,13 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in
OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status);
return OMPI_ERROR;
}
if (result_value == TARGET_LOCK_UNLOCKED) {
return OMPI_SUCCESS;
}
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
}
return OMPI_SUCCESS;
}
static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) {

Просмотреть файл

@ -44,6 +44,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
} else {
break;
}
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
}
return OMPI_SUCCESS;
@ -72,7 +73,7 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
ucs_status_t status;
while (result_value != TARGET_LOCK_UNLOCKED) {
for (;;) {
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
&result_value, sizeof(result_value),
remote_addr, rkey,
@ -80,9 +81,12 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
if (status != UCS_OK) {
return OMPI_ERROR;
}
}
if (result_value == TARGET_LOCK_UNLOCKED) {
return OMPI_SUCCESS;
}
return OMPI_SUCCESS;
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
}
}
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {