OSC/UCX: Fix deadlock with atomic lock
Atomic lock must progress local worker while obtaining the remote lock, otherwise an active message which actually releases the lock might not be processed while polling on local memory location. (picked from master 9d1994b) Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
Этот коммит содержится в:
родитель
c22326e59a
Коммит
4f9fb3e9ce
@ -276,6 +276,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
|
||||
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
|
||||
}
|
||||
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
usleep(100);
|
||||
} while (1);
|
||||
}
|
||||
|
@ -281,7 +281,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET;
|
||||
ucs_status_t status;
|
||||
|
||||
while (result_value != TARGET_LOCK_UNLOCKED) {
|
||||
for (;;) {
|
||||
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey,
|
||||
@ -290,9 +290,13 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in
|
||||
OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (result_value == TARGET_LOCK_UNLOCKED) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) {
|
||||
|
@ -44,6 +44,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -72,7 +73,7 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
ucs_status_t status;
|
||||
|
||||
while (result_value != TARGET_LOCK_UNLOCKED) {
|
||||
for (;;) {
|
||||
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey,
|
||||
@ -80,9 +81,12 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
if (result_value == TARGET_LOCK_UNLOCKED) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user