SPML/UCX: fixed hang in SHMEM_FINALIZE
- used MPI _Barrier to synchronize processes Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
Этот коммит содержится в:
родитель
69bd9453e9
Коммит
182023febb
@ -187,8 +187,11 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count,
|
||||
size_t my_rank, size_t max_disconnect, ucp_worker_h worker) {
|
||||
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs,
|
||||
size_t count, size_t my_rank,
|
||||
size_t max_disconnect,
|
||||
ucp_worker_h worker)
|
||||
{
|
||||
size_t num_reqs;
|
||||
size_t max_reqs;
|
||||
void *dreq, **dreqs;
|
||||
|
@ -124,18 +124,16 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
|
||||
mca_spml_ucx.num_disconnect,
|
||||
mca_spml_ucx_ctx_default.ucp_worker);
|
||||
|
||||
ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(),
|
||||
mca_spml_ucx.num_disconnect,
|
||||
mca_spml_ucx_ctx_default.ucp_worker);
|
||||
/* Do not barrier here - barrier is called in _shmem_finalize */
|
||||
free(del_procs);
|
||||
free(mca_spml_ucx.remote_addrs_tbl);
|
||||
free(mca_spml_ucx_ctx_default.ucp_peers);
|
||||
|
||||
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
||||
|
||||
opal_common_ucx_mca_proc_added();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -323,6 +321,8 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
free(wk_roffs);
|
||||
|
||||
SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***");
|
||||
|
||||
opal_common_ucx_mca_proc_added();
|
||||
return OSHMEM_SUCCESS;
|
||||
|
||||
error2:
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user