SPML/UCX: fixed hang in SHMEM_FINALIZE
- used MPI _Barrier to synchronize processes Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
Этот коммит содержится в:
родитель
69bd9453e9
Коммит
182023febb
@ -187,8 +187,11 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count,
|
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs,
|
||||||
size_t my_rank, size_t max_disconnect, ucp_worker_h worker) {
|
size_t count, size_t my_rank,
|
||||||
|
size_t max_disconnect,
|
||||||
|
ucp_worker_h worker)
|
||||||
|
{
|
||||||
size_t num_reqs;
|
size_t num_reqs;
|
||||||
size_t max_reqs;
|
size_t max_reqs;
|
||||||
void *dreq, **dreqs;
|
void *dreq, **dreqs;
|
||||||
|
@ -124,18 +124,16 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
|||||||
mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL;
|
mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
|
ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(),
|
||||||
mca_spml_ucx.num_disconnect,
|
mca_spml_ucx.num_disconnect,
|
||||||
mca_spml_ucx_ctx_default.ucp_worker);
|
mca_spml_ucx_ctx_default.ucp_worker);
|
||||||
|
/* Do not barrier here - barrier is called in _shmem_finalize */
|
||||||
free(del_procs);
|
free(del_procs);
|
||||||
free(mca_spml_ucx.remote_addrs_tbl);
|
free(mca_spml_ucx.remote_addrs_tbl);
|
||||||
free(mca_spml_ucx_ctx_default.ucp_peers);
|
free(mca_spml_ucx_ctx_default.ucp_peers);
|
||||||
|
|
||||||
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
||||||
|
|
||||||
opal_common_ucx_mca_proc_added();
|
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -323,6 +321,8 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs)
|
|||||||
free(wk_roffs);
|
free(wk_roffs);
|
||||||
|
|
||||||
SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***");
|
SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***");
|
||||||
|
|
||||||
|
opal_common_ucx_mca_proc_added();
|
||||||
return OSHMEM_SUCCESS;
|
return OSHMEM_SUCCESS;
|
||||||
|
|
||||||
error2:
|
error2:
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user