1
1

SPML/UCX: fixed hang in SHMEM_FINALIZE

- used MPI _Barrier to synchronize processes

Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
Этот коммит содержится в:
Sergey Oblomov 2019-08-21 12:01:55 +03:00
родитель 69bd9453e9
Коммит 182023febb
2 изменённых файлов: 11 добавлений и 8 удалений

Просмотреть файл

@ -187,8 +187,11 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker
}
}
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count,
size_t my_rank, size_t max_disconnect, ucp_worker_h worker) {
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs,
size_t count, size_t my_rank,
size_t max_disconnect,
ucp_worker_h worker)
{
size_t num_reqs;
size_t max_reqs;
void *dreq, **dreqs;

Просмотреть файл

@ -124,18 +124,16 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL;
}
ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(),
mca_spml_ucx.num_disconnect,
mca_spml_ucx_ctx_default.ucp_worker);
/* Do not barrier here - barrier is called in _shmem_finalize */
free(del_procs);
free(mca_spml_ucx.remote_addrs_tbl);
free(mca_spml_ucx_ctx_default.ucp_peers);
mca_spml_ucx_ctx_default.ucp_peers = NULL;
opal_common_ucx_mca_proc_added();
return ret;
}
@ -323,6 +321,8 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs)
free(wk_roffs);
SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***");
opal_common_ucx_mca_proc_added();
return OSHMEM_SUCCESS;
error2: