From 182023febb6f8f31ce34dc54c8aa409ad7e44fa2 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 21 Aug 2019 12:01:55 +0300 Subject: [PATCH] SPML/UCX: fixed hang in SHMEM_FINALIZE - used MPI _Barrier to synchronize processes Signed-off-by: Sergey Oblomov --- opal/mca/common/ucx/common_ucx.c | 7 +++++-- oshmem/mca/spml/ucx/spml_ucx.c | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 1cba713bb9..2482c01fee 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -187,8 +187,11 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker } } -OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count, - size_t my_rank, size_t max_disconnect, ucp_worker_h worker) { +OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, + size_t count, size_t my_rank, + size_t max_disconnect, + ucp_worker_h worker) +{ size_t num_reqs; size_t max_reqs; void *dreq, **dreqs; diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 47e0ae5482..33893178ed 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -124,18 +124,16 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL; } - ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(), - mca_spml_ucx.num_disconnect, - mca_spml_ucx_ctx_default.ucp_worker); - + ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(), + mca_spml_ucx.num_disconnect, + mca_spml_ucx_ctx_default.ucp_worker); + /* Do not barrier here - barrier is called in _shmem_finalize */ free(del_procs); free(mca_spml_ucx.remote_addrs_tbl); free(mca_spml_ucx_ctx_default.ucp_peers); mca_spml_ucx_ctx_default.ucp_peers = NULL; - opal_common_ucx_mca_proc_added(); - return ret; } @@ -323,6 +321,8 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs) free(wk_roffs); SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***"); + + opal_common_ucx_mca_proc_added(); return OSHMEM_SUCCESS; error2: