From 66e18563bf4867763b63e520fd355b9ed6a6d0b6 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 21 Aug 2019 12:01:55 +0300 Subject: [PATCH 1/2] SPML/UCX: fixed hang in SHMEM_FINALIZE - used MPI _Barrier to synchronize processes Signed-off-by: Sergey Oblomov (cherry picked from commit 182023febb6f8f31ce34dc54c8aa409ad7e44fa2) --- opal/mca/common/ucx/common_ucx.c | 7 +++++-- oshmem/mca/spml/ucx/spml_ucx.c | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index a3a12a8fa8..bf5d6c0494 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -186,8 +186,11 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker } } -OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count, - size_t my_rank, size_t max_disconnect, ucp_worker_h worker) { +OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, + size_t count, size_t my_rank, + size_t max_disconnect, + ucp_worker_h worker) +{ size_t num_reqs; size_t max_reqs; void *dreq, **dreqs; diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 4a0dd121d8..44ad1b4f09 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -127,18 +127,16 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL; } - ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(), - mca_spml_ucx.num_disconnect, - mca_spml_ucx_ctx_default.ucp_worker); - + ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(), + mca_spml_ucx.num_disconnect, + mca_spml_ucx_ctx_default.ucp_worker); + /* Do not barrier here - barrier is called in _shmem_finalize */ free(del_procs); free(mca_spml_ucx.remote_addrs_tbl); free(mca_spml_ucx_ctx_default.ucp_peers); mca_spml_ucx_ctx_default.ucp_peers = NULL; - opal_common_ucx_mca_proc_added(); - return ret; } @@ -326,6 +324,8 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs) free(wk_roffs); SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***"); + + opal_common_ucx_mca_proc_added(); return OSHMEM_SUCCESS; error2: From 1f9fce8955c3ba904daace618661e936c34f010b Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 21 Aug 2019 12:08:09 +0300 Subject: [PATCH 2/2] SPML/UCX: fixed comment Signed-off-by: Sergey Oblomov (cherry picked from commit 01dacaa6a42b35c1b7538d8ff0036bded913c847) --- oshmem/mca/spml/ucx/spml_ucx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 44ad1b4f09..36d3467bf5 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -130,7 +130,7 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(), mca_spml_ucx.num_disconnect, mca_spml_ucx_ctx_default.ucp_worker); - /* Do not barrier here - barrier is called in _shmem_finalize */ + /* No need to barrier here - barrier is called in _shmem_finalize */ free(del_procs); free(mca_spml_ucx.remote_addrs_tbl); free(mca_spml_ucx_ctx_default.ucp_peers);