From dc6809495d231b81471babe15ed1c4b9b74b5821 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Wed, 10 Oct 2018 14:30:44 +0300 Subject: [PATCH] osc_ucx: fix hang/timeout in component finalize Add barrier to make sure all endpoints are destroyed before destroying the worker. Signed-off-by: Yossi Itigin --- ompi/mca/osc/ucx/osc_ucx_component.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 4d0d28a6d2..ea01313751 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -127,6 +127,14 @@ static int component_init(bool enable_progress_threads, bool enable_mpi_threads) return OMPI_SUCCESS; } +static void component_world_barrier(void) +{ + ompi_communicator_t *comm = &ompi_mpi_comm_world.comm; + opal_progress_register(progress_callback); + comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module); + opal_progress_unregister(progress_callback); +} + static int component_finalize(void) { int i; for (i = 0; i < ompi_proc_world_size(); i++) { @@ -136,7 +144,9 @@ static int component_finalize(void) { } } + assert(mca_osc_ucx_component.num_modules == 0); if (mca_osc_ucx_component.ucp_worker != NULL) { + component_world_barrier(); ucp_worker_destroy(mca_osc_ucx_component.ucp_worker); }