From cafd55f18ca030f03ece2b66e9ff75f6627ac781 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 6 Oct 2015 22:07:23 +0300 Subject: [PATCH] HCOLL: fix hang in hcoll barrier called from finalize for MXM/yalla tear down HCOLL barrier may not complete if HCOLL progress is not called periodically. which is the case in HCOLL teardown progress in the finalize. (cherry picked from commit 793244d75dd94d1d5e0243bcccf6d04318750f3f) --- ompi/mca/coll/hcoll/coll_hcoll_ops.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/hcoll/coll_hcoll_ops.c b/ompi/mca/coll/hcoll/coll_hcoll_ops.c index 6d6756b1e7..8c9a43a5c5 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_ops.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_ops.c @@ -18,14 +18,21 @@ int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm, mca_coll_base_module_t *module){ int rc; - HCOL_VERBOSE(20,"RUNNING HCOL BARRIER"); mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; + HCOL_VERBOSE(20,"RUNNING HCOL BARRIER"); + + if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) { + HCOL_VERBOSE(5, "In finalize, reverting to previous barrier"); + goto orig_barrier; + } rc = hcoll_collectives.coll_barrier(hcoll_module->hcoll_context); if (HCOLL_SUCCESS != rc){ HCOL_VERBOSE(20,"RUNNING FALLBACK BARRIER"); rc = hcoll_module->previous_barrier(comm,hcoll_module->previous_barrier_module); } return rc; +orig_barrier: + return hcoll_module->previous_barrier(comm,hcoll_module->previous_barrier_module); } int mca_coll_hcoll_bcast(void *buff, int count,