From 5bebed45eb4223adb1c8db484c75202b7ddb67d7 Mon Sep 17 00:00:00 2001 From: Mike Dubman Date: Sun, 4 Oct 2015 09:39:37 +0300 Subject: [PATCH 1/2] OMPI: set "in finalize" indicator in finalize flow --- ompi/runtime/mpiruntime.h | 2 ++ ompi/runtime/ompi_mpi_finalize.c | 3 +-- ompi/runtime/ompi_mpi_init.c | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ompi/runtime/mpiruntime.h b/ompi/runtime/mpiruntime.h index ec02a36254..af050f66a3 100644 --- a/ompi/runtime/mpiruntime.h +++ b/ompi/runtime/mpiruntime.h @@ -55,6 +55,8 @@ OMPI_DECLSPEC extern bool ompi_mpi_initialized; OMPI_DECLSPEC extern bool ompi_mpi_finalized; /** Has the RTE been initialized? */ OMPI_DECLSPEC extern bool ompi_rte_initialized; +/** Did mpi start to finalize? */ +OMPI_DECLSPEC extern int32_t ompi_mpi_finalize_started; /** Do we have multiple threads? */ OMPI_DECLSPEC extern bool ompi_mpi_thread_multiple; diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index 51274992b2..7935ebeb60 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -93,7 +93,6 @@ extern bool ompi_enable_timing_ext; int ompi_mpi_finalize(void) { int ret; - static int32_t finalize_has_already_started = 0; opal_list_item_t *item; ompi_proc_t** procs; size_t nprocs; @@ -106,7 +105,7 @@ int ompi_mpi_finalize(void) ompi_comm_free() (or run into other nasty lions, tigers, or bears) */ - if (! opal_atomic_cmpset_32(&finalize_has_already_started, 0, 1)) { + if (! opal_atomic_cmpset_32(&ompi_mpi_finalize_started, 0, 1)) { /* Note that if we're already finalized, we cannot raise an MPI exception. The best that we can do is write something to stderr. */ diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 9568568d28..44650ee985 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -128,6 +128,7 @@ bool ompi_mpi_init_started = false; bool ompi_mpi_initialized = false; bool ompi_mpi_finalized = false; bool ompi_rte_initialized = false; +int32_t ompi_mpi_finalize_started = false; bool ompi_mpi_thread_multiple = false; int ompi_mpi_thread_requested = MPI_THREAD_SINGLE; From e8d7373b14909100cece81d21677cac4472acc27 Mon Sep 17 00:00:00 2001 From: Mike Dubman Date: Wed, 30 Sep 2015 12:23:23 +0300 Subject: [PATCH 2/2] COLL/FCA: revert to prev barrier if called from finalize FCA barrier may not complete if FCA progress is not called periodically. PMI/PMI2 API that can be used in rte barrier has no provision for calling external progress function. So it is possible that during finalize some ranks will be stuck in fca barrier while others are in PMI barrier. --- ompi/mca/coll/fca/coll_fca_ops.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ompi/mca/coll/fca/coll_fca_ops.c b/ompi/mca/coll/fca/coll_fca_ops.c index 093bd46988..7d2711f15a 100644 --- a/ompi/mca/coll/fca/coll_fca_ops.c +++ b/ompi/mca/coll/fca/coll_fca_ops.c @@ -153,6 +153,10 @@ int mca_coll_fca_barrier(struct ompi_communicator_t *comm, int ret; FCA_VERBOSE(5,"Using FCA Barrier"); + if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) { + FCA_VERBOSE(5, "In finalize, reverting to previous barrier"); + goto orig_barrier; + } ret = fca_do_barrier(fca_module->fca_comm); if (ret < 0) { if (ret == -EUSEMPI) {