From abedb97be4a3aba53ff6da253f5ca837261f73cf Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 8 Aug 2014 15:58:49 +0000 Subject: [PATCH] Resolve race condition when procs call MPI_Abort. Since we go thru the errmgr instead of the normal proc termination routines, we need to ensure we mark that the proc has fired its waitpid and is no longer alive. Otherwise, the local daemon won't terminate because it thinks there is still a local proc alive and we hang. Thanks to Gilles for tracking it down. cmr=v1.8.2:reviewer=rhc This commit was SVN r32460. --- orte/mca/odls/base/odls_base_default_fns.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 05847efd43..7a0fdcb93c 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1876,6 +1876,14 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata) ORTE_NAME_PRINT(&proc->name))); state = ORTE_PROC_STATE_CALLED_ABORT; free(abortfile); + /* since we are going down a different code path, we need to + * flag that this proc has had its waitpid fired */ + ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID); + /* if IOF_COMPLETE has already been recvd, then we need + * to mark this proc as no longer alive */ + if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) { + ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE); + } goto MOVEON; } free(abortfile);