diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index b3da90a9c2..d16328bb08 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -149,12 +149,13 @@ static int update_state(orte_jobid_t job, } OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:default_orted:update_state() %s) " - "------- %s state updated for process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc) ? "App. Process" : - (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); + "errmgr:default_orted:update_state() %s) " + "------- %s state updated for process %s to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ((NULL == proc) ? "App. Process" : + (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || @@ -276,6 +277,20 @@ static int update_state(orte_jobid_t job, */ orte_quit(); } + /* was it a daemon that failed? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { + /* if all my routes are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes() && + 0 == opal_list_get_size(&orte_local_children)) { + orte_quit(); + } else { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:orted not exiting, num_routes() == %d, num children == %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int)orte_routed.num_routes(), + (int)opal_list_get_size(&orte_local_children))); + } + } /* if not, then indicate we can continue */ return ORTE_SUCCESS; } diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 366a089459..1aa847abf4 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -32,7 +32,6 @@ #include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" -#include "orte/runtime/orte_quit.h" #include "orte/runtime/runtime.h" #include "orte/runtime/data_type_support/orte_dt_support.h" @@ -841,22 +840,15 @@ static int route_lost(const orte_process_name_t *route) item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == route->vpid) { - OPAL_OUTPUT_VERBOSE((4, orte_routed_base_output, - "%s routed_binomial: removing route to child daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + OPAL_OUTPUT_VERBOSE((4, orte_routed_base_output, + "%s routed_binomial: removing route to child daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(route))); opal_list_remove_item(&my_children, item); OBJ_RELEASE(item); return ORTE_SUCCESS; } } - /* if we are the HNP or daemon, AND we are terminating, - * then we want to finalize if all our child daemons - * have left - */ - if (orte_terminating && 0 == opal_list_get_size(&my_children)) { - orte_quit(); - } } /* we don't care about this one, so return success */ diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index 82a4fa1d62..92bde82709 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -743,8 +743,6 @@ int orte_daemon_process_commands(orte_process_name_t* sender, opal_output(0, "%s orted_cmd: received exit cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } - /* flag that we are terminating */ - orte_terminating = true; /* kill the local procs */ orte_odls.kill_local_procs(NULL); /* trigger our appropriate exit procedure diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 5a607e945d..ee9ad47705 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -51,7 +51,6 @@ */ bool orte_initialized = false; bool orte_finalizing = false; -bool orte_terminating = false; bool orte_debug_flag = false; int orte_debug_verbosity; char *orte_prohibited_session_dirs = NULL; diff --git a/orte/runtime/orte_quit.c b/orte/runtime/orte_quit.c index c0fa3dff18..5e3a711f1a 100644 --- a/orte/runtime/orte_quit.c +++ b/orte/runtime/orte_quit.c @@ -143,9 +143,6 @@ void orte_quit(void) return; } - /* flag that we are finalizing */ - orte_finalizing = true; - /* whack any lingering session directory files from our jobs */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); diff --git a/orte/runtime/runtime.h b/orte/runtime/runtime.h index 814fe2b3c0..60578d71d0 100644 --- a/orte/runtime/runtime.h +++ b/orte/runtime/runtime.h @@ -43,7 +43,6 @@ ORTE_DECLSPEC extern const char orte_version_string[]; */ ORTE_DECLSPEC extern bool orte_initialized; ORTE_DECLSPEC extern bool orte_finalizing; -ORTE_DECLSPEC extern bool orte_terminating; ORTE_DECLSPEC extern int orte_debug_output; ORTE_DECLSPEC extern bool orte_debug_flag;