From f56f37d3641ced41da64bfd7b22f808cf3a092b9 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 12 Mar 2014 16:49:58 +0000 Subject: [PATCH] Shifting to an event-driven RTE raises some interesting issues during shutdown. We want the last messages to get thru, but also need to correctly shutdown the virtual machine. This requires a delicate balancing act across event priorities, and the need to check for termination conditions in places where related events get processed. Change the priority of comm_failure and job_termination events to ensure we process final messages prior to terminating. Check for termination conditions when processing proc termination events as we may order proc termination when the daemon gets an exit command, but we can't see the proc actually terminate until we get out of that message event. Jeff: probably easiest to review this by testing. I tested it under both Slurm and rsh on v1.7.5 as well as trunk cmr=v1.7.5:reviewer=jsquyres:subject=resolve event priorities during VM shutdown This commit was SVN r31042. --- .../errmgr/default_hnp/errmgr_default_hnp.c | 5 +++ .../default_orted/errmgr_default_orted.c | 7 +++- orte/mca/ess/hnp/ess_hnp_module.c | 2 +- orte/mca/state/hnp/state_hnp.c | 2 +- orte/mca/state/orted/state_orted.c | 42 ++++++++++++++++++- orte/orted/orted_comm.c | 13 ++++++ orte/test/system/orte_exit.c | 8 +++- orte/test/system/orte_spin.c | 12 +++++- 8 files changed, 83 insertions(+), 8 deletions(-) diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index 4b5429222b..c6a6efbda2 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -109,6 +109,11 @@ static int init(void) /* setup state machine to trap job errors */ orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + /* set the lost connection state to run at MSG priority so + * we can process any last messages from the proc + */ + orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI); + /* setup state machine to trap proc errors */ orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index f32311947c..3fb3997d51 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -112,6 +112,11 @@ static int init(void) /* setup state machine to trap job errors */ orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + /* set the lost connection state to run at MSG priority so + * we can process any last messages from the proc + */ + orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI); + /* setup state machine to trap proc errors */ orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); @@ -290,7 +295,7 @@ static void proc_errors(int fd, short args, void *cbdata) OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - ORTE_FORCED_TERMINATE(0); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index e9ea36fc31..280c7c2dd3 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -921,7 +921,7 @@ static void clean_abort(int fd, short flags, void *arg) (which is a Bad Thing), so we can't call it directly. Instead, we have to exit this handler and setup to call job_completed() after this. */ - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_plm.terminate_orteds();; } static struct timeval current, last={0,0}; diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c index b93ac09d68..135c474623 100644 --- a/orte/mca/state/hnp/state_hnp.c +++ b/orte/mca/state/hnp/state_hnp.c @@ -152,7 +152,7 @@ static int init(void) } /* add the termination response */ if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, - orte_quit, ORTE_MSG_PRI))) { + orte_quit, ORTE_SYS_PRI))) { ORTE_ERROR_LOG(rc); } /* add a default error response */ diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index ff33a5558e..0a104a4877 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -23,6 +23,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/iof/iof.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" #include "orte/util/session_dir.h" #include "orte/runtime/orte_quit.h" @@ -108,7 +109,7 @@ static int init(void) } /* add a state for when we are ordered to terminate */ if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, - orte_quit, ORTE_ERROR_PRI))) { + orte_quit, ORTE_SYS_PRI))) { ORTE_ERROR_LOG(rc); } if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) { @@ -326,8 +327,28 @@ static void track_procs(int fd, short argc, void *cbdata) orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } + /* if we are trying to terminate and our routes are + * gone, then terminate ourselves IF no local procs + * remain (might be some from another job) + */ + if (orte_orteds_term_ordered && + 0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + pptr->alive) { + /* at least one is still alive */ + goto moveon; + } + } + /* call our appropriate exit procedure */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, + "%s orted:state: all routes and children gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } } } + moveon: /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes @@ -377,6 +398,25 @@ static void track_procs(int fd, short argc, void *cbdata) orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } + /* if we are trying to terminate and our routes are + * gone, then terminate ourselves IF no local procs + * remain (might be some from another job) + */ + if (orte_orteds_term_ordered && + 0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + pptr->alive) { + /* at least one is still alive */ + goto moveon; + } + } + /* call our appropriate exit procedure */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, + "%s orted:state: all routes and children gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } } } } diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index e662c7657f..48148b4e89 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -14,6 +14,7 @@ * reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2014 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -85,6 +86,11 @@ static char *get_orted_comm_cmd_str(int command); static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL; +static void suicide(int sd, short args, void *cbdata) +{ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); +} + void orte_daemon_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) @@ -434,6 +440,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } /* kill the local procs */ orte_odls.kill_local_procs(NULL); + /* flag that orteds were ordered to terminate */ + orte_orteds_term_ordered = true; /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { @@ -449,6 +457,11 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } else { + /* set a timer to suicide, just in case one of our + * dependent routes fails to terminate + */ + ORTE_TIMER_EVENT(10, 0, suicide, ORTE_ERROR_PRI); } return; break; diff --git a/orte/test/system/orte_exit.c b/orte/test/system/orte_exit.c index e175618d14..d8ace53897 100644 --- a/orte/test/system/orte_exit.c +++ b/orte/test/system/orte_exit.c @@ -38,8 +38,12 @@ int main(int argc, char* argv[]) i++; pi = i / 3.14159256; if (i == 9995) { - orte_finalize(); - exit(ORTE_PROC_MY_NAME->vpid); + if (ORTE_PROC_MY_NAME->vpid == 1) { + orte_finalize(); + exit(77); + } else { + i=0; + } } } diff --git a/orte/test/system/orte_spin.c b/orte/test/system/orte_spin.c index ae85304294..703171442d 100644 --- a/orte/test/system/orte_spin.c +++ b/orte/test/system/orte_spin.c @@ -17,14 +17,22 @@ int main(int argc, char* argv[]) { + int i; + float pi; + if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_NON_MPI)) { fprintf(stderr, "ORTE_INIT FAILED\n"); exit(1); } opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - while (orte_event_base_active) { - opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + i = 0; + while (1) { + i++; + pi = i / 3.14159256; + if (i == 9995) { + i=0; + } } orte_finalize();