diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index 4b5429222b..c6a6efbda2 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -109,6 +109,11 @@ static int init(void) /* setup state machine to trap job errors */ orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + /* set the lost connection state to run at MSG priority so + * we can process any last messages from the proc + */ + orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI); + /* setup state machine to trap proc errors */ orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index f32311947c..3fb3997d51 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -112,6 +112,11 @@ static int init(void) /* setup state machine to trap job errors */ orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + /* set the lost connection state to run at MSG priority so + * we can process any last messages from the proc + */ + orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI); + /* setup state machine to trap proc errors */ orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); @@ -290,7 +295,7 @@ static void proc_errors(int fd, short args, void *cbdata) OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - ORTE_FORCED_TERMINATE(0); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index e9ea36fc31..280c7c2dd3 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -921,7 +921,7 @@ static void clean_abort(int fd, short flags, void *arg) (which is a Bad Thing), so we can't call it directly. Instead, we have to exit this handler and setup to call job_completed() after this. */ - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_plm.terminate_orteds();; } static struct timeval current, last={0,0}; diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c index b93ac09d68..135c474623 100644 --- a/orte/mca/state/hnp/state_hnp.c +++ b/orte/mca/state/hnp/state_hnp.c @@ -152,7 +152,7 @@ static int init(void) } /* add the termination response */ if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, - orte_quit, ORTE_MSG_PRI))) { + orte_quit, ORTE_SYS_PRI))) { ORTE_ERROR_LOG(rc); } /* add a default error response */ diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index ff33a5558e..0a104a4877 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -23,6 +23,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/iof/iof.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" #include "orte/util/session_dir.h" #include "orte/runtime/orte_quit.h" @@ -108,7 +109,7 @@ static int init(void) } /* add a state for when we are ordered to terminate */ if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, - orte_quit, ORTE_ERROR_PRI))) { + orte_quit, ORTE_SYS_PRI))) { ORTE_ERROR_LOG(rc); } if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) { @@ -326,8 +327,28 @@ static void track_procs(int fd, short argc, void *cbdata) orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } + /* if we are trying to terminate and our routes are + * gone, then terminate ourselves IF no local procs + * remain (might be some from another job) + */ + if (orte_orteds_term_ordered && + 0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + pptr->alive) { + /* at least one is still alive */ + goto moveon; + } + } + /* call our appropriate exit procedure */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, + "%s orted:state: all routes and children gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } } } + moveon: /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes @@ -377,6 +398,25 @@ static void track_procs(int fd, short argc, void *cbdata) orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } + /* if we are trying to terminate and our routes are + * gone, then terminate ourselves IF no local procs + * remain (might be some from another job) + */ + if (orte_orteds_term_ordered && + 0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + pptr->alive) { + /* at least one is still alive */ + goto moveon; + } + } + /* call our appropriate exit procedure */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, + "%s orted:state: all routes and children gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } } } } diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index e662c7657f..48148b4e89 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -14,6 +14,7 @@ * reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2014 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -85,6 +86,11 @@ static char *get_orted_comm_cmd_str(int command); static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL; +static void suicide(int sd, short args, void *cbdata) +{ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); +} + void orte_daemon_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) @@ -434,6 +440,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } /* kill the local procs */ orte_odls.kill_local_procs(NULL); + /* flag that orteds were ordered to terminate */ + orte_orteds_term_ordered = true; /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { @@ -449,6 +457,11 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } else { + /* set a timer to suicide, just in case one of our + * dependent routes fails to terminate + */ + ORTE_TIMER_EVENT(10, 0, suicide, ORTE_ERROR_PRI); } return; break; diff --git a/orte/test/system/orte_exit.c b/orte/test/system/orte_exit.c index e175618d14..d8ace53897 100644 --- a/orte/test/system/orte_exit.c +++ b/orte/test/system/orte_exit.c @@ -38,8 +38,12 @@ int main(int argc, char* argv[]) i++; pi = i / 3.14159256; if (i == 9995) { - orte_finalize(); - exit(ORTE_PROC_MY_NAME->vpid); + if (ORTE_PROC_MY_NAME->vpid == 1) { + orte_finalize(); + exit(77); + } else { + i=0; + } } } diff --git a/orte/test/system/orte_spin.c b/orte/test/system/orte_spin.c index ae85304294..703171442d 100644 --- a/orte/test/system/orte_spin.c +++ b/orte/test/system/orte_spin.c @@ -17,14 +17,22 @@ int main(int argc, char* argv[]) { + int i; + float pi; + if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_NON_MPI)) { fprintf(stderr, "ORTE_INIT FAILED\n"); exit(1); } opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - while (orte_event_base_active) { - opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + i = 0; + while (1) { + i++; + pi = i / 3.14159256; + if (i == 9995) { + i=0; + } } orte_finalize();