From 4bd25f587c85b3f2e4d63cebec970f7258b5d88e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 11 May 2010 00:34:12 +0000 Subject: [PATCH] Begin handling the case of lost connections by having the OOB report it to the errmgr instead of the routed framework. Add an "app" component to t he errmgr framework so that it can decide how to respond - which for now at least is just to check for lifeline and abort if so. Add a new error constant to indicate that the error is "unrecoverable" so the oob can know it needs to abort. This commit was SVN r23112. --- orte/include/orte/constants.h | 3 +- orte/mca/errmgr/app/.windows | 12 +++ orte/mca/errmgr/app/Makefile.am | 36 +++++++ orte/mca/errmgr/app/configure.params | 22 +++++ orte/mca/errmgr/app/errmgr_app.c | 108 +++++++++++++++++++++ orte/mca/errmgr/app/errmgr_app.h | 35 +++++++ orte/mca/errmgr/app/errmgr_app_component.c | 83 ++++++++++++++++ orte/mca/errmgr/base/errmgr_base_fns.c | 22 ++--- orte/mca/errmgr/base/errmgr_base_open.c | 6 -- orte/mca/errmgr/base/errmgr_private.h | 1 - orte/mca/errmgr/hnp/errmgr_hnp.c | 21 +++- orte/mca/errmgr/orted/errmgr_orted.c | 10 +- orte/mca/ess/base/ess_base_std_app.c | 13 +++ orte/mca/oob/tcp/oob_tcp_peer.c | 9 +- orte/tools/orterun/orterun.c | 3 +- orte/util/error_strings.c | 4 + 16 files changed, 355 insertions(+), 33 deletions(-) create mode 100644 orte/mca/errmgr/app/.windows create mode 100644 orte/mca/errmgr/app/Makefile.am create mode 100644 orte/mca/errmgr/app/configure.params create mode 100644 orte/mca/errmgr/app/errmgr_app.c create mode 100644 orte/mca/errmgr/app/errmgr_app.h create mode 100644 orte/mca/errmgr/app/errmgr_app_component.c diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 1c593d8d2c..1f0fbd5b07 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -114,7 +114,8 @@ enum { ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31), ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32), ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33), - ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34) + ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34), + ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) diff --git a/orte/mca/errmgr/app/.windows b/orte/mca/errmgr/app/.windows new file mode 100644 index 0000000000..7a934e8f29 --- /dev/null +++ b/orte/mca/errmgr/app/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/app/Makefile.am b/orte/mca/errmgr/app/Makefile.am new file mode 100644 index 0000000000..f2857d29b0 --- /dev/null +++ b/orte/mca/errmgr/app/Makefile.am @@ -0,0 +1,36 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +sources = \ + errmgr_app.h \ + errmgr_app_component.c \ + errmgr_app.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_errmgr_app_DSO +component_noinst = +component_install = mca_errmgr_app.la +else +component_noinst = libmca_errmgr_app.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_app_la_SOURCES = $(sources) +mca_errmgr_app_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_app_la_SOURCES =$(sources) +libmca_errmgr_app_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/app/configure.params b/orte/mca/errmgr/app/configure.params new file mode 100644 index 0000000000..8fc44480a6 --- /dev/null +++ b/orte/mca/errmgr/app/configure.params @@ -0,0 +1,22 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/errmgr/app/errmgr_app.c b/orte/mca/errmgr/app/errmgr_app.c new file mode 100644 index 0000000000..0dc43dd5fe --- /dev/null +++ b/orte/mca/errmgr/app/errmgr_app.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2009-2010 The Trustees of Indiana University. + * All rights reserved. + * + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/routed/routed.h" + +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "errmgr_app.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + orte_exit_code_t exit_code, + orte_errmgr_stack_state_t *stack_state); + +/****************** + * HNP module + ******************/ +orte_errmgr_base_module_t orte_errmgr_app_module = { + init, + finalize, + update_state, + NULL, + NULL, + NULL +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + return ORTE_SUCCESS; +} + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + orte_exit_code_t exit_code, + orte_errmgr_stack_state_t *stack_state) +{ + /* indicate that this is the end of the line */ + *stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:app: job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state), exit_code)); + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + if (ORTE_PROC_STATE_COMM_FAILED == state) { + /* delete the route */ + orte_routed.delete_route(proc); + /* see is this was a lifeline */ + if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { + return ORTE_ERR_UNRECOVERABLE; + } + } + return ORTE_SUCCESS; +} diff --git a/orte/mca/errmgr/app/errmgr_app.h b/orte/mca/errmgr/app/errmgr_app.h new file mode 100644 index 0000000000..1f3da658b7 --- /dev/null +++ b/orte/mca/errmgr/app/errmgr_app.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_APP_EXPORT_H +#define MCA_ERRMGR_APP_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_app_component; + +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module; + +END_C_DECLS + +#endif /* MCA_ERRMGR_APP_EXPORT_H */ diff --git a/orte/mca/errmgr/app/errmgr_app_component.c b/orte/mca/errmgr/app/errmgr_app_component.c new file mode 100644 index 0000000000..23a8fa1436 --- /dev/null +++ b/orte/mca/errmgr/app/errmgr_app_component.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "errmgr_app.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_app_component_version_string = + "ORTE ERRMGR app MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int errmgr_app_open(void); +static int errmgr_app_close(void); +static int errmgr_app_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_base_component_t mca_errmgr_app_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component itapp + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "app", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + errmgr_app_open, + errmgr_app_close, + errmgr_app_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + +static int errmgr_app_open(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_app_close(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_app_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_APP) { + /* keep our priority low so that other modules are higher + * and will run before us + */ + *priority = 10; + *module = (mca_base_module_t *)&orte_errmgr_app_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index 89d0943a79..231d8d088b 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -68,19 +68,13 @@ int orte_errmgr_base_update_state(orte_jobid_t job, int i; orte_errmgr_stack_state_t stack_state; orte_errmgr_base_module_t *module; - - if( ORTE_PROC_IS_APP ) { - return ORTE_SUCCESS; - } - - if( !orte_errmgr_base.shutting_down ) { - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:base:update_state() %s) " - "------- %s state updated for process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"), - (NULL == name) ? "NULL" : ORTE_NAME_PRINT(name))); - } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "errmgr:base:update_state() %s) " + "------- %s state updated for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"), + (NULL == name) ? "NULL" : ORTE_NAME_PRINT(name))); stack_state = ORTE_ERRMGR_STACK_STATE_NONE; stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT; @@ -101,7 +95,7 @@ int orte_errmgr_base_update_state(orte_jobid_t job, } } - return ORTE_SUCCESS; + return rc; } int orte_errmgr_base_abort(int error_code, char *fmt, ...) diff --git a/orte/mca/errmgr/base/errmgr_base_open.c b/orte/mca/errmgr/base/errmgr_base_open.c index a5d59ca027..c0321a0ab5 100644 --- a/orte/mca/errmgr/base/errmgr_base_open.c +++ b/orte/mca/errmgr/base/errmgr_base_open.c @@ -79,12 +79,6 @@ int orte_errmgr_base_open(void) orte_errmgr_base.output = opal_output_open(NULL); - /* - * A flag to indicate that orterun is shutting down, so skip the recovery - * logic. - */ - orte_errmgr_base.shutting_down = false; - /* * Open up all available components */ diff --git a/orte/mca/errmgr/base/errmgr_private.h b/orte/mca/errmgr/base/errmgr_private.h index 8b7d11465e..a71fbe2b36 100644 --- a/orte/mca/errmgr/base/errmgr_private.h +++ b/orte/mca/errmgr/base/errmgr_private.h @@ -42,7 +42,6 @@ BEGIN_C_DECLS /* define a struct to hold framework-global values */ typedef struct { int output; - bool shutting_down; opal_pointer_array_t modules; bool initialized; } orte_errmgr_base_t; diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index d06655ca45..e1af67e866 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -132,7 +132,7 @@ static int update_state(orte_jobid_t job, /* * if orte is trying to shutdown, just let it */ - if (orte_errmgr_base.shutting_down) { + if (orte_finalizing) { return ORTE_SUCCESS; } @@ -235,6 +235,18 @@ static int update_state(orte_jobid_t job, hnp_abort(jdata->jobid, exit_code); } break; + case ORTE_JOB_STATE_COMM_FAILED: + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; default: break; @@ -253,7 +265,6 @@ static int update_state(orte_jobid_t job, case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_TERM_WO_SYNC: - case ORTE_PROC_STATE_COMM_FAILED: if (jdata->enable_recovery) { /* is this a local proc */ if (NULL != (child = proc_is_local(proc))) { @@ -327,6 +338,12 @@ static int update_state(orte_jobid_t job, } break; + case ORTE_PROC_STATE_COMM_FAILED: + /* is this to a daemon? */ + /* relocate its processes */ + /* attempt to restart? */ + break; + default: break; } diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index 0a9059aad3..e972cfc18d 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -78,7 +78,7 @@ static int ft_event(int state); /****************** - * ORCM module + * ORTED module ******************/ orte_errmgr_base_module_t orte_errmgr_orted_module = { init, @@ -123,7 +123,13 @@ static int update_state(orte_jobid_t job, /* * if orte is trying to shutdown, just let it */ - if (orte_errmgr_base.shutting_down) { + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + /* if this is a heartbeat failure, let the HNP handle it */ + if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || + ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { return ORTE_SUCCESS; } diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 3ff23e3234..fe8659be26 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -44,6 +44,7 @@ #include "orte/mca/odls/odls_types.h" #include "orte/mca/plm/plm.h" #include "orte/mca/filem/base/base.h" +#include "orte/mca/errmgr/base/base.h" #if OPAL_ENABLE_FT_CR == 1 #include "orte/mca/snapc/base/base.h" #endif @@ -65,6 +66,18 @@ int orte_ess_base_app_setup(void) int ret; char *error = NULL; + /* setup the errmgr */ + if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr_base_select"; + goto error; + } + /* Setup the communication infrastructure */ /* Runtime Messaging Layer */ diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 466fc1ddf7..d8567f72a5 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -60,7 +60,6 @@ #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/routed/routed.h" #include "orte/mca/ess/ess.h" #include "orte/mca/notifier/notifier.h" #include "orte/runtime/orte_wait.h" @@ -597,12 +596,12 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) peer->peer_state); } - mca_oob_tcp_peer_shutdown(peer); - - /* inform the routed framework that we have lost a connection so + /* inform the ERRMGR framework that we have lost a connection so * it can decide if this is important, what to do about it, etc. */ - if (ORTE_SUCCESS != orte_routed.route_lost(&peer->peer_name)) { + if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(peer->peer_name.jobid, ORTE_JOB_STATE_COMM_FAILED, + &peer->peer_name, ORTE_PROC_STATE_COMM_FAILED, + ORTE_ERROR_DEFAULT_EXIT_CODE)) { /* Should free the peer lock before we abort so we don't * get stuck in the orte_wait_kill when receiving messages in the * tcp OOB diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index bee52a33f7..f04f6495f6 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -1228,8 +1228,7 @@ static void abort_exit_callback(int fd, short ign, void *arg) * procedure. */ orte_enable_recovery = false; - orte_errmgr_base.shutting_down = true; - + /* terminate the orteds - they will automatically kill * their local procs */ diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index f5d7ed3c3e..ab6b47c158 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -129,6 +129,10 @@ const char *orte_err2str(int errnum) case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED: retval = "Limit on number of process relocations was exceeded"; break; + case ORTE_ERR_UNRECOVERABLE: + retval = "Unrecoverable error"; + break; + default: retval = NULL;