Begin handling the case of lost connections by having the OOB report it to the errmgr instead of the routed framework. Add an "app" component to t
he errmgr framework so that it can decide how to respond - which for now at least is just to check for lifeline and abort if so. Add a new error constant to indicate that the error is "unrecoverable" so the oob can know it needs to abort. This commit was SVN r23112.
Этот коммит содержится в:
родитель
5965d3e620
Коммит
4bd25f587c
@ -114,7 +114,8 @@ enum {
|
||||
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
|
||||
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
|
||||
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
|
||||
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34)
|
||||
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
|
||||
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35)
|
||||
};
|
||||
|
||||
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
|
||||
|
12
orte/mca/errmgr/app/.windows
Обычный файл
12
orte/mca/errmgr/app/.windows
Обычный файл
@ -0,0 +1,12 @@
|
||||
#
|
||||
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
mca_link_libraries=libopen-rte
|
36
orte/mca/errmgr/app/Makefile.am
Обычный файл
36
orte/mca/errmgr/app/Makefile.am
Обычный файл
@ -0,0 +1,36 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
sources = \
|
||||
errmgr_app.h \
|
||||
errmgr_app_component.c \
|
||||
errmgr_app.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_app_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_app.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_app.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_app_la_SOURCES = $(sources)
|
||||
mca_errmgr_app_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_app_la_SOURCES =$(sources)
|
||||
libmca_errmgr_app_la_LDFLAGS = -module -avoid-version
|
22
orte/mca/errmgr/app/configure.params
Обычный файл
22
orte/mca/errmgr/app/configure.params
Обычный файл
@ -0,0 +1,22 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
108
orte/mca/errmgr/app/errmgr_app.c
Обычный файл
108
orte/mca/errmgr/app/errmgr_app.c
Обычный файл
@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_app.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
init,
|
||||
finalize,
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* see is this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
35
orte/mca/errmgr/app/errmgr_app.h
Обычный файл
35
orte/mca/errmgr/app/errmgr_app.h
Обычный файл
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_APP_EXPORT_H
|
||||
#define MCA_ERRMGR_APP_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_app_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_APP_EXPORT_H */
|
83
orte/mca/errmgr/app/errmgr_app_component.c
Обычный файл
83
orte/mca/errmgr/app/errmgr_app_component.c
Обычный файл
@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "errmgr_app.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_app_component_version_string =
|
||||
"ORTE ERRMGR app MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_app_open(void);
|
||||
static int errmgr_app_close(void);
|
||||
static int errmgr_app_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_app_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itapp
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"app",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_app_open,
|
||||
errmgr_app_close,
|
||||
errmgr_app_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_app_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_app_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_app_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_APP) {
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 10;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_app_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -68,19 +68,13 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
int i;
|
||||
orte_errmgr_stack_state_t stack_state;
|
||||
orte_errmgr_base_module_t *module;
|
||||
|
||||
if( ORTE_PROC_IS_APP ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if( !orte_errmgr_base.shutting_down ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
|
||||
(NULL == name) ? "NULL" : ORTE_NAME_PRINT(name)));
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
|
||||
(NULL == name) ? "NULL" : ORTE_NAME_PRINT(name)));
|
||||
|
||||
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
@ -101,7 +95,7 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
|
@ -79,12 +79,6 @@ int orte_errmgr_base_open(void)
|
||||
|
||||
orte_errmgr_base.output = opal_output_open(NULL);
|
||||
|
||||
/*
|
||||
* A flag to indicate that orterun is shutting down, so skip the recovery
|
||||
* logic.
|
||||
*/
|
||||
orte_errmgr_base.shutting_down = false;
|
||||
|
||||
/*
|
||||
* Open up all available components
|
||||
*/
|
||||
|
@ -42,7 +42,6 @@ BEGIN_C_DECLS
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
int output;
|
||||
bool shutting_down;
|
||||
opal_pointer_array_t modules;
|
||||
bool initialized;
|
||||
} orte_errmgr_base_t;
|
||||
|
@ -132,7 +132,7 @@ static int update_state(orte_jobid_t job,
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_errmgr_base.shutting_down) {
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -235,6 +235,18 @@ static int update_state(orte_jobid_t job,
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
case ORTE_JOB_STATE_COMM_FAILED:
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||
check_job_complete(jdata); /* set the local proc states */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
@ -253,7 +265,6 @@ static int update_state(orte_jobid_t job,
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
if (jdata->enable_recovery) {
|
||||
/* is this a local proc */
|
||||
if (NULL != (child = proc_is_local(proc))) {
|
||||
@ -327,6 +338,12 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
/* is this to a daemon? */
|
||||
/* relocate its processes */
|
||||
/* attempt to restart? */
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -78,7 +78,7 @@ static int ft_event(int state);
|
||||
|
||||
|
||||
/******************
|
||||
* ORCM module
|
||||
* ORTED module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
init,
|
||||
@ -123,7 +123,13 @@ static int update_state(orte_jobid_t job,
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_errmgr_base.shutting_down) {
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if this is a heartbeat failure, let the HNP handle it */
|
||||
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
|
||||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
@ -65,6 +66,18 @@ int orte_ess_base_app_setup(void)
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
|
||||
/* setup the errmgr */
|
||||
if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_errmgr_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_errmgr_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Setup the communication infrastructure */
|
||||
|
||||
/* Runtime Messaging Layer */
|
||||
|
@ -60,7 +60,6 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -597,12 +596,12 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
peer->peer_state);
|
||||
}
|
||||
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
|
||||
/* inform the routed framework that we have lost a connection so
|
||||
/* inform the ERRMGR framework that we have lost a connection so
|
||||
* it can decide if this is important, what to do about it, etc.
|
||||
*/
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(&peer->peer_name)) {
|
||||
if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(peer->peer_name.jobid, ORTE_JOB_STATE_COMM_FAILED,
|
||||
&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE)) {
|
||||
/* Should free the peer lock before we abort so we don't
|
||||
* get stuck in the orte_wait_kill when receiving messages in the
|
||||
* tcp OOB
|
||||
|
@ -1228,8 +1228,7 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
* procedure.
|
||||
*/
|
||||
orte_enable_recovery = false;
|
||||
orte_errmgr_base.shutting_down = true;
|
||||
|
||||
|
||||
/* terminate the orteds - they will automatically kill
|
||||
* their local procs
|
||||
*/
|
||||
|
@ -129,6 +129,10 @@ const char *orte_err2str(int errnum)
|
||||
case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED:
|
||||
retval = "Limit on number of process relocations was exceeded";
|
||||
break;
|
||||
case ORTE_ERR_UNRECOVERABLE:
|
||||
retval = "Unrecoverable error";
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
retval = NULL;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user