1
1

Begin handling the case of lost connections by having the OOB report it to the errmgr instead of the routed framework. Add an "app" component to t

he errmgr framework so that it can decide how to respond - which for now at least is just to check for lifeline and abort if so.

Add a new error constant to indicate that the error is "unrecoverable" so the oob can know it needs to abort.

This commit was SVN r23112.
Этот коммит содержится в:
Ralph Castain 2010-05-11 00:34:12 +00:00
родитель 5965d3e620
Коммит 4bd25f587c
16 изменённых файлов: 355 добавлений и 33 удалений

Просмотреть файл

@ -114,7 +114,8 @@ enum {
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34)
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35)
};
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)

12
orte/mca/errmgr/app/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

36
orte/mca/errmgr/app/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
errmgr_app.h \
errmgr_app_component.c \
errmgr_app.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_app_DSO
component_noinst =
component_install = mca_errmgr_app.la
else
component_noinst = libmca_errmgr_app.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_app_la_SOURCES = $(sources)
mca_errmgr_app_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_app_la_SOURCES =$(sources)
libmca_errmgr_app_la_LDFLAGS = -module -avoid-version

22
orte/mca/errmgr/app/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,22 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

108
orte/mca/errmgr/app/errmgr_app.c Обычный файл
Просмотреть файл

@ -0,0 +1,108 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_app.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_app_module = {
init,
finalize,
update_state,
NULL,
NULL,
NULL
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
{
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), exit_code));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
/* delete the route */
orte_routed.delete_route(proc);
/* see is this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
return ORTE_ERR_UNRECOVERABLE;
}
}
return ORTE_SUCCESS;
}

35
orte/mca/errmgr/app/errmgr_app.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_APP_EXPORT_H
#define MCA_ERRMGR_APP_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_app_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module;
END_C_DECLS
#endif /* MCA_ERRMGR_APP_EXPORT_H */

83
orte/mca/errmgr/app/errmgr_app_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,83 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_app.h"
/*
* Public string for version number
*/
const char *orte_errmgr_app_component_version_string =
"ORTE ERRMGR app MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_app_open(void);
static int errmgr_app_close(void);
static int errmgr_app_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_app_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itapp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"app",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_app_open,
errmgr_app_close,
errmgr_app_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int errmgr_app_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_app_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_app_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_APP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*module = (mca_base_module_t *)&orte_errmgr_app_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -69,18 +69,12 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
orte_errmgr_stack_state_t stack_state;
orte_errmgr_base_module_t *module;
if( ORTE_PROC_IS_APP ) {
return ORTE_SUCCESS;
}
if( !orte_errmgr_base.shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
(NULL == name) ? "NULL" : ORTE_NAME_PRINT(name)));
}
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
@ -101,7 +95,7 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
}
}
return ORTE_SUCCESS;
return rc;
}
int orte_errmgr_base_abort(int error_code, char *fmt, ...)

Просмотреть файл

@ -79,12 +79,6 @@ int orte_errmgr_base_open(void)
orte_errmgr_base.output = opal_output_open(NULL);
/*
* A flag to indicate that orterun is shutting down, so skip the recovery
* logic.
*/
orte_errmgr_base.shutting_down = false;
/*
* Open up all available components
*/

Просмотреть файл

@ -42,7 +42,6 @@ BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
int output;
bool shutting_down;
opal_pointer_array_t modules;
bool initialized;
} orte_errmgr_base_t;

Просмотреть файл

@ -132,7 +132,7 @@ static int update_state(orte_jobid_t job,
/*
* if orte is trying to shutdown, just let it
*/
if (orte_errmgr_base.shutting_down) {
if (orte_finalizing) {
return ORTE_SUCCESS;
}
@ -235,6 +235,18 @@ static int update_state(orte_jobid_t job,
hnp_abort(jdata->jobid, exit_code);
}
break;
case ORTE_JOB_STATE_COMM_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(job))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
default:
break;
@ -253,7 +265,6 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
case ORTE_PROC_STATE_COMM_FAILED:
if (jdata->enable_recovery) {
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
@ -327,6 +338,12 @@ static int update_state(orte_jobid_t job,
}
break;
case ORTE_PROC_STATE_COMM_FAILED:
/* is this to a daemon? */
/* relocate its processes */
/* attempt to restart? */
break;
default:
break;
}

Просмотреть файл

@ -78,7 +78,7 @@ static int ft_event(int state);
/******************
* ORCM module
* ORTED module
******************/
orte_errmgr_base_module_t orte_errmgr_orted_module = {
init,
@ -123,7 +123,13 @@ static int update_state(orte_jobid_t job,
/*
* if orte is trying to shutdown, just let it
*/
if (orte_errmgr_base.shutting_down) {
if (orte_finalizing) {
return ORTE_SUCCESS;
}
/* if this is a heartbeat failure, let the HNP handle it */
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
return ORTE_SUCCESS;
}

Просмотреть файл

@ -44,6 +44,7 @@
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/filem/base/base.h"
#include "orte/mca/errmgr/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#endif
@ -65,6 +66,18 @@ int orte_ess_base_app_setup(void)
int ret;
char *error = NULL;
/* setup the errmgr */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_select";
goto error;
}
/* Setup the communication infrastructure */
/* Runtime Messaging Layer */

Просмотреть файл

@ -60,7 +60,6 @@
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/notifier/notifier.h"
#include "orte/runtime/orte_wait.h"
@ -597,12 +596,12 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
peer->peer_state);
}
mca_oob_tcp_peer_shutdown(peer);
/* inform the routed framework that we have lost a connection so
/* inform the ERRMGR framework that we have lost a connection so
* it can decide if this is important, what to do about it, etc.
*/
if (ORTE_SUCCESS != orte_routed.route_lost(&peer->peer_name)) {
if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(peer->peer_name.jobid, ORTE_JOB_STATE_COMM_FAILED,
&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED,
ORTE_ERROR_DEFAULT_EXIT_CODE)) {
/* Should free the peer lock before we abort so we don't
* get stuck in the orte_wait_kill when receiving messages in the
* tcp OOB

Просмотреть файл

@ -1228,7 +1228,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
* procedure.
*/
orte_enable_recovery = false;
orte_errmgr_base.shutting_down = true;
/* terminate the orteds - they will automatically kill
* their local procs

Просмотреть файл

@ -129,6 +129,10 @@ const char *orte_err2str(int errnum)
case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED:
retval = "Limit on number of process relocations was exceeded";
break;
case ORTE_ERR_UNRECOVERABLE:
retval = "Unrecoverable error";
break;
default:
retval = NULL;