1
1

Per discussion with Josh, cleanup the errmgr API by creating separate modules for the public vs internal APIs. This mirrors the architecture used in other frameworks that had similar requirements.

Remove the orcm errmgr module - moving to the orcm code base so it can utilize orcm communications and not interfere with ompi-related operations.

This commit was SVN r22931.
Этот коммит содержится в:
Ralph Castain 2010-04-05 22:59:21 +00:00
родитель 1caba7af2f
Коммит a1e82e9d05
14 изменённых файлов: 161 добавлений и 685 удалений

Просмотреть файл

@ -43,8 +43,8 @@ int orte_errmgr_base_close(void)
if( NULL == module ) {
continue;
}
if( NULL != module->internal_errmgr_finalize ) {
module->internal_errmgr_finalize();
if( NULL != module->finalize ) {
module->finalize();
}
}

Просмотреть файл

@ -145,8 +145,8 @@ int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code)
if( NULL == module ) {
continue;
}
if( NULL != module->internal_process_fault ) {
module->internal_process_fault(jdata, name, state, &stack_state);
if( NULL != module->process_fault ) {
module->process_fault(jdata, name, state, &stack_state);
}
}
}
@ -284,8 +284,8 @@ int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code)
if( NULL == module ) {
continue;
}
if( NULL != module->internal_process_fault ) {
module->internal_process_fault(jdata, NULL, state, &stack_state);
if( NULL != module->process_fault ) {
module->process_fault(jdata, NULL, state, &stack_state);
}
}
}
@ -390,8 +390,8 @@ int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code)
if( NULL == module ) {
continue;
}
if( NULL != module->internal_process_fault ) {
module->internal_process_fault(jdata, name, state, &stack_state);
if( NULL != module->process_fault ) {
module->process_fault(jdata, name, state, &stack_state);
}
}
}
@ -462,6 +462,7 @@ int orte_errmgr_base_predicted_fault(char ***proc_list,
{
orte_errmgr_base_module_t *module = NULL;
int i;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
/*
* If the user did not ask for recovery, then do not process recovery events
@ -485,8 +486,8 @@ int orte_errmgr_base_predicted_fault(char ***proc_list,
if( NULL == module ) {
continue;
}
if( NULL != module->internal_predicted_fault ) {
module->internal_predicted_fault(proc_list, node_list, suggested_nodes);
if( NULL != module->predicted_fault ) {
module->predicted_fault(proc_list, node_list, suggested_nodes, &stack_state);
}
}
@ -499,6 +500,7 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
{
orte_errmgr_base_module_t *module = NULL;
int i;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
/*
* If the user did not ask for recovery, then do not process recovery events
@ -522,8 +524,8 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
if( NULL == module ) {
continue;
}
if( NULL != module->internal_suggest_map_targets ) {
module->internal_suggest_map_targets(proc, oldnode, node_list);
if( NULL != module->suggest_map_targets ) {
module->suggest_map_targets(proc, oldnode, node_list, &stack_state);
}
}
@ -546,8 +548,8 @@ int orte_errmgr_base_ft_event(int state)
if( NULL == module ) {
continue;
}
if( NULL != module->internal_ft_event ) {
module->internal_ft_event(state);
if( NULL != module->ft_event ) {
module->ft_event(state);
}
}

Просмотреть файл

@ -54,19 +54,14 @@ bool orte_errmgr_initialized = false;
opal_list_t orte_errmgr_base_components_available;
/* Public module provides a wrapper around previous functions */
orte_errmgr_base_module_t orte_errmgr = {
orte_errmgr_API_t orte_errmgr = {
orte_errmgr_base_log,
orte_errmgr_base_proc_aborted,
orte_errmgr_base_incomplete_start,
orte_errmgr_base_comm_failed,
orte_errmgr_base_abort,
/* Internal Interfaces */
NULL, /* internal_errmgr_init */
NULL, /* internal_errmgr_finalize */
NULL, /* internal_predicted_fault */
NULL, /* internal_process_fault */
NULL, /* internal_suggest_map_targets */
NULL /* internal_ft_event */
orte_errmgr_base_predicted_fault,
orte_errmgr_base_suggest_map_targets,
orte_errmgr_base_abort
};
/**

Просмотреть файл

@ -171,8 +171,8 @@ int orte_errmgr_base_select(void)
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->internal_errmgr_init ) {
i_module->internal_errmgr_init();
if( NULL != i_module->init ) {
i_module->init();
}
}

Просмотреть файл

@ -60,13 +60,13 @@ ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
ORTE_DECLSPEC int orte_recos_base_predicted_fault(char ***proc_list,
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
ORTE_DECLSPEC int orte_recos_base_suggest_map_targets(orte_proc_t *proc,
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
ORTE_DECLSPEC int orte_recos_base_ft_event(int state);
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
/*
* Additional External API function declared in errmgr.h

Просмотреть файл

@ -86,13 +86,120 @@ BEGIN_C_DECLS
#define ORTE_ERROR_NAME(n) opal_strerror(n)
#define ORTE_ERROR_LOG(n) \
orte_errmgr_base_log(n, __FILE__, __LINE__)
orte_errmgr.log(n, __FILE__, __LINE__)
/**** FRAMEWORK API FUNCTIONS ****/
/**
* This is not part of any module so it can be used at any time!
*/
ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line);
typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, int line);
/**
* Alert - process aborted
* This function is called by the PLM when a remote process aborts during execution. Actions taken
* in response to the abnormal termination of a remote application process will vary across
* the various errmgr components.
*
* NOTE: Local process errors should always be reported through the error_detected interface and
* NOT here.
*
* @param *name Pointer to the name of the proc that aborted
*
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef int (*orte_errmgr_base_API_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
/**
* Alert - incomplete start of a job
* This function is called by the PLM when an attempted launch of a job encounters failure of
* one or more processes to start. The strategy for dealing
* with this "incomplete start" situation varies across the various errmgr components.
*
* This function is only called by the respective process launcher, which is responsible
* for detecting incomplete starts. If on a daemon, the function simply updates the
* process state to indicate failure to launch - this initiates a trigger that goes to
* the respective HNP for response.
*
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
* from taking any action to this function call. Instead, they are restricted to simply
* returning.
*
* @param job Job that failed to start
*
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef int (*orte_errmgr_base_API_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
/**
* If the communication link failed to a peer.
* This gives us a chance to recover from this error, or abort.
*/
typedef int (*orte_errmgr_base_API_comm_failed_fn_t)(orte_process_name_t *name,
int exit_code);
/**
* Predicted process/node failure notification
* Composite interface. Called in priority order.
*
* @param[in] proc_list List of processes (or NULL if none)
* @param[in] node_list List of nodes (or NULL if none)
* @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none)
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
/**
* Suggest a node to map a restarting process onto
*
* @param[in] proc Process that is being mapped
* @param[in] oldnode Previous node where this process resided
* @param[in|out] node_list List of nodes to select from
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_API_suggest_map_targets_fn_t)(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
/**
* Alert - self aborting
* This function is called when a process is aborting due to some internal error.
* It will finalize the process
* itself, and then exit - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
typedef int (*orte_errmgr_base_API_abort_fn_t)(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
/* global structure for accessing ERRMGR FRAMEWORK API's */
typedef struct {
orte_errmgr_base_API_log_fn_t log;
orte_errmgr_base_API_proc_aborted_fn_t proc_aborted;
orte_errmgr_base_API_incomplete_start_fn_t incomplete_start;
orte_errmgr_base_API_comm_failed_fn_t comm_failed;
orte_errmgr_base_API_predicted_fault_fn_t predicted_fault;
orte_errmgr_base_API_suggest_map_targets_fn_t suggest_map_targets;
orte_errmgr_base_API_abort_fn_t abort;
} orte_errmgr_API_t;
ORTE_DECLSPEC extern orte_errmgr_API_t orte_errmgr;
/**** INTERNAL MODULE FUNCTIONS ****/
/**
* Module initialization function.
@ -115,48 +222,20 @@ typedef int (*orte_errmgr_base_module_finalize_fn_t)
(void);
/*
* Internal Composite Interfaces
* Internal Composite Interfaces corresponding to API interfaces
*/
/**
* Predicted process/node failure notification
* Composite interface. Called in priority order.
*
* @param[in] proc_list List of processes (or NULL if none)
* @param[in] node_list List of nodes (or NULL if none)
* @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none)
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_predicted_fault_fn_t)
(char ***proc_list, char ***node_list, char ***suggested_nodes);
/**
* Actual process failure notification
* Composite interface. Called in priority order.
*
* @param[in] proc_name Name of the failed processes
* @param[in] state State of the failed process
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_process_fault_fn_t)
(orte_job_t *jdata, orte_process_name_t *proec_name, orte_proc_state_t state, int *stack_state);
/**
* Suggest a node to map a restarting process onto
* Composite interface. Called in priority order.
*
* @param[in] proc Process that is being mapped
* @param[in] oldnode Previous node where this process resided
* @param[in|out] node_list List of nodes to select from
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_suggest_map_targets_fn_t)
(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list);
typedef int (*orte_errmgr_base_module_process_fault_fn_t)(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
int *stack_state);
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(char ***proc_list,
char ***node_list,
char ***suggested_nodes,
int *stack_state);
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
int *stack_state);
/**
* Handle fault tolerance updates
@ -169,110 +248,25 @@ typedef int (*orte_errmgr_base_suggest_map_targets_fn_t)
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
/*
* External API Functions - Implemented in errmgr/base/errmgr_base_fns.c
*/
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
/**
* Alert - process aborted
* This function is called by the PLM when a remote process aborts during execution. Actions taken
* in response to the abnormal termination of a remote application process will vary across
* the various errmgr components.
*
* NOTE: Local process errors should always be reported through the error_detected interface and
* NOT here.
*
* @param *name Pointer to the name of the proc that aborted
*
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
ORTE_DECLSPEC extern int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
/**
* Alert - incomplete start of a job
* This function is called by the PLM when an attempted launch of a job encounters failure of
* one or more processes to start. The strategy for dealing
* with this "incomplete start" situation varies across the various errmgr components.
*
* This function is only called by the respective process launcher, which is responsible
* for detecting incomplete starts. If on a daemon, the function simply updates the
* process state to indicate failure to launch - this initiates a trigger that goes to
* the respective HNP for response.
*
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
* from taking any action to this function call. Instead, they are restricted to simply
* returning.
*
* @param job Job that failed to start
*
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
ORTE_DECLSPEC extern int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
/**
* Alert - self aborting
* This function is called when a process is aborting due to some internal error.
* It will finalize the process
* itself, and then exit - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
ORTE_DECLSPEC extern int orte_errmgr_base_abort(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
/**
* If the communication link failed to a peer.
* This gives us a chance to recover from this error, or abort.
*/
ORTE_DECLSPEC extern int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
typedef int (*orte_errmgr_base_module_comm_failed_fn_t)(orte_process_name_t *name,
int exit_code);
/*
* Module Structure
*/
struct orte_errmgr_base_module_2_3_0_t {
/* ---- Previous Interfaces (Always call base) -- */
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
orte_errmgr_base_module_comm_failed_fn_t comm_failed;
orte_errmgr_base_module_abort_fn_t abort;
/** Initialization Function */
orte_errmgr_base_module_init_fn_t init;
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t finalize;
/* -------------- Internal Composite Interfaces -- */
/** Initialization Function */
orte_errmgr_base_module_init_fn_t internal_errmgr_init;
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t internal_errmgr_finalize;
/** Predicted process/node failure notification */
orte_errmgr_base_predicted_fault_fn_t internal_predicted_fault;
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
/** Actual process failure notification */
orte_errmgr_base_process_fault_fn_t internal_process_fault;
orte_errmgr_base_module_process_fault_fn_t process_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_suggest_map_targets_fn_t internal_suggest_map_targets;
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_ft_event_fn_t internal_ft_event;
orte_errmgr_base_ft_event_fn_t ft_event;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
@ -297,10 +291,6 @@ struct orte_errmgr_base_component_3_0_0_t {
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
/*
* Global structure for accessing previous error manager functions
*/
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
/*
* Macro for use in components that are of type errmgr

Просмотреть файл

@ -1,37 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-orcm.txt
sources = \
errmgr_orcm.h \
errmgr_orcm_component.c \
errmgr_orcm_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_orcm_DSO
component_noinst =
component_install = mca_errmgr_orcm.la
else
component_noinst = libmca_errmgr_orcm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_orcm_la_SOURCES = $(sources)
mca_errmgr_orcm_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_orcm_la_SOURCES = $(sources)
libmca_errmgr_orcm_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,19 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_errmgr_orcm_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_errmgr_orcm_CONFIG],[
# If we don't want orcm FT, don't compile this component
AS_IF([test "$opal_want_ft_orcm" = "1"],
[$1],
[$2])
])dnl

Просмотреть файл

@ -1,13 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=errmgr_orcm_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,35 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_ORCM_EXPORT_H
#define MCA_ERRMGR_ORCM_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orcm_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orcm_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORCM_EXPORT_H */

Просмотреть файл

@ -1,86 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_orcm.h"
/*
* Public string for version number
*/
const char *orte_errmgr_orcm_component_version_string =
"ORTE ERRMGR orcm MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_orcm_open(void);
static int errmgr_orcm_close(void);
static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_orcm_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itorcm
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"orcm",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_orcm_open,
errmgr_orcm_close,
errmgr_orcm_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
1
};
static int errmgr_orcm_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_orcm_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority)
{
/* if we built, then we should probably be the
* default module
*/
*priority = 100;
*module = (mca_base_module_t *)&orte_errmgr_orcm_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,307 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/argv.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_orcm.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
static int process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
int *stack_state);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
static int ft_event(int state);
/******************
* ORCM module
******************/
orte_errmgr_base_module_t orte_errmgr_orcm_module = {
NULL, /* proc_aborted (old interface) */
NULL, /* incomplete_start (old interface) */
NULL, /* comm_failed (old interface) */
NULL, /* abort (old interface) */
init,
finalize,
predicted_fault,
process_fault,
suggest_map_targets,
ft_event
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int process_fault(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
int *stack_state)
{
orte_job_t *jnew;
orte_proc_t *pdata;
orte_app_context_t *app=NULL;
orte_node_t *node, *newnode;
orte_proc_t *daemon, *nodeproc;
opal_value_array_t jobs;
bool found;
int i;
size_t j;
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"errmgr:orcm:process_fault() "
"------- %s fault reported! proc %s (0x%x)",
(proc->jobid == ORTE_PROC_MY_NAME->jobid ? "Daemon" : "App. Process"),
ORTE_NAME_PRINT(proc),
state ));
/* get the app - just for output purposes in case of error */
app = opal_pointer_array_get_item(jdata->apps, 0);
/* Remove the route to this process since it is dead */
orte_routed.delete_route(proc);
/**** NON-DAEMON PROC FAILED ****/
if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
/* if the proc failed to start or we killed it by cmd,
* don't attempt to restart it as this can lead to an
* infinite loop
*/
if (ORTE_PROC_STATE_FAILED_TO_START == state) {
opal_output(0, "APPLICATION %s FAILED TO START", app->app);
return ORTE_SUCCESS;
}
/* if the proc was terminated by cmd, then do nothing */
if (ORTE_PROC_STATE_KILLED_BY_CMD == state) {
opal_output(0, "APPLICATION %s KILLED BY COMMAND", app->app);
return ORTE_SUCCESS;
}
/* get the proc_t object for this process */
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
if (NULL == pdata) {
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND;
}
/* proc just died - save the node where this proc was located */
node = pdata->node;
/* increment restarts */
pdata->restarts++;
/* have we exceeded #restarts? */
if (jdata->max_restarts < pdata->restarts) {
opal_output(0, "Max restarts for proc %s of app %s has been exceeded - process will not be restarted",
ORTE_NAME_PRINT(proc), app->app);
return ORTE_SUCCESS;
}
/* reset the job params for restart */
orte_plm_base_reset_job(jdata);
/* restart the job - the spawn function will remap and
* launch the replacement proc(s)
*/
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s RESTARTING APP: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (ORTE_SUCCESS != orte_plm.spawn(jdata)) {
opal_output(0, "FAILED TO RESTART APP %s", app->app);
orte_trigger_event(&orte_exit);
return ORTE_ERROR;
}
/* get the new node */
newnode = pdata->node;
/* report what we did */
opal_output(0, "Proc %s:%s aborted on node %s and was restarted on node %s\n\n",
app->app, ORTE_NAME_PRINT(proc), node->name, newnode->name);
return ORTE_SUCCESS;
}
/* if it was a daemon that failed, then we have to
* treat it differently
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s Daemon %s failed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(proc->vpid)));
/* need to relaunch all the apps that were on
* the node where this daemon was running as
* they either died along with the node, or will
* have self-terminated when the daemon died
*/
if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
/* nothing we can do - abort things */
opal_output(0, "FAILED TO GET DAEMON OBJECT");
return ORTE_ERROR;
}
/* flag the daemon state to indicate it terminated - this will
* cause the daemon to be restarted IF required for starting
* procs on that node
*/
daemon->state = ORTE_PROC_STATE_ABORTED;
/* identify the node where the daemon was running */
node = daemon->node;
/* release the contact info, if not already done */
if (NULL != daemon->rml_uri) {
free(daemon->rml_uri);
daemon->rml_uri = NULL;
}
/* setup to track the jobs on this node */
OBJ_CONSTRUCT(&jobs, opal_value_array_t);
opal_value_array_init(&jobs, sizeof(orte_jobid_t));
/* cycle through the node's procs */
for (i=0; i < node->procs->size; i++) {
if (NULL == (nodeproc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
/* set the proc to abnormally terminated */
nodeproc->state = ORTE_PROC_STATE_ABORTED;
/* increment restarts */
nodeproc->restarts++;
/* check if this proc's jobid is already in array */
found = false;
for (j=0; j < opal_value_array_get_size(&jobs); j++) {
if (nodeproc->name.jobid == OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)) {
found = true;
break;
}
}
if (!found) {
/* add it */
opal_value_array_append_item(&jobs, &nodeproc->name.jobid);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s RESTARTING APPS FROM NODE: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
for (j=0; j < opal_value_array_get_size(&jobs); j++) {
if (NULL == (jnew = orte_get_job_data_object(OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)))) {
/* nothing we can do - abort things */
opal_output(0, "FAILED TO GET JOB OBJECT TO BE RESTARTED");
return ORTE_ERROR;
}
/* reset the job params for restart */
orte_plm_base_reset_job(jnew);
/* restart the job - the spawn function will remap and
* launch the replacement proc(s)
*/
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s RESTARTING JOB %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jnew->jobid)));
if (ORTE_SUCCESS != orte_plm.spawn(jnew)) {
opal_output(0, "FAILED TO RESTART APPS FROM NODE: %s", node->name);
return ORTE_ERROR;
}
}
opal_output(0, "Daemon %s on node %s aborted - procs were restarted elsewhere\n\n",
ORTE_NAME_PRINT(proc), node->name);
/* all done - cleanup and leave */
OBJ_DESTRUCT(&jobs);
return ORTE_ERROR;
}
/* save */
return ORTE_SUCCESS;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/

Просмотреть файл

@ -1,14 +0,0 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE RecoS IGNORE framework.
#

Просмотреть файл

@ -282,7 +282,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
}
/* Ask the ErrMgr components if they have a suggestion for this process */
orte_errmgr_base_suggest_map_targets(proc, proc->node, &node_list);
orte_errmgr.suggest_map_targets(proc, proc->node, &node_list);
nd = (orte_node_t*)opal_list_get_first(&node_list);
if( NULL == nd ) {