1
1

Simplification of the ErrMgr framework by removing the 'stack'/composite functionality.

The composite functionality was becoming difficult to maintain, so we removed it for now which simplifies the framework design considerably.

Since the 'crmig' and 'autor' components were -very- similar to the 'hnp' component, this commit also merges them together. By moving the 'crmig' and 'autor' to a separate file under the 'hnp' component we are able to isolate the C/R logic to a large extent, thus being only minimally hooked into the previous 'hnp' component.

So other than some name changes, the functionality is all still in place. I will update the C/R documentation later this morning.

This commit was SVN r23628.
Этот коммит содержится в:
Josh Hursey 2010-08-19 13:09:20 +00:00
родитель 77792c937d
Коммит fabd5cc153
38 изменённых файлов: 1030 добавлений и 2429 удалений

Просмотреть файл

@ -68,12 +68,11 @@ btl_openib_cpc_include=oob
orte_forward_job_control=1
#
# Use the C/R Error Management and Recovery Service
# Activate the Process Migartion and Automatic Recovery services in the
# HNP ErrMgr component.
#
orte_enable_recovery=1
orte_max_global_restarts=10
errmgr_crmig_enable=1
errmgr_autor_enable=1
errmgr_hnp_crmig_enable=1
errmgr_hnp_autor_enable=1
#
# Additional constraints to be lifted in the future

Просмотреть файл

@ -43,8 +43,7 @@ static int update_state(orte_jobid_t job,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
orte_exit_code_t exit_code);
/******************
* HNP module
@ -52,6 +51,8 @@ static int update_state(orte_jobid_t job,
orte_errmgr_base_module_t orte_errmgr_app_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
update_state,
NULL,
NULL,
@ -76,12 +77,8 @@ static int update_state(orte_jobid_t job,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
orte_exit_code_t exit_code)
{
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-autor.txt
sources = \
errmgr_autor.h \
errmgr_autor_component.c \
errmgr_autor_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_autor_DSO
component_noinst =
component_install = mca_errmgr_autor.la
else
component_noinst = libmca_errmgr_autor.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_autor_la_SOURCES = $(sources)
mca_errmgr_autor_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_autor_la_SOURCES = $(sources)
libmca_errmgr_autor_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,20 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_errmgr_autor_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_errmgr_autor_CONFIG],[
# If we don't want FT, don't compile this component
AS_IF([test "$opal_want_ft_cr" = "1"],
[$1],
[$2])
])dnl

Просмотреть файл

@ -1,14 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=errmgr_autor_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,88 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Automatic Recovery Errmgr component
*
*/
#ifndef MCA_ERRMGR_AUTOR_EXPORT_H
#define MCA_ERRMGR_AUTOR_EXPORT_H
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "opal/event/event.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_autor_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
bool autor_enabled;
bool timing_enabled;
int recovery_delay;
bool skip_oldnode;
};
typedef struct orte_errmgr_autor_component_t orte_errmgr_autor_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_autor_component_t mca_errmgr_autor_component;
int orte_errmgr_autor_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions: Global
*/
int orte_errmgr_autor_global_module_init(void);
int orte_errmgr_autor_global_module_finalize(void);
int orte_errmgr_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_autor_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_autor_global_ft_event(int state);
/*
* Module functions: Local (Daemon)
*/
int orte_errmgr_autor_local_module_init(void);
int orte_errmgr_autor_local_module_finalize(void);
int orte_errmgr_autor_local_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_autor_local_ft_event(int state);
END_C_DECLS
#endif /* MCA_ERRMGR_AUTOR_EXPORT_H */

Просмотреть файл

@ -1,161 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_autor.h"
/*
* Public string for version number
*/
const char *orte_errmgr_autor_component_version_string =
"ORTE ERRMGR AutoR MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_autor_open(void);
static int errmgr_autor_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_autor_component_t mca_errmgr_autor_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itautor
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"autor",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_autor_open,
errmgr_autor_close,
orte_errmgr_autor_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
20
}
};
static int errmgr_autor_open(void)
{
int val;
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
"priority",
"Priority of the ERRMGR autor component",
false, false,
mca_errmgr_autor_component.super.priority,
&mca_errmgr_autor_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR autor component",
false, false,
mca_errmgr_autor_component.super.verbose,
&mca_errmgr_autor_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_autor_component.super.verbose) {
mca_errmgr_autor_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_autor_component.super.output_handle,
mca_errmgr_autor_component.super.verbose);
} else {
mca_errmgr_autor_component.super.output_handle = orte_errmgr_base.output;
}
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
"timing",
"Enable Automatic Recovery timer",
false, false,
0, &val);
mca_errmgr_autor_component.timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
"enable",
"Enable Automatic Recovery (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_autor_component.autor_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
"recovery_delay",
"Number of seconds to wait before starting to recover the job after a failure"
" [Default: 1 sec]",
false, false,
1, &val);
mca_errmgr_autor_component.recovery_delay = val;
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
"skip_oldnode",
"Skip the old node from failed proc, even if it is still available"
" [Default: Enabled]",
false, false,
1, &val);
mca_errmgr_autor_component.skip_oldnode = OPAL_INT_TO_BOOL(val);
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: open()");
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: open: priority = %d",
mca_errmgr_autor_component.super.priority);
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: open: verbosity = %d",
mca_errmgr_autor_component.super.verbose);
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: open: timing = %s",
(mca_errmgr_autor_component.timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: open: Auto. Recover = %s",
(mca_errmgr_autor_component.autor_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: open: recover_delay = %d",
mca_errmgr_autor_component.recovery_delay);
return ORTE_SUCCESS;
}
static int errmgr_autor_close(void)
{
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: close()");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,28 +0,0 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE ErrMgr AutoR framework.
#
[recovering_job]
Notice: The processes listed below failed unexpectedly.
Using the last checkpoint to recover the job.
Please standby.
%s
[recovery_complete]
Notice: The job has been successfully recovered from the
last checkpoint.
[failed_to_recover_proc]
Error: The process below has failed. There is no checkpoint available for
this job, so we are terminating the application since automatic
recovery cannot occur.
Internal Name: %s
MCW Rank: %d

Просмотреть файл

@ -43,20 +43,16 @@ ORTE_DECLSPEC int orte_errmgr_base_open(void);
ORTE_DECLSPEC int orte_errmgr_base_select(void);
ORTE_DECLSPEC int orte_errmgr_base_close(void);
/**
* Composite Stack states
*/
#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */
#define ORTE_ERRMGR_STACK_STATE_UPDATED 0x01 /* Updated the runtime */
#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */
#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */
#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */
#define ORTE_ERRMGR_STACK_STATE_COMPLETE 0x10 /* done processing this command */
/**
* Output and component variables
*/
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
/**
* Internal module reference
*/
ORTE_DECLSPEC extern orte_errmgr_base_component_t orte_errmgr_base_selected_component;
/**
* Interfaces for orte-migrate tool
*/
@ -100,7 +96,7 @@ ORTE_DECLSPEC int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t
ORTE_DECLSPEC int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int seq_num);
ORTE_DECLSPEC int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_t *datum);
#endif
#endif /* OPAL_ENABLE_FT_CR */
/*
* Additional External API function declared in errmgr.h

Просмотреть файл

@ -32,30 +32,25 @@
int orte_errmgr_base_close(void)
{
orte_errmgr_base_module_t *module = NULL;
int i;
OPAL_TRACE(5);
/* Close all selected components */
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->finalize ) {
module->finalize();
}
/* if not initialized, then skip this action. */
if( !orte_errmgr_base.initialized ) {
return ORTE_SUCCESS;
}
/* Close selected component */
if( NULL != orte_errmgr.finalize ) {
orte_errmgr.finalize();
}
/* Close all remaining available components (may be one if this is a
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
* OMPI RTE program, or [possibly] multiple if this is ompi_info)
*/
mca_base_components_close(orte_errmgr_base.output,
&orte_errmgr_base_components_available,
NULL);
OBJ_DESTRUCT(&orte_errmgr_base.modules);
orte_errmgr_base.initialized = false;
return ORTE_SUCCESS;

Просмотреть файл

@ -189,47 +189,6 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
ORTE_ERROR_NAME(error_code), filename, line);
}
int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
int rc=ORTE_SUCCESS;
int i;
orte_errmgr_stack_state_t stack_state;
orte_errmgr_base_module_t *module;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
(NULL == name) ? "NULL" : ORTE_NAME_PRINT(name)));
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Call the active modules
********************************/
for (i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->update_state ) {
rc = module->update_state(job, jobstate, name, state, pid, exit_code, &stack_state);
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
break;
}
}
}
return rc;
}
int orte_errmgr_base_abort(int error_code, char *fmt, ...)
{
va_list arglist;
@ -265,90 +224,6 @@ int orte_errmgr_base_abort(int error_code, char *fmt, ...)
return ORTE_SUCCESS;
}
int orte_errmgr_base_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
orte_errmgr_base_module_t *module = NULL;
int i, rc;
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:predicted_fault() %s) "
"------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base.modules.size));
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->predicted_fault ) {
rc = module->predicted_fault(proc_list, node_list, suggested_map, &stack_state);
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
break;
}
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
orte_errmgr_base_module_t *module = NULL;
int i, rc;
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:suggest_map_targets() %s) "
"------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base.modules.size));
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->suggest_map_targets ) {
rc = module->suggest_map_targets(proc, oldnode, node_list, &stack_state);
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
break;
}
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_base_ft_event(int state)
{
orte_errmgr_base_module_t *module = NULL;
int i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:ft_event() %s) "
"------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base.modules.size));
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->ft_event ) {
module->ft_event(state);
}
}
return ORTE_SUCCESS;
}
/********************
* Utility functions
********************/
@ -619,9 +494,9 @@ int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int s
orte_snapc_base_has_recovered = false;
loc_proc.jobid = jobid;
loc_proc.vpid = 0;
orte_errmgr_base_update_state(jobid, ORTE_JOB_STATE_RESTART,
&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD,
0, 0);
orte_errmgr.update_state(jobid, ORTE_JOB_STATE_RESTART,
&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD,
0, 0);
while( !orte_snapc_base_has_recovered ) {
opal_progress();
}
@ -678,7 +553,7 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
opal_list_append(suggested_map_list, &(onto_map->super));
}
if( ORTE_SUCCESS != (ret = orte_errmgr_base_predicted_fault(proc_list, node_list, suggested_map_list)) ) {
if( ORTE_SUCCESS != (ret = orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;

Просмотреть файл

@ -52,13 +52,18 @@ opal_list_t orte_errmgr_base_components_available;
orte_errmgr_base_t orte_errmgr_base;
orte_errmgr_base_component_t orte_errmgr_base_selected_component;
/* Public module provides a wrapper around previous functions */
orte_errmgr_API_t orte_errmgr = {
orte_errmgr_base_module_t orte_errmgr = {
NULL, /* init */
NULL, /* finalize */
orte_errmgr_base_log,
orte_errmgr_base_update_state,
orte_errmgr_base_predicted_fault,
orte_errmgr_base_suggest_map_targets,
orte_errmgr_base_abort
orte_errmgr_base_abort,
NULL, /* update_state */
NULL, /* predicted_fault */
NULL, /* suggest_map_targets */
NULL /* ft_event */
};
/**
@ -74,9 +79,6 @@ int orte_errmgr_base_open(void)
return ORTE_SUCCESS;
}
OBJ_CONSTRUCT(&orte_errmgr_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_errmgr_base.modules, 3, INT_MAX, 1);
orte_errmgr_base.output = opal_output_open(NULL);
/*

Просмотреть файл

@ -33,145 +33,36 @@
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
struct orte_errmgr_base_select_module_t {
mca_base_component_t *component;
mca_base_module_t *module;
int priority;
};
typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t;
int orte_errmgr_base_select(void)
{
int exit_status = OPAL_SUCCESS;
mca_base_component_list_item_t *cli = NULL;
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
opal_list_item_t *item = NULL;
int priority = 0, i, j, low_i;
orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
opal_pointer_array_t tmp_array;
orte_errmgr_base_module_t *i_module = NULL;
bool none_found;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_pointer_array_init(&tmp_array, 3, INT_MAX, 1);
opal_output_verbose(10, orte_errmgr_base.output,
"errmgr:base:select: Auto-selecting components");
orte_errmgr_base_component_t *best_component = NULL;
orte_errmgr_base_module_t *best_module = NULL;
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
* Select the best component
*/
none_found = true;
for (item = opal_list_get_first(&orte_errmgr_base_components_available);
item != opal_list_get_end(&orte_errmgr_base_components_available);
item = opal_list_get_next(item) ) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
component->mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Querying component [%s]",
component->mca_component_name);
component->mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
component->mca_component_name );
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Query of component [%s] set priority to %d",
component->mca_component_name, priority);
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
tmp_module->component = component;
tmp_module->module = module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
if( OPAL_SUCCESS != mca_base_select("errmgr", orte_errmgr_base.output,
&orte_errmgr_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected */
exit_status = ORTE_ERROR;
goto cleanup;
}
if (none_found) {
/* must have at least one module */
return ORTE_ERR_MODULE_NOT_FOUND;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
/* Save the winner */
orte_errmgr_base_selected_component = *best_component;
orte_errmgr = *best_module;
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Add module with priority [%s] %d",
tmp_module->component->mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_errmgr_base.modules, (void*)(tmp_module->module));
free(tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the Errmgr Modules
*/
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->init ) {
i_module->init();
/* Initialize the winner */
if (NULL != best_module) {
if (OPAL_SUCCESS != orte_errmgr.init()) {
exit_status = OPAL_ERROR;
goto cleanup;
}
}
cleanup:
return exit_status;
}

Просмотреть файл

@ -455,7 +455,7 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
/*
* Pass to the predicted fault function to see how they would like to progress
*/
orte_errmgr_base_predicted_fault(proc_list, node_list, suggested_map_list);
orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list);
}
/*
* Unknown command

Просмотреть файл

@ -42,7 +42,6 @@ BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
int output;
opal_pointer_array_t modules;
bool initialized;
} orte_errmgr_base_t;
@ -61,29 +60,11 @@ typedef uint8_t orte_errmgr_cmd_flag_t;
*/
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
/*
* Additional External API function declared in errmgr.h
*/
END_C_DECLS
#endif

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-crmig.txt
sources = \
errmgr_crmig.h \
errmgr_crmig_component.c \
errmgr_crmig_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_crmig_DSO
component_noinst =
component_install = mca_errmgr_crmig.la
else
component_noinst = libmca_errmgr_crmig.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_crmig_la_SOURCES = $(sources)
mca_errmgr_crmig_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_crmig_la_SOURCES = $(sources)
libmca_errmgr_crmig_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,20 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_errmgr_crmig_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_errmgr_crmig_CONFIG],[
# If we don't want FT, don't compile this component
AS_IF([test "$opal_want_ft_cr" = "1"],
[$1],
[$2])
])dnl

Просмотреть файл

@ -1,14 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=errmgr_crmig_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,93 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Checkpoint/Restart Process Migration (CRMIG) ErrMgr component
*
* Simple, braindead implementation.
*/
#ifndef MCA_ERRMGR_CRMIG_EXPORT_H
#define MCA_ERRMGR_CRMIG_EXPORT_H
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "opal/event/event.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_crmig_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
bool crmig_enabled;
bool timing_enabled;
};
typedef struct orte_errmgr_crmig_component_t orte_errmgr_crmig_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_crmig_component_t mca_errmgr_crmig_component;
int orte_errmgr_crmig_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions: Global
*/
int orte_errmgr_crmig_global_module_init(void);
int orte_errmgr_crmig_global_module_finalize(void);
int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_crmig_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_crmig_global_ft_event(int state);
/*
* Module functions: Local
*/
int orte_errmgr_crmig_local_module_init(void);
int orte_errmgr_crmig_local_module_finalize(void);
int orte_errmgr_crmig_local_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_crmig_local_ft_event(int state);
END_C_DECLS
#endif /* MCA_ERRMGR_CRMIG_EXPORT_H */

Просмотреть файл

@ -1,142 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_crmig.h"
/*
* Public string for version number
*/
const char *orte_errmgr_crmig_component_version_string =
"ORTE ERRMGR crmig MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_crmig_open(void);
static int errmgr_crmig_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_crmig_component_t mca_errmgr_crmig_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itcrmig
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"crmig",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_crmig_open,
errmgr_crmig_close,
orte_errmgr_crmig_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
40
}
};
static int errmgr_crmig_open(void)
{
int val;
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
"priority",
"Priority of the ERRMGR crmig component",
false, false,
mca_errmgr_crmig_component.super.priority,
&mca_errmgr_crmig_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR crmig component",
false, false,
mca_errmgr_crmig_component.super.verbose,
&mca_errmgr_crmig_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_crmig_component.super.verbose) {
mca_errmgr_crmig_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_crmig_component.super.output_handle,
mca_errmgr_crmig_component.super.verbose);
} else {
mca_errmgr_crmig_component.super.output_handle = orte_errmgr_base.output;
}
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
"timing",
"Enable Process Migration timer",
false, false,
0, &val);
mca_errmgr_crmig_component.timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
"enable",
"Enable Process Migration (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_crmig_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: open()");
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: open: priority = %d",
mca_errmgr_crmig_component.super.priority);
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: open: verbosity = %d",
mca_errmgr_crmig_component.super.verbose);
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: open: Proc. Mig. = %s",
(mca_errmgr_crmig_component.crmig_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: open: timing = %s",
(mca_errmgr_crmig_component.timing_enabled ? "Enabled" : "Disabled"));
return ORTE_SUCCESS;
}
static int errmgr_crmig_close(void)
{
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: close()");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,27 +0,0 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE ErrMgr CRMig framework.
#
[migrating_job]
Notice: A migration of this job has been requested.
The processes below will be migrated.
Please standby.
%s
[migrated_job]
Notice: The processes have been successfully migrated to/from the specified
machines.
[no_migrating_procs]
Warning: Could not find any processes to migrate on the nodes specified.
You provided the following:
Nodes: %s
Procs: %s

Просмотреть файл

@ -20,36 +20,16 @@
*
* The Open RTE Error and Recovery Manager (ErrMgr)
*
* This framework is a composite framework in which multiple components
* are often active at the same time and may work on a single external call
* to the interface functions.
* This framework is the logically central clearing house for process/daemon
* state updates. In particular when a process fails and another process detects
* it, then that information is reported through this framework. This framework
* then (depending on the active component) decides how to handle the failure.
*
* This framework allows the user to compose a job recovery policy from multiple
* individual components. Each component will operate on the function call if it
* has a registered function. If no component registers a function then the base
* functionality/policy is used.
*
* For example, consider the 3 components on the left (C1, C2, C3), and the
* API function calls across the top:
* | Priority | Fn1 | Fn2 | Fn3 | Fn4 |
* -----+----------+------+------+------+------+
* base | --- | act0 | --- | --- | act6 |
* C1 | 10 | act1 | --- | act2 | --- |
* C2 | 20 | --- | act3 | --- | --- |
* C3 | 30 | act4 | act5 | --- | --- |
* -----+----------+------+------+------+------+
* A call to Fn1 will result in:
* act4, act1
* A call to Fn2 will result in:
* act5, act3
* A call to Fn3 will result in:
* act2
* A call to Fn4 will result in:
* act6
*
* Notice that when the base function is overridden it is not called. The base
* function is only called when the function has not been overridden by a
* component.
* For example, if a process fails this may activate an automatic recovery
* of the process from a previous checkpoint, or initial state. Conversely,
* the active component could decide not to continue the job, and request that
* it be terminated. The error and recovery policy is determined by individual
* components within this framework.
*
*/
@ -76,8 +56,6 @@
#include "orte/mca/plm/plm_types.h"
BEGIN_C_DECLS
/* type definition */
typedef uint8_t orte_errmgr_stack_state_t;
/*
* Structure to describe a predicted process fault.
@ -159,12 +137,43 @@ OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
OPAL_SOS_LOG(n); \
}
/**** FRAMEWORK API FUNCTIONS ****/
/*
* Framework Interfaces
*/
/**
* Module initialization function.
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_init_fn_t)(void);
/**
* Module finalization function.
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
/**
* This is not part of any module so it can be used at any time!
*/
typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, int line);
typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
/**
* Alert - self aborting
* This function is called when a process is aborting due to some internal error.
* It will finalize the process
* itself, and then exit - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
/**
* Alert - process aborted
@ -180,16 +189,15 @@ typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, in
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef int (*orte_errmgr_base_API_update_state_fn_t)(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
/**
* Predicted process/node failure notification
* Composite interface. Called in priority order.
*
* @param[in] proc_list List of processes (or NULL if none)
* @param[in] node_list List of nodes (or NULL if none)
@ -198,9 +206,9 @@ typedef int (*orte_errmgr_base_API_update_state_fn_t)(orte_jobid_t job,
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
/**
* Suggest a node to map a restarting process onto
@ -212,79 +220,9 @@ typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(opal_list_t *proc_list,
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_API_suggest_map_targets_fn_t)(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
/**
* Alert - self aborting
* This function is called when a process is aborting due to some internal error.
* It will finalize the process
* itself, and then exit - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
typedef int (*orte_errmgr_base_API_abort_fn_t)(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
/* global structure for accessing ERRMGR FRAMEWORK API's */
typedef struct {
orte_errmgr_base_API_log_fn_t log;
orte_errmgr_base_API_update_state_fn_t update_state;
orte_errmgr_base_API_predicted_fault_fn_t predicted_fault;
orte_errmgr_base_API_suggest_map_targets_fn_t suggest_map_targets;
orte_errmgr_base_API_abort_fn_t abort;
} orte_errmgr_API_t;
ORTE_DECLSPEC extern orte_errmgr_API_t orte_errmgr;
/**** INTERNAL MODULE FUNCTIONS ****/
/**
* Module initialization function.
* Public interface. Will be call in each of the active composite components
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_init_fn_t)
(void);
/**
* Module finalization function.
* Public interface. Will be call in each of the active composite components
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_finalize_fn_t)
(void);
/*
* Internal Composite Interfaces corresponding to API interfaces
*/
typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state);
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
opal_list_t *node_list);
/**
* Handle fault tolerance updates
@ -294,8 +232,7 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
/*
* Module Structure
@ -306,7 +243,9 @@ struct orte_errmgr_base_module_2_3_0_t {
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t finalize;
/* -------------- Internal Composite Interfaces -- */
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
/** Actual process failure notification */
orte_errmgr_base_module_update_state_fn_t update_state;
/** Predicted process/node failure notification */
@ -315,11 +254,11 @@ struct orte_errmgr_base_module_2_3_0_t {
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_ft_event_fn_t ft_event;
orte_errmgr_base_module_ft_event_fn_t ft_event;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
/*
* ErrMgr Component
@ -340,7 +279,6 @@ struct orte_errmgr_base_component_3_0_0_t {
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
/*
* Macro for use in components that are of type errmgr
*/

Просмотреть файл

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-example.txt
sources = \
errmgr_example.h \
errmgr_example_component.c \
errmgr_example_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_example_DSO
component_noinst =
component_install = mca_errmgr_example.la
else
component_noinst = libmca_errmgr_example.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_example_la_SOURCES = $(sources)
mca_errmgr_example_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_example_la_SOURCES = $(sources)
libmca_errmgr_example_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,20 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_errmgr_example_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_errmgr_example_CONFIG],[
# If we don't want FT, don't compile this component
AS_IF([test "$opal_want_ft_cr" = "1"],
[$1],
[$2])
])dnl

Просмотреть файл

@ -1,14 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University.
# All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=errmgr_example_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,74 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Automatic Recovery Errmgr component
*
*/
#ifndef MCA_ERRMGR_EXAMPLE_EXPORT_H
#define MCA_ERRMGR_EXAMPLE_EXPORT_H
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "opal/event/event.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_example_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
};
typedef struct orte_errmgr_example_component_t orte_errmgr_example_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_example_component_t mca_errmgr_example_component;
int orte_errmgr_example_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions: Global
*/
int orte_errmgr_example_global_module_init(void);
int orte_errmgr_example_global_module_finalize(void);
int orte_errmgr_example_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_example_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_example_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_example_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
int orte_errmgr_example_global_ft_event(int state);
END_C_DECLS
#endif /* MCA_ERRMGR_EXAMPLE_EXPORT_H */

Просмотреть файл

@ -1,120 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_example.h"
/*
* Public string for version number
*/
const char *orte_errmgr_example_component_version_string =
"ORTE ERRMGR Example MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_example_open(void);
static int errmgr_example_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_example_component_t mca_errmgr_example_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itexample
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"example",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_example_open,
errmgr_example_close,
orte_errmgr_example_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
}
};
static int errmgr_example_open(void)
{
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_example_component.super.base_version,
"priority",
"Priority of the ERRMGR example component",
false, false,
mca_errmgr_example_component.super.priority,
&mca_errmgr_example_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_example_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR example component",
false, false,
mca_errmgr_example_component.super.verbose,
&mca_errmgr_example_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_example_component.super.verbose) {
mca_errmgr_example_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_example_component.super.output_handle,
mca_errmgr_example_component.super.verbose);
} else {
mca_errmgr_example_component.super.output_handle = orte_errmgr_base.output;
}
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example: open()");
opal_output_verbose(20, mca_errmgr_example_component.super.output_handle,
"errmgr:example: open: priority = %d",
mca_errmgr_example_component.super.priority);
opal_output_verbose(20, mca_errmgr_example_component.super.output_handle,
"errmgr:example: open: verbosity = %d",
mca_errmgr_example_component.super.verbose);
return ORTE_SUCCESS;
}
static int errmgr_example_close(void)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example: close()");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,187 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/argv.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/sstore.h"
#include "orte/mca/sstore/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_example.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Automatic Recovery module
******************/
static orte_errmgr_base_module_t global_module = {
/** Initialization Function */
orte_errmgr_example_global_module_init,
/** Finalization Function */
orte_errmgr_example_global_module_finalize,
/** Update State */
orte_errmgr_example_global_update_state,
orte_errmgr_example_global_predicted_fault,
/*orte_errmgr_example_global_process_fault,*/
orte_errmgr_example_global_suggest_map_targets,
orte_errmgr_example_global_ft_event
};
/************************************
* Locally Global vars & functions
************************************/
/************************
* Function Definitions
************************/
/*
* MCA Functions
*/
int orte_errmgr_example_component_query(mca_base_module_t **module, int *priority)
{
if( !(orte_enable_recovery) ) {
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:component_query() - Disabled: Recovery is not enabled");
*priority = -1;
*module = NULL;
return ORTE_SUCCESS;
}
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:component_query()");
*priority = mca_errmgr_example_component.super.priority;
if( ORTE_PROC_IS_HNP ) {
*module = (mca_base_module_t *)&global_module;
}
else {
*module = NULL;
}
return ORTE_SUCCESS;
}
/************************
* Function Definitions
************************/
int orte_errmgr_example_global_module_init(void)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:init()");
return ORTE_SUCCESS;
}
int orte_errmgr_example_global_module_finalize(void)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:finalize()");
return ORTE_SUCCESS;
}
int orte_errmgr_example_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:predicted_fault()");
return ORTE_SUCCESS;
}
int orte_errmgr_example_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:update_state(%s)",
ORTE_NAME_PRINT(proc_name));
return ORTE_SUCCESS;
}
int orte_errmgr_example_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:process_fault(%s)",
ORTE_NAME_PRINT(proc_name));
return ORTE_SUCCESS;
}
int orte_errmgr_example_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
{
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
"errmgr:example:suggest_map_targets()");
return ORTE_SUCCESS;
}
int orte_errmgr_example_global_ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/

Просмотреть файл

@ -1,14 +0,0 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE ErrMgr Example framework.
#

Просмотреть файл

@ -14,7 +14,9 @@ dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
sources = \
errmgr_hnp.h \
errmgr_hnp_component.c \
errmgr_hnp.c
errmgr_hnp.c \
errmgr_hnp_autor.c \
errmgr_hnp_crmig.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -51,83 +51,305 @@
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
/* Local functions */
/**********************
* C/R Mgr Components
* Global: HNP
**********************/
static orte_errmgr_base_module_t global_module = {
/** Initialization Function */
orte_errmgr_hnp_global_module_init,
/** Finalization Function */
orte_errmgr_hnp_global_module_finalize,
/** Error Log */
orte_errmgr_base_log,
/** Forced Abort */
orte_errmgr_base_abort,
/** Update State */
orte_errmgr_hnp_global_update_state,
/* Predicted Fault */
orte_errmgr_hnp_global_predicted_fault,
/* Suggest proc to node mapping */
orte_errmgr_hnp_global_suggest_map_targets,
/* FT Event hook */
orte_errmgr_hnp_global_ft_event
};
/*
* Local functions
*/
static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code);
static void failed_start(orte_job_t *jdata);
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate,
orte_proc_state_t state, orte_exit_code_t exit_code);
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
orte_proc_state_t state, orte_exit_code_t exit_code);
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
static int ft_event(int state);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_hnp_module = {
init,
finalize,
update_state,
predicted_fault,
suggest_map_targets,
ft_event
};
/************************
* API Definitions
************************/
static int init(void)
int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
{
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp:component_query()");
if( ORTE_PROC_IS_HNP ) {
*priority = mca_errmgr_hnp_component.super.priority;
*module = (mca_base_module_t *)&global_module;
}
/* Daemons and Apps have their own components */
else {
*module = NULL;
*priority = -1;
}
return ORTE_SUCCESS;
}
/*******************
* Global Functions
********************/
int orte_errmgr_hnp_global_module_init(void)
{
int ret, exit_status = ORTE_SUCCESS;
#if OPAL_ENABLE_FT_CR
if( mca_errmgr_hnp_component.crmig_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_init()) ) {
exit_status = ret;
goto cleanup;
}
}
if( mca_errmgr_hnp_component.autor_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_init()) ) {
exit_status = ret;
goto cleanup;
}
}
#endif /* OPAL_ENABLE_FT_CR */
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_init()) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
int orte_errmgr_hnp_global_module_finalize(void)
{
int ret, exit_status = ORTE_SUCCESS;
#if OPAL_ENABLE_FT_CR
if( mca_errmgr_hnp_component.crmig_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_finalize()) ) {
exit_status = ret;
goto cleanup;
}
}
if( mca_errmgr_hnp_component.autor_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_finalize()) ) {
exit_status = ret;
goto cleanup;
}
}
#endif /* OPAL_ENABLE_FT_CR */
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_finalize()) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
int orte_errmgr_hnp_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
int ret, exit_status = ORTE_SUCCESS;
mca_errmgr_hnp_component.ignore_current_update = false;
if (orte_finalizing ||
orte_job_term_ordered ||
ORTE_PROC_STATE_TERMINATED == state ) {
mca_errmgr_hnp_component.term_in_progress = true;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:hnp:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((NULL == proc_name) ? "App. Process" :
(proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name)));
#if OPAL_ENABLE_FT_CR
if( mca_errmgr_hnp_component.crmig_enabled &&
!mca_errmgr_hnp_component.autor_in_progress) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_update_state(job,
jobstate,
proc_name,
state,
pid,
exit_code)) ) {
exit_status = ret;
goto cleanup;
}
}
if( mca_errmgr_hnp_component.autor_enabled &&
!mca_errmgr_hnp_component.crmig_in_progress) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_update_state(job,
jobstate,
proc_name,
state,
pid,
exit_code)) ) {
exit_status = ret;
goto cleanup;
}
}
#endif /* OPAL_ENABLE_FT_CR */
if( !mca_errmgr_hnp_component.ignore_current_update ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_update_state(job,
jobstate,
proc_name,
state,
pid,
exit_code)) ) {
exit_status = ret;
goto cleanup;
}
}
cleanup:
return exit_status;
}
int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
#if OPAL_ENABLE_FT_CR
int ret, exit_status = ORTE_SUCCESS;
if( mca_errmgr_hnp_component.crmig_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_predicted_fault(proc_list,
node_list,
suggested_map)) ) {
exit_status = ret;
goto cleanup;
}
}
else {
exit_status = ORTE_ERR_NOT_IMPLEMENTED;
}
cleanup:
return exit_status;
#else
return ORTE_ERR_NOT_IMPLEMENTED;
#endif /* OPAL_ENABLE_FT_CR */
}
int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
#if OPAL_ENABLE_FT_CR
int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED;
if( mca_errmgr_hnp_component.crmig_enabled &&
!mca_errmgr_hnp_component.autor_in_progress ) {
exit_status = ORTE_SUCCESS;
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_suggest_map_targets(proc,
oldnode,
node_list)) ) {
exit_status = ret;
goto cleanup;
}
}
if( mca_errmgr_hnp_component.autor_enabled &&
!mca_errmgr_hnp_component.crmig_in_progress ) {
exit_status = ORTE_SUCCESS;
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_suggest_map_targets(proc,
oldnode,
node_list)) ) {
exit_status = ret;
goto cleanup;
}
}
cleanup:
return exit_status;
#else
return ORTE_ERR_NOT_IMPLEMENTED;
#endif /* OPAL_ENABLE_FT_CR */
}
int orte_errmgr_hnp_global_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
#if OPAL_ENABLE_FT_CR
if( !mca_errmgr_hnp_component.crmig_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_ft_event(state)) ) {
exit_status = ret;
goto cleanup;
}
}
if( !mca_errmgr_hnp_component.autor_enabled ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_ft_event(state)) ) {
exit_status = ret;
goto cleanup;
}
}
#endif /* OPAL_ENABLE_FT_CR */
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_ft_event(state)) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
/**********************
* From HNP
**********************/
int orte_errmgr_hnp_base_global_init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
int orte_errmgr_hnp_base_global_finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_job_t *jdata;
orte_exit_code_t sts;
@ -136,9 +358,6 @@ static int update_state(orte_jobid_t job,
orte_app_context_t *app;
orte_proc_t *pdat;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported state %s"
" for proc %s state %s pid %d exit_code %d",
@ -148,18 +367,6 @@ static int update_state(orte_jobid_t job,
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), pid, exit_code));
/********************************
* If the modules before us recovered from this error, then do not abort.
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:hnp:update_proc() %s) "
"------- A previous component successfully recovered from the process fault of %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return ORTE_SUCCESS;
}
/*
* if orte is trying to shutdown, just let it
*/
@ -340,7 +547,7 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) {
if( jdata->enable_recovery ) {
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its local restart limit */
@ -371,7 +578,7 @@ static int update_state(orte_jobid_t job,
/* guess not - let it fall thru to abort */
}
}
update_proc(jdata, proc, state, pid, exit_code);
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -384,7 +591,7 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_FAILED_TO_START:
case ORTE_PROC_STATE_CALLED_ABORT:
update_proc(jdata, proc, state, pid, exit_code);
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata);
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -397,22 +604,22 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_REGISTERED:
case ORTE_PROC_STATE_RUNNING:
update_proc(jdata, proc, state, pid, exit_code);
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
break;
case ORTE_PROC_STATE_LAUNCHED:
/* record the pid for this child */
update_proc(jdata, proc, state, pid, exit_code);
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
break;
case ORTE_PROC_STATE_TERMINATED:
case ORTE_PROC_STATE_KILLED_BY_CMD:
update_proc(jdata, proc, state, pid, exit_code);
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata);
break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
update_proc(jdata, proc, state, pid, exit_code);
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
killprocs(proc->jobid, proc->vpid);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
@ -423,7 +630,7 @@ static int update_state(orte_jobid_t job,
hnp_abort(jdata->jobid, exit_code);
}
break;
case ORTE_PROC_STATE_COMM_FAILED:
/* is this to a daemon? */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
@ -442,7 +649,7 @@ static int update_state(orte_jobid_t job,
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
/* update daemon job */
record_dead_daemon(jdata, proc->vpid, state, 0);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
/* check for complete */
check_job_complete(jdata);
break;
@ -457,7 +664,7 @@ static int update_state(orte_jobid_t job,
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
/* update daemon job */
record_dead_daemon(jdata, proc->vpid, state, exit_code);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* check for complete */
check_job_complete(jdata);
break;
@ -468,7 +675,7 @@ static int update_state(orte_jobid_t job,
/* purge the oob */
orte_rml.purge(proc);
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
if( orte_enable_recovery ) {
/* relocate its processes */
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
/* unable to relocate for some reason */
@ -493,7 +700,7 @@ static int update_state(orte_jobid_t job,
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
}
/* remove this proc from the daemon job */
record_dead_daemon(jdata, proc->vpid, state, exit_code);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
@ -506,10 +713,10 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
/* heartbeats are only from daemons */
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
if( orte_enable_recovery ) {
/* relocate its processes */
} else {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
@ -525,23 +732,7 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int ft_event(int state)
int orte_errmgr_hnp_base_global_ft_event(int state)
{
return ORTE_SUCCESS;
}
@ -697,11 +888,11 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
}
}
static void update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_child_t *child;
@ -1230,7 +1421,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
/* remove this proc from the daemon job */
record_dead_daemon(jdata, proc->vpid, state, exit_code);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* check to see if any other nodes are "alive" */
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
return ORTE_ERR_FATAL;
@ -1355,8 +1546,10 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
return NULL;
}
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
orte_proc_state_t state, orte_exit_code_t exit_code)
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
orte_job_t *jdt;
orte_proc_t *pdat;
@ -1387,8 +1580,21 @@ static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
}
/* get the job data object for this process */
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
/* It is possible that the process job finishes before the daemons.
* In that case the process state is set to normal termination, and
* the job data has already been cleared. So no need to throw an
* error.
*/
if( ORTE_PROC_STATE_TERMINATED != pdat->state ) {
opal_output(0,
"%s Error: Failed to find job_data for proc %s (%s) on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pdat->name),
orte_proc_state_to_str(pdat->state),
node->name );
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
}
continue;
}
pdat->state = ORTE_PROC_STATE_ABORTED;

Просмотреть файл

@ -25,10 +25,108 @@ BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_hnp_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_hnp_component;
bool ignore_current_update;
bool term_in_progress;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_hnp_module;
#if OPAL_ENABLE_FT_CR
/* State of the Recovery */
bool crmig_in_progress;
bool autor_in_progress;
/* CRMig Options */
bool crmig_enabled;
bool crmig_timing_enabled;
/* AutoR Options */
bool autor_enabled;
bool autor_timing_enabled;
int autor_recovery_delay;
bool autor_skip_oldnode;
#endif
};
typedef struct orte_errmgr_hnp_component_t orte_errmgr_hnp_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_hnp_component_t mca_errmgr_hnp_component;
int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
***************************/
int orte_errmgr_hnp_global_module_init(void);
int orte_errmgr_hnp_global_module_finalize(void);
int orte_errmgr_hnp_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_global_ft_event(int state);
/* HNP Versions */
int orte_errmgr_hnp_base_global_init(void);
int orte_errmgr_hnp_base_global_finalize(void);
int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnp_base_global_ft_event(int state);
#if OPAL_ENABLE_FT_CR
/* CRMig Versions */
int orte_errmgr_hnp_crmig_global_module_init(void);
int orte_errmgr_hnp_crmig_global_module_finalize(void);
int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_crmig_global_ft_event(int state);
/* AutoR Versions */
int orte_errmgr_hnp_autor_global_module_init(void);
int orte_errmgr_hnp_autor_global_module_finalize(void);
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_autor_global_ft_event(int state);
#endif
END_C_DECLS

Просмотреть файл

@ -55,40 +55,11 @@
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_autor.h"
#include "errmgr_hnp.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Automatic Recovery module
******************/
static orte_errmgr_base_module_t global_module = {
/** Initialization Function */
orte_errmgr_autor_global_module_init,
/** Finalization Function */
orte_errmgr_autor_global_module_finalize,
/** Update State */
orte_errmgr_autor_global_update_state,
NULL, /** predicted_fault */
/*orte_errmgr_autor_global_process_fault,*/
orte_errmgr_autor_global_suggest_map_targets,
orte_errmgr_autor_global_ft_event
};
static orte_errmgr_base_module_t local_module = {
/** Initialization Function */
orte_errmgr_autor_local_module_init,
/** Finalization Function */
orte_errmgr_autor_local_module_finalize,
/** Update State */
orte_errmgr_autor_local_update_state,
NULL, /** predicted_fault */
/*orte_errmgr_autor_local_process_fault,*/
NULL, /* suggest_map_targets */
orte_errmgr_autor_local_ft_event
};
#if OPAL_ENABLE_FT_CR
/************************
* Work Pool structures
************************/
@ -132,22 +103,20 @@ static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name
static int display_procs(void );
static int autor_procs_sort_compare_fn(opal_list_item_t **a,
opal_list_item_t **b);
static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state);
static void errmgr_autor_process_fault_app(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
orte_proc_state_t state);
static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
orte_proc_state_t state);
static int check_if_terminated(opal_pointer_array_t *procs);
static int check_if_restarted(opal_pointer_array_t *procs);
static void update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/*
* Timer stuff
*/
@ -167,76 +136,34 @@ static double timer_start[OPAL_CR_TIMER_MAX];
#define ERRMGR_AUTOR_TIMER_FINISH 5
#define ERRMGR_AUTOR_TIMER_MAX 6
#define ERRMGR_AUTOR_CLEAR_TIMERS() \
#define ERRMGR_AUTOR_CLEAR_TIMERS() \
{ \
if(OPAL_UNLIKELY(mca_errmgr_autor_component.timing_enabled > 0)) { \
errmgr_autor_clear_timers(); \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \
errmgr_autor_clear_timers(); \
} \
}
#define ERRMGR_AUTOR_SET_TIMER(idx) \
#define ERRMGR_AUTOR_SET_TIMER(idx) \
{ \
if(OPAL_UNLIKELY(mca_errmgr_autor_component.timing_enabled > 0)) { \
errmgr_autor_set_time(idx); \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \
errmgr_autor_set_time(idx); \
} \
}
#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \
#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \
{ \
if(OPAL_UNLIKELY(mca_errmgr_autor_component.timing_enabled > 0)) { \
errmgr_autor_display_all_timers(); \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \
errmgr_autor_display_all_timers(); \
} \
}
/************************
* Function Definitions
************************/
/*
* MCA Functions
*/
int orte_errmgr_autor_component_query(mca_base_module_t **module, int *priority)
{
if( !(orte_enable_recovery) ) {
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:component_query() - Disabled: Recovery is not enabled");
*priority = -1;
*module = NULL;
return ORTE_SUCCESS;
}
if( !mca_errmgr_autor_component.autor_enabled ) {
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor: component_query() - Disabled: C/R Automatic Recovery "
"is not enabled via errmgr_autor_enable MCA parameter.");
*priority = -1;
*module = NULL;
return ORTE_SUCCESS;
}
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:component_query()");
*priority = mca_errmgr_autor_component.super.priority;
if( ORTE_PROC_IS_HNP ) {
*module = (mca_base_module_t *)&global_module;
}
else if (ORTE_PROC_IS_DAEMON) {
*module = (mca_base_module_t *)&local_module;
}
else {
*module = NULL;
}
return ORTE_SUCCESS;
}
/************************
* Function Definitions: Global
************************/
int orte_errmgr_autor_global_module_init(void)
int orte_errmgr_hnp_autor_global_module_init(void)
{
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:init()");
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):init()");
procs_pending_recovery = OBJ_NEW(opal_list_t);
autor_timer_event = (opal_event_t*)malloc(sizeof(opal_event_t));
@ -249,10 +176,10 @@ int orte_errmgr_autor_global_module_init(void)
return ORTE_SUCCESS;
}
int orte_errmgr_autor_global_module_finalize(void)
int orte_errmgr_hnp_autor_global_module_finalize(void)
{
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:finalize()");
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):finalize()");
if( NULL != procs_pending_recovery ) {
OBJ_RELEASE(procs_pending_recovery);
@ -313,20 +240,19 @@ static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name
}
if( NULL == current_global_jobdata ) {
opal_output(0, "errmgr:autor:process_fault(): Global) Error: Cannot find the jdata for the current job.");
opal_output(0, "errmgr:hnp(autor):process_fault(): Global) Error: Cannot find the jdata for the current job.");
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
int orte_errmgr_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_proc_t *loc_proc = NULL;
orte_job_t *jdata = NULL;
@ -336,30 +262,20 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
if( mca_errmgr_hnp_component.term_in_progress ) {
return ORTE_SUCCESS;
}
if( NULL != proc_name &&
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:autor: Update reported on self (%s), state %s. Skip...",
"%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state) ));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:autor: job %s reported state %s"
" for proc %s state %s exit_code %d (%c)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state), exit_code,
(orte_finalizing ? 'T' : 'F')));
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(job))) {
ret = ORTE_ERROR;
@ -369,15 +285,27 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
}
/*
* If this job opt'ed not to be recovered, then skip
* If this is a tool, ignore
*/
if( !(jdata->enable_recovery) ) {
if( jdata->num_apps == 0 &&
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp(autor): An external tool disconnected. Ignore...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
exit_status = ORTE_SUCCESS;
goto cleanup;
}
if( ORTE_JOB_STATE_RESTART == jobstate ) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp(autor): job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state), exit_code));
if( ORTE_JOB_STATE_RESTART == jobstate ) {
for(i = 0; i < jdata->procs->size; ++i) {
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
@ -385,8 +313,7 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
break;
}
/*state = ORTE_PROC_STATE_KILLED_BY_CMD;*/
if( ORTE_SUCCESS != (ret = orte_errmgr_autor_global_process_fault(jdata, &(loc_proc->name), state, stack_state)) ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
@ -394,7 +321,7 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
}
else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state ||
ORTE_PROC_STATE_COMM_FAILED == state ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_autor_global_process_fault(jdata, proc_name, state, stack_state)) ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
@ -402,9 +329,8 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
}
else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
if( autor_mask_faults ) {
update_proc(jdata, proc_name, state, exit_code);
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
mca_errmgr_hnp_component.ignore_current_update = true;
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code);
}
}
@ -412,10 +338,9 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
return ret;
}
int orte_errmgr_autor_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state)
{
int ret;
@ -431,19 +356,18 @@ int orte_errmgr_autor_global_process_fault(orte_job_t *jdata,
current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE;
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
errmgr_autor_process_fault_daemon(jdata, proc_name, state, stack_state);
errmgr_autor_process_fault_daemon(jdata, proc_name, state);
} else {
update_proc(jdata, proc_name, state, 0);
errmgr_autor_process_fault_app(jdata, proc_name, state, stack_state);
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, 0);
errmgr_autor_process_fault_app(jdata, proc_name, state);
}
return ORTE_SUCCESS;
}
int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
opal_list_item_t *item = NULL;
errmgr_autor_wp_item_t *wp_item = NULL;
@ -463,7 +387,7 @@ int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
* Find this process in the known failures list
*/
found = false;
if( mca_errmgr_autor_component.skip_oldnode ) {
if( mca_errmgr_hnp_component.autor_skip_oldnode ) {
for(item = opal_list_get_first(procs_pending_recovery);
item != opal_list_get_end(procs_pending_recovery);
item = opal_list_get_next(item) ) {
@ -477,8 +401,8 @@ int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
}
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor: suggest_map() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor): suggest_map() "
"Process remapping: %s oldnode %s, %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
@ -525,107 +449,47 @@ int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
return ORTE_SUCCESS;
}
int orte_errmgr_autor_global_ft_event(int state)
int orte_errmgr_hnp_autor_global_ft_event(int state)
{
return ORTE_SUCCESS;
}
/************************
* Function Definitions: Local
************************/
int orte_errmgr_autor_local_module_init(void)
{
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:init() Local");
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
return ORTE_SUCCESS;
}
int orte_errmgr_autor_local_module_finalize(void)
{
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:finalize() Local");
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
return ORTE_SUCCESS;
}
int orte_errmgr_autor_local_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
{
/*
* If this component is enabled, then the global version takes care of
* recovery policy. Tell lower layers in the ErrMgr stack -not- to recover
* locally.
*/
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:autor: update_state() (Local) job state %s"
" for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state) ));
return ORTE_SUCCESS;
}
int orte_errmgr_autor_local_ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/
static void errmgr_autor_process_fault_app(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
orte_proc_state_t state)
{
errmgr_autor_wp_item_t *wp_item = NULL;
struct timeval soon;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor: process_fault() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor): process_fault() "
"Process fault! proc %s (0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
state));
if( !orte_sstore_base_is_checkpoint_available ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor: process_fault() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor): process_fault() "
"No checkpoints are available for this job! Cannot Automaticly Recover!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
*stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
opal_show_help("help-orte-errmgr-autor.txt", "failed_to_recover_proc", true,
opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true,
ORTE_NAME_PRINT(proc), proc->vpid);
return;
}
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
mca_errmgr_hnp_component.ignore_current_update = true;
/*
* If we are already in the shutdown stage of the recovery, then just skip it
*/
if( autor_mask_faults ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor:process_fault() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor):process_fault() "
"Currently recovering the job. Failure masked!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
@ -648,7 +512,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
autor_timer_active = true;
opal_evtimer_set(autor_timer_event, errmgr_autor_recover_processes, NULL);
soon.tv_sec = mca_errmgr_autor_component.recovery_delay;
soon.tv_sec = mca_errmgr_hnp_component.autor_recovery_delay;
soon.tv_usec = 0;
opal_evtimer_add(autor_timer_event, &soon);
}
@ -658,15 +522,14 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
orte_proc_state_t state)
{
orte_proc_t *loc_proc = NULL, *child_proc = NULL;
orte_std_cntr_t i_proc;
int32_t i;
OPAL_OUTPUT_VERBOSE((15, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor: process_fault_daemon() "
OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor): process_fault_daemon() "
"------- Daemon fault reported! proc %s (0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
@ -704,28 +567,36 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
"------- Daemon lost with the following processes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
if( NULL == child_proc ) {
continue;
}
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
if( NULL == child_proc ) {
continue;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s errmgr:base: stabalize_runtime() "
"\t %s [0x%x]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child_proc->name),
child_proc->state));
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s errmgr:base: stabalize_runtime() "
"\t %s [0x%x]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child_proc->name),
child_proc->state));
if( child_proc->last_errmgr_state < child_proc->state ) {
child_proc->last_errmgr_state = child_proc->state;
orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED,
&(child_proc->name), ORTE_PROC_STATE_COMM_FAILED,
0, 1);
/*orte_errmgr_base_proc_aborted(&child_proc->name, -1);*/
}
if( child_proc->last_errmgr_state < child_proc->state ) {
child_proc->last_errmgr_state = child_proc->state;
orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED,
&(child_proc->name), ORTE_PROC_STATE_COMM_FAILED,
0, 1);
}
}
} else {
/* This daemon had no children, so just mask the failure */
mca_errmgr_hnp_component.ignore_current_update = true;
}
/*
* Record the dead daemon
*/
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
return;
}
@ -772,7 +643,7 @@ static int display_procs(void )
}
}
opal_show_help("help-orte-errmgr-autor.txt", "recovering_job", true,
opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovering_job", true,
proc_str);
if( NULL != tmp_str ) {
@ -824,8 +695,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
/*
* Display the processes that are to be recovered
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor:recover() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor):recover() "
"------- Display known failed processes in the job %s -------",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
@ -836,8 +707,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
/*
* Find the latest checkpoint
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor:recover() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor):recover() "
"------- Find the latest checkpoint for the job %s -------",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
@ -854,8 +725,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
/*
* Safely terminate the entire job
*/
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:recover() "
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):recover() "
"------- Safely terminate the job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
@ -883,8 +754,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM);
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:recover() "
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):recover() "
"------- Done waiting for termination of job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
current_global_jobdata->num_terminated = current_global_jobdata->num_procs;
@ -893,8 +764,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
/*
* Construct the app contexts to restart
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
"%s errmgr:autor:recover() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"%s errmgr:hnp(autor):recover() "
"------- Rebuild job %s app context -------",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
@ -912,7 +783,7 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\tAdjusted: \"%s\" [0x%d] [%s]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
}
@ -922,8 +793,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
/*
* Spawn the restarted job
*/
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:recover() "
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):recover() "
"------- Respawning the job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
orte_snapc_base_has_recovered = false;
@ -933,8 +804,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
/*
* Wait for all the processes to restart
*/
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:recover() "
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):recover() "
"------- Waiting for restart -------");
while(!check_if_restarted(current_global_jobdata->procs) ) {
opal_progress();
@ -949,12 +820,12 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
opal_progress();
}
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
"errmgr:autor:recover() "
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(autor):recover() "
"------- Finished recovering job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
opal_show_help("help-orte-errmgr-autor.txt", "recovery_complete", true);
opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true);
ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH);
@ -1002,7 +873,7 @@ static int check_if_terminated(opal_pointer_array_t *procs)
}
if( !is_done ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t Still waiting for termination: \"%s\" [0x%x] < [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_UNTERMINATED));
}
@ -1034,7 +905,7 @@ static int check_if_restarted(opal_pointer_array_t *procs)
}
if( !is_done ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING));
}
@ -1042,64 +913,6 @@ static int check_if_restarted(opal_pointer_array_t *procs)
return is_done;
}
static void update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_proc_t *proct;
int i;
/*** UPDATE LOCAL CHILD ***/
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
next = opal_list_get_next(item);
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid) {
if (child->name->vpid == proc->vpid) {
child->state = state;
child->exit_code = exit_code;
proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
proct->state = state;
proct->exit_code = exit_code;
/* (JJH: See note below)
if (ORTE_PROC_STATE_UNTERMINATED < state) {
jdata->num_terminated++;
}
*/
return;
}
}
}
/*** UPDATE REMOTE CHILD ***/
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if (proct->name.jobid != proc->jobid ||
proct->name.vpid != proc->vpid) {
continue;
}
proct->state = state;
proct->exit_code = exit_code;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
/* JJH: Do not increment this value. Otherwise the 'hnp' component
* will try to terminate us after we request the job to
* termiante. So we fake it out by making sure that
* num_terminated never equals num_procs.
* There should be a better way though...
*/
/* update the counter so we can terminate */
/*jdata->num_terminated++;*/
}
return;
}
}
/************************
* Timing
************************/
@ -1192,3 +1005,5 @@ static void errmgr_autor_display_indv_timer_core(double diff, char *str)
perc);
return;
}
#endif /* OPAL_ENABLE_FT_CR */

Просмотреть файл

@ -13,71 +13,189 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnp_component_version_string =
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
"ORTE ERRMGR Hnp MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_hnp_open(void);
static int errmgr_hnp_close(void);
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
static int orte_errmgr_hnp_open(void);
static int orte_errmgr_hnp_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_hnp_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component ithnp
*/
orte_errmgr_hnp_component_t mca_errmgr_hnp_component = {
/* First do the base component stuff */
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnp",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Handle the general mca_component_t struct containing
* meta information about the component hnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnp",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_hnp_open,
errmgr_hnp_close,
errmgr_hnp_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
/* Component open and close functions */
orte_errmgr_hnp_open,
orte_errmgr_hnp_close,
orte_errmgr_hnp_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
50
}
};
static int errmgr_hnp_open(void)
static int orte_errmgr_hnp_open(void)
{
return ORTE_SUCCESS;
}
int val;
static int errmgr_hnp_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_HNP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*module = (mca_base_module_t *)&orte_errmgr_hnp_module;
return ORTE_SUCCESS;
}
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"priority",
"Priority of the ERRMGR hnp component",
false, false,
mca_errmgr_hnp_component.super.priority,
&mca_errmgr_hnp_component.super.priority);
*priority = -1;
*module = NULL;
return ORTE_ERROR;
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR hnp component",
false, false,
mca_errmgr_hnp_component.super.verbose,
&mca_errmgr_hnp_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_hnp_component.super.verbose) {
mca_errmgr_hnp_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_hnp_component.super.output_handle,
mca_errmgr_hnp_component.super.verbose);
} else {
mca_errmgr_hnp_component.super.output_handle = orte_errmgr_base.output;
}
#if OPAL_ENABLE_FT_CR
/****************************
* CRMig (C/R Process Migration) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"crmig_timing",
"Enable Process Migration timer",
false, false,
0, &val);
mca_errmgr_hnp_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"crmig_enable",
"Enable Process Migration (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnp_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
/****************************
* AutoR (Automatic Recovery) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"autor_timing",
"Enable Automatic Recovery timer",
false, false,
0, &val);
mca_errmgr_hnp_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"autor_enable",
"Enable Automatic Recovery (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnp_component.autor_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"recovery_delay",
"Number of seconds to wait before starting to recover the job after a failure"
" [Default: 1 sec]",
false, false,
1, &val);
mca_errmgr_hnp_component.autor_recovery_delay = val;
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"skip_oldnode",
"Skip the old node from failed proc, even if it is still available"
" [Default: Enabled]",
false, false,
1, &val);
mca_errmgr_hnp_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
#else
val = 0; /* Silence compiler warning */
#endif /* OPAL_ENABLE_FT_CR */
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open()");
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: priority = %d",
mca_errmgr_hnp_component.super.priority);
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: verbosity = %d",
mca_errmgr_hnp_component.super.verbose);
#if OPAL_ENABLE_FT_CR
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: --- CR Migration Options ---");
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: Process Migration = %s",
(mca_errmgr_hnp_component.crmig_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnp_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: --- Auto. Recovery Options ---");
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: Auto. Recover = %s",
(mca_errmgr_hnp_component.autor_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnp_component.autor_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: recover_delay = %d",
mca_errmgr_hnp_component.autor_recovery_delay);
mca_errmgr_hnp_component.crmig_in_progress = false;
mca_errmgr_hnp_component.autor_in_progress = false;
mca_errmgr_hnp_component.term_in_progress = false;
#endif /* OPAL_ENABLE_FT_CR */
return ORTE_SUCCESS;
}
static int orte_errmgr_hnp_close(void)
{
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: close()");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -53,37 +53,11 @@
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_crmig.h"
#include "errmgr_hnp.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Crmig module
******************/
static orte_errmgr_base_module_t global_module = {
/** Initialization Function */
orte_errmgr_crmig_global_module_init,
/** Finalization Function */
orte_errmgr_crmig_global_module_finalize,
/** Update State */
orte_errmgr_crmig_global_update_state,
orte_errmgr_crmig_global_predicted_fault,
/*orte_errmgr_crmig_global_process_fault,*/
orte_errmgr_crmig_global_suggest_map_targets,
orte_errmgr_crmig_global_ft_event
};
static orte_errmgr_base_module_t local_module = {
/** Initialization Function */
orte_errmgr_crmig_local_module_init,
/** Finalization Function */
orte_errmgr_crmig_local_module_finalize,
/** Update State */
orte_errmgr_crmig_local_update_state,
NULL,
NULL,
orte_errmgr_crmig_local_ft_event
};
#if OPAL_ENABLE_FT_CR
/************************************
* Locally Global vars & functions :)
@ -103,14 +77,15 @@ static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map);
static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state);
static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
orte_proc_state_t state);
static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
orte_proc_state_t state);
static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs);
static int check_if_terminated(opal_pointer_array_t *migrating_procs);
@ -124,11 +99,6 @@ static void display_request(opal_list_t *off_procs,
opal_list_t *off_nodes,
orte_snapc_base_quiesce_t *cur_datum);
static void update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/*
* Timer stuff
*/
@ -149,78 +119,36 @@ static double timer_start[OPAL_CR_TIMER_MAX];
#define ERRMGR_CRMIG_TIMER_FINISH 6
#define ERRMGR_CRMIG_TIMER_MAX 7
#define ERRMGR_CRMIG_CLEAR_TIMERS() \
#define ERRMGR_CRMIG_CLEAR_TIMERS() \
{ \
if(OPAL_UNLIKELY(mca_errmgr_crmig_component.timing_enabled > 0)) { \
errmgr_crmig_clear_timers(); \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
errmgr_crmig_clear_timers(); \
} \
}
#define ERRMGR_CRMIG_SET_TIMER(idx) \
#define ERRMGR_CRMIG_SET_TIMER(idx) \
{ \
if(OPAL_UNLIKELY(mca_errmgr_crmig_component.timing_enabled > 0)) { \
errmgr_crmig_set_time(idx); \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
errmgr_crmig_set_time(idx); \
} \
}
#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \
#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \
{ \
if(OPAL_UNLIKELY(mca_errmgr_crmig_component.timing_enabled > 0)) { \
errmgr_crmig_display_all_timers(); \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
errmgr_crmig_display_all_timers(); \
} \
}
/************************
* Function Definitions
************************/
/*
* MCA Functions
*/
int orte_errmgr_crmig_component_query(mca_base_module_t **module, int *priority)
{
if( !(orte_enable_recovery) ) {
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: component_query() - Disabled: Recovery is not enabled");
*priority = -1;
*module = NULL;
return ORTE_SUCCESS;
}
if( !mca_errmgr_crmig_component.crmig_enabled ) {
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: component_query() - Disabled: Process Migration "
"is not enabled via errmgr_crmig_enable MCA parameter.");
*priority = -1;
*module = NULL;
return ORTE_SUCCESS;
}
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: component_query()");
*priority = mca_errmgr_crmig_component.super.priority;
if( ORTE_PROC_IS_HNP ) {
*module = (mca_base_module_t *)&global_module;
}
else if (ORTE_PROC_IS_DAEMON) {
*module = (mca_base_module_t *)&local_module;
}
else {
*module = NULL;
}
return ORTE_SUCCESS;
}
/************************
* Function Definitions: Global
************************/
int orte_errmgr_crmig_global_module_init(void)
int orte_errmgr_hnp_crmig_global_module_init(void)
{
int ret;
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: init()");
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig): init()");
migrating_underway = false;
@ -240,12 +168,12 @@ int orte_errmgr_crmig_global_module_init(void)
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_global_module_finalize(void)
int orte_errmgr_hnp_crmig_global_module_finalize(void)
{
int ret;
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: finalize()");
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig): finalize()");
/*
* Finalize the connection to the orte-migrate tool
@ -265,10 +193,9 @@ int orte_errmgr_crmig_global_module_finalize(void)
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state)
int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
int ret, exit_status = ORTE_SUCCESS;
orte_job_t *jdata = NULL;
@ -299,7 +226,7 @@ int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
break;
}
if( NULL == current_global_jobdata ) {
opal_output(0, "errmgr:crmig:predicted_fault(): Global) Error: Cannot find the jdata for the current job.");
opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job.");
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
@ -331,19 +258,18 @@ int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
goto cleanup;
}
opal_show_help("help-orte-errmgr-crmig.txt", "migrated_job", true);
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true);
cleanup:
return exit_status;
}
int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_job_t *jdata = NULL;
int ret = ORTE_SUCCESS;
@ -351,19 +277,10 @@ int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
if( mca_errmgr_hnp_component.term_in_progress ) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:crmig: job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state), exit_code));
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(job))) {
ret = ORTE_ERROR;
@ -371,9 +288,29 @@ int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
return ret;
}
/*
* If this is a tool, ignore
*/
if( jdata->num_apps == 0 &&
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp(crmig): An external tool disconnected. Ignore...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp(crmig): job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state), exit_code));
if( ORTE_PROC_STATE_ABORTED_BY_SIG == state ||
ORTE_PROC_STATE_COMM_FAILED == state ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_crmig_global_process_fault(jdata, proc_name, state, stack_state)) ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_process_fault(jdata, proc_name, state)) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
@ -381,50 +318,17 @@ int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
if( migrating_underway ) {
/* If we are migrating, then we need to mask this to prevent the lower level from terminating us */
update_proc(jdata, proc_name, state, exit_code);
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
mca_errmgr_hnp_component.ignore_current_update = true;
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code);
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
{
/*
* JJH: Todo
* The expected logic here is:
* if( a daemon with children fails ) {
* abort migration.
* }
* if( a daemon without children fails ) {
* continue. No processes lost
* }
* if( an application process fails ) {
* abort migration. Might be a bad checkpoint, or a process that we were
* not migrating that died.
* }
* else {
* continue;
* }
*/
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
errmgr_crmig_process_fault_daemon(jdata, proc_name, state, stack_state);
} else {
errmgr_crmig_process_fault_app(jdata, proc_name, state, stack_state);
}
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
int exit_status = ORTE_SUCCESS;
opal_list_item_t *item = NULL, *m_item = NULL;
@ -482,8 +386,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
OBJ_RELEASE(item);
continue;
} else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name,
current_proc_map->pre_map_fixed_node, node->name));
}
@ -498,8 +402,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
* If 'off_current_node' then exclude current node
*/
if( current_proc_map->off_current_node ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Remove old node (info) [%15s : %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name));
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
@ -537,8 +441,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
break;
}
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------",
ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name),
oldnode->name, current_proc_map->map_node_name));
}
@ -563,8 +467,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
OBJ_RELEASE(item);
continue;
} else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name,
current_proc_map->map_node_name, node->name));
}
@ -578,8 +482,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
/*
* Otherwise then map as if there was no exclusive mapping
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------",
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
}
/*
@ -590,8 +494,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
* Remove the old node from the list, if there are more than 1 nodes available
*/
if(1 < opal_list_get_size(node_list) ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Remove old node [%15s : %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name));
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
@ -612,8 +516,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
* If we do not have any general suggestions, then just return
*/
if( opal_list_get_size(current_onto_mapping_general) <= 0 ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------",
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
exit_status = ORTE_SUCCESS;
goto cleanup;
@ -622,8 +526,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
/*
* Otherwise look through the general suggestions as an include list
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------",
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
num_suggested = 0;
@ -653,87 +557,58 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
++num_suggested;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------",
num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name));
}
cleanup:
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------",
(int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
return exit_status;
}
int orte_errmgr_crmig_global_ft_event(int state)
int orte_errmgr_hnp_crmig_global_ft_event(int state)
{
return ORTE_SUCCESS;
}
/************************
* Function Definitions: Global
* Function Definitions: Static
************************/
int orte_errmgr_crmig_local_module_init(void)
{
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: init() (Local)");
migrating_underway = false;
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_local_module_finalize(void)
{
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig: finalize() (Local)");
migrating_underway = false;
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_local_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state)
{
/*
* If this component is enabled, then the global version takes care of
* recovery policy. Tell lower layers in the ErrMgr stack -not- to recover
* locally.
* JJH: Todo
* The expected logic here is:
* if( a daemon with children fails ) {
* abort migration.
* }
* if( a daemon without children fails ) {
* continue. No processes lost
* }
* if( an application process fails ) {
* abort migration. Might be a bad checkpoint, or a process that we were
* not migrating that died.
* }
* else {
* continue;
* }
*/
if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
errmgr_crmig_process_fault_daemon(jdata, proc_name, state);
} else {
errmgr_crmig_process_fault_app(jdata, proc_name, state);
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:crmig: update_state() (Local) job state %s"
" for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state) ));
return ORTE_SUCCESS;
}
int orte_errmgr_crmig_local_ft_event(int state)
{
return ORTE_SUCCESS;
}
static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps)
{
int ret, exit_status = ORTE_SUCCESS;
@ -756,8 +631,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
ERRMGR_CRMIG_CLEAR_TIMERS();
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START);
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Migrating (%3d, %3d, %3d) -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------",
(int)opal_list_get_size(off_procs),
(int)opal_list_get_size(off_nodes),
(int)opal_list_get_size(onto_maps)));
@ -836,13 +711,13 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) &&
(NULL == onto_map->map_node_name ||
0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Process %15s does not wish to move -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------",
ORTE_NAME_PRINT(&proc->name)));
} else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Process %15s will be moved -------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------",
ORTE_NAME_PRINT(&proc->name)));
/*
* Set the process to restarting
@ -999,7 +874,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
}
}
opal_show_help("help-orte-errmgr-crmig.txt", "no_migrating_procs", true,
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true,
err_str_nodes,
err_str_procs);
@ -1042,12 +917,12 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
goto cleanup;
}
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Starting the checkpoint of job %s -------",
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) {
opal_output(0, "errmgr:crmig:migrate() Error: Unable to start the checkpoint.");
opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint.");
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
@ -1058,8 +933,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
/*
* Terminate the migrating processes
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Terminate old processes in job %s -------",
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
orte_plm.terminate_procs(&cur_datum->migrating_procs);
@ -1068,8 +943,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
* Clear the IOF stdin target if necessary
*/
if( close_iof_stdin ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Closing old STDIN target for job %s (%s)-------",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid),
ORTE_NAME_PRINT(&iof_name) ));
@ -1079,8 +954,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
/*
* Wait for the processes to finish terminating
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Waiting for termination -------");
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Waiting for termination -------");
while( !migrating_terminated ) {
opal_progress();
@ -1092,8 +967,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
/*
* Start remapping the processes
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Checkpoint finished, setting up job %s -------",
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP;
@ -1126,7 +1001,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\tAdjusted: \"%s\" [0x%d] [%s]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
}
@ -1137,15 +1012,15 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
* Restart the job
* - spawn function will remap and launch the replacement proc(s)
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Respawning migrating processes in job %s -------",
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
orte_plm.spawn(current_global_jobdata);
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Waiting for restart -------");
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Waiting for restart -------");
migrating_restarted = false;
while( !migrating_restarted ) {
@ -1158,12 +1033,12 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
/*
* Finish the checkpoint
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Reconnecting processes in job %s -------",
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) {
opal_output(0, "errmgr:crmig:migrate() Error: Unable to end the checkpoint.");
opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint.");
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
@ -1172,8 +1047,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
/*
* All done
*/
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() ------- Finished migrating processes in job %s -------",
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
OBJ_RELEASE(cur_datum);
@ -1247,7 +1122,7 @@ static int check_if_terminated(opal_pointer_array_t *migrating_procs)
migrating_terminated = true;
}
else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD));
}
@ -1279,7 +1154,7 @@ static int check_if_restarted(opal_pointer_array_t *migrating_procs)
migrating_restarted = true;
}
else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING));
}
@ -1289,11 +1164,10 @@ static int check_if_restarted(opal_pointer_array_t *migrating_procs)
static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
orte_proc_state_t state)
{
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:process_fault_app() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):process_fault_app() "
"------- Application fault reported! proc %s (0x%x) "
"- %s",
ORTE_NAME_PRINT(proc),
@ -1305,11 +1179,10 @@ static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
orte_proc_state_t state)
{
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:process_fault_daemon() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):process_fault_daemon() "
"------- Daemon fault reported! proc %s (0x%x) "
"- %s",
ORTE_NAME_PRINT(proc),
@ -1322,8 +1195,8 @@ static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
* JJH: Check to make sure this is not a new daemon loss.
*/
if( ORTE_PROC_STATE_COMM_FAILED == state ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:process_fault_daemon() "
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):process_fault_daemon() "
"------- Daemon fault reported! proc %s (0x%x) "
"- Communication failure, keep going",
ORTE_NAME_PRINT(proc),
@ -1373,8 +1246,8 @@ static void display_request(opal_list_t *off_procs,
/*
* Display all requested processes to migrate
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() Requested Processes to migrate: (%d procs)\n",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n",
(int) opal_list_get_size(off_procs) ));
for(item = opal_list_get_first(off_procs);
item != opal_list_get_end(off_procs);
@ -1396,7 +1269,7 @@ static void display_request(opal_list_t *off_procs,
break;
}
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t%s (Rank %3d) on node %s\n",
ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name));
}
@ -1404,8 +1277,8 @@ static void display_request(opal_list_t *off_procs,
/*
* Display Off Nodes
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() Requested Nodes to migration: (%d nodes)\n",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n",
(int)opal_list_get_size(off_nodes) ));
for(item = opal_list_get_first(off_nodes);
@ -1426,7 +1299,7 @@ static void display_request(opal_list_t *off_procs,
}
}
if( found ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\"%s\" \t%d\n",
node->name, node->num_procs));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) {
@ -1435,7 +1308,7 @@ static void display_request(opal_list_t *off_procs,
continue;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\t\"%s\" [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state));
}
@ -1445,26 +1318,26 @@ static void display_request(opal_list_t *off_procs,
/*
* Suggested onto nodes
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() Suggested nodes to migration onto: (%d nodes)\n",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n",
(int)opal_list_get_size(current_onto_mapping_general) ));
for(item = opal_list_get_first(current_onto_mapping_general);
item != opal_list_get_end(current_onto_mapping_general);
item = opal_list_get_next(item) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\"%s\"\n",
onto_map->map_node_name));
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n",
(int)opal_list_get_size(current_onto_mapping_exclusive) ));
for(item = opal_list_get_first(current_onto_mapping_exclusive);
item != opal_list_get_end(current_onto_mapping_exclusive);
item = opal_list_get_next(item) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t%d\t(%c)\t\"%s\"\n",
onto_map->proc_name.vpid,
(onto_map->off_current_node ? 'T' : 'F'),
@ -1474,8 +1347,8 @@ static void display_request(opal_list_t *off_procs,
/*
* Display all processes scheduled to migrate
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
"errmgr:crmig:migrate() All Migrating Processes: (%d procs)\n",
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n",
cur_datum->num_migrating));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc);
@ -1483,7 +1356,7 @@ static void display_request(opal_list_t *off_procs,
continue;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\"%s\" [0x%x] [%s]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
@ -1504,7 +1377,7 @@ static void display_request(opal_list_t *off_procs,
}
}
opal_show_help("help-orte-errmgr-crmig.txt", "migrating_job", true,
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true,
status_str);
if( NULL != tmp_str ) {
@ -1520,64 +1393,6 @@ static void display_request(opal_list_t *off_procs,
return;
}
static void update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_proc_t *proct;
int i;
/*** UPDATE LOCAL CHILD ***/
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
next = opal_list_get_next(item);
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid) {
if (child->name->vpid == proc->vpid) {
child->state = state;
child->exit_code = exit_code;
proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
proct->state = state;
proct->exit_code = exit_code;
/* (JJH: See note below)
if (ORTE_PROC_STATE_UNTERMINATED < state) {
jdata->num_terminated++;
}
*/
return;
}
}
}
/*** UPDATE REMOTE CHILD ***/
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if (proct->name.jobid != proc->jobid ||
proct->name.vpid != proc->vpid) {
continue;
}
proct->state = state;
proct->exit_code = exit_code;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
/* JJH: Do not increment this value. Otherwise the 'hnp' component
* will try to terminate us after we request the job to
* termiante. So we fake it out by making sure that
* num_terminated never equals num_procs.
* There should be a better way though...
*/
/* update the counter so we can terminate */
/*jdata->num_terminated++;*/
}
return;
}
}
/************************
* Timing
************************/
@ -1676,3 +1491,5 @@ static void errmgr_crmig_display_indv_timer_core(double diff, char *str)
perc);
return;
}
#endif /* OPAL_ENABLE_FT_CR */

Просмотреть файл

@ -40,3 +40,32 @@ Process: %s
because the application for that process could not be found. This
appears to be a system error. Please report it to the ORTE
developers.
[autor_recovering_job]
Notice: The processes listed below failed unexpectedly.
Using the last checkpoint to recover the job.
Please standby.
%s
[autor_recovery_complete]
Notice: The job has been successfully recovered from the
last checkpoint.
[autor_failed_to_recover_proc]
Error: The process below has failed. There is no checkpoint available for
this job, so we are terminating the application since automatic
recovery cannot occur.
Internal Name: %s
MCW Rank: %d
[crmig_migrating_job]
Notice: A migration of this job has been requested.
The processes below will be migrated.
Please standby.
%s
[crmig_migrated_job]
Notice: The processes have been successfully migrated to/from the specified
machines.
[crmig_no_migrating_procs]
Warning: Could not find any processes to migrate on the nodes specified.
You provided the following:
Nodes: %s
Procs: %s

Просмотреть файл

@ -64,21 +64,18 @@ static int finalize(void);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state);
opal_list_t *suggested_map);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
orte_exit_code_t exit_code);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
opal_list_t *node_list);
static int ft_event(int state);
@ -90,6 +87,8 @@ static int ft_event(int state);
orte_errmgr_base_module_t orte_errmgr_orted_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
update_state,
predicted_fault,
suggest_map_targets,
@ -114,8 +113,7 @@ static int update_state(orte_jobid_t job,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_job_t *jobdat = NULL;
@ -126,9 +124,6 @@ static int update_state(orte_jobid_t job,
orte_vpid_t null=ORTE_VPID_INVALID;
orte_app_context_t *app;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
/*
* if orte is trying to shutdown, just let it
*/
@ -315,8 +310,7 @@ static int update_state(orte_jobid_t job,
killprocs(proc->jobid, proc->vpid);
}
app = jobdat->apps[child->app_idx];
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) &&
jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
if( jobdat->enable_recovery && child->restarts < app->max_local_restarts ) {
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
@ -330,7 +324,7 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_TERMINATED < state) {
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) {
if( jobdat->enable_recovery ) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s RECOVERY ENABLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -581,16 +575,14 @@ static int update_state(orte_jobid_t job,
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map,
orte_errmgr_stack_state_t *stack_state)
opal_list_t *suggested_map)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
opal_list_t *node_list)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
@ -600,9 +592,9 @@ int ft_event(int state)
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/
/*****************
* Local Functions
*****************/
static bool any_live_children(orte_jobid_t job)
{
opal_list_item_t *item;