Simplification of the ErrMgr framework by removing the 'stack'/composite functionality.
The composite functionality was becoming difficult to maintain, so we removed it for now which simplifies the framework design considerably. Since the 'crmig' and 'autor' components were -very- similar to the 'hnp' component, this commit also merges them together. By moving the 'crmig' and 'autor' to a separate file under the 'hnp' component we are able to isolate the C/R logic to a large extent, thus being only minimally hooked into the previous 'hnp' component. So other than some name changes, the functionality is all still in place. I will update the C/R documentation later this morning. This commit was SVN r23628.
Этот коммит содержится в:
родитель
77792c937d
Коммит
fabd5cc153
@ -68,12 +68,11 @@ btl_openib_cpc_include=oob
|
||||
orte_forward_job_control=1
|
||||
|
||||
#
|
||||
# Use the C/R Error Management and Recovery Service
|
||||
# Activate the Process Migartion and Automatic Recovery services in the
|
||||
# HNP ErrMgr component.
|
||||
#
|
||||
orte_enable_recovery=1
|
||||
orte_max_global_restarts=10
|
||||
errmgr_crmig_enable=1
|
||||
errmgr_autor_enable=1
|
||||
errmgr_hnp_crmig_enable=1
|
||||
errmgr_hnp_autor_enable=1
|
||||
|
||||
#
|
||||
# Additional constraints to be lifted in the future
|
||||
|
@ -43,8 +43,7 @@ static int update_state(orte_jobid_t job,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
@ -52,6 +51,8 @@ static int update_state(orte_jobid_t job,
|
||||
orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
@ -76,12 +77,8 @@ static int update_state(orte_jobid_t job,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
|
@ -1,38 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-errmgr-autor.txt
|
||||
|
||||
sources = \
|
||||
errmgr_autor.h \
|
||||
errmgr_autor_component.c \
|
||||
errmgr_autor_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_autor_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_autor.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_autor.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_autor_la_SOURCES = $(sources)
|
||||
mca_errmgr_autor_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_autor_la_SOURCES = $(sources)
|
||||
libmca_errmgr_autor_la_LDFLAGS = -module -avoid-version
|
@ -1,20 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_errmgr_autor_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_errmgr_autor_CONFIG],[
|
||||
# If we don't want FT, don't compile this component
|
||||
AS_IF([test "$opal_want_ft_cr" = "1"],
|
||||
[$1],
|
||||
[$2])
|
||||
])dnl
|
@ -1,14 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=errmgr_autor_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,88 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Automatic Recovery Errmgr component
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_AUTOR_EXPORT_H
|
||||
#define MCA_ERRMGR_AUTOR_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
struct orte_errmgr_autor_component_t {
|
||||
orte_errmgr_base_component_t super; /** Base Errmgr component */
|
||||
bool autor_enabled;
|
||||
bool timing_enabled;
|
||||
int recovery_delay;
|
||||
bool skip_oldnode;
|
||||
};
|
||||
typedef struct orte_errmgr_autor_component_t orte_errmgr_autor_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern orte_errmgr_autor_component_t mca_errmgr_autor_component;
|
||||
|
||||
int orte_errmgr_autor_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
int orte_errmgr_autor_global_module_init(void);
|
||||
int orte_errmgr_autor_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_autor_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_autor_global_ft_event(int state);
|
||||
|
||||
/*
|
||||
* Module functions: Local (Daemon)
|
||||
*/
|
||||
int orte_errmgr_autor_local_module_init(void);
|
||||
int orte_errmgr_autor_local_module_finalize(void);
|
||||
|
||||
int orte_errmgr_autor_local_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_autor_local_ft_event(int state);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_AUTOR_EXPORT_H */
|
@ -1,161 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_autor.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_autor_component_version_string =
|
||||
"ORTE ERRMGR AutoR MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_autor_open(void);
|
||||
static int errmgr_autor_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_autor_component_t mca_errmgr_autor_component = {
|
||||
/* First do the base component stuff */
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itautor
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"autor",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_autor_open,
|
||||
errmgr_autor_close,
|
||||
orte_errmgr_autor_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
20
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_autor_open(void)
|
||||
{
|
||||
int val;
|
||||
|
||||
/*
|
||||
* This should be the last componet to ever get used since
|
||||
* it doesn't do anything.
|
||||
*/
|
||||
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
|
||||
"priority",
|
||||
"Priority of the ERRMGR autor component",
|
||||
false, false,
|
||||
mca_errmgr_autor_component.super.priority,
|
||||
&mca_errmgr_autor_component.super.priority);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
|
||||
"verbose",
|
||||
"Verbose level for the ERRMGR autor component",
|
||||
false, false,
|
||||
mca_errmgr_autor_component.super.verbose,
|
||||
&mca_errmgr_autor_component.super.verbose);
|
||||
/* If there is a custom verbose level for this component than use it
|
||||
* otherwise take our parents level and output channel
|
||||
*/
|
||||
if ( 0 != mca_errmgr_autor_component.super.verbose) {
|
||||
mca_errmgr_autor_component.super.output_handle = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_errmgr_autor_component.super.output_handle,
|
||||
mca_errmgr_autor_component.super.verbose);
|
||||
} else {
|
||||
mca_errmgr_autor_component.super.output_handle = orte_errmgr_base.output;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
|
||||
"timing",
|
||||
"Enable Automatic Recovery timer",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_autor_component.timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
|
||||
"enable",
|
||||
"Enable Automatic Recovery (Default: 0/off)",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_autor_component.autor_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
|
||||
"recovery_delay",
|
||||
"Number of seconds to wait before starting to recover the job after a failure"
|
||||
" [Default: 1 sec]",
|
||||
false, false,
|
||||
1, &val);
|
||||
mca_errmgr_autor_component.recovery_delay = val;
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_autor_component.super.base_version,
|
||||
"skip_oldnode",
|
||||
"Skip the old node from failed proc, even if it is still available"
|
||||
" [Default: Enabled]",
|
||||
false, false,
|
||||
1, &val);
|
||||
mca_errmgr_autor_component.skip_oldnode = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: open()");
|
||||
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: open: priority = %d",
|
||||
mca_errmgr_autor_component.super.priority);
|
||||
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: open: verbosity = %d",
|
||||
mca_errmgr_autor_component.super.verbose);
|
||||
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: open: timing = %s",
|
||||
(mca_errmgr_autor_component.timing_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: open: Auto. Recover = %s",
|
||||
(mca_errmgr_autor_component.autor_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: open: recover_delay = %d",
|
||||
mca_errmgr_autor_component.recovery_delay);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_autor_close(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: close()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
-*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for ORTE ErrMgr AutoR framework.
|
||||
#
|
||||
[recovering_job]
|
||||
Notice: The processes listed below failed unexpectedly.
|
||||
Using the last checkpoint to recover the job.
|
||||
Please standby.
|
||||
%s
|
||||
[recovery_complete]
|
||||
Notice: The job has been successfully recovered from the
|
||||
last checkpoint.
|
||||
[failed_to_recover_proc]
|
||||
Error: The process below has failed. There is no checkpoint available for
|
||||
this job, so we are terminating the application since automatic
|
||||
recovery cannot occur.
|
||||
Internal Name: %s
|
||||
MCW Rank: %d
|
@ -43,20 +43,16 @@ ORTE_DECLSPEC int orte_errmgr_base_open(void);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_select(void);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_close(void);
|
||||
|
||||
/**
|
||||
* Composite Stack states
|
||||
*/
|
||||
#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */
|
||||
#define ORTE_ERRMGR_STACK_STATE_UPDATED 0x01 /* Updated the runtime */
|
||||
#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */
|
||||
#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */
|
||||
#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */
|
||||
#define ORTE_ERRMGR_STACK_STATE_COMPLETE 0x10 /* done processing this command */
|
||||
/**
|
||||
* Output and component variables
|
||||
*/
|
||||
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
|
||||
|
||||
/**
|
||||
* Internal module reference
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
|
||||
/**
|
||||
* Interfaces for orte-migrate tool
|
||||
*/
|
||||
@ -100,7 +96,7 @@ ORTE_DECLSPEC int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t
|
||||
ORTE_DECLSPEC int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int seq_num);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_t *datum);
|
||||
|
||||
#endif
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
/*
|
||||
* Additional External API function declared in errmgr.h
|
||||
|
@ -32,30 +32,25 @@
|
||||
|
||||
int orte_errmgr_base_close(void)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* Close all selected components */
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->finalize ) {
|
||||
module->finalize();
|
||||
}
|
||||
/* if not initialized, then skip this action. */
|
||||
if( !orte_errmgr_base.initialized ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Close selected component */
|
||||
if( NULL != orte_errmgr.finalize ) {
|
||||
orte_errmgr.finalize();
|
||||
}
|
||||
|
||||
/* Close all remaining available components (may be one if this is a
|
||||
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
|
||||
* OMPI RTE program, or [possibly] multiple if this is ompi_info)
|
||||
*/
|
||||
mca_base_components_close(orte_errmgr_base.output,
|
||||
&orte_errmgr_base_components_available,
|
||||
NULL);
|
||||
|
||||
OBJ_DESTRUCT(&orte_errmgr_base.modules);
|
||||
|
||||
orte_errmgr_base.initialized = false;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -189,47 +189,6 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
}
|
||||
|
||||
int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
int rc=ORTE_SUCCESS;
|
||||
int i;
|
||||
orte_errmgr_stack_state_t stack_state;
|
||||
orte_errmgr_base_module_t *module;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
|
||||
(NULL == name) ? "NULL" : ORTE_NAME_PRINT(name)));
|
||||
|
||||
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
|
||||
/********************************
|
||||
* Call the active modules
|
||||
********************************/
|
||||
for (i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->update_state ) {
|
||||
rc = module->update_state(job, jobstate, name, state, pid, exit_code, &stack_state);
|
||||
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
@ -265,90 +224,6 @@ int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i, rc;
|
||||
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:predicted_fault() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base.modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->predicted_fault ) {
|
||||
rc = module->predicted_fault(proc_list, node_list, suggested_map, &stack_state);
|
||||
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i, rc;
|
||||
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base.modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->suggest_map_targets ) {
|
||||
rc = module->suggest_map_targets(proc, oldnode, node_list, &stack_state);
|
||||
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_ft_event(int state)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:ft_event() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base.modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->ft_event ) {
|
||||
module->ft_event(state);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
********************/
|
||||
@ -619,9 +494,9 @@ int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int s
|
||||
orte_snapc_base_has_recovered = false;
|
||||
loc_proc.jobid = jobid;
|
||||
loc_proc.vpid = 0;
|
||||
orte_errmgr_base_update_state(jobid, ORTE_JOB_STATE_RESTART,
|
||||
&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD,
|
||||
0, 0);
|
||||
orte_errmgr.update_state(jobid, ORTE_JOB_STATE_RESTART,
|
||||
&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD,
|
||||
0, 0);
|
||||
while( !orte_snapc_base_has_recovered ) {
|
||||
opal_progress();
|
||||
}
|
||||
@ -678,7 +553,7 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
|
||||
opal_list_append(suggested_map_list, &(onto_map->super));
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_base_predicted_fault(proc_list, node_list, suggested_map_list)) ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
|
@ -52,13 +52,18 @@ opal_list_t orte_errmgr_base_components_available;
|
||||
|
||||
orte_errmgr_base_t orte_errmgr_base;
|
||||
|
||||
orte_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
|
||||
/* Public module provides a wrapper around previous functions */
|
||||
orte_errmgr_API_t orte_errmgr = {
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
NULL, /* init */
|
||||
NULL, /* finalize */
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_update_state,
|
||||
orte_errmgr_base_predicted_fault,
|
||||
orte_errmgr_base_suggest_map_targets,
|
||||
orte_errmgr_base_abort
|
||||
orte_errmgr_base_abort,
|
||||
NULL, /* update_state */
|
||||
NULL, /* predicted_fault */
|
||||
NULL, /* suggest_map_targets */
|
||||
NULL /* ft_event */
|
||||
};
|
||||
|
||||
/**
|
||||
@ -74,9 +79,6 @@ int orte_errmgr_base_open(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&orte_errmgr_base.modules, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&orte_errmgr_base.modules, 3, INT_MAX, 1);
|
||||
|
||||
orte_errmgr_base.output = opal_output_open(NULL);
|
||||
|
||||
/*
|
||||
|
@ -33,145 +33,36 @@
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
|
||||
struct orte_errmgr_base_select_module_t {
|
||||
mca_base_component_t *component;
|
||||
mca_base_module_t *module;
|
||||
int priority;
|
||||
};
|
||||
typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t;
|
||||
|
||||
int orte_errmgr_base_select(void)
|
||||
{
|
||||
int exit_status = OPAL_SUCCESS;
|
||||
mca_base_component_list_item_t *cli = NULL;
|
||||
mca_base_component_t *component = NULL;
|
||||
mca_base_module_t *module = NULL;
|
||||
opal_list_item_t *item = NULL;
|
||||
int priority = 0, i, j, low_i;
|
||||
orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
|
||||
opal_pointer_array_t tmp_array;
|
||||
orte_errmgr_base_module_t *i_module = NULL;
|
||||
bool none_found;
|
||||
|
||||
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&tmp_array, 3, INT_MAX, 1);
|
||||
|
||||
opal_output_verbose(10, orte_errmgr_base.output,
|
||||
"errmgr:base:select: Auto-selecting components");
|
||||
orte_errmgr_base_component_t *best_component = NULL;
|
||||
orte_errmgr_base_module_t *best_module = NULL;
|
||||
|
||||
/*
|
||||
* Traverse the list of available components.
|
||||
* For each call their 'query' functions to determine relative priority.
|
||||
* Select the best component
|
||||
*/
|
||||
none_found = true;
|
||||
for (item = opal_list_get_first(&orte_errmgr_base_components_available);
|
||||
item != opal_list_get_end(&orte_errmgr_base_components_available);
|
||||
item = opal_list_get_next(item) ) {
|
||||
cli = (mca_base_component_list_item_t *) item;
|
||||
component = (mca_base_component_t *) cli->cli_component;
|
||||
|
||||
/*
|
||||
* If there is a query function then use it.
|
||||
*/
|
||||
if (NULL == component->mca_query_component) {
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query this component for the module and priority
|
||||
*/
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Querying component [%s]",
|
||||
component->mca_component_name);
|
||||
|
||||
component->mca_query_component(&module, &priority);
|
||||
|
||||
/*
|
||||
* If no module was returned or negative priority, then skip component
|
||||
*/
|
||||
if (NULL == module || priority < 0) {
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append them to the temporary list, we will sort later
|
||||
*/
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Query of component [%s] set priority to %d",
|
||||
component->mca_component_name, priority);
|
||||
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
|
||||
tmp_module->component = component;
|
||||
tmp_module->module = module;
|
||||
tmp_module->priority = priority;
|
||||
|
||||
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
|
||||
none_found = false;
|
||||
if( OPAL_SUCCESS != mca_base_select("errmgr", orte_errmgr_base.output,
|
||||
&orte_errmgr_base_components_available,
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component) ) {
|
||||
/* This will only happen if no component was selected */
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (none_found) {
|
||||
/* must have at least one module */
|
||||
return ORTE_ERR_MODULE_NOT_FOUND;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the list by decending priority
|
||||
*/
|
||||
priority = 0;
|
||||
for(j = 0; j < tmp_array.size; ++j) {
|
||||
tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
|
||||
if( NULL == tmp_module_sw ) {
|
||||
continue;
|
||||
}
|
||||
/* Save the winner */
|
||||
orte_errmgr_base_selected_component = *best_component;
|
||||
orte_errmgr = *best_module;
|
||||
|
||||
low_i = -1;
|
||||
priority = tmp_module_sw->priority;
|
||||
|
||||
for(i = 0; i < tmp_array.size; ++i) {
|
||||
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if( NULL == tmp_module ) {
|
||||
continue;
|
||||
}
|
||||
if( tmp_module->priority > priority ) {
|
||||
low_i = i;
|
||||
priority = tmp_module->priority;
|
||||
}
|
||||
}
|
||||
|
||||
if( low_i >= 0 ) {
|
||||
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
|
||||
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
|
||||
j--; /* Try this entry again, if it is not the lowest */
|
||||
} else {
|
||||
tmp_module = tmp_module_sw;
|
||||
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
||||
}
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Add module with priority [%s] %d",
|
||||
tmp_module->component->mca_component_name, tmp_module->priority);
|
||||
opal_pointer_array_add(&orte_errmgr_base.modules, (void*)(tmp_module->module));
|
||||
free(tmp_module);
|
||||
}
|
||||
OBJ_DESTRUCT(&tmp_array);
|
||||
|
||||
/*
|
||||
* Initialize each of the Errmgr Modules
|
||||
*/
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == i_module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != i_module->init ) {
|
||||
i_module->init();
|
||||
/* Initialize the winner */
|
||||
if (NULL != best_module) {
|
||||
if (OPAL_SUCCESS != orte_errmgr.init()) {
|
||||
exit_status = OPAL_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
@ -455,7 +455,7 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
|
||||
/*
|
||||
* Pass to the predicted fault function to see how they would like to progress
|
||||
*/
|
||||
orte_errmgr_base_predicted_fault(proc_list, node_list, suggested_map_list);
|
||||
orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list);
|
||||
}
|
||||
/*
|
||||
* Unknown command
|
||||
|
@ -42,7 +42,6 @@ BEGIN_C_DECLS
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
int output;
|
||||
opal_pointer_array_t modules;
|
||||
bool initialized;
|
||||
} orte_errmgr_base_t;
|
||||
|
||||
@ -61,29 +60,11 @@ typedef uint8_t orte_errmgr_cmd_flag_t;
|
||||
*/
|
||||
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
# endif
|
||||
;
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
|
||||
|
||||
/*
|
||||
* Additional External API function declared in errmgr.h
|
||||
*/
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -1,38 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-errmgr-crmig.txt
|
||||
|
||||
sources = \
|
||||
errmgr_crmig.h \
|
||||
errmgr_crmig_component.c \
|
||||
errmgr_crmig_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_crmig_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_crmig.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_crmig.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_crmig_la_SOURCES = $(sources)
|
||||
mca_errmgr_crmig_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_crmig_la_SOURCES = $(sources)
|
||||
libmca_errmgr_crmig_la_LDFLAGS = -module -avoid-version
|
@ -1,20 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_errmgr_crmig_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_errmgr_crmig_CONFIG],[
|
||||
# If we don't want FT, don't compile this component
|
||||
AS_IF([test "$opal_want_ft_cr" = "1"],
|
||||
[$1],
|
||||
[$2])
|
||||
])dnl
|
@ -1,14 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=errmgr_crmig_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,93 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Checkpoint/Restart Process Migration (CRMIG) ErrMgr component
|
||||
*
|
||||
* Simple, braindead implementation.
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_CRMIG_EXPORT_H
|
||||
#define MCA_ERRMGR_CRMIG_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
struct orte_errmgr_crmig_component_t {
|
||||
orte_errmgr_base_component_t super; /** Base Errmgr component */
|
||||
bool crmig_enabled;
|
||||
bool timing_enabled;
|
||||
};
|
||||
typedef struct orte_errmgr_crmig_component_t orte_errmgr_crmig_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern orte_errmgr_crmig_component_t mca_errmgr_crmig_component;
|
||||
|
||||
int orte_errmgr_crmig_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
int orte_errmgr_crmig_global_module_init(void);
|
||||
int orte_errmgr_crmig_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_crmig_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
int orte_errmgr_crmig_global_ft_event(int state);
|
||||
|
||||
/*
|
||||
* Module functions: Local
|
||||
*/
|
||||
int orte_errmgr_crmig_local_module_init(void);
|
||||
int orte_errmgr_crmig_local_module_finalize(void);
|
||||
|
||||
int orte_errmgr_crmig_local_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_crmig_local_ft_event(int state);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_CRMIG_EXPORT_H */
|
@ -1,142 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_crmig.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_crmig_component_version_string =
|
||||
"ORTE ERRMGR crmig MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_crmig_open(void);
|
||||
static int errmgr_crmig_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_crmig_component_t mca_errmgr_crmig_component = {
|
||||
/* First do the base component stuff */
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itcrmig
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"crmig",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_crmig_open,
|
||||
errmgr_crmig_close,
|
||||
orte_errmgr_crmig_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
40
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_crmig_open(void)
|
||||
{
|
||||
int val;
|
||||
|
||||
/*
|
||||
* This should be the last componet to ever get used since
|
||||
* it doesn't do anything.
|
||||
*/
|
||||
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
|
||||
"priority",
|
||||
"Priority of the ERRMGR crmig component",
|
||||
false, false,
|
||||
mca_errmgr_crmig_component.super.priority,
|
||||
&mca_errmgr_crmig_component.super.priority);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
|
||||
"verbose",
|
||||
"Verbose level for the ERRMGR crmig component",
|
||||
false, false,
|
||||
mca_errmgr_crmig_component.super.verbose,
|
||||
&mca_errmgr_crmig_component.super.verbose);
|
||||
/* If there is a custom verbose level for this component than use it
|
||||
* otherwise take our parents level and output channel
|
||||
*/
|
||||
if ( 0 != mca_errmgr_crmig_component.super.verbose) {
|
||||
mca_errmgr_crmig_component.super.output_handle = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_errmgr_crmig_component.super.output_handle,
|
||||
mca_errmgr_crmig_component.super.verbose);
|
||||
} else {
|
||||
mca_errmgr_crmig_component.super.output_handle = orte_errmgr_base.output;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
|
||||
"timing",
|
||||
"Enable Process Migration timer",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_crmig_component.timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_crmig_component.super.base_version,
|
||||
"enable",
|
||||
"Enable Process Migration (Default: 0/off)",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_crmig_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: open()");
|
||||
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: open: priority = %d",
|
||||
mca_errmgr_crmig_component.super.priority);
|
||||
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: open: verbosity = %d",
|
||||
mca_errmgr_crmig_component.super.verbose);
|
||||
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: open: Proc. Mig. = %s",
|
||||
(mca_errmgr_crmig_component.crmig_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: open: timing = %s",
|
||||
(mca_errmgr_crmig_component.timing_enabled ? "Enabled" : "Disabled"));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_crmig_close(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: close()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
-*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for ORTE ErrMgr CRMig framework.
|
||||
#
|
||||
[migrating_job]
|
||||
Notice: A migration of this job has been requested.
|
||||
The processes below will be migrated.
|
||||
Please standby.
|
||||
%s
|
||||
[migrated_job]
|
||||
Notice: The processes have been successfully migrated to/from the specified
|
||||
machines.
|
||||
[no_migrating_procs]
|
||||
Warning: Could not find any processes to migrate on the nodes specified.
|
||||
You provided the following:
|
||||
Nodes: %s
|
||||
Procs: %s
|
@ -20,36 +20,16 @@
|
||||
*
|
||||
* The Open RTE Error and Recovery Manager (ErrMgr)
|
||||
*
|
||||
* This framework is a composite framework in which multiple components
|
||||
* are often active at the same time and may work on a single external call
|
||||
* to the interface functions.
|
||||
* This framework is the logically central clearing house for process/daemon
|
||||
* state updates. In particular when a process fails and another process detects
|
||||
* it, then that information is reported through this framework. This framework
|
||||
* then (depending on the active component) decides how to handle the failure.
|
||||
*
|
||||
* This framework allows the user to compose a job recovery policy from multiple
|
||||
* individual components. Each component will operate on the function call if it
|
||||
* has a registered function. If no component registers a function then the base
|
||||
* functionality/policy is used.
|
||||
*
|
||||
* For example, consider the 3 components on the left (C1, C2, C3), and the
|
||||
* API function calls across the top:
|
||||
* | Priority | Fn1 | Fn2 | Fn3 | Fn4 |
|
||||
* -----+----------+------+------+------+------+
|
||||
* base | --- | act0 | --- | --- | act6 |
|
||||
* C1 | 10 | act1 | --- | act2 | --- |
|
||||
* C2 | 20 | --- | act3 | --- | --- |
|
||||
* C3 | 30 | act4 | act5 | --- | --- |
|
||||
* -----+----------+------+------+------+------+
|
||||
* A call to Fn1 will result in:
|
||||
* act4, act1
|
||||
* A call to Fn2 will result in:
|
||||
* act5, act3
|
||||
* A call to Fn3 will result in:
|
||||
* act2
|
||||
* A call to Fn4 will result in:
|
||||
* act6
|
||||
*
|
||||
* Notice that when the base function is overridden it is not called. The base
|
||||
* function is only called when the function has not been overridden by a
|
||||
* component.
|
||||
* For example, if a process fails this may activate an automatic recovery
|
||||
* of the process from a previous checkpoint, or initial state. Conversely,
|
||||
* the active component could decide not to continue the job, and request that
|
||||
* it be terminated. The error and recovery policy is determined by individual
|
||||
* components within this framework.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -76,8 +56,6 @@
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
/* type definition */
|
||||
typedef uint8_t orte_errmgr_stack_state_t;
|
||||
|
||||
/*
|
||||
* Structure to describe a predicted process fault.
|
||||
@ -159,12 +137,43 @@ OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
|
||||
OPAL_SOS_LOG(n); \
|
||||
}
|
||||
|
||||
/**** FRAMEWORK API FUNCTIONS ****/
|
||||
|
||||
/*
|
||||
* Framework Interfaces
|
||||
*/
|
||||
/**
|
||||
* Module initialization function.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_init_fn_t)(void);
|
||||
|
||||
/**
|
||||
* Module finalization function.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
|
||||
|
||||
/**
|
||||
* This is not part of any module so it can be used at any time!
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, int line);
|
||||
typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
|
||||
|
||||
/**
|
||||
* Alert - self aborting
|
||||
* This function is called when a process is aborting due to some internal error.
|
||||
* It will finalize the process
|
||||
* itself, and then exit - it takes no other actions. The intent here is to provide
|
||||
* a last-ditch exit procedure that attempts to clean up a little.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
|
||||
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
# endif
|
||||
;
|
||||
|
||||
/**
|
||||
* Alert - process aborted
|
||||
@ -180,16 +189,15 @@ typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, in
|
||||
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
||||
* @retval ORTE_ERROR Appropriate error code
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_API_update_state_fn_t)(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/**
|
||||
* Predicted process/node failure notification
|
||||
* Composite interface. Called in priority order.
|
||||
*
|
||||
* @param[in] proc_list List of processes (or NULL if none)
|
||||
* @param[in] node_list List of nodes (or NULL if none)
|
||||
@ -198,9 +206,9 @@ typedef int (*orte_errmgr_base_API_update_state_fn_t)(orte_jobid_t job,
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
/**
|
||||
* Suggest a node to map a restarting process onto
|
||||
@ -212,79 +220,9 @@ typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(opal_list_t *proc_list,
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_API_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
|
||||
/**
|
||||
* Alert - self aborting
|
||||
* This function is called when a process is aborting due to some internal error.
|
||||
* It will finalize the process
|
||||
* itself, and then exit - it takes no other actions. The intent here is to provide
|
||||
* a last-ditch exit procedure that attempts to clean up a little.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_API_abort_fn_t)(int error_code, char *fmt, ...)
|
||||
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
# endif
|
||||
;
|
||||
|
||||
/* global structure for accessing ERRMGR FRAMEWORK API's */
|
||||
typedef struct {
|
||||
orte_errmgr_base_API_log_fn_t log;
|
||||
orte_errmgr_base_API_update_state_fn_t update_state;
|
||||
orte_errmgr_base_API_predicted_fault_fn_t predicted_fault;
|
||||
orte_errmgr_base_API_suggest_map_targets_fn_t suggest_map_targets;
|
||||
orte_errmgr_base_API_abort_fn_t abort;
|
||||
|
||||
} orte_errmgr_API_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_API_t orte_errmgr;
|
||||
|
||||
|
||||
|
||||
|
||||
/**** INTERNAL MODULE FUNCTIONS ****/
|
||||
|
||||
/**
|
||||
* Module initialization function.
|
||||
* Public interface. Will be call in each of the active composite components
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_init_fn_t)
|
||||
(void);
|
||||
|
||||
/**
|
||||
* Module finalization function.
|
||||
* Public interface. Will be call in each of the active composite components
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_finalize_fn_t)
|
||||
(void);
|
||||
|
||||
/*
|
||||
* Internal Composite Interfaces corresponding to API interfaces
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
opal_list_t *node_list);
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
@ -294,8 +232,7 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
|
||||
|
||||
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
||||
|
||||
/*
|
||||
* Module Structure
|
||||
@ -306,7 +243,9 @@ struct orte_errmgr_base_module_2_3_0_t {
|
||||
/** Finalization Function */
|
||||
orte_errmgr_base_module_finalize_fn_t finalize;
|
||||
|
||||
/* -------------- Internal Composite Interfaces -- */
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
|
||||
/** Actual process failure notification */
|
||||
orte_errmgr_base_module_update_state_fn_t update_state;
|
||||
/** Predicted process/node failure notification */
|
||||
@ -315,11 +254,11 @@ struct orte_errmgr_base_module_2_3_0_t {
|
||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
||||
|
||||
/** Handle any FT Notifications */
|
||||
orte_errmgr_base_ft_event_fn_t ft_event;
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
};
|
||||
|
||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
|
||||
|
||||
/*
|
||||
* ErrMgr Component
|
||||
@ -340,7 +279,6 @@ struct orte_errmgr_base_component_3_0_0_t {
|
||||
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
|
||||
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type errmgr
|
||||
*/
|
||||
|
@ -1,38 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-errmgr-example.txt
|
||||
|
||||
sources = \
|
||||
errmgr_example.h \
|
||||
errmgr_example_component.c \
|
||||
errmgr_example_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_example_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_example.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_example.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_example_la_SOURCES = $(sources)
|
||||
mca_errmgr_example_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_example_la_SOURCES = $(sources)
|
||||
libmca_errmgr_example_la_LDFLAGS = -module -avoid-version
|
@ -1,20 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_errmgr_example_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_errmgr_example_CONFIG],[
|
||||
# If we don't want FT, don't compile this component
|
||||
AS_IF([test "$opal_want_ft_cr" = "1"],
|
||||
[$1],
|
||||
[$2])
|
||||
])dnl
|
@ -1,14 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=errmgr_example_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,74 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Automatic Recovery Errmgr component
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_EXAMPLE_EXPORT_H
|
||||
#define MCA_ERRMGR_EXAMPLE_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
struct orte_errmgr_example_component_t {
|
||||
orte_errmgr_base_component_t super; /** Base Errmgr component */
|
||||
};
|
||||
typedef struct orte_errmgr_example_component_t orte_errmgr_example_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern orte_errmgr_example_component_t mca_errmgr_example_component;
|
||||
|
||||
int orte_errmgr_example_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
int orte_errmgr_example_global_module_init(void);
|
||||
int orte_errmgr_example_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_example_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
int orte_errmgr_example_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_example_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
int orte_errmgr_example_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
int orte_errmgr_example_global_ft_event(int state);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_EXAMPLE_EXPORT_H */
|
@ -1,120 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_example.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_example_component_version_string =
|
||||
"ORTE ERRMGR Example MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_example_open(void);
|
||||
static int errmgr_example_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_example_component_t mca_errmgr_example_component = {
|
||||
/* First do the base component stuff */
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itexample
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"example",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_example_open,
|
||||
errmgr_example_close,
|
||||
orte_errmgr_example_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_example_open(void)
|
||||
{
|
||||
/*
|
||||
* This should be the last componet to ever get used since
|
||||
* it doesn't do anything.
|
||||
*/
|
||||
mca_base_param_reg_int(&mca_errmgr_example_component.super.base_version,
|
||||
"priority",
|
||||
"Priority of the ERRMGR example component",
|
||||
false, false,
|
||||
mca_errmgr_example_component.super.priority,
|
||||
&mca_errmgr_example_component.super.priority);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_example_component.super.base_version,
|
||||
"verbose",
|
||||
"Verbose level for the ERRMGR example component",
|
||||
false, false,
|
||||
mca_errmgr_example_component.super.verbose,
|
||||
&mca_errmgr_example_component.super.verbose);
|
||||
/* If there is a custom verbose level for this component than use it
|
||||
* otherwise take our parents level and output channel
|
||||
*/
|
||||
if ( 0 != mca_errmgr_example_component.super.verbose) {
|
||||
mca_errmgr_example_component.super.output_handle = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_errmgr_example_component.super.output_handle,
|
||||
mca_errmgr_example_component.super.verbose);
|
||||
} else {
|
||||
mca_errmgr_example_component.super.output_handle = orte_errmgr_base.output;
|
||||
}
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example: open()");
|
||||
opal_output_verbose(20, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example: open: priority = %d",
|
||||
mca_errmgr_example_component.super.priority);
|
||||
opal_output_verbose(20, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example: open: verbosity = %d",
|
||||
mca_errmgr_example_component.super.verbose);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_example_close(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example: close()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,187 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/sstore.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_example.h"
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
|
||||
/******************
|
||||
* Automatic Recovery module
|
||||
******************/
|
||||
static orte_errmgr_base_module_t global_module = {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_example_global_module_init,
|
||||
/** Finalization Function */
|
||||
orte_errmgr_example_global_module_finalize,
|
||||
/** Update State */
|
||||
orte_errmgr_example_global_update_state,
|
||||
orte_errmgr_example_global_predicted_fault,
|
||||
/*orte_errmgr_example_global_process_fault,*/
|
||||
orte_errmgr_example_global_suggest_map_targets,
|
||||
orte_errmgr_example_global_ft_event
|
||||
};
|
||||
|
||||
/************************************
|
||||
* Locally Global vars & functions
|
||||
************************************/
|
||||
|
||||
/************************
|
||||
* Function Definitions
|
||||
************************/
|
||||
/*
|
||||
* MCA Functions
|
||||
*/
|
||||
int orte_errmgr_example_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if( !(orte_enable_recovery) ) {
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:component_query() - Disabled: Recovery is not enabled");
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:component_query()");
|
||||
|
||||
*priority = mca_errmgr_example_component.super.priority;
|
||||
if( ORTE_PROC_IS_HNP ) {
|
||||
*module = (mca_base_module_t *)&global_module;
|
||||
}
|
||||
else {
|
||||
*module = NULL;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Function Definitions
|
||||
************************/
|
||||
int orte_errmgr_example_global_module_init(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:init()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_example_global_module_finalize(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:finalize()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_example_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:predicted_fault()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_example_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:update_state(%s)",
|
||||
ORTE_NAME_PRINT(proc_name));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_example_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:process_fault(%s)",
|
||||
ORTE_NAME_PRINT(proc_name));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_example_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_example_component.super.output_handle,
|
||||
"errmgr:example:suggest_map_targets()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_example_global_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
@ -1,14 +0,0 @@
|
||||
-*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for ORTE ErrMgr Example framework.
|
||||
#
|
@ -14,7 +14,9 @@ dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
|
||||
sources = \
|
||||
errmgr_hnp.h \
|
||||
errmgr_hnp_component.c \
|
||||
errmgr_hnp.c
|
||||
errmgr_hnp.c \
|
||||
errmgr_hnp_autor.c \
|
||||
errmgr_hnp_crmig.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
|
@ -51,83 +51,305 @@
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_hnp.h"
|
||||
|
||||
/* Local functions */
|
||||
/**********************
|
||||
* C/R Mgr Components
|
||||
* Global: HNP
|
||||
**********************/
|
||||
static orte_errmgr_base_module_t global_module = {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_hnp_global_module_init,
|
||||
/** Finalization Function */
|
||||
orte_errmgr_hnp_global_module_finalize,
|
||||
/** Error Log */
|
||||
orte_errmgr_base_log,
|
||||
/** Forced Abort */
|
||||
orte_errmgr_base_abort,
|
||||
/** Update State */
|
||||
orte_errmgr_hnp_global_update_state,
|
||||
/* Predicted Fault */
|
||||
orte_errmgr_hnp_global_predicted_fault,
|
||||
/* Suggest proc to node mapping */
|
||||
orte_errmgr_hnp_global_suggest_map_targets,
|
||||
/* FT Event hook */
|
||||
orte_errmgr_hnp_global_ft_event
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code);
|
||||
static void failed_start(orte_job_t *jdata);
|
||||
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
static void check_job_complete(orte_job_t *jdata);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
|
||||
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_hnp_module = {
|
||||
init,
|
||||
finalize,
|
||||
update_state,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event
|
||||
};
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp:component_query()");
|
||||
|
||||
if( ORTE_PROC_IS_HNP ) {
|
||||
*priority = mca_errmgr_hnp_component.super.priority;
|
||||
*module = (mca_base_module_t *)&global_module;
|
||||
}
|
||||
/* Daemons and Apps have their own components */
|
||||
else {
|
||||
*module = NULL;
|
||||
*priority = -1;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*******************
|
||||
* Global Functions
|
||||
********************/
|
||||
int orte_errmgr_hnp_global_module_init(void)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
if( mca_errmgr_hnp_component.crmig_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_init()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
if( mca_errmgr_hnp_component.autor_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_init()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_init()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_module_finalize(void)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
if( mca_errmgr_hnp_component.crmig_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_finalize()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
if( mca_errmgr_hnp_component.autor_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_finalize()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_finalize()) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
mca_errmgr_hnp_component.ignore_current_update = false;
|
||||
|
||||
if (orte_finalizing ||
|
||||
orte_job_term_ordered ||
|
||||
ORTE_PROC_STATE_TERMINATED == state ) {
|
||||
mca_errmgr_hnp_component.term_in_progress = true;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:hnp:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
((NULL == proc_name) ? "App. Process" :
|
||||
(proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name)));
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
if( mca_errmgr_hnp_component.crmig_enabled &&
|
||||
!mca_errmgr_hnp_component.autor_in_progress) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_update_state(job,
|
||||
jobstate,
|
||||
proc_name,
|
||||
state,
|
||||
pid,
|
||||
exit_code)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
if( mca_errmgr_hnp_component.autor_enabled &&
|
||||
!mca_errmgr_hnp_component.crmig_in_progress) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_update_state(job,
|
||||
jobstate,
|
||||
proc_name,
|
||||
state,
|
||||
pid,
|
||||
exit_code)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
if( !mca_errmgr_hnp_component.ignore_current_update ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_update_state(job,
|
||||
jobstate,
|
||||
proc_name,
|
||||
state,
|
||||
pid,
|
||||
exit_code)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
if( mca_errmgr_hnp_component.crmig_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_predicted_fault(proc_list,
|
||||
node_list,
|
||||
suggested_map)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else {
|
||||
exit_status = ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
#else
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED;
|
||||
|
||||
if( mca_errmgr_hnp_component.crmig_enabled &&
|
||||
!mca_errmgr_hnp_component.autor_in_progress ) {
|
||||
exit_status = ORTE_SUCCESS;
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_suggest_map_targets(proc,
|
||||
oldnode,
|
||||
node_list)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
if( mca_errmgr_hnp_component.autor_enabled &&
|
||||
!mca_errmgr_hnp_component.crmig_in_progress ) {
|
||||
exit_status = ORTE_SUCCESS;
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_suggest_map_targets(proc,
|
||||
oldnode,
|
||||
node_list)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
#else
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_ft_event(int state)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
if( !mca_errmgr_hnp_component.crmig_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_ft_event(state)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
if( !mca_errmgr_hnp_component.autor_enabled ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_ft_event(state)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_ft_event(state)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
|
||||
/**********************
|
||||
* From HNP
|
||||
**********************/
|
||||
int orte_errmgr_hnp_base_global_init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
int orte_errmgr_hnp_base_global_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
orte_exit_code_t sts;
|
||||
@ -136,9 +358,6 @@ static int update_state(orte_jobid_t job,
|
||||
orte_app_context_t *app;
|
||||
orte_proc_t *pdat;
|
||||
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: job %s reported state %s"
|
||||
" for proc %s state %s pid %d exit_code %d",
|
||||
@ -148,18 +367,6 @@ static int update_state(orte_jobid_t job,
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), pid, exit_code));
|
||||
|
||||
/********************************
|
||||
* If the modules before us recovered from this error, then do not abort.
|
||||
********************************/
|
||||
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:hnp:update_proc() %s) "
|
||||
"------- A previous component successfully recovered from the process fault of %s! Continuing...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
@ -340,7 +547,7 @@ static int update_state(orte_jobid_t job,
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) {
|
||||
if( jdata->enable_recovery ) {
|
||||
/* is this a local proc */
|
||||
if (NULL != (child = proc_is_local(proc))) {
|
||||
/* local proc - see if it has reached its local restart limit */
|
||||
@ -371,7 +578,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* guess not - let it fall thru to abort */
|
||||
}
|
||||
}
|
||||
update_proc(jdata, proc, state, pid, exit_code);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
check_job_complete(jdata); /* need to set the job state */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
@ -384,7 +591,7 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
update_proc(jdata, proc, state, pid, exit_code);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
check_job_complete(jdata);
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
@ -397,22 +604,22 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
case ORTE_PROC_STATE_REGISTERED:
|
||||
case ORTE_PROC_STATE_RUNNING:
|
||||
update_proc(jdata, proc, state, pid, exit_code);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_LAUNCHED:
|
||||
/* record the pid for this child */
|
||||
update_proc(jdata, proc, state, pid, exit_code);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERMINATED:
|
||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
||||
update_proc(jdata, proc, state, pid, exit_code);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
update_proc(jdata, proc, state, pid, exit_code);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
check_job_complete(jdata); /* need to set the job state */
|
||||
/* the job object for this job will have been NULL'd
|
||||
@ -423,7 +630,7 @@ static int update_state(orte_jobid_t job,
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
/* is this to a daemon? */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
@ -442,7 +649,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* update daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
/* check for complete */
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
@ -457,7 +664,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* update daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* check for complete */
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
@ -468,7 +675,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
|
||||
if( orte_enable_recovery ) {
|
||||
/* relocate its processes */
|
||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
|
||||
/* unable to relocate for some reason */
|
||||
@ -493,7 +700,7 @@ static int update_state(orte_jobid_t job,
|
||||
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
|
||||
}
|
||||
/* remove this proc from the daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* kill all jobs */
|
||||
@ -506,10 +713,10 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
/* heartbeats are only from daemons */
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
|
||||
if( orte_enable_recovery ) {
|
||||
/* relocate its processes */
|
||||
} else {
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* kill all jobs */
|
||||
@ -525,23 +732,7 @@ static int update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int ft_event(int state)
|
||||
int orte_errmgr_hnp_base_global_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -697,11 +888,11 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
|
||||
}
|
||||
}
|
||||
|
||||
static void update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_child_t *child;
|
||||
@ -1230,7 +1421,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
/* remove this proc from the daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* check to see if any other nodes are "alive" */
|
||||
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
|
||||
return ORTE_ERR_FATAL;
|
||||
@ -1355,8 +1546,10 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code)
|
||||
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
|
||||
orte_vpid_t vpid,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_job_t *jdt;
|
||||
orte_proc_t *pdat;
|
||||
@ -1387,8 +1580,21 @@ static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
|
||||
}
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
|
||||
/* major problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
/* It is possible that the process job finishes before the daemons.
|
||||
* In that case the process state is set to normal termination, and
|
||||
* the job data has already been cleared. So no need to throw an
|
||||
* error.
|
||||
*/
|
||||
if( ORTE_PROC_STATE_TERMINATED != pdat->state ) {
|
||||
opal_output(0,
|
||||
"%s Error: Failed to find job_data for proc %s (%s) on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pdat->name),
|
||||
orte_proc_state_to_str(pdat->state),
|
||||
node->name );
|
||||
/* major problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
pdat->state = ORTE_PROC_STATE_ABORTED;
|
||||
|
@ -25,10 +25,108 @@ BEGIN_C_DECLS
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
struct orte_errmgr_hnp_component_t {
|
||||
orte_errmgr_base_component_t super; /** Base Errmgr component */
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_hnp_component;
|
||||
bool ignore_current_update;
|
||||
bool term_in_progress;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_hnp_module;
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/* State of the Recovery */
|
||||
bool crmig_in_progress;
|
||||
bool autor_in_progress;
|
||||
|
||||
/* CRMig Options */
|
||||
bool crmig_enabled;
|
||||
bool crmig_timing_enabled;
|
||||
|
||||
/* AutoR Options */
|
||||
bool autor_enabled;
|
||||
bool autor_timing_enabled;
|
||||
int autor_recovery_delay;
|
||||
bool autor_skip_oldnode;
|
||||
#endif
|
||||
};
|
||||
typedef struct orte_errmgr_hnp_component_t orte_errmgr_hnp_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern orte_errmgr_hnp_component_t mca_errmgr_hnp_component;
|
||||
|
||||
int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
|
||||
orte_vpid_t vpid,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/***************************
|
||||
* Module functions: Global
|
||||
***************************/
|
||||
int orte_errmgr_hnp_global_module_init(void);
|
||||
int orte_errmgr_hnp_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_hnp_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnp_global_ft_event(int state);
|
||||
|
||||
/* HNP Versions */
|
||||
int orte_errmgr_hnp_base_global_init(void);
|
||||
int orte_errmgr_hnp_base_global_finalize(void);
|
||||
int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnp_base_global_ft_event(int state);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/* CRMig Versions */
|
||||
int orte_errmgr_hnp_crmig_global_module_init(void);
|
||||
int orte_errmgr_hnp_crmig_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnp_crmig_global_ft_event(int state);
|
||||
|
||||
/* AutoR Versions */
|
||||
int orte_errmgr_hnp_autor_global_module_init(void);
|
||||
int orte_errmgr_hnp_autor_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnp_autor_global_ft_event(int state);
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -55,40 +55,11 @@
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_autor.h"
|
||||
#include "errmgr_hnp.h"
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
|
||||
/******************
|
||||
* Automatic Recovery module
|
||||
******************/
|
||||
static orte_errmgr_base_module_t global_module = {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_autor_global_module_init,
|
||||
/** Finalization Function */
|
||||
orte_errmgr_autor_global_module_finalize,
|
||||
/** Update State */
|
||||
orte_errmgr_autor_global_update_state,
|
||||
NULL, /** predicted_fault */
|
||||
/*orte_errmgr_autor_global_process_fault,*/
|
||||
orte_errmgr_autor_global_suggest_map_targets,
|
||||
orte_errmgr_autor_global_ft_event
|
||||
};
|
||||
|
||||
static orte_errmgr_base_module_t local_module = {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_autor_local_module_init,
|
||||
/** Finalization Function */
|
||||
orte_errmgr_autor_local_module_finalize,
|
||||
/** Update State */
|
||||
orte_errmgr_autor_local_update_state,
|
||||
NULL, /** predicted_fault */
|
||||
/*orte_errmgr_autor_local_process_fault,*/
|
||||
NULL, /* suggest_map_targets */
|
||||
orte_errmgr_autor_local_ft_event
|
||||
};
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/************************
|
||||
* Work Pool structures
|
||||
************************/
|
||||
@ -132,22 +103,20 @@ static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name
|
||||
static int display_procs(void );
|
||||
static int autor_procs_sort_compare_fn(opal_list_item_t **a,
|
||||
opal_list_item_t **b);
|
||||
|
||||
static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state);
|
||||
static void errmgr_autor_process_fault_app(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
orte_proc_state_t state);
|
||||
static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
orte_proc_state_t state);
|
||||
|
||||
static int check_if_terminated(opal_pointer_array_t *procs);
|
||||
static int check_if_restarted(opal_pointer_array_t *procs);
|
||||
|
||||
static void update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/*
|
||||
* Timer stuff
|
||||
*/
|
||||
@ -167,76 +136,34 @@ static double timer_start[OPAL_CR_TIMER_MAX];
|
||||
#define ERRMGR_AUTOR_TIMER_FINISH 5
|
||||
#define ERRMGR_AUTOR_TIMER_MAX 6
|
||||
|
||||
#define ERRMGR_AUTOR_CLEAR_TIMERS() \
|
||||
#define ERRMGR_AUTOR_CLEAR_TIMERS() \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_autor_component.timing_enabled > 0)) { \
|
||||
errmgr_autor_clear_timers(); \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \
|
||||
errmgr_autor_clear_timers(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ERRMGR_AUTOR_SET_TIMER(idx) \
|
||||
#define ERRMGR_AUTOR_SET_TIMER(idx) \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_autor_component.timing_enabled > 0)) { \
|
||||
errmgr_autor_set_time(idx); \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \
|
||||
errmgr_autor_set_time(idx); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \
|
||||
#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_autor_component.timing_enabled > 0)) { \
|
||||
errmgr_autor_display_all_timers(); \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \
|
||||
errmgr_autor_display_all_timers(); \
|
||||
} \
|
||||
}
|
||||
|
||||
/************************
|
||||
* Function Definitions
|
||||
************************/
|
||||
/*
|
||||
* MCA Functions
|
||||
*/
|
||||
int orte_errmgr_autor_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if( !(orte_enable_recovery) ) {
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:component_query() - Disabled: Recovery is not enabled");
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if( !mca_errmgr_autor_component.autor_enabled ) {
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor: component_query() - Disabled: C/R Automatic Recovery "
|
||||
"is not enabled via errmgr_autor_enable MCA parameter.");
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:component_query()");
|
||||
|
||||
*priority = mca_errmgr_autor_component.super.priority;
|
||||
if( ORTE_PROC_IS_HNP ) {
|
||||
*module = (mca_base_module_t *)&global_module;
|
||||
}
|
||||
else if (ORTE_PROC_IS_DAEMON) {
|
||||
*module = (mca_base_module_t *)&local_module;
|
||||
}
|
||||
else {
|
||||
*module = NULL;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Function Definitions: Global
|
||||
************************/
|
||||
int orte_errmgr_autor_global_module_init(void)
|
||||
int orte_errmgr_hnp_autor_global_module_init(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:init()");
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):init()");
|
||||
|
||||
procs_pending_recovery = OBJ_NEW(opal_list_t);
|
||||
autor_timer_event = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
@ -249,10 +176,10 @@ int orte_errmgr_autor_global_module_init(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_global_module_finalize(void)
|
||||
int orte_errmgr_hnp_autor_global_module_finalize(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:finalize()");
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):finalize()");
|
||||
|
||||
if( NULL != procs_pending_recovery ) {
|
||||
OBJ_RELEASE(procs_pending_recovery);
|
||||
@ -313,20 +240,19 @@ static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name
|
||||
}
|
||||
|
||||
if( NULL == current_global_jobdata ) {
|
||||
opal_output(0, "errmgr:autor:process_fault(): Global) Error: Cannot find the jdata for the current job.");
|
||||
opal_output(0, "errmgr:hnp(autor):process_fault(): Global) Error: Cannot find the jdata for the current job.");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_proc_t *loc_proc = NULL;
|
||||
orte_job_t *jdata = NULL;
|
||||
@ -336,30 +262,20 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
if( mca_errmgr_hnp_component.term_in_progress ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if( NULL != proc_name &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:autor: Update reported on self (%s), state %s. Skip...",
|
||||
"%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:autor: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d (%c)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state), exit_code,
|
||||
(orte_finalizing ? 'T' : 'F')));
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ret = ORTE_ERROR;
|
||||
@ -369,15 +285,27 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
/*
|
||||
* If this job opt'ed not to be recovered, then skip
|
||||
* If this is a tool, ignore
|
||||
*/
|
||||
if( !(jdata->enable_recovery) ) {
|
||||
if( jdata->num_apps == 0 &&
|
||||
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp(autor): An external tool disconnected. Ignore...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( ORTE_JOB_STATE_RESTART == jobstate ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp(autor): job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
if( ORTE_JOB_STATE_RESTART == jobstate ) {
|
||||
for(i = 0; i < jdata->procs->size; ++i) {
|
||||
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
@ -385,8 +313,7 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
break;
|
||||
}
|
||||
|
||||
/*state = ORTE_PROC_STATE_KILLED_BY_CMD;*/
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_autor_global_process_fault(jdata, &(loc_proc->name), state, stack_state)) ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -394,7 +321,7 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
}
|
||||
else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state ||
|
||||
ORTE_PROC_STATE_COMM_FAILED == state ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_autor_global_process_fault(jdata, proc_name, state, stack_state)) ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -402,9 +329,8 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
}
|
||||
else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
|
||||
if( autor_mask_faults ) {
|
||||
update_proc(jdata, proc_name, state, exit_code);
|
||||
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
|
||||
mca_errmgr_hnp_component.ignore_current_update = true;
|
||||
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code);
|
||||
}
|
||||
}
|
||||
|
||||
@ -412,10 +338,9 @@ int orte_errmgr_autor_global_update_state(orte_jobid_t job,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -431,19 +356,18 @@ int orte_errmgr_autor_global_process_fault(orte_job_t *jdata,
|
||||
current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE;
|
||||
|
||||
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
|
||||
errmgr_autor_process_fault_daemon(jdata, proc_name, state, stack_state);
|
||||
errmgr_autor_process_fault_daemon(jdata, proc_name, state);
|
||||
} else {
|
||||
update_proc(jdata, proc_name, state, 0);
|
||||
errmgr_autor_process_fault_app(jdata, proc_name, state, stack_state);
|
||||
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, 0);
|
||||
errmgr_autor_process_fault_app(jdata, proc_name, state);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
opal_list_item_t *item = NULL;
|
||||
errmgr_autor_wp_item_t *wp_item = NULL;
|
||||
@ -463,7 +387,7 @@ int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
* Find this process in the known failures list
|
||||
*/
|
||||
found = false;
|
||||
if( mca_errmgr_autor_component.skip_oldnode ) {
|
||||
if( mca_errmgr_hnp_component.autor_skip_oldnode ) {
|
||||
for(item = opal_list_get_first(procs_pending_recovery);
|
||||
item != opal_list_get_end(procs_pending_recovery);
|
||||
item = opal_list_get_next(item) ) {
|
||||
@ -477,8 +401,8 @@ int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor: suggest_map() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor): suggest_map() "
|
||||
"Process remapping: %s oldnode %s, %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
@ -525,107 +449,47 @@ int orte_errmgr_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_global_ft_event(int state)
|
||||
int orte_errmgr_hnp_autor_global_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Function Definitions: Local
|
||||
************************/
|
||||
int orte_errmgr_autor_local_module_init(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:init() Local");
|
||||
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
current_global_jobdata = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_local_module_finalize(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:finalize() Local");
|
||||
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
current_global_jobdata = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_local_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
/*
|
||||
* If this component is enabled, then the global version takes care of
|
||||
* recovery policy. Tell lower layers in the ErrMgr stack -not- to recover
|
||||
* locally.
|
||||
*/
|
||||
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:autor: update_state() (Local) job state %s"
|
||||
" for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state) ));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_autor_local_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
static void errmgr_autor_process_fault_app(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
errmgr_autor_wp_item_t *wp_item = NULL;
|
||||
struct timeval soon;
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor: process_fault() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor): process_fault() "
|
||||
"Process fault! proc %s (0x%x)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
state));
|
||||
|
||||
if( !orte_sstore_base_is_checkpoint_available ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor: process_fault() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor): process_fault() "
|
||||
"No checkpoints are available for this job! Cannot Automaticly Recover!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
opal_show_help("help-orte-errmgr-autor.txt", "failed_to_recover_proc", true,
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true,
|
||||
ORTE_NAME_PRINT(proc), proc->vpid);
|
||||
return;
|
||||
}
|
||||
|
||||
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
|
||||
mca_errmgr_hnp_component.ignore_current_update = true;
|
||||
|
||||
/*
|
||||
* If we are already in the shutdown stage of the recovery, then just skip it
|
||||
*/
|
||||
if( autor_mask_faults ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor:process_fault() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor):process_fault() "
|
||||
"Currently recovering the job. Failure masked!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return;
|
||||
@ -648,7 +512,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
|
||||
autor_timer_active = true;
|
||||
|
||||
opal_evtimer_set(autor_timer_event, errmgr_autor_recover_processes, NULL);
|
||||
soon.tv_sec = mca_errmgr_autor_component.recovery_delay;
|
||||
soon.tv_sec = mca_errmgr_hnp_component.autor_recovery_delay;
|
||||
soon.tv_usec = 0;
|
||||
opal_evtimer_add(autor_timer_event, &soon);
|
||||
}
|
||||
@ -658,15 +522,14 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
|
||||
|
||||
static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
orte_proc_t *loc_proc = NULL, *child_proc = NULL;
|
||||
orte_std_cntr_t i_proc;
|
||||
int32_t i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((15, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor: process_fault_daemon() "
|
||||
OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor): process_fault_daemon() "
|
||||
"------- Daemon fault reported! proc %s (0x%x)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
@ -704,28 +567,36 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
|
||||
"------- Daemon lost with the following processes",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
|
||||
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
|
||||
if( NULL == child_proc ) {
|
||||
continue;
|
||||
}
|
||||
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
|
||||
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
|
||||
if( NULL == child_proc ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s errmgr:base: stabalize_runtime() "
|
||||
"\t %s [0x%x]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child_proc->name),
|
||||
child_proc->state));
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s errmgr:base: stabalize_runtime() "
|
||||
"\t %s [0x%x]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child_proc->name),
|
||||
child_proc->state));
|
||||
|
||||
if( child_proc->last_errmgr_state < child_proc->state ) {
|
||||
child_proc->last_errmgr_state = child_proc->state;
|
||||
orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED,
|
||||
&(child_proc->name), ORTE_PROC_STATE_COMM_FAILED,
|
||||
0, 1);
|
||||
/*orte_errmgr_base_proc_aborted(&child_proc->name, -1);*/
|
||||
}
|
||||
if( child_proc->last_errmgr_state < child_proc->state ) {
|
||||
child_proc->last_errmgr_state = child_proc->state;
|
||||
orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED,
|
||||
&(child_proc->name), ORTE_PROC_STATE_COMM_FAILED,
|
||||
0, 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* This daemon had no children, so just mask the failure */
|
||||
mca_errmgr_hnp_component.ignore_current_update = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Record the dead daemon
|
||||
*/
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@ -772,7 +643,7 @@ static int display_procs(void )
|
||||
}
|
||||
}
|
||||
|
||||
opal_show_help("help-orte-errmgr-autor.txt", "recovering_job", true,
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovering_job", true,
|
||||
proc_str);
|
||||
|
||||
if( NULL != tmp_str ) {
|
||||
@ -824,8 +695,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
/*
|
||||
* Display the processes that are to be recovered
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor:recover() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor):recover() "
|
||||
"------- Display known failed processes in the job %s -------",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
|
||||
@ -836,8 +707,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
/*
|
||||
* Find the latest checkpoint
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor:recover() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor):recover() "
|
||||
"------- Find the latest checkpoint for the job %s -------",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
|
||||
@ -854,8 +725,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
/*
|
||||
* Safely terminate the entire job
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:recover() "
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):recover() "
|
||||
"------- Safely terminate the job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
@ -883,8 +754,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
|
||||
ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM);
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:recover() "
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):recover() "
|
||||
"------- Done waiting for termination of job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
current_global_jobdata->num_terminated = current_global_jobdata->num_procs;
|
||||
@ -893,8 +764,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
/*
|
||||
* Construct the app contexts to restart
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
"%s errmgr:autor:recover() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"%s errmgr:hnp(autor):recover() "
|
||||
"------- Rebuild job %s app context -------",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
|
||||
@ -912,7 +783,7 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\tAdjusted: \"%s\" [0x%d] [%s]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
|
||||
}
|
||||
@ -922,8 +793,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
/*
|
||||
* Spawn the restarted job
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:recover() "
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):recover() "
|
||||
"------- Respawning the job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
orte_snapc_base_has_recovered = false;
|
||||
@ -933,8 +804,8 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
/*
|
||||
* Wait for all the processes to restart
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:recover() "
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):recover() "
|
||||
"------- Waiting for restart -------");
|
||||
while(!check_if_restarted(current_global_jobdata->procs) ) {
|
||||
opal_progress();
|
||||
@ -949,12 +820,12 @@ static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_autor_component.super.output_handle,
|
||||
"errmgr:autor:recover() "
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(autor):recover() "
|
||||
"------- Finished recovering job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
opal_show_help("help-orte-errmgr-autor.txt", "recovery_complete", true);
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true);
|
||||
|
||||
ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH);
|
||||
|
||||
@ -1002,7 +873,7 @@ static int check_if_terminated(opal_pointer_array_t *procs)
|
||||
}
|
||||
|
||||
if( !is_done ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t Still waiting for termination: \"%s\" [0x%x] < [0x%x]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_UNTERMINATED));
|
||||
}
|
||||
@ -1034,7 +905,7 @@ static int check_if_restarted(opal_pointer_array_t *procs)
|
||||
}
|
||||
|
||||
if( !is_done ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_autor_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING));
|
||||
}
|
||||
@ -1042,64 +913,6 @@ static int check_if_restarted(opal_pointer_array_t *procs)
|
||||
return is_done;
|
||||
}
|
||||
|
||||
static void update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_child_t *child;
|
||||
orte_proc_t *proct;
|
||||
int i;
|
||||
|
||||
/*** UPDATE LOCAL CHILD ***/
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = next) {
|
||||
next = opal_list_get_next(item);
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid) {
|
||||
if (child->name->vpid == proc->vpid) {
|
||||
child->state = state;
|
||||
child->exit_code = exit_code;
|
||||
proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
|
||||
proct->state = state;
|
||||
proct->exit_code = exit_code;
|
||||
/* (JJH: See note below)
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||
jdata->num_terminated++;
|
||||
}
|
||||
*/
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*** UPDATE REMOTE CHILD ***/
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (proct->name.jobid != proc->jobid ||
|
||||
proct->name.vpid != proc->vpid) {
|
||||
continue;
|
||||
}
|
||||
proct->state = state;
|
||||
proct->exit_code = exit_code;
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||
/* JJH: Do not increment this value. Otherwise the 'hnp' component
|
||||
* will try to terminate us after we request the job to
|
||||
* termiante. So we fake it out by making sure that
|
||||
* num_terminated never equals num_procs.
|
||||
* There should be a better way though...
|
||||
*/
|
||||
/* update the counter so we can terminate */
|
||||
/*jdata->num_terminated++;*/
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/************************
|
||||
* Timing
|
||||
************************/
|
||||
@ -1192,3 +1005,5 @@ static void errmgr_autor_display_indv_timer_core(double diff, char *str)
|
||||
perc);
|
||||
return;
|
||||
}
|
||||
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
@ -13,71 +13,189 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_hnp.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_hnp_component_version_string =
|
||||
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
|
||||
"ORTE ERRMGR Hnp MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_hnp_open(void);
|
||||
static int errmgr_hnp_close(void);
|
||||
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
|
||||
static int orte_errmgr_hnp_open(void);
|
||||
static int orte_errmgr_hnp_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_hnp_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component ithnp
|
||||
*/
|
||||
orte_errmgr_hnp_component_t mca_errmgr_hnp_component = {
|
||||
/* First do the base component stuff */
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"hnp",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component hnp
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"hnp",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_hnp_open,
|
||||
errmgr_hnp_close,
|
||||
errmgr_hnp_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
/* Component open and close functions */
|
||||
orte_errmgr_hnp_open,
|
||||
orte_errmgr_hnp_close,
|
||||
orte_errmgr_hnp_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
50
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_hnp_open(void)
|
||||
static int orte_errmgr_hnp_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
int val;
|
||||
|
||||
static int errmgr_hnp_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 10;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_hnp_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/*
|
||||
* This should be the last componet to ever get used since
|
||||
* it doesn't do anything.
|
||||
*/
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"priority",
|
||||
"Priority of the ERRMGR hnp component",
|
||||
false, false,
|
||||
mca_errmgr_hnp_component.super.priority,
|
||||
&mca_errmgr_hnp_component.super.priority);
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"verbose",
|
||||
"Verbose level for the ERRMGR hnp component",
|
||||
false, false,
|
||||
mca_errmgr_hnp_component.super.verbose,
|
||||
&mca_errmgr_hnp_component.super.verbose);
|
||||
/* If there is a custom verbose level for this component than use it
|
||||
* otherwise take our parents level and output channel
|
||||
*/
|
||||
if ( 0 != mca_errmgr_hnp_component.super.verbose) {
|
||||
mca_errmgr_hnp_component.super.output_handle = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_errmgr_hnp_component.super.output_handle,
|
||||
mca_errmgr_hnp_component.super.verbose);
|
||||
} else {
|
||||
mca_errmgr_hnp_component.super.output_handle = orte_errmgr_base.output;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/****************************
|
||||
* CRMig (C/R Process Migration) MCA Options
|
||||
****************************/
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"crmig_timing",
|
||||
"Enable Process Migration timer",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnp_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"crmig_enable",
|
||||
"Enable Process Migration (Default: 0/off)",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnp_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
/****************************
|
||||
* AutoR (Automatic Recovery) MCA Options
|
||||
****************************/
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"autor_timing",
|
||||
"Enable Automatic Recovery timer",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnp_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"autor_enable",
|
||||
"Enable Automatic Recovery (Default: 0/off)",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnp_component.autor_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"recovery_delay",
|
||||
"Number of seconds to wait before starting to recover the job after a failure"
|
||||
" [Default: 1 sec]",
|
||||
false, false,
|
||||
1, &val);
|
||||
mca_errmgr_hnp_component.autor_recovery_delay = val;
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
|
||||
"skip_oldnode",
|
||||
"Skip the old node from failed proc, even if it is still available"
|
||||
" [Default: Enabled]",
|
||||
false, false,
|
||||
1, &val);
|
||||
mca_errmgr_hnp_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
|
||||
#else
|
||||
val = 0; /* Silence compiler warning */
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open()");
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: priority = %d",
|
||||
mca_errmgr_hnp_component.super.priority);
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: verbosity = %d",
|
||||
mca_errmgr_hnp_component.super.verbose);
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: --- CR Migration Options ---");
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: Process Migration = %s",
|
||||
(mca_errmgr_hnp_component.crmig_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: timing = %s",
|
||||
(mca_errmgr_hnp_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
|
||||
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: --- Auto. Recovery Options ---");
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: Auto. Recover = %s",
|
||||
(mca_errmgr_hnp_component.autor_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: timing = %s",
|
||||
(mca_errmgr_hnp_component.autor_timing_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: open: recover_delay = %d",
|
||||
mca_errmgr_hnp_component.autor_recovery_delay);
|
||||
|
||||
mca_errmgr_hnp_component.crmig_in_progress = false;
|
||||
mca_errmgr_hnp_component.autor_in_progress = false;
|
||||
mca_errmgr_hnp_component.term_in_progress = false;
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_errmgr_hnp_close(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp: close()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -53,37 +53,11 @@
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_crmig.h"
|
||||
#include "errmgr_hnp.h"
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
/******************
|
||||
* Crmig module
|
||||
******************/
|
||||
static orte_errmgr_base_module_t global_module = {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_crmig_global_module_init,
|
||||
/** Finalization Function */
|
||||
orte_errmgr_crmig_global_module_finalize,
|
||||
/** Update State */
|
||||
orte_errmgr_crmig_global_update_state,
|
||||
orte_errmgr_crmig_global_predicted_fault,
|
||||
/*orte_errmgr_crmig_global_process_fault,*/
|
||||
orte_errmgr_crmig_global_suggest_map_targets,
|
||||
orte_errmgr_crmig_global_ft_event
|
||||
};
|
||||
|
||||
static orte_errmgr_base_module_t local_module = {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_crmig_local_module_init,
|
||||
/** Finalization Function */
|
||||
orte_errmgr_crmig_local_module_finalize,
|
||||
/** Update State */
|
||||
orte_errmgr_crmig_local_update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_crmig_local_ft_event
|
||||
};
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
|
||||
/************************************
|
||||
* Locally Global vars & functions :)
|
||||
@ -103,14 +77,15 @@ static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
|
||||
|
||||
static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map);
|
||||
|
||||
static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state);
|
||||
static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
orte_proc_state_t state);
|
||||
static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
orte_proc_state_t state);
|
||||
|
||||
static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs);
|
||||
static int check_if_terminated(opal_pointer_array_t *migrating_procs);
|
||||
@ -124,11 +99,6 @@ static void display_request(opal_list_t *off_procs,
|
||||
opal_list_t *off_nodes,
|
||||
orte_snapc_base_quiesce_t *cur_datum);
|
||||
|
||||
static void update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/*
|
||||
* Timer stuff
|
||||
*/
|
||||
@ -149,78 +119,36 @@ static double timer_start[OPAL_CR_TIMER_MAX];
|
||||
#define ERRMGR_CRMIG_TIMER_FINISH 6
|
||||
#define ERRMGR_CRMIG_TIMER_MAX 7
|
||||
|
||||
#define ERRMGR_CRMIG_CLEAR_TIMERS() \
|
||||
#define ERRMGR_CRMIG_CLEAR_TIMERS() \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_crmig_component.timing_enabled > 0)) { \
|
||||
errmgr_crmig_clear_timers(); \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
|
||||
errmgr_crmig_clear_timers(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ERRMGR_CRMIG_SET_TIMER(idx) \
|
||||
#define ERRMGR_CRMIG_SET_TIMER(idx) \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_crmig_component.timing_enabled > 0)) { \
|
||||
errmgr_crmig_set_time(idx); \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
|
||||
errmgr_crmig_set_time(idx); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \
|
||||
#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_crmig_component.timing_enabled > 0)) { \
|
||||
errmgr_crmig_display_all_timers(); \
|
||||
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
|
||||
errmgr_crmig_display_all_timers(); \
|
||||
} \
|
||||
}
|
||||
|
||||
/************************
|
||||
* Function Definitions
|
||||
************************/
|
||||
/*
|
||||
* MCA Functions
|
||||
*/
|
||||
int orte_errmgr_crmig_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if( !(orte_enable_recovery) ) {
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: component_query() - Disabled: Recovery is not enabled");
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if( !mca_errmgr_crmig_component.crmig_enabled ) {
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: component_query() - Disabled: Process Migration "
|
||||
"is not enabled via errmgr_crmig_enable MCA parameter.");
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: component_query()");
|
||||
|
||||
*priority = mca_errmgr_crmig_component.super.priority;
|
||||
if( ORTE_PROC_IS_HNP ) {
|
||||
*module = (mca_base_module_t *)&global_module;
|
||||
}
|
||||
else if (ORTE_PROC_IS_DAEMON) {
|
||||
*module = (mca_base_module_t *)&local_module;
|
||||
}
|
||||
else {
|
||||
*module = NULL;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Function Definitions: Global
|
||||
************************/
|
||||
int orte_errmgr_crmig_global_module_init(void)
|
||||
int orte_errmgr_hnp_crmig_global_module_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: init()");
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig): init()");
|
||||
|
||||
migrating_underway = false;
|
||||
|
||||
@ -240,12 +168,12 @@ int orte_errmgr_crmig_global_module_init(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_global_module_finalize(void)
|
||||
int orte_errmgr_hnp_crmig_global_module_finalize(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: finalize()");
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig): finalize()");
|
||||
|
||||
/*
|
||||
* Finalize the connection to the orte-migrate tool
|
||||
@ -265,10 +193,9 @@ int orte_errmgr_crmig_global_module_finalize(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_job_t *jdata = NULL;
|
||||
@ -299,7 +226,7 @@ int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
break;
|
||||
}
|
||||
if( NULL == current_global_jobdata ) {
|
||||
opal_output(0, "errmgr:crmig:predicted_fault(): Global) Error: Cannot find the jdata for the current job.");
|
||||
opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job.");
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -331,19 +258,18 @@ int orte_errmgr_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_show_help("help-orte-errmgr-crmig.txt", "migrated_job", true);
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true);
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_job_t *jdata = NULL;
|
||||
int ret = ORTE_SUCCESS;
|
||||
@ -351,19 +277,10 @@ int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
if( mca_errmgr_hnp_component.term_in_progress ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:crmig: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ret = ORTE_ERROR;
|
||||
@ -371,9 +288,29 @@ int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a tool, ignore
|
||||
*/
|
||||
if( jdata->num_apps == 0 &&
|
||||
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp(crmig): An external tool disconnected. Ignore...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp(crmig): job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
if( ORTE_PROC_STATE_ABORTED_BY_SIG == state ||
|
||||
ORTE_PROC_STATE_COMM_FAILED == state ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_crmig_global_process_fault(jdata, proc_name, state, stack_state)) ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_process_fault(jdata, proc_name, state)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
@ -381,50 +318,17 @@ int orte_errmgr_crmig_global_update_state(orte_jobid_t job,
|
||||
else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
|
||||
if( migrating_underway ) {
|
||||
/* If we are migrating, then we need to mask this to prevent the lower level from terminating us */
|
||||
update_proc(jdata, proc_name, state, exit_code);
|
||||
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
|
||||
mca_errmgr_hnp_component.ignore_current_update = true;
|
||||
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
/*
|
||||
* JJH: Todo
|
||||
* The expected logic here is:
|
||||
* if( a daemon with children fails ) {
|
||||
* abort migration.
|
||||
* }
|
||||
* if( a daemon without children fails ) {
|
||||
* continue. No processes lost
|
||||
* }
|
||||
* if( an application process fails ) {
|
||||
* abort migration. Might be a bad checkpoint, or a process that we were
|
||||
* not migrating that died.
|
||||
* }
|
||||
* else {
|
||||
* continue;
|
||||
* }
|
||||
*/
|
||||
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
|
||||
errmgr_crmig_process_fault_daemon(jdata, proc_name, state, stack_state);
|
||||
} else {
|
||||
errmgr_crmig_process_fault_app(jdata, proc_name, state, stack_state);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
opal_list_item_t *item = NULL, *m_item = NULL;
|
||||
@ -482,8 +386,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
OBJ_RELEASE(item);
|
||||
continue;
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), oldnode->name,
|
||||
current_proc_map->pre_map_fixed_node, node->name));
|
||||
}
|
||||
@ -498,8 +402,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
* If 'off_current_node' then exclude current node
|
||||
*/
|
||||
if( current_proc_map->off_current_node ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Remove old node (info) [%15s : %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), oldnode->name));
|
||||
for( item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
@ -537,8 +441,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name),
|
||||
oldnode->name, current_proc_map->map_node_name));
|
||||
}
|
||||
@ -563,8 +467,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
OBJ_RELEASE(item);
|
||||
continue;
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), oldnode->name,
|
||||
current_proc_map->map_node_name, node->name));
|
||||
}
|
||||
@ -578,8 +482,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
/*
|
||||
* Otherwise then map as if there was no exclusive mapping
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
|
||||
}
|
||||
/*
|
||||
@ -590,8 +494,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
* Remove the old node from the list, if there are more than 1 nodes available
|
||||
*/
|
||||
if(1 < opal_list_get_size(node_list) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Remove old node [%15s : %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), oldnode->name));
|
||||
for( item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
@ -612,8 +516,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
* If we do not have any general suggestions, then just return
|
||||
*/
|
||||
if( opal_list_get_size(current_onto_mapping_general) <= 0 ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
@ -622,8 +526,8 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
/*
|
||||
* Otherwise look through the general suggestions as an include list
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
|
||||
|
||||
num_suggested = 0;
|
||||
@ -653,87 +557,58 @@ int orte_errmgr_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
|
||||
++num_suggested;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------",
|
||||
num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name));
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------",
|
||||
(int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_global_ft_event(int state)
|
||||
int orte_errmgr_hnp_crmig_global_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/************************
|
||||
* Function Definitions: Global
|
||||
* Function Definitions: Static
|
||||
************************/
|
||||
int orte_errmgr_crmig_local_module_init(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: init() (Local)");
|
||||
|
||||
migrating_underway = false;
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
current_global_jobdata = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_local_module_finalize(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig: finalize() (Local)");
|
||||
|
||||
migrating_underway = false;
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
current_global_jobdata = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_local_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
/*
|
||||
* If this component is enabled, then the global version takes care of
|
||||
* recovery policy. Tell lower layers in the ErrMgr stack -not- to recover
|
||||
* locally.
|
||||
* JJH: Todo
|
||||
* The expected logic here is:
|
||||
* if( a daemon with children fails ) {
|
||||
* abort migration.
|
||||
* }
|
||||
* if( a daemon without children fails ) {
|
||||
* continue. No processes lost
|
||||
* }
|
||||
* if( an application process fails ) {
|
||||
* abort migration. Might be a bad checkpoint, or a process that we were
|
||||
* not migrating that died.
|
||||
* }
|
||||
* else {
|
||||
* continue;
|
||||
* }
|
||||
*/
|
||||
if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
|
||||
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_RECOVERED;
|
||||
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
|
||||
errmgr_crmig_process_fault_daemon(jdata, proc_name, state);
|
||||
} else {
|
||||
errmgr_crmig_process_fault_app(jdata, proc_name, state);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:crmig: update_state() (Local) job state %s"
|
||||
" for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
|
||||
orte_proc_state_to_str(state) ));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_crmig_local_ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
@ -756,8 +631,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
ERRMGR_CRMIG_CLEAR_TIMERS();
|
||||
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Migrating (%3d, %3d, %3d) -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------",
|
||||
(int)opal_list_get_size(off_procs),
|
||||
(int)opal_list_get_size(off_nodes),
|
||||
(int)opal_list_get_size(onto_maps)));
|
||||
@ -836,13 +711,13 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) &&
|
||||
(NULL == onto_map->map_node_name ||
|
||||
0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Process %15s does not wish to move -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------",
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Process %15s will be moved -------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------",
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
/*
|
||||
* Set the process to restarting
|
||||
@ -999,7 +874,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
}
|
||||
}
|
||||
|
||||
opal_show_help("help-orte-errmgr-crmig.txt", "no_migrating_procs", true,
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true,
|
||||
err_str_nodes,
|
||||
err_str_procs);
|
||||
|
||||
@ -1042,12 +917,12 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Starting the checkpoint of job %s -------",
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) {
|
||||
opal_output(0, "errmgr:crmig:migrate() Error: Unable to start the checkpoint.");
|
||||
opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint.");
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -1058,8 +933,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
/*
|
||||
* Terminate the migrating processes
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Terminate old processes in job %s -------",
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
orte_plm.terminate_procs(&cur_datum->migrating_procs);
|
||||
@ -1068,8 +943,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
* Clear the IOF stdin target if necessary
|
||||
*/
|
||||
if( close_iof_stdin ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Closing old STDIN target for job %s (%s)-------",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid),
|
||||
ORTE_NAME_PRINT(&iof_name) ));
|
||||
|
||||
@ -1079,8 +954,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
/*
|
||||
* Wait for the processes to finish terminating
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Waiting for termination -------");
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Waiting for termination -------");
|
||||
|
||||
while( !migrating_terminated ) {
|
||||
opal_progress();
|
||||
@ -1092,8 +967,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
/*
|
||||
* Start remapping the processes
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Checkpoint finished, setting up job %s -------",
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP;
|
||||
@ -1126,7 +1001,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\tAdjusted: \"%s\" [0x%d] [%s]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
|
||||
}
|
||||
@ -1137,15 +1012,15 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
* Restart the job
|
||||
* - spawn function will remap and launch the replacement proc(s)
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Respawning migrating processes in job %s -------",
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
orte_plm.spawn(current_global_jobdata);
|
||||
|
||||
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Waiting for restart -------");
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Waiting for restart -------");
|
||||
|
||||
migrating_restarted = false;
|
||||
while( !migrating_restarted ) {
|
||||
@ -1158,12 +1033,12 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
/*
|
||||
* Finish the checkpoint
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Reconnecting processes in job %s -------",
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) {
|
||||
opal_output(0, "errmgr:crmig:migrate() Error: Unable to end the checkpoint.");
|
||||
opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint.");
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -1172,8 +1047,8 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
/*
|
||||
* All done
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() ------- Finished migrating processes in job %s -------",
|
||||
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------",
|
||||
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
|
||||
|
||||
OBJ_RELEASE(cur_datum);
|
||||
@ -1247,7 +1122,7 @@ static int check_if_terminated(opal_pointer_array_t *migrating_procs)
|
||||
migrating_terminated = true;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD));
|
||||
}
|
||||
@ -1279,7 +1154,7 @@ static int check_if_restarted(opal_pointer_array_t *migrating_procs)
|
||||
migrating_restarted = true;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING));
|
||||
}
|
||||
@ -1289,11 +1164,10 @@ static int check_if_restarted(opal_pointer_array_t *migrating_procs)
|
||||
|
||||
static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:process_fault_app() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):process_fault_app() "
|
||||
"------- Application fault reported! proc %s (0x%x) "
|
||||
"- %s",
|
||||
ORTE_NAME_PRINT(proc),
|
||||
@ -1305,11 +1179,10 @@ static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
|
||||
|
||||
static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:process_fault_daemon() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):process_fault_daemon() "
|
||||
"------- Daemon fault reported! proc %s (0x%x) "
|
||||
"- %s",
|
||||
ORTE_NAME_PRINT(proc),
|
||||
@ -1322,8 +1195,8 @@ static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
|
||||
* JJH: Check to make sure this is not a new daemon loss.
|
||||
*/
|
||||
if( ORTE_PROC_STATE_COMM_FAILED == state ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:process_fault_daemon() "
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):process_fault_daemon() "
|
||||
"------- Daemon fault reported! proc %s (0x%x) "
|
||||
"- Communication failure, keep going",
|
||||
ORTE_NAME_PRINT(proc),
|
||||
@ -1373,8 +1246,8 @@ static void display_request(opal_list_t *off_procs,
|
||||
/*
|
||||
* Display all requested processes to migrate
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() Requested Processes to migrate: (%d procs)\n",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n",
|
||||
(int) opal_list_get_size(off_procs) ));
|
||||
for(item = opal_list_get_first(off_procs);
|
||||
item != opal_list_get_end(off_procs);
|
||||
@ -1396,7 +1269,7 @@ static void display_request(opal_list_t *off_procs,
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t%s (Rank %3d) on node %s\n",
|
||||
ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name));
|
||||
}
|
||||
@ -1404,8 +1277,8 @@ static void display_request(opal_list_t *off_procs,
|
||||
/*
|
||||
* Display Off Nodes
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() Requested Nodes to migration: (%d nodes)\n",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n",
|
||||
(int)opal_list_get_size(off_nodes) ));
|
||||
|
||||
for(item = opal_list_get_first(off_nodes);
|
||||
@ -1426,7 +1299,7 @@ static void display_request(opal_list_t *off_procs,
|
||||
}
|
||||
}
|
||||
if( found ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t\"%s\" \t%d\n",
|
||||
node->name, node->num_procs));
|
||||
for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) {
|
||||
@ -1435,7 +1308,7 @@ static void display_request(opal_list_t *off_procs,
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t\t\"%s\" [0x%x]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state));
|
||||
}
|
||||
@ -1445,26 +1318,26 @@ static void display_request(opal_list_t *off_procs,
|
||||
/*
|
||||
* Suggested onto nodes
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() Suggested nodes to migration onto: (%d nodes)\n",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n",
|
||||
(int)opal_list_get_size(current_onto_mapping_general) ));
|
||||
for(item = opal_list_get_first(current_onto_mapping_general);
|
||||
item != opal_list_get_end(current_onto_mapping_general);
|
||||
item = opal_list_get_next(item) ) {
|
||||
onto_map = (orte_errmgr_predicted_map_t*) item;
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t\"%s\"\n",
|
||||
onto_map->map_node_name));
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n",
|
||||
(int)opal_list_get_size(current_onto_mapping_exclusive) ));
|
||||
for(item = opal_list_get_first(current_onto_mapping_exclusive);
|
||||
item != opal_list_get_end(current_onto_mapping_exclusive);
|
||||
item = opal_list_get_next(item) ) {
|
||||
onto_map = (orte_errmgr_predicted_map_t*) item;
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t%d\t(%c)\t\"%s\"\n",
|
||||
onto_map->proc_name.vpid,
|
||||
(onto_map->off_current_node ? 'T' : 'F'),
|
||||
@ -1474,8 +1347,8 @@ static void display_request(opal_list_t *off_procs,
|
||||
/*
|
||||
* Display all processes scheduled to migrate
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
"errmgr:crmig:migrate() All Migrating Processes: (%d procs)\n",
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n",
|
||||
cur_datum->num_migrating));
|
||||
for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) {
|
||||
proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc);
|
||||
@ -1483,7 +1356,7 @@ static void display_request(opal_list_t *off_procs,
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_crmig_component.super.output_handle,
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
|
||||
"\t\"%s\" [0x%x] [%s]\n",
|
||||
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
|
||||
|
||||
@ -1504,7 +1377,7 @@ static void display_request(opal_list_t *off_procs,
|
||||
}
|
||||
}
|
||||
|
||||
opal_show_help("help-orte-errmgr-crmig.txt", "migrating_job", true,
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true,
|
||||
status_str);
|
||||
|
||||
if( NULL != tmp_str ) {
|
||||
@ -1520,64 +1393,6 @@ static void display_request(opal_list_t *off_procs,
|
||||
return;
|
||||
}
|
||||
|
||||
static void update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_child_t *child;
|
||||
orte_proc_t *proct;
|
||||
int i;
|
||||
|
||||
/*** UPDATE LOCAL CHILD ***/
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = next) {
|
||||
next = opal_list_get_next(item);
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid) {
|
||||
if (child->name->vpid == proc->vpid) {
|
||||
child->state = state;
|
||||
child->exit_code = exit_code;
|
||||
proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
|
||||
proct->state = state;
|
||||
proct->exit_code = exit_code;
|
||||
/* (JJH: See note below)
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||
jdata->num_terminated++;
|
||||
}
|
||||
*/
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*** UPDATE REMOTE CHILD ***/
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (proct->name.jobid != proc->jobid ||
|
||||
proct->name.vpid != proc->vpid) {
|
||||
continue;
|
||||
}
|
||||
proct->state = state;
|
||||
proct->exit_code = exit_code;
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||
/* JJH: Do not increment this value. Otherwise the 'hnp' component
|
||||
* will try to terminate us after we request the job to
|
||||
* termiante. So we fake it out by making sure that
|
||||
* num_terminated never equals num_procs.
|
||||
* There should be a better way though...
|
||||
*/
|
||||
/* update the counter so we can terminate */
|
||||
/*jdata->num_terminated++;*/
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/************************
|
||||
* Timing
|
||||
************************/
|
||||
@ -1676,3 +1491,5 @@ static void errmgr_crmig_display_indv_timer_core(double diff, char *str)
|
||||
perc);
|
||||
return;
|
||||
}
|
||||
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
@ -40,3 +40,32 @@ Process: %s
|
||||
because the application for that process could not be found. This
|
||||
appears to be a system error. Please report it to the ORTE
|
||||
developers.
|
||||
|
||||
[autor_recovering_job]
|
||||
Notice: The processes listed below failed unexpectedly.
|
||||
Using the last checkpoint to recover the job.
|
||||
Please standby.
|
||||
%s
|
||||
[autor_recovery_complete]
|
||||
Notice: The job has been successfully recovered from the
|
||||
last checkpoint.
|
||||
[autor_failed_to_recover_proc]
|
||||
Error: The process below has failed. There is no checkpoint available for
|
||||
this job, so we are terminating the application since automatic
|
||||
recovery cannot occur.
|
||||
Internal Name: %s
|
||||
MCW Rank: %d
|
||||
|
||||
[crmig_migrating_job]
|
||||
Notice: A migration of this job has been requested.
|
||||
The processes below will be migrated.
|
||||
Please standby.
|
||||
%s
|
||||
[crmig_migrated_job]
|
||||
Notice: The processes have been successfully migrated to/from the specified
|
||||
machines.
|
||||
[crmig_no_migrating_procs]
|
||||
Warning: Could not find any processes to migrate on the nodes specified.
|
||||
You provided the following:
|
||||
Nodes: %s
|
||||
Procs: %s
|
||||
|
@ -64,21 +64,18 @@ static int finalize(void);
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
opal_list_t *node_list);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
@ -90,6 +87,8 @@ static int ft_event(int state);
|
||||
orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
update_state,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
@ -114,8 +113,7 @@ static int update_state(orte_jobid_t job,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_job_t *jobdat = NULL;
|
||||
@ -126,9 +124,6 @@ static int update_state(orte_jobid_t job,
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
@ -315,8 +310,7 @@ static int update_state(orte_jobid_t job,
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) &&
|
||||
jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
|
||||
if( jobdat->enable_recovery && child->restarts < app->max_local_restarts ) {
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted restarting proc %s for the %d time",
|
||||
@ -330,7 +324,7 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) {
|
||||
if( jobdat->enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s RECOVERY ENABLED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -581,16 +575,14 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
@ -600,9 +592,9 @@ int ft_event(int state)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
static bool any_live_children(orte_jobid_t job)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user