1
1

ErrMgr Framework redesign to better support fault tolerance development activities.

Explained in more detail in the following RFC:
  http://www.open-mpi.org/community/lists/devel/2010/03/7589.php

This commit was SVN r22872.
Этот коммит содержится в:
Josh Hursey 2010-03-23 21:28:02 +00:00
родитель 0b9552cd4e
Коммит e4f2d03d28
44 изменённых файлов: 1989 добавлений и 708 удалений

5
NEWS
Просмотреть файл

@ -1,4 +1,4 @@
Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
University Research and Technology
Corporation. All rights reserved.
Copyright (c) 2004-2006 The University of Tennessee and The University
@ -29,6 +29,9 @@ version 1.0.
Trunk (not on release branches yet)
-----------------------------------
- ErrMgr framework redesigned to better support fault tolerance development
activities. See the following RFC for details:
http://www.open-mpi.org/community/lists/devel/2010/03/7589.php
- Add pkg-config(1) configuration files for ompi, ompi-c, ompi-cxx,
ompi-f77, ompi-f90. See the README for more details.

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -33,32 +33,37 @@
#include "orte/mca/errmgr/errmgr.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/*
* Internal definitions
*/
/*
* function definitions
* MCA Framework functions
*/
ORTE_DECLSPEC int orte_errmgr_base_open(void);
ORTE_DECLSPEC int orte_errmgr_base_select(void);
ORTE_DECLSPEC int orte_errmgr_base_close(void);
/*
* globals that might be needed
/**
* Composite Stack states
*/
#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */
#define ORTE_ERRMGR_STACK_STATE_STABLIZED 0x01 /* Stabalized the runtime */
#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */
#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */
#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */
extern bool orte_errmgr_base_selected;
extern bool orte_errmgr_initialized;
/**
* Output and component variables
*/
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
ORTE_DECLSPEC extern mca_errmgr_base_component_t orte_errmgr_base_selected_component;
ORTE_DECLSPEC extern int orte_errmgr_base_output;
ORTE_DECLSPEC extern bool orte_errmgr_base_shutting_down;
ORTE_DECLSPEC extern bool orte_errmgr_base_enable_recovery;
extern opal_pointer_array_t orte_errmgr_base_modules;
extern bool orte_errmgr_initialized;
/*
* external API functions will be documented in the mca/errmgr/errmgr.h file
* Additional External API function declared in errmgr.h
*/
END_C_DECLS

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -32,17 +32,31 @@
int orte_errmgr_base_close(void)
{
orte_errmgr_base_module_t *module = NULL;
int i;
OPAL_TRACE(5);
/* Close all selected components */
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_errmgr_finalize ) {
module->internal_errmgr_finalize();
}
}
/* Close all remaining available components (may be one if this is a
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
mca_base_components_close(orte_errmgr_base_output,
&orte_errmgr_base_components_available, NULL);
&orte_errmgr_base_components_available,
NULL);
OBJ_DESTRUCT(&orte_errmgr_base_modules);
orte_errmgr_initialized = false;
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -24,20 +24,37 @@
#include <unistd.h>
#endif
#include <stdlib.h>
#include <stdarg.h>
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_locks.h"
#include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/session_dir.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/*
* Local Function Declaration
*/
static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state);
/*
* Public interfaces
*/
void orte_errmgr_base_log(int error_code, char *filename, int line)
{
OPAL_TRACE(1);
@ -52,17 +69,363 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
ORTE_ERROR_NAME(error_code), filename, line);
}
void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code)
int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code)
{
return;
int rc;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
orte_proc_state_t state = ORTE_PROC_STATE_ABORTED;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
orte_errmgr_base_module_t *module = NULL;
if( ORTE_PROC_IS_APP ) {
return ORTE_SUCCESS;
}
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Stabalize the runtime
********************************/
if( !orte_errmgr_base_shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- %s fault reported! Process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
ORTE_NAME_PRINT(name)));
}
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
/* nothing we can do - abort things */
goto PROCESS;
}
/* if the proc was terminated by cmd, ignore it */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
/* nothing we can do */
goto PROCESS;
}
if( !orte_errmgr_base_shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- %s fault reported! Process %s, state (0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
ORTE_NAME_PRINT(name),
proc->state ));
}
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
/* don't do anything or else we can enter an infinite loop */
return ORTE_SUCCESS;
}
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) {
goto PROCESS;
}
/********************************
* Call the active modules
********************************/
if( orte_errmgr_base_enable_recovery && !orte_errmgr_base_shutting_down) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- Attempting recovery... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_process_fault ) {
module->internal_process_fault(jdata, name, state, &stack_state);
}
}
}
/********************************
* If the active modules still need us to abort, then do so
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- Successfully recovered from process %s fault! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name)));
return ORTE_SUCCESS;
}
PROCESS:
if( !orte_errmgr_base_shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- Not able to recover from process %s fault! Aborting...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name)));
}
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: abort in progress, ignoring proc %s aborted with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: proc %s aborted with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
orte_job_term_ordered = true;
/* if the proc is a daemon, then we are abnormally terminating */
if (ORTE_PROC_MY_NAME->jobid == name->jobid) {
orte_abnormal_term_ordered = true;
}
/* indicate that all jobs other than the one containing this
* proc have been ordered to abort - this is necessary to avoid
* duplicate ordering of "abort".
*
* NOTE: be sure to not include the 0 job data location as this
* contains the daemons!
*/
for (i=1; i < orte_job_data->size; i++) {
/* the array may have holes in it as we are recovering
* jobids as they complete, so check everything
*/
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
continue;
}
if (ORTE_JOB_STATE_ABORTED != jdata->state &&
ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state &&
ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) {
jdata->state = ORTE_JOB_STATE_ABORT_ORDERED;
}
}
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* just return - let the daemons report back so we can properly
* know when to actually exit
*/
return ORTE_SUCCESS;
}
void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code)
int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code)
{
return;
int rc;
orte_job_t *jdata;
orte_proc_state_t state = ORTE_PROC_STATE_FAILED_TO_START;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
if( ORTE_PROC_IS_APP ) {
return ORTE_SUCCESS;
}
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Stabalize the runtime
********************************/
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Incomplete start of job %s!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* nothing we can do - abort things */
goto PROCESS;
}
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, NULL, state)) {
goto PROCESS;
}
/********************************
* Call the active modules
* JJH: Currently, if we cannot launch the job, then we should just abort.
* JJH: Add job launch recovery logic...
********************************/
#if 0
if( orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Attempting recovery... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_process_fault ) {
module->internal_process_fault(jdata, NULL, state, &stack_state);
}
}
}
#endif
/********************************
* If the active modules still need us to abort, then do so
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Successfully recovered from incomplete start of job %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job) ));
return ORTE_SUCCESS;
}
PROCESS:
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Not able to recover from incomplete start of job %s! Aborting...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job) ));
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: abort in progress, ignoring incomplete start on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: job %s reported incomplete start with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
orte_job_term_ordered = true;
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* just return - let the daemons report back so we can properly
* know when to actually exit
*/
return ORTE_SUCCESS;
}
void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code)
{
orte_job_t *jdata = NULL;
orte_proc_state_t state = ORTE_PROC_STATE_COMM_FAILED;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
orte_errmgr_base_module_t *module = NULL;
int i;
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Stabalize the runtime
********************************/
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Communication to Process %s failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name) ));
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
/* nothing we can do - abort things */
goto PROCESS;
}
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) {
goto PROCESS;
}
/********************************
* Call the active modules
********************************/
if( orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Attempting recovery... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_process_fault ) {
module->internal_process_fault(jdata, name, state, &stack_state);
}
}
}
/********************************
* If the active modules still need us to abort, then do so
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Successfully recovered from communication fault with process %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name) ));
return ORTE_SUCCESS;
}
PROCESS:
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Not able to recover from communication fault with process %s! Aborting...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name) ));
/*
* Default action is to abort
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
orte_abnormal_term_ordered = true;
orte_trigger_event(&orte_exit);
return ORTE_SUCCESS;
}
int orte_errmgr_base_abort(int error_code, char *fmt, ...)
{
va_list arglist;
@ -89,12 +452,191 @@ void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
/* abnormal exit */
orte_ess.abort(error_code, false);
return ORTE_SUCCESS;
}
int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job,
orte_job_state_t state,
orte_err_cb_fn_t cbfunc,
void *cbdata)
int orte_errmgr_base_predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes)
{
return ORTE_ERR_NOT_AVAILABLE;
orte_errmgr_base_module_t *module = NULL;
int i;
/*
* If the user did not ask for recovery, then do not process recovery events
*/
if( !orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:predicted_fault() %s) "
"------- Recovery currently disabled! Skipping...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:predicted_fault() %s) "
"------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_predicted_fault ) {
module->internal_predicted_fault(proc_list, node_list, suggested_nodes);
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
orte_errmgr_base_module_t *module = NULL;
int i;
/*
* If the user did not ask for recovery, then do not process recovery events
*/
if( !orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:suggest_map_targets() %s) "
"------- Recovery currently disabled! Skipping...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:suggest_map_targets() %s) "
"------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_suggest_map_targets ) {
module->internal_suggest_map_targets(proc, oldnode, node_list);
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_base_ft_event(int state)
{
orte_errmgr_base_module_t *module = NULL;
int i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:ft_event() %s) "
"------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->internal_ft_event ) {
module->internal_ft_event(state);
}
}
return ORTE_SUCCESS;
}
/*
* Local functions
*/
static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state)
{
orte_proc_t *loc_proc, *child_proc;
orte_std_cntr_t i_proc;
int32_t i;
/*
* orterun is trying to shutdown, so just let it
*/
if( orte_errmgr_base_shutting_down ) {
return ORTE_SUCCESS;
}
/*
* orte_errmgr_base_incomplete_start() will pass a NULL since all processes
* are effected by this fault.
* JJH: Since we do not handle the recovery from such errors yet, just
* skip processing, and go to the abort sequence.
*/
if( NULL == proc ) {
return ORTE_SUCCESS;
}
/*
* Set the process state in the job data structure
*/
for(i = 0; i < jdata->procs->size; ++i) {
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if( loc_proc->name.vpid != proc->vpid) {
continue;
}
loc_proc->state = state;
break;
}
/*
* If this is a part of the control plane (HNP/orted)
*/
if( proc->jobid == ORTE_PROC_MY_NAME->jobid ) {
/*
* Remove the route to this process
*/
orte_routed.delete_route(proc);
/*
* If the aborted daemon had active processes on its node, then we should
* make sure to signal that all the children are gone.
*/
if( loc_proc->node->num_procs > 0 ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:stabalize_runtime() %s) "
"------- Daemon lost with the following processes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
if( NULL == child_proc ) {
continue;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:stabalize_runtime() %s) "
"\t %s [0x%x]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child_proc->name),
child_proc->state));
if( child_proc->last_errmgr_state < child_proc->state ) {
child_proc->last_errmgr_state = child_proc->state;
orte_errmgr_base_proc_aborted(&child_proc->name, -1);
}
}
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -20,50 +20,54 @@
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/errmgr/base/static-components.h"
/*
* globals
* Globals
*/
/*
* Global variables
*/
int orte_errmgr_base_output = -1;
/*
* define a default module that all application procs
* can use without having to open the framework. The
* decision on whether or not to open the framework is
* made in orte_init
*/
orte_errmgr_base_module_t orte_errmgr = {
orte_errmgr_base_proc_aborted_not_avail,
orte_errmgr_base_incomplete_start_not_avail,
orte_errmgr_base_register_cb_not_avail,
orte_errmgr_base_error_abort
};
bool orte_errmgr_base_selected = false;
opal_list_t orte_errmgr_base_components_available;
mca_errmgr_base_component_t orte_errmgr_base_selected_component;
int orte_errmgr_base_output = -1;
bool orte_errmgr_base_enable_recovery = false;
bool orte_errmgr_base_shutting_down = false;
bool orte_errmgr_initialized = false;
opal_list_t orte_errmgr_base_components_available;
/* Public module provides a wrapper around previous functions */
orte_errmgr_base_module_t orte_errmgr = {
orte_errmgr_base_proc_aborted,
orte_errmgr_base_incomplete_start,
orte_errmgr_base_comm_failed,
orte_errmgr_base_abort,
/* Internal Interfaces */
NULL, /* internal_errmgr_init */
NULL, /* internal_errmgr_finalize */
NULL, /* internal_predicted_fault */
NULL, /* internal_process_fault */
NULL, /* internal_suggest_map_targets */
NULL /* internal_ft_event */
};
/**
* Function for finding and opening either all MCA components, or the one
@ -71,25 +75,46 @@ bool orte_errmgr_initialized = false;
*/
int orte_errmgr_base_open(void)
{
int value;
OPAL_TRACE(5);
if (!orte_errmgr_initialized) { /* ensure we only do this once */
orte_errmgr_base_output = opal_output_open(NULL);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("errmgr", orte_errmgr_base_output,
mca_errmgr_base_static_components,
&orte_errmgr_base_components_available, true)) {
return ORTE_ERROR;
}
orte_errmgr_initialized = true;
/* Only pass this way once */
if( orte_errmgr_initialized ) {
return ORTE_SUCCESS;
}
/* All done */
OBJ_CONSTRUCT(&orte_errmgr_base_modules, opal_pointer_array_t);
orte_errmgr_base_output = opal_output_open(NULL);
mca_base_param_reg_int_name("errmgr",
"base_enable_recovery",
"If the ErrMgr recovery components should be enabled."
" [Default = disabled]",
false, false,
0, &value);
orte_errmgr_base_enable_recovery = OPAL_INT_TO_BOOL(value);
/*
* A flag to indicate that orterun is shutting down, so skip the recovery
* logic.
*/
orte_errmgr_base_shutting_down = false;
/*
* Open up all available components
*/
if (ORTE_SUCCESS !=
mca_base_components_open("errmgr",
orte_errmgr_base_output,
mca_errmgr_base_static_components,
&orte_errmgr_base_components_available,
true)) {
return ORTE_ERROR;
}
orte_errmgr_initialized = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -18,38 +18,163 @@
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/**
* Function for selecting one component from all those that are
* available.
/*
* List of composite modules, ordered by priority
*/
opal_pointer_array_t orte_errmgr_base_modules;
struct orte_errmgr_base_select_module_t {
mca_base_component_t *component;
mca_base_module_t *module;
int priority;
};
typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t;
int orte_errmgr_base_select(void)
{
mca_errmgr_base_component_t *best_component = NULL;
orte_errmgr_base_module_t *best_module = NULL;
int exit_status = OPAL_SUCCESS;
mca_base_component_list_item_t *cli = NULL;
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
opal_list_item_t *item = NULL;
int priority = 0, i, j, low_i;
orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
opal_pointer_array_t tmp_array;
orte_errmgr_base_module_t *i_module = NULL;
/*
* Select the best component
* If the user does not want the recovery features, then do not select any.
*/
if( OPAL_SUCCESS != mca_base_select("errmgr", orte_errmgr_base_output,
&orte_errmgr_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected */
return ORTE_ERR_NOT_FOUND;
if( !orte_errmgr_base_enable_recovery ) {
goto INIT;
}
/* Save the winner */
orte_errmgr = *best_module;
orte_errmgr_base_selected_component = *best_component;
orte_errmgr_base_selected = true;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
return ORTE_SUCCESS;
opal_output_verbose(10, orte_errmgr_base_output,
"errmgr:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
for (item = opal_list_get_first(&orte_errmgr_base_components_available);
item != opal_list_get_end(&orte_errmgr_base_components_available);
item = opal_list_get_next(item) ) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_errmgr_base_output,
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
component->mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_errmgr_base_output,
"errmgr:base:select Querying component [%s]",
component->mca_component_name);
component->mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_errmgr_base_output,
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
component->mca_component_name );
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_errmgr_base_output,
"errmgr:base:select Query of component [%s] set priority to %d",
component->mca_component_name, priority);
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
tmp_module->component = component;
tmp_module->module = module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_errmgr_base_output,
"errmgr:base:select Add module with priority [%s] %d",
tmp_module->component->mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_errmgr_base_modules, (void*)(tmp_module->module));
free(tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
INIT:
/*
* Initialize each of the Errmgr Modules
*/
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->internal_errmgr_init ) {
i_module->internal_errmgr_init();
}
}
return exit_status;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -34,7 +34,6 @@
#include "orte/mca/errmgr/errmgr.h"
/*
* Functions for use solely within the ERRMGR framework
*/
@ -48,29 +47,29 @@ typedef uint8_t orte_errmgr_cmd_flag_t;
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
#define ORTE_ERRMGR_REGISTER_CALLBACK_CMD 0x02
/* provide access to verbose output channel */
ORTE_DECLSPEC extern int orte_errmgr_base_output;
/*
* Base functions
*/
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
ORTE_DECLSPEC void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code);
ORTE_DECLSPEC void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code);
ORTE_DECLSPEC void orte_errmgr_base_error_abort(int error_code, char *fmt, ...) __opal_attribute_format__(__printf__, 2, 3) __opal_attribute_noreturn__;
ORTE_DECLSPEC int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job,
orte_job_state_t state,
orte_err_cb_fn_t cbfunc,
void *cbdata);
ORTE_DECLSPEC int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
ORTE_DECLSPEC int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
ORTE_DECLSPEC int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
ORTE_DECLSPEC int orte_recos_base_predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
ORTE_DECLSPEC int orte_recos_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
ORTE_DECLSPEC int orte_recos_base_ft_event(int state);
/*
* external API functions will be documented in the mca/errmgr/errmgr.h file
* Additional External API function declared in errmgr.h
*/
END_C_DECLS

Просмотреть файл

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

Просмотреть файл

@ -1,45 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
errmgr_default.h \
errmgr_default_component.c \
errmgr_default.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_default_DSO
component_noinst =
component_install = mca_errmgr_default.la
else
component_noinst = libmca_errmgr_default.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_default_la_SOURCES = $(sources)
mca_errmgr_default_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_default_la_SOURCES =$(sources)
libmca_errmgr_default_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,220 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdlib.h>
#include <stdarg.h>
#include "opal/util/trace.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_locks.h"
#include "orte/mca/plm/plm.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_default.h"
/*
* This function gets called by the PLM when an orted notifies us
* that a process has aborted
* Various components will follow their own strategy for dealing with
* this situation. For this component, we call the provided
* err_cbfunc if they requested notification on proc aborted.
* Otherwise, we simply kill the job.
*/
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
{
int rc;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
/* nothing we can do - abort things */
goto PROCESS;
}
/* if the proc was terminated by cmd, ignore it */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
/* nothing we can do */
goto PROCESS;
}
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
/* don't do anything or else we can enter an infinite loop */
return;
}
if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_ABORTED & jdata->err_cbstates)) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:default: proc %s aborted with status %d - calling cbfunc",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
jdata->err_cbfunc(name, ORTE_PROC_STATE_ABORTED, jdata->err_cbdata);
return;
}
PROCESS:
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:default: abort in progress, ignoring proc %s aborted with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:default: proc %s aborted with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
orte_job_term_ordered = true;
/* if the proc is a daemon, then we are abnormally terminating */
if (ORTE_PROC_MY_NAME->jobid == name->jobid) {
orte_abnormal_term_ordered = true;
}
/* indicate that all jobs other than the one containing this
* proc have been ordered to abort - this is necessary to avoid
* duplicate ordering of "abort".
*
* NOTE: be sure to not include the 0 job data location as this
* contains the daemons!
*/
for (i=1; i < orte_job_data->size; i++) {
/* the array may have holes in it as we are recovering
* jobids as they complete, so check everything
*/
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
continue;
}
if (ORTE_JOB_STATE_ABORTED != jdata->state &&
ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state &&
ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) {
jdata->state = ORTE_JOB_STATE_ABORT_ORDERED;
}
}
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* just return - let the daemons report back so we can properly
* know when to actually exit
*/
}
/*
* This function gets called by the PLM when an orted notifies us that
* a job failed to start.
* Various components will follow their own strategy for dealing with
* this situation. For this component, we simply kill the job.
*/
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
{
int rc;
orte_job_t *jdata;
orte_process_name_t name;
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* nothing we can do - abort things */
goto PROCESS;
}
if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_FAILED_TO_START & jdata->err_cbstates)) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:cm: job %s reported incomplete start with status %d - calling cbfunc",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
name.jobid = job;
name.vpid = ORTE_VPID_WILDCARD;
jdata->err_cbfunc(&name, ORTE_PROC_STATE_FAILED_TO_START, jdata->err_cbdata);
return;
}
PROCESS:
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:default: abort in progress, ignoring incomplete start on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:default: job %s reported incomplete start with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
orte_job_term_ordered = true;
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* just return - let the daemons report back so we can properly
* know when to actually exit
*/
}
/*
* Register a callback function upon a change to a specified job state.
*/
int orte_errmgr_default_register_callback(orte_jobid_t job,
orte_proc_state_t state,
orte_err_cb_fn_t cbfunc,
void *cbdata)
{
orte_job_t *jdata;
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* nothing we can do - abort things */
return ORTE_ERR_NOT_FOUND;
}
/* update the error callback data */
jdata->err_cbfunc = cbfunc;
jdata->err_cbstates = state;
jdata->err_cbdata = cbdata;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,57 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef ORTE_ERRMGR_HNP_H
#define ORTE_ERRMGR_HNP_H
#include "orte_config.h"
#include "orte/types.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Module open / close
*/
int orte_errmgr_default_component_open(void);
int orte_errmgr_default_component_close(void);
int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority);
/*
* Component API functions
*/
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code);
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code);
int orte_errmgr_default_register_callback(orte_jobid_t job,
orte_job_state_t state,
orte_err_cb_fn_t cbfunc,
void *cbdata);
ORTE_MODULE_DECLSPEC extern mca_errmgr_base_component_t mca_errmgr_default_component;
END_C_DECLS
#endif

Просмотреть файл

@ -1,108 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI General Purpose Registry - Proxy component
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "orte/util/proc_info.h"
#include "errmgr_default.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_errmgr_base_component_t mca_errmgr_default_component = {
{
ORTE_ERRMGR_BASE_VERSION_2_0_0,
"default", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_errmgr_default_component_open, /* component open */
orte_errmgr_default_component_close, /* component close */
orte_errmgr_default_component_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
/*
* setup the function pointers for the module
*/
orte_errmgr_base_module_t orte_errmgr_default = {
orte_errmgr_default_proc_aborted,
orte_errmgr_default_incomplete_start,
orte_errmgr_default_register_callback,
orte_errmgr_base_error_abort
};
/*
* Open the component
*/
int orte_errmgr_default_component_open(void)
{
return ORTE_SUCCESS;
}
/*
* Close the component
*/
int orte_errmgr_default_component_close(void)
{
return ORTE_SUCCESS;
}
int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority)
{
/* If we are an HNP or a CM, then pick us! */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_CM) {
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other components). */
*priority = 100;
*module = (mca_base_module_t *)&orte_errmgr_default;
return ORTE_SUCCESS;
}
/* otherwise, don't take me! */
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -18,7 +18,38 @@
*/
/** @file:
*
* The Open RTE Error Manager
* The Open RTE Error and Recovery Manager (ErrMgr)
*
* This framework is a composite framework in which multiple components
* are often active at the same time and may work on a single external call
* to the interface functions.
*
* This framework allows the user to compose a job recovery policy from multiple
* individual components. Each component will operate on the function call if it
* has a registered function. If no component registers a function then the base
* functionality/policy is used.
*
* For example, consider the 3 components on the left (C1, C2, C3), and the
* API function calls across the top:
* | Priority | Fn1 | Fn2 | Fn3 | Fn4 |
* -----+----------+------+------+------+------+
* base | --- | act0 | --- | --- | act6 |
* C1 | 10 | act1 | --- | act2 | --- |
* C2 | 20 | --- | act3 | --- | --- |
* C3 | 30 | act4 | act5 | --- | --- |
* -----+----------+------+------+------+------+
* A call to Fn1 will result in:
* act4, act1
* A call to Fn2 will result in:
* act5, act3
* A call to Fn3 will result in:
* act2
* A call to Fn4 will result in:
* act6
*
* Notice that when the base function is overridden it is not called. The base
* function is only called when the function has not been overridden by a
* component.
*
*/
@ -34,6 +65,10 @@
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/util/output.h"
#include "opal/util/error.h"
#include "orte/runtime/orte_globals.h"
@ -54,16 +89,98 @@ BEGIN_C_DECLS
orte_errmgr_base_log(n, __FILE__, __LINE__)
/**
* This is not part of any
* module so it can be used at any time!
* This is not part of any module so it can be used at any time!
*/
ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line);
/**
* Module initialization function.
* Public interface. Will be call in each of the active composite components
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_init_fn_t)
(void);
/**
* Module finalization function.
* Public interface. Will be call in each of the active composite components
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_finalize_fn_t)
(void);
/*
* Component functions - all MUST be provided!
* Internal Composite Interfaces
*/
/**
* Predicted process/node failure notification
* Composite interface. Called in priority order.
*
* @param[in] proc_list List of processes (or NULL if none)
* @param[in] node_list List of nodes (or NULL if none)
* @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none)
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_predicted_fault_fn_t)
(char ***proc_list, char ***node_list, char ***suggested_nodes);
/**
* Actual process failure notification
* Composite interface. Called in priority order.
*
* @param[in] proc_name Name of the failed processes
* @param[in] state State of the failed process
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_process_fault_fn_t)
(orte_job_t *jdata, orte_process_name_t *proec_name, orte_proc_state_t state, int *stack_state);
/**
* Suggest a node to map a restarting process onto
* Composite interface. Called in priority order.
*
* @param[in] proc Process that is being mapped
* @param[in] oldnode Previous node where this process resided
* @param[in|out] node_list List of nodes to select from
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_suggest_map_targets_fn_t)
(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list);
/**
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
/*
* External API Functions - Implemented in errmgr/base/errmgr_base_fns.c
*/
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
/**
* Alert - process aborted
@ -79,7 +196,8 @@ ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, i
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
ORTE_DECLSPEC extern int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
/**
* Alert - incomplete start of a job
@ -101,28 +219,8 @@ typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *n
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef void (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
/*
* Register a job with the error manager
* When a job is launched, this function is called so the error manager can register
* subscriptions on the job segment so that the error manager will be notified when
* problems occur - i.e., when process status entries change to abnormal termination
* values. Process status entries are changed by the appropriate state monitor
* and/or the process launcher, depending upon the stage at which the problem occurs.
*
* Monitoring of the job begins once the job has reached the "executing" stage. Prior
* to that time, failure of processes to start are the responsibility of the respective
* process launcher - which is expected to call the error manager via the "incomplete
* start" interface to report any problems prior to the job beginning "execution".
*
* NOTE: ONLY HNPs are allowed to register for trigger reports. All other components
* MUST do nothing but return ORTE_SUCCESS.
*/
typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job,
orte_proc_state_t state,
orte_err_cb_fn_t cbfunc,
void *cbdata);
ORTE_DECLSPEC extern int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
/**
* Alert - self aborting
@ -131,48 +229,85 @@ typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job,
* itself, and then exit - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) __opal_attribute_noreturn__
ORTE_DECLSPEC extern int orte_errmgr_base_abort(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
# endif
;
/**
* If the communication link failed to a peer.
* This gives us a chance to recover from this error, or abort.
*/
ORTE_DECLSPEC extern int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
typedef int (*orte_errmgr_base_module_comm_failed_fn_t)(orte_process_name_t *name,
int exit_code);
/*
*
* Module Structure
*/
struct orte_errmgr_base_module_2_3_0_t {
/* ---- Previous Interfaces (Always call base) -- */
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
orte_errmgr_base_module_register_cb_fn_t register_callback;
orte_errmgr_base_module_comm_failed_fn_t comm_failed;
orte_errmgr_base_module_abort_fn_t abort;
/* -------------- Internal Composite Interfaces -- */
/** Initialization Function */
orte_errmgr_base_module_init_fn_t internal_errmgr_init;
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t internal_errmgr_finalize;
/** Predicted process/node failure notification */
orte_errmgr_base_predicted_fault_fn_t internal_predicted_fault;
/** Actual process failure notification */
orte_errmgr_base_process_fault_fn_t internal_process_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_suggest_map_targets_fn_t internal_suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_ft_event_fn_t internal_ft_event;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
/*
* ERRMGR Component
* the standard component data structure
* ErrMgr Component
*/
struct mca_errmgr_base_component_2_0_0_t {
struct orte_errmgr_base_component_3_0_0_t {
/** MCA base component */
mca_base_component_t base_version;
/** MCA base data */
mca_base_component_data_t base_data;
/** Verbosity Level */
int verbose;
/** Output Handle for opal_output */
int output_handle;
/** Default Priority */
int priority;
};
typedef struct mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_2_0_0_t;
typedef mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_t;
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
/*
* Global structure for accessing previous error manager functions
*/
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
/*
* Macro for use in components that are of type errmgr
*/
#define ORTE_ERRMGR_BASE_VERSION_2_0_0 \
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
MCA_BASE_VERSION_2_0_0, \
"errmgr", 2, 0, 0
/* Global structure for accessing error manager functions
*/
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* holds selected module's function pointers */
"errmgr", 3, 0, 0
END_C_DECLS

37
orte/mca/errmgr/orcm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-orcm.txt
sources = \
errmgr_orcm.h \
errmgr_orcm_component.c \
errmgr_orcm_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_orcm_DSO
component_noinst =
component_install = mca_errmgr_orcm.la
else
component_noinst = libmca_errmgr_orcm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_orcm_la_SOURCES = $(sources)
mca_errmgr_orcm_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_orcm_la_SOURCES = $(sources)
libmca_errmgr_orcm_la_LDFLAGS = -module -avoid-version

19
orte/mca/errmgr/orcm/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_errmgr_orcm_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_errmgr_orcm_CONFIG],[
# If we don't want FT, don't compile this component
AS_IF([test "$ompi_want_ft" = "1"],
[$1],
[$2])
])dnl

13
orte/mca/errmgr/orcm/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,13 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=errmgr_orcm_component.c
PARAM_CONFIG_FILES="Makefile"

35
orte/mca/errmgr/orcm/errmgr_orcm.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_ORCM_EXPORT_H
#define MCA_ERRMGR_ORCM_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orcm_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orcm_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORCM_EXPORT_H */

Просмотреть файл

@ -0,0 +1,99 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_orcm.h"
/*
* Public string for version number
*/
const char *orte_errmgr_orcm_component_version_string =
"ORTE ERRMGR orcm MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_orcm_open(void);
static int errmgr_orcm_close(void);
static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_orcm_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itorcm
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"orcm",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_orcm_open,
errmgr_orcm_close,
errmgr_orcm_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
1
};
static int errmgr_orcm_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_orcm_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority)
{
/*
* This component is selected only when requested - and if so, then
* it MUST be used exclusively
*/
bool is_required = false;
mca_base_is_component_required(&orte_errmgr_base_components_available,
&mca_errmgr_orcm_component.base_version,
true,
&is_required);
if( !is_required ) {
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
*priority = 1000;
*module = (mca_base_module_t *)&orte_errmgr_orcm_module;
return ORTE_SUCCESS;
}

307
orte/mca/errmgr/orcm/errmgr_orcm_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,307 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/argv.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_orcm.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
static int process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
int *stack_state);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
static int ft_event(int state);
/******************
* ORCM module
******************/
orte_errmgr_base_module_t orte_errmgr_orcm_module = {
NULL, /* proc_aborted (old interface) */
NULL, /* incomplete_start (old interface) */
NULL, /* comm_failed (old interface) */
NULL, /* abort (old interface) */
init,
finalize,
predicted_fault,
process_fault,
suggest_map_targets,
ft_event
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int process_fault(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
int *stack_state)
{
orte_job_t *jnew;
orte_proc_t *pdata;
orte_app_context_t *app=NULL;
orte_node_t *node, *newnode;
orte_proc_t *daemon, *nodeproc;
opal_value_array_t jobs;
bool found;
int i;
size_t j;
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"errmgr:orcm:process_fault() "
"------- %s fault reported! proc %s (0x%x)",
(proc->jobid == ORTE_PROC_MY_NAME->jobid ? "Daemon" : "App. Process"),
ORTE_NAME_PRINT(proc),
state ));
/* get the app - just for output purposes in case of error */
app = opal_pointer_array_get_item(jdata->apps, 0);
/* Remove the route to this process since it is dead */
orte_routed.delete_route(proc);
/**** NON-DAEMON PROC FAILED ****/
if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
/* if the proc failed to start or we killed it by cmd,
* don't attempt to restart it as this can lead to an
* infinite loop
*/
if (ORTE_PROC_STATE_FAILED_TO_START == state) {
opal_output(0, "APPLICATION %s FAILED TO START", app->app);
return ORTE_SUCCESS;
}
/* if the proc was terminated by cmd, then do nothing */
if (ORTE_PROC_STATE_KILLED_BY_CMD == state) {
opal_output(0, "APPLICATION %s KILLED BY COMMAND", app->app);
return ORTE_SUCCESS;
}
/* get the proc_t object for this process */
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
if (NULL == pdata) {
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND;
}
/* proc just died - save the node where this proc was located */
node = pdata->node;
/* increment restarts */
pdata->restarts++;
/* have we exceeded #restarts? */
if (jdata->max_restarts < pdata->restarts) {
opal_output(0, "Max restarts for proc %s of app %s has been exceeded - process will not be restarted",
ORTE_NAME_PRINT(proc), app->app);
return ORTE_SUCCESS;
}
/* reset the job params for restart */
orte_plm_base_reset_job(jdata);
/* restart the job - the spawn function will remap and
* launch the replacement proc(s)
*/
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s RESTARTING APP: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (ORTE_SUCCESS != orte_plm.spawn(jdata)) {
opal_output(0, "FAILED TO RESTART APP %s", app->app);
orte_trigger_event(&orte_exit);
return ORTE_ERROR;
}
/* get the new node */
newnode = pdata->node;
/* report what we did */
opal_output(0, "Proc %s:%s aborted on node %s and was restarted on node %s\n\n",
app->app, ORTE_NAME_PRINT(proc), node->name, newnode->name);
return ORTE_SUCCESS;
}
/* if it was a daemon that failed, then we have to
* treat it differently
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s Daemon %s failed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(proc->vpid)));
/* need to relaunch all the apps that were on
* the node where this daemon was running as
* they either died along with the node, or will
* have self-terminated when the daemon died
*/
if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
/* nothing we can do - abort things */
opal_output(0, "FAILED TO GET DAEMON OBJECT");
return ORTE_ERROR;
}
/* flag the daemon state to indicate it terminated - this will
* cause the daemon to be restarted IF required for starting
* procs on that node
*/
daemon->state = ORTE_PROC_STATE_ABORTED;
/* identify the node where the daemon was running */
node = daemon->node;
/* release the contact info, if not already done */
if (NULL != daemon->rml_uri) {
free(daemon->rml_uri);
daemon->rml_uri = NULL;
}
/* setup to track the jobs on this node */
OBJ_CONSTRUCT(&jobs, opal_value_array_t);
opal_value_array_init(&jobs, sizeof(orte_jobid_t));
/* cycle through the node's procs */
for (i=0; i < node->procs->size; i++) {
if (NULL == (nodeproc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
/* set the proc to abnormally terminated */
nodeproc->state = ORTE_PROC_STATE_ABORTED;
/* increment restarts */
nodeproc->restarts++;
/* check if this proc's jobid is already in array */
found = false;
for (j=0; j < opal_value_array_get_size(&jobs); j++) {
if (nodeproc->name.jobid == OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)) {
found = true;
break;
}
}
if (!found) {
/* add it */
opal_value_array_append_item(&jobs, &nodeproc->name.jobid);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s RESTARTING APPS FROM NODE: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
for (j=0; j < opal_value_array_get_size(&jobs); j++) {
if (NULL == (jnew = orte_get_job_data_object(OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)))) {
/* nothing we can do - abort things */
opal_output(0, "FAILED TO GET JOB OBJECT TO BE RESTARTED");
return ORTE_ERROR;
}
/* reset the job params for restart */
orte_plm_base_reset_job(jnew);
/* restart the job - the spawn function will remap and
* launch the replacement proc(s)
*/
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
"%s RESTARTING JOB %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jnew->jobid)));
if (ORTE_SUCCESS != orte_plm.spawn(jnew)) {
opal_output(0, "FAILED TO RESTART APPS FROM NODE: %s", node->name);
return ORTE_ERROR;
}
}
opal_output(0, "Daemon %s on node %s aborted - procs were restarted elsewhere\n\n",
ORTE_NAME_PRINT(proc), node->name);
/* all done - cleanup and leave */
OBJ_DESTRUCT(&jobs);
return ORTE_ERROR;
}
/* save */
return ORTE_SUCCESS;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/

Просмотреть файл

@ -0,0 +1,14 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE RecoS IGNORE framework.
#

Просмотреть файл

@ -3,6 +3,9 @@
* Copyright (c) 2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
*
* $COPYRIGHT$
*
@ -72,7 +75,6 @@ orte_ess_base_module_t orte_ess_cm_module = {
proc_get_node_rank,
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
NULL /* ft_event */
};

21
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -447,6 +447,10 @@ static int rte_ft_event(int state)
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:env ft_event(%2d) - %s is Continuing",
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/*
* Notify RML -> OOB
*/
@ -476,6 +480,10 @@ static int rte_ft_event(int state)
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:env ft_event(%2d) - %s is Restarting",
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/*
* This should follow the ess init() function
*/
@ -583,6 +591,13 @@ static int rte_ft_event(int state)
goto cleanup;
}
/* if one was provided, build my nidmap */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* Notify SnapC
*/
@ -592,12 +607,6 @@ static int rte_ft_event(int state)
goto cleanup;
}
/* if one was provided, build my nidmap */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */

Просмотреть файл

@ -1873,7 +1873,37 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri)
struct sockaddr_storage inaddr;
mca_oob_tcp_addr_t* addr = NULL;
mca_oob_tcp_peer_t* peer = NULL;
opal_list_item_t *item;
int rc;
if (NULL == uri) {
/* purge the hash table entry for this proc */
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
/* get the peer object */
opal_hash_table_get_value_uint64(&mca_oob_tcp_component.tcp_peers,
orte_util_hash_name(name),
(void**)&peer);
if (NULL != peer) {
OPAL_THREAD_LOCK(&peer->peer_lock);
/* flag the state as closed */
peer->peer_state = MCA_OOB_TCP_CLOSED;
/* clear any pending sends */
while (NULL != (item = opal_list_remove_first(&peer->peer_send_queue))) {
OBJ_RELEASE(item);
}
peer->peer_send_msg = NULL;
/* clear any pending recvs */
peer->peer_recv_msg = NULL;
OPAL_THREAD_UNLOCK(&peer->peer_lock);
}
/* delete the entry from the hash table */
opal_hash_table_set_value_uint64(&mca_oob_tcp_component.tcp_peer_names,
orte_util_hash_name(name), NULL);
/* all done */
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
return ORTE_SUCCESS;
}
if((rc = mca_oob_tcp_parse_uri(uri, (struct sockaddr*) &inaddr)) != ORTE_SUCCESS) {
return rc;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -532,7 +532,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
so_error);
}
mca_oob_tcp_peer_shutdown(peer);
opal_evtimer_add(&peer->peer_timer_event, &tv);
if( MCA_OOB_TCP_FAILED != peer->peer_state ) {
opal_evtimer_add(&peer->peer_timer_event, &tv);
}
return;
} else if(so_error != 0) {
/* No need to worry about the return code here - we return regardless
@ -595,6 +597,8 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
peer->peer_state);
}
mca_oob_tcp_peer_shutdown(peer);
/* inform the routed framework that we have lost a connection so
* it can decide if this is important, what to do about it, etc.
*/
@ -606,8 +610,6 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
OPAL_THREAD_UNLOCK(&peer->peer_lock);
orte_errmgr.abort(1, NULL);
}
mca_oob_tcp_peer_shutdown(peer);
}
void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
@ -646,18 +648,6 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
not likely to suddenly become successful, so abort the
whole thing */
peer->peer_state = MCA_OOB_TCP_FAILED;
/* since we cannot communicate, and the system obviously needed
* to do so, let's abort so we don't just hang here
*/
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* just wake us up */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_abnormal_term_ordered = true;
orte_trigger_event(&orte_exit);
} else {
orte_errmgr.abort(1, NULL);
}
}
if (peer->peer_sd >= 0) {
@ -669,7 +659,9 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
}
opal_event_del(&peer->peer_timer_event);
peer->peer_state = MCA_OOB_TCP_CLOSED;
if( MCA_OOB_TCP_FAILED != peer->peer_state ) {
peer->peer_state = MCA_OOB_TCP_CLOSED;
}
}
/*

Просмотреть файл

@ -47,7 +47,7 @@
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/odls/odls.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#endif
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
@ -217,12 +217,15 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
***/
#if OPAL_ENABLE_FT_CR == 1
/* JJH: Would it be useful to let the errmgr know what we are doing here? */
/*
* Notify the Global SnapC component regarding new job
*/
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(jdata->jobid) ) ) {
/* Silent Failure :/ JJH */
ORTE_ERROR_LOG(rc);
if (ORTE_JOB_STATE_RESTART != jdata->state) {
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(jdata->jobid) ) ) {
/* Silent Failure :/ JJH */
ORTE_ERROR_LOG(rc);
}
}
#endif
@ -1388,7 +1391,8 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
* an error unless it was specifically commanded
*/
if (jdata->state < ORTE_JOB_STATE_TERMINATED ||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP ||
jdata->controls & ORTE_JOB_CONTROL_RECOVERABLE) {
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
/* the proc array may no longer be left justified, so
@ -1396,6 +1400,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
*/
continue;
}
/*
* Determine how the process state affects the job state
*/
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
if (!jdata->abort) {
@ -1406,7 +1414,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
jdata->state = ORTE_JOB_STATE_ABORTED;
if (!jdata->abort) {
@ -1417,7 +1424,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
if (!jdata->abort) {
@ -1428,7 +1434,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
if (!jdata->abort) {
@ -1445,7 +1450,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
*/
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
break;
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
/* we ordered this proc to die, so it isn't an abnormal termination
* and we don't flag it as such - just check the remaining jobs to
@ -1471,6 +1475,30 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
}
/*
* Call the errmgr for this process, if necessary
*/
if (ORTE_PROC_STATE_ABORTED == proc->state ||
ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state ||
ORTE_PROC_STATE_TERM_WO_SYNC == proc->state ||
ORTE_PROC_STATE_KILLED_BY_CMD == proc->state ) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed "
"Declared job %s %s by proc %s with code %d (0x%x vs 0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
(jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ?
"killed by cmd" : "aborted"),
ORTE_NAME_PRINT(&(proc->name)),
proc->exit_code,
proc->last_errmgr_state, proc->state));
/* Only report escalations in the fault state */
if( proc->last_errmgr_state < proc->state ) {
proc->last_errmgr_state = proc->state;
orte_errmgr.proc_aborted(&(proc->name), proc->exit_code);
}
}
}
}
@ -1490,21 +1518,16 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
}
goto CHECK_ALL_JOBS;
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed declared job %s aborted by proc %s with code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
(NULL == jdata->aborted_proc) ? "unknown" : ORTE_NAME_PRINT(&(jdata->aborted_proc->name)),
(NULL == jdata->aborted_proc) ? ORTE_ERROR_DEFAULT_EXIT_CODE : jdata->aborted_proc->exit_code));
/* report this to the errmgr */
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state ||
ORTE_JOB_STATE_KILLED_BY_CMD == jdata->state ) {
/* report this to the errmgr
* (if we know which process caused this, then it was reported above)
*/
if (NULL == jdata->aborted_proc) {
/* we don't know who caused us to abort */
orte_errmgr.proc_aborted(ORTE_NAME_INVALID, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code);
}
goto CHECK_ALL_JOBS;
} else if (jdata->num_terminated >= jdata->num_procs) {
@ -1521,7 +1544,9 @@ CHECK_ALL_JOBS:
/* if this job is a continuously operating one, then don't do
* anything further - just return here
*/
if (NULL != jdata && ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls) {
if (NULL != jdata &&
(ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls ||
ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls) ) {
goto CHECK_ALIVE;
}
@ -1634,6 +1659,13 @@ CHECK_ALIVE:
ORTE_JOBID_PRINT(job->jobid)));
one_still_alive = true;
}
else {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed job %s is terminated (%d vs %d [0x%x])",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid),
job->num_terminated, job->num_procs, jdata->state ));
}
}
/* if a job is still alive, we just return */
if (one_still_alive) {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -1343,6 +1343,8 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
int n, i, j;
orte_proc_t *proc, *proc_from_node;
orte_node_t *node_from_map, *node;
orte_odls_job_t *jobdat = NULL;
opal_list_item_t *item = NULL;
/* set the state to restart */
jdata->state = ORTE_JOB_STATE_RESTART;
@ -1354,6 +1356,7 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
if (ORTE_PROC_STATE_TERMINATED < proc->state) {
/* this proc abnormally terminated */
proc->state = ORTE_PROC_STATE_RESTART;
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
proc->pid = 0;
/* remove the proc from the node upon which it was mapped
*
@ -1394,7 +1397,13 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
}
}
/* adjust job accounting */
jdata->num_terminated--;
if( jdata->num_terminated > 0 ) {
jdata->num_terminated--;
}
else {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"plm:base:reset_job() WARNING: Prevented num_terminated from becoming < 0!"));
}
}
}
/* clear the info on who aborted */
@ -1406,6 +1415,18 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
/* since every daemon will be reporting status for every proc, reset these to zero */
jdata->num_launched = 0;
jdata->num_reported = 0;
/* Clean up the orte_odls_job_t structure for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
if (jobdat->jobid == jdata->jobid) {
jobdat->num_participating = -1;
}
}
/* since we are restarting the failed proc, reset the exit status */
ORTE_RESET_EXIT_STATUS();
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
@ -58,6 +58,7 @@ typedef uint16_t orte_proc_state_t;
#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0400 /* process aborted by signal */
#define ORTE_PROC_STATE_TERM_WO_SYNC 0x0800 /* process exit'd w/o required sync */
#define ORTE_PROC_STATE_KILLED_BY_CMD 0x1000 /* process was killed by ORTE cmd */
#define ORTE_PROC_STATE_COMM_FAILED 0x2000 /* process communication has failed */
/*

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
@ -323,8 +323,16 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
/* increment the #daemons terminated so we will exit properly */
jdata->num_terminated++;
#if 0
/* report that the daemon has failed so we can exit */
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
#else
/* JJH: Look into a better way of doing this. If we let the daemon
* know, then it kills the job when we are trying to restart.. */
opal_output(0, "%s daemon %s failed. SKIPPING orte_plm_base_launch_failed()",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&daemon->name));
#endif
}
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
@ -19,6 +19,10 @@
#include "orte_config.h"
#include "orte/constants.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#include "opal/util/if.h"
@ -708,7 +712,55 @@ int orte_rmaps_base_define_daemons(orte_job_map_t *map)
if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
map->daemon_vpid_start = proc->name.vpid;
}
} else {
}
/*
* If we are launching on a node where there used to be a daemon, but
* it had previously failed, try to relaunch it. (Daemon Recovery) Do
* this ONLY if there are procs mapped to that daemon!
*/
else if(node->daemon->state > ORTE_PROC_STATE_UNTERMINATED ) {
/* If no processes are to be launched on this node, then exclude it */
if( 0 >= node->num_procs ) {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name),
node->daemon->state,
(node->daemon_launched ? "T" : "F")
));
/* since this daemon exists but is not needed, then flag it
* as "launched" to avoid relaunching it for no reason
*/
node->daemon_launched = true;
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name),
node->daemon->state,
(node->daemon_launched ? "T" : "F")
));
/* flag that the daemon is no longer launched */
node->daemon_launched = false;
/* set the state to indicate launch is in progress */
node->daemon->state = ORTE_PROC_STATE_RESTART;
free(node->daemon->rml_uri);
node->daemon->rml_uri = NULL;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name)));
/* track number of daemons to be launched */
++map->num_new_daemons;
}
else {
/* this daemon was previously defined - flag it */
node->daemon_launched = true;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
*
* $COPYRIGHT$
*
@ -116,7 +119,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
int rc;
float avgload, minload;
orte_node_t *node, *nd=NULL, *oldnode;
orte_rmaps_res_ftgrp_t *ftgrp, *target;
orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL;
orte_vpid_t totprocs, lowprocs, num_assigned;
FILE *fp;
char *ftinput;
@ -195,6 +198,11 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
oldnode = proc->node;
/* point to the app */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx);
if( NULL == app ) {
ORTE_ERROR_LOG(ORTE_ERROR);
goto error;
}
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: proc %s from node %s is to be restarted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -257,18 +265,39 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
* and -host options
*/
if (NULL == target) {
nd = oldnode; /* put it back where it was if nothing else is found */
totprocs = 1000000;
nd = NULL;
/*
* Get a list of all nodes
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
map = jdata->map;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->policy))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
&num_slots,
app,
map->policy))) {
ORTE_ERROR_LOG(rc);
return rc;
goto error;
}
/* Ask the ErrMgr components if they have a suggestion for this process */
orte_errmgr_base_suggest_map_targets(proc, proc->node, &node_list);
nd = (orte_node_t*)opal_list_get_first(&node_list);
if( NULL == nd ) {
ORTE_ERROR_LOG(ORTE_ERROR);
goto error;
}
/*
* Look though the list for the least loaded machine.
*/
nd = oldnode; /* Put it back where it was if nothing else is found */
totprocs = 1000000;
/* find the lightest loaded node while deconstructing the list */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
if (node->num_procs < totprocs) {
if( node->num_procs < totprocs) {
nd = node;
totprocs = node->num_procs;
}
@ -280,9 +309,18 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nd->name));
/* put proc on the found node */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
NULL, jdata->map->oversubscribe, false, &proc))) {
/*
* Put the process on the found node (add it if not already in the map)
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata,
nd,
jdata->map->cpus_per_rank,
proc->app_idx,
NULL,
jdata->map->oversubscribe,
false,
&proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error
*/
@ -291,12 +329,15 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
goto error;
}
}
/* flag the proc state as non-launched so we'll know to launch it */
proc->state = ORTE_PROC_STATE_INIT;
/* update the node and local ranks so static ports can
* be properly selected if active
*/
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
continue;
}
/* if we did find a target, re-map the proc to the lightest loaded

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -163,6 +163,8 @@ BEGIN_C_DECLS
*/
int orte_rml_ftrm_ft_event(int state);
int orte_rml_ftrm_purge(orte_process_name_t *peer);
END_C_DECLS
#endif

Просмотреть файл

@ -81,7 +81,9 @@ orte_rml_module_t orte_rml_ftrm_module = {
orte_rml_ftrm_add_exception_handler,
orte_rml_ftrm_del_exception_handler,
orte_rml_ftrm_ft_event
orte_rml_ftrm_ft_event,
orte_rml_ftrm_purge
};
int rml_ftrm_output_handle;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -454,3 +454,18 @@ int orte_rml_ftrm_ft_event(int state)
return ORTE_SUCCESS;
}
int orte_rml_ftrm_purge(orte_process_name_t *peer)
{
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: purge()");
if( NULL != orte_rml_ftrm_wrapped_module.purge ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.purge(peer) ) ) {
return ret;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -179,6 +179,8 @@ void orte_rml_oob_exception_callback(const orte_process_name_t *peer,
orte_rml_exception_t exception);
int orte_rml_oob_purge(orte_process_name_t *peer);
END_C_DECLS
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -100,10 +100,14 @@ orte_rml_oob_module_t orte_rml_oob_module = {
orte_rml_oob_add_exception,
orte_rml_oob_del_exception,
orte_rml_oob_ft_event
orte_rml_oob_ft_event,
orte_rml_oob_purge
}
};
/* Local variables */
static bool init_done = false;
static int
rml_oob_open(void)
@ -134,6 +138,11 @@ rml_oob_close(void)
static orte_rml_module_t*
rml_oob_init(int* priority)
{
if (init_done) {
*priority = 1;
return &orte_rml_oob_module.super;
}
if (mca_oob_base_init() != ORTE_SUCCESS)
return NULL;
*priority = 1;
@ -156,6 +165,7 @@ rml_oob_init(int* priority)
orte_rml_oob_module.active_oob->oob_exception_callback =
orte_rml_oob_exception_callback;
init_done = true;
return &orte_rml_oob_module.super;
}

Просмотреть файл

@ -1,4 +1,7 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -12,6 +15,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
@ -75,3 +79,37 @@ orte_rml_oob_get_new_name(orte_process_name_t *name)
return orte_rml_oob_module.active_oob->oob_get_new_name(name);
}
int
orte_rml_oob_purge(orte_process_name_t *peer)
{
opal_list_item_t *item, *next;
orte_rml_oob_queued_msg_t *qmsg;
orte_rml_oob_msg_header_t *hdr;
orte_process_name_t step;
/* clear the oob contact info and pending messages */
orte_rml_oob_module.active_oob->oob_set_addr(peer, NULL);
/* clear our message queue */
OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
item = opal_list_get_first(&orte_rml_oob_module.queued_routing_messages);
while (item != opal_list_get_end(&orte_rml_oob_module.queued_routing_messages)) {
next = opal_list_get_next(item);
qmsg = (orte_rml_oob_queued_msg_t*)item;
hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base;
step = orte_routed.get_route(&hdr->destination);
if (peer->jobid == hdr->destination.jobid &&
peer->vpid == hdr->destination.vpid) {
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
OBJ_RELEASE(item);
} else if (step.jobid == hdr->destination.jobid &&
step.vpid == hdr->destination.vpid) {
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
OBJ_RELEASE(item);
}
item = next;
}
OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -575,6 +575,12 @@ typedef int (*orte_rml_module_exception_fn_t)(orte_rml_exception_callback_t cbfu
*/
typedef int (*orte_rml_module_ft_event_fn_t)(int state);
/**
* Purge the RML/OOB of contact info and pending messages
* to/from a specified process. Used when a process aborts
* and is to be restarted
*/
typedef int (*orte_rml_module_purge_fn_t)(struct orte_process_name_t *peer);
/* ******************************************************************** */
@ -629,6 +635,9 @@ struct orte_rml_module_t {
/** Fault tolerance handler */
orte_rml_module_ft_event_fn_t ft_event;
/** Purge information */
orte_rml_module_purge_fn_t purge;
};
/** Convienence typedef */
typedef struct orte_rml_module_t orte_rml_module_t;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* $COPYRIGHT$
*
@ -58,12 +58,19 @@ opal_list_t orte_routed_base_components;
static orte_routed_component_t *active_component = NULL;
static bool component_open_called = false;
static bool opened = false;
static bool selected = false;
int
orte_routed_base_open(void)
{
int ret;
if (opened) {
return ORTE_SUCCESS;
}
opened = true;
/* setup the output stream */
orte_routed_base_output = opal_output_open(NULL);
@ -88,6 +95,11 @@ orte_routed_base_select(void)
orte_routed_component_t *best_component = NULL;
orte_routed_module_t *best_module = NULL;
if (selected) {
return ORTE_SUCCESS;
}
selected = true;
/*
* Select the best component
*/
@ -134,6 +146,9 @@ orte_routed_base_close(void)
OBJ_DESTRUCT(&orte_routed_base_components);
opened = false;
selected = false;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -184,6 +187,8 @@ static int delete_route(orte_process_name_t *proc)
* the routing tree
*/
/* remove any entries in the RML for this process */
rc = orte_rml.purge(proc);
return ORTE_SUCCESS;
}
@ -279,6 +284,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
{
orte_process_name_t *ret, daemon;
int rc;
int32_t i;
orte_job_t *jdata;
orte_proc_t *proc;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
@ -342,7 +350,37 @@ static orte_process_name_t get_route(orte_process_name_t *target)
} else {
/* otherwise, if I am the HNP, send to the daemon */
if (ORTE_PROC_IS_HNP) {
ret = &daemon;
/*
* Check to make sure the daemon is active, if not then return an INVALID name
* JJH: There should be a faster way to do this check, but for now just iterate...
*/
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ret = ORTE_NAME_INVALID;
goto found;
}
for(i = 0; i < jdata->procs->size; ++i) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if( proc->name.vpid != daemon.vpid) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed_cm_get: Checking process %15s state 0x%x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(proc->name)),
proc->state));
if( proc->state <= ORTE_PROC_STATE_UNTERMINATED ) {
ret = &daemon;
} else {
ret = ORTE_NAME_INVALID;
}
goto found;
}
} else {
/* send to the HNP for routing */
ret = ORTE_PROC_MY_HNP;
@ -727,7 +765,9 @@ static int update_routing_tree(void)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_routed_tree_t *nm;
orte_vpid_t i;
int32_t i;
orte_job_t *jdata;
orte_proc_t *proc;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
@ -741,12 +781,41 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
return ORTE_PROC_MY_HNP->vpid;
}
/* for the HNP, the cm routing tree is direct to all known daemons */
/* for the HNP, the cm routing tree is direct to all known alive daemons */
if (NULL != children) {
for (i=1; i < orte_process_info.num_procs; i++) {
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = i;
opal_list_append(children, &nm->super);
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
for(i = 0; i < jdata->procs->size; ++i) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if( proc->name.vpid == 0) {
continue;
}
if( proc->state <= ORTE_PROC_STATE_UNTERMINATED &&
NULL != proc->rml_uri ) {
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s get_routing_tree: Adding process %15s state 0x%x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(proc->name)),
proc->state));
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = proc->name.vpid;
opal_bitmap_clear_all_bits(&nm->relatives);
opal_list_append(children, &nm->super);
}
else {
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s get_routing_tree: Skipped process %15s state 0x%x (non functional daemon)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(proc->name)),
proc->state));
}
}
}

Просмотреть файл

@ -781,6 +781,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->pid = 0;
proc->local_rank = ORTE_LOCAL_RANK_INVALID;
proc->node_rank = ORTE_NODE_RANK_INVALID;
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
proc->state = ORTE_PROC_STATE_UNDEF;
proc->app_idx = 0;
proc->slot_list = NULL;

Просмотреть файл

@ -285,6 +285,7 @@ typedef uint8_t orte_job_controls_t;
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x10
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
#define ORTE_JOB_CONTROL_RECOVERABLE 0x80
#define ORTE_MAPPING_POLICY OPAL_UINT16
/* put the rank assignment method in the upper 8 bits */
@ -419,6 +420,8 @@ struct orte_proc_t {
* know which static IP port to use
*/
orte_node_rank_t node_rank;
/* Last state used to trigger the errmgr for this proc */
orte_proc_state_t last_errmgr_state;
/* process state */
orte_proc_state_t state;
/* exit code */

Просмотреть файл

@ -5,9 +5,12 @@
* A program that just spins - provides mechanism for testing user-driven
* abnormal program termination
*/
#include "opal_config.h"
#include <stdio.h>
#include "opal/runtime/opal_progress.h"
#include "orte/runtime/runtime.h"
int main(int argc, char* argv[])
@ -22,7 +25,15 @@ int main(int argc, char* argv[])
while (1) {
i++;
pi = i / 3.14159256;
if (i > 100) i = 0;
if (i > 100) {
/* need to progress so we can
* wake up if our daemon goes
* away!
*/
opal_progress();
/* reset the counter so we loop */
i = 0;
}
}
orte_finalize();

Просмотреть файл

@ -81,6 +81,7 @@
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/runtime/runtime.h"
@ -1127,6 +1128,15 @@ static void abort_exit_callback(int fd, short ign, void *arg)
!orte_never_launched) {
/* if the debuggers were run, clean up */
orte_debugger_finalize();
/*
* Turn off the errmgr recovery functionality, if it was enabled.
* This keeps the errmgr from trying to recover from the shutdown
* procedure.
*/
orte_errmgr_base_enable_recovery = false;
orte_errmgr_base_shutting_down = true;
/* terminate the orteds - they will automatically kill
* their local procs
*/