ErrMgr Framework redesign to better support fault tolerance development activities.
Explained in more detail in the following RFC: http://www.open-mpi.org/community/lists/devel/2010/03/7589.php This commit was SVN r22872.
Этот коммит содержится в:
родитель
0b9552cd4e
Коммит
e4f2d03d28
5
NEWS
5
NEWS
@ -1,4 +1,4 @@
|
||||
Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
University Research and Technology
|
||||
Corporation. All rights reserved.
|
||||
Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
@ -29,6 +29,9 @@ version 1.0.
|
||||
Trunk (not on release branches yet)
|
||||
-----------------------------------
|
||||
|
||||
- ErrMgr framework redesigned to better support fault tolerance development
|
||||
activities. See the following RFC for details:
|
||||
http://www.open-mpi.org/community/lists/devel/2010/03/7589.php
|
||||
- Add pkg-config(1) configuration files for ompi, ompi-c, ompi-cxx,
|
||||
ompi-f77, ompi-f90. See the README for more details.
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -33,32 +33,37 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Internal definitions
|
||||
*/
|
||||
/*
|
||||
* function definitions
|
||||
* MCA Framework functions
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_errmgr_base_open(void);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_select(void);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_close(void);
|
||||
|
||||
/*
|
||||
* globals that might be needed
|
||||
/**
|
||||
* Composite Stack states
|
||||
*/
|
||||
#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */
|
||||
#define ORTE_ERRMGR_STACK_STATE_STABLIZED 0x01 /* Stabalized the runtime */
|
||||
#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */
|
||||
#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */
|
||||
#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */
|
||||
|
||||
extern bool orte_errmgr_base_selected;
|
||||
extern bool orte_errmgr_initialized;
|
||||
/**
|
||||
* Output and component variables
|
||||
*/
|
||||
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
|
||||
ORTE_DECLSPEC extern mca_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_output;
|
||||
ORTE_DECLSPEC extern bool orte_errmgr_base_shutting_down;
|
||||
ORTE_DECLSPEC extern bool orte_errmgr_base_enable_recovery;
|
||||
|
||||
extern opal_pointer_array_t orte_errmgr_base_modules;
|
||||
extern bool orte_errmgr_initialized;
|
||||
|
||||
/*
|
||||
* external API functions will be documented in the mca/errmgr/errmgr.h file
|
||||
* Additional External API function declared in errmgr.h
|
||||
*/
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -32,17 +32,31 @@
|
||||
|
||||
int orte_errmgr_base_close(void)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* Close all selected components */
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_errmgr_finalize ) {
|
||||
module->internal_errmgr_finalize();
|
||||
}
|
||||
}
|
||||
|
||||
/* Close all remaining available components (may be one if this is a
|
||||
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
|
||||
|
||||
mca_base_components_close(orte_errmgr_base_output,
|
||||
&orte_errmgr_base_components_available, NULL);
|
||||
&orte_errmgr_base_components_available,
|
||||
NULL);
|
||||
|
||||
OBJ_DESTRUCT(&orte_errmgr_base_modules);
|
||||
|
||||
orte_errmgr_initialized = false;
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -24,20 +24,37 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
/*
|
||||
* Local Function Declaration
|
||||
*/
|
||||
static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state);
|
||||
|
||||
|
||||
/*
|
||||
* Public interfaces
|
||||
*/
|
||||
void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
@ -52,17 +69,363 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
}
|
||||
|
||||
void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code)
|
||||
int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code)
|
||||
{
|
||||
return;
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
orte_proc_state_t state = ORTE_PROC_STATE_ABORTED;
|
||||
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
|
||||
if( ORTE_PROC_IS_APP ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code)
|
||||
{
|
||||
return;
|
||||
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
|
||||
/********************************
|
||||
* Stabalize the runtime
|
||||
********************************/
|
||||
if( !orte_errmgr_base_shutting_down ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:proc_aborted() %s) "
|
||||
"------- %s fault reported! Process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
|
||||
ORTE_NAME_PRINT(name)));
|
||||
}
|
||||
|
||||
void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
|
||||
/* nothing we can do - abort things */
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/* if the proc was terminated by cmd, ignore it */
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
|
||||
/* nothing we can do */
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
if( !orte_errmgr_base_shutting_down ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:proc_aborted() %s) "
|
||||
"------- %s fault reported! Process %s, state (0x%x)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
|
||||
ORTE_NAME_PRINT(name),
|
||||
proc->state ));
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||
/* don't do anything or else we can enter an infinite loop */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) {
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/********************************
|
||||
* Call the active modules
|
||||
********************************/
|
||||
if( orte_errmgr_base_enable_recovery && !orte_errmgr_base_shutting_down) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:proc_aborted() %s) "
|
||||
"------- Attempting recovery... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_process_fault ) {
|
||||
module->internal_process_fault(jdata, name, state, &stack_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/********************************
|
||||
* If the active modules still need us to abort, then do so
|
||||
********************************/
|
||||
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:proc_aborted() %s) "
|
||||
"------- Successfully recovered from process %s fault! Continuing...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name)));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
PROCESS:
|
||||
if( !orte_errmgr_base_shutting_down ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:proc_aborted() %s) "
|
||||
"------- Not able to recover from process %s fault! Aborting...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name)));
|
||||
}
|
||||
|
||||
/* if we are already in progress, then ignore this call */
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:base: abort in progress, ignoring proc %s aborted with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name), exit_code));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:base: proc %s aborted with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name), exit_code));
|
||||
|
||||
orte_job_term_ordered = true;
|
||||
|
||||
/* if the proc is a daemon, then we are abnormally terminating */
|
||||
if (ORTE_PROC_MY_NAME->jobid == name->jobid) {
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
|
||||
/* indicate that all jobs other than the one containing this
|
||||
* proc have been ordered to abort - this is necessary to avoid
|
||||
* duplicate ordering of "abort".
|
||||
*
|
||||
* NOTE: be sure to not include the 0 job data location as this
|
||||
* contains the daemons!
|
||||
*/
|
||||
for (i=1; i < orte_job_data->size; i++) {
|
||||
/* the array may have holes in it as we are recovering
|
||||
* jobids as they complete, so check everything
|
||||
*/
|
||||
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_JOB_STATE_ABORTED != jdata->state &&
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state &&
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORT_ORDERED;
|
||||
}
|
||||
}
|
||||
|
||||
/* tell the plm to terminate all jobs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* set the exit status, just in case whomever called us failed
|
||||
* to do so - it can only be done once, so we are protected
|
||||
* from overwriting it
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
|
||||
/* just return - let the daemons report back so we can properly
|
||||
* know when to actually exit
|
||||
*/
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_state_t state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
|
||||
if( ORTE_PROC_IS_APP ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
|
||||
/********************************
|
||||
* Stabalize the runtime
|
||||
********************************/
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:incomplete_start() %s) "
|
||||
"------- Incomplete start of job %s!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* nothing we can do - abort things */
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, NULL, state)) {
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/********************************
|
||||
* Call the active modules
|
||||
* JJH: Currently, if we cannot launch the job, then we should just abort.
|
||||
* JJH: Add job launch recovery logic...
|
||||
********************************/
|
||||
#if 0
|
||||
if( orte_errmgr_base_enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:incomplete_start() %s) "
|
||||
"------- Attempting recovery... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_process_fault ) {
|
||||
module->internal_process_fault(jdata, NULL, state, &stack_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/********************************
|
||||
* If the active modules still need us to abort, then do so
|
||||
********************************/
|
||||
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:incomplete_start() %s) "
|
||||
"------- Successfully recovered from incomplete start of job %s! Continuing...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
PROCESS:
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:incomplete_start() %s) "
|
||||
"------- Not able to recover from incomplete start of job %s! Aborting...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job) ));
|
||||
|
||||
/* if we are already in progress, then ignore this call */
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:base: abort in progress, ignoring incomplete start on job %s with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:base: job %s reported incomplete start with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
|
||||
orte_job_term_ordered = true;
|
||||
|
||||
/* tell the plm to terminate all jobs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* set the exit status, just in case whomever called us failed
|
||||
* to do so - it can only be done once, so we are protected
|
||||
* from overwriting it
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
|
||||
/* just return - let the daemons report back so we can properly
|
||||
* know when to actually exit
|
||||
*/
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code)
|
||||
{
|
||||
orte_job_t *jdata = NULL;
|
||||
orte_proc_state_t state = ORTE_PROC_STATE_COMM_FAILED;
|
||||
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
|
||||
/********************************
|
||||
* Stabalize the runtime
|
||||
********************************/
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:comm_failed() %s) "
|
||||
"------- Communication to Process %s failed!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name) ));
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
|
||||
/* nothing we can do - abort things */
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) {
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/********************************
|
||||
* Call the active modules
|
||||
********************************/
|
||||
if( orte_errmgr_base_enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:comm_failed() %s) "
|
||||
"------- Attempting recovery... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
|
||||
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_process_fault ) {
|
||||
module->internal_process_fault(jdata, name, state, &stack_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/********************************
|
||||
* If the active modules still need us to abort, then do so
|
||||
********************************/
|
||||
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:comm_failed() %s) "
|
||||
"------- Successfully recovered from communication fault with process %s! Continuing...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
PROCESS:
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:comm_failed() %s) "
|
||||
"------- Not able to recover from communication fault with process %s! Aborting...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name) ));
|
||||
|
||||
/*
|
||||
* Default action is to abort
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
orte_abnormal_term_ordered = true;
|
||||
orte_trigger_event(&orte_exit);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
@ -89,12 +452,191 @@ void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
|
||||
|
||||
/* abnormal exit */
|
||||
orte_ess.abort(error_code, false);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job,
|
||||
orte_job_state_t state,
|
||||
orte_err_cb_fn_t cbfunc,
|
||||
void *cbdata)
|
||||
int orte_errmgr_base_predicted_fault(char ***proc_list,
|
||||
char ***node_list,
|
||||
char ***suggested_nodes)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* If the user did not ask for recovery, then do not process recovery events
|
||||
*/
|
||||
if( !orte_errmgr_base_enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:predicted_fault() %s) "
|
||||
"------- Recovery currently disabled! Skipping...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:predicted_fault() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_predicted_fault ) {
|
||||
module->internal_predicted_fault(proc_list, node_list, suggested_nodes);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* If the user did not ask for recovery, then do not process recovery events
|
||||
*/
|
||||
if( !orte_errmgr_base_enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Recovery currently disabled! Skipping...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_suggest_map_targets ) {
|
||||
module->internal_suggest_map_targets(proc, oldnode, node_list);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_ft_event(int state)
|
||||
{
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:ft_event() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != module->internal_ft_event ) {
|
||||
module->internal_ft_event(state);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state)
|
||||
{
|
||||
orte_proc_t *loc_proc, *child_proc;
|
||||
orte_std_cntr_t i_proc;
|
||||
int32_t i;
|
||||
|
||||
/*
|
||||
* orterun is trying to shutdown, so just let it
|
||||
*/
|
||||
if( orte_errmgr_base_shutting_down ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* orte_errmgr_base_incomplete_start() will pass a NULL since all processes
|
||||
* are effected by this fault.
|
||||
* JJH: Since we do not handle the recovery from such errors yet, just
|
||||
* skip processing, and go to the abort sequence.
|
||||
*/
|
||||
if( NULL == proc ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the process state in the job data structure
|
||||
*/
|
||||
for(i = 0; i < jdata->procs->size; ++i) {
|
||||
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if( loc_proc->name.vpid != proc->vpid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
loc_proc->state = state;
|
||||
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* If this is a part of the control plane (HNP/orted)
|
||||
*/
|
||||
if( proc->jobid == ORTE_PROC_MY_NAME->jobid ) {
|
||||
/*
|
||||
* Remove the route to this process
|
||||
*/
|
||||
orte_routed.delete_route(proc);
|
||||
|
||||
/*
|
||||
* If the aborted daemon had active processes on its node, then we should
|
||||
* make sure to signal that all the children are gone.
|
||||
*/
|
||||
if( loc_proc->node->num_procs > 0 ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:stabalize_runtime() %s) "
|
||||
"------- Daemon lost with the following processes",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
|
||||
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
|
||||
if( NULL == child_proc ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
"errmgr:base:stabalize_runtime() %s) "
|
||||
"\t %s [0x%x]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child_proc->name),
|
||||
child_proc->state));
|
||||
|
||||
if( child_proc->last_errmgr_state < child_proc->state ) {
|
||||
child_proc->last_errmgr_state = child_proc->state;
|
||||
orte_errmgr_base_proc_aborted(&child_proc->name, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -20,50 +20,54 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/errmgr/base/static-components.h"
|
||||
|
||||
/*
|
||||
* globals
|
||||
*/
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
* Globals
|
||||
*/
|
||||
int orte_errmgr_base_output = -1;
|
||||
/*
|
||||
* define a default module that all application procs
|
||||
* can use without having to open the framework. The
|
||||
* decision on whether or not to open the framework is
|
||||
* made in orte_init
|
||||
*/
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
orte_errmgr_base_proc_aborted_not_avail,
|
||||
orte_errmgr_base_incomplete_start_not_avail,
|
||||
orte_errmgr_base_register_cb_not_avail,
|
||||
orte_errmgr_base_error_abort
|
||||
};
|
||||
|
||||
bool orte_errmgr_base_selected = false;
|
||||
opal_list_t orte_errmgr_base_components_available;
|
||||
mca_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
bool orte_errmgr_base_enable_recovery = false;
|
||||
bool orte_errmgr_base_shutting_down = false;
|
||||
bool orte_errmgr_initialized = false;
|
||||
opal_list_t orte_errmgr_base_components_available;
|
||||
|
||||
/* Public module provides a wrapper around previous functions */
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
orte_errmgr_base_proc_aborted,
|
||||
orte_errmgr_base_incomplete_start,
|
||||
orte_errmgr_base_comm_failed,
|
||||
orte_errmgr_base_abort,
|
||||
|
||||
/* Internal Interfaces */
|
||||
NULL, /* internal_errmgr_init */
|
||||
NULL, /* internal_errmgr_finalize */
|
||||
NULL, /* internal_predicted_fault */
|
||||
NULL, /* internal_process_fault */
|
||||
NULL, /* internal_suggest_map_targets */
|
||||
NULL /* internal_ft_event */
|
||||
};
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
@ -71,25 +75,46 @@ bool orte_errmgr_initialized = false;
|
||||
*/
|
||||
int orte_errmgr_base_open(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
OPAL_TRACE(5);
|
||||
|
||||
if (!orte_errmgr_initialized) { /* ensure we only do this once */
|
||||
/* Only pass this way once */
|
||||
if( orte_errmgr_initialized ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&orte_errmgr_base_modules, opal_pointer_array_t);
|
||||
|
||||
orte_errmgr_base_output = opal_output_open(NULL);
|
||||
|
||||
/* Open up all available components */
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"base_enable_recovery",
|
||||
"If the ErrMgr recovery components should be enabled."
|
||||
" [Default = disabled]",
|
||||
false, false,
|
||||
0, &value);
|
||||
orte_errmgr_base_enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/*
|
||||
* A flag to indicate that orterun is shutting down, so skip the recovery
|
||||
* logic.
|
||||
*/
|
||||
orte_errmgr_base_shutting_down = false;
|
||||
|
||||
/*
|
||||
* Open up all available components
|
||||
*/
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("errmgr", orte_errmgr_base_output,
|
||||
mca_base_components_open("errmgr",
|
||||
orte_errmgr_base_output,
|
||||
mca_errmgr_base_static_components,
|
||||
&orte_errmgr_base_components_available, true)) {
|
||||
&orte_errmgr_base_components_available,
|
||||
true)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
orte_errmgr_initialized = true;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -18,38 +18,163 @@
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
|
||||
/**
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
/*
|
||||
* List of composite modules, ordered by priority
|
||||
*/
|
||||
opal_pointer_array_t orte_errmgr_base_modules;
|
||||
|
||||
struct orte_errmgr_base_select_module_t {
|
||||
mca_base_component_t *component;
|
||||
mca_base_module_t *module;
|
||||
int priority;
|
||||
};
|
||||
typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t;
|
||||
|
||||
int orte_errmgr_base_select(void)
|
||||
{
|
||||
mca_errmgr_base_component_t *best_component = NULL;
|
||||
orte_errmgr_base_module_t *best_module = NULL;
|
||||
int exit_status = OPAL_SUCCESS;
|
||||
mca_base_component_list_item_t *cli = NULL;
|
||||
mca_base_component_t *component = NULL;
|
||||
mca_base_module_t *module = NULL;
|
||||
opal_list_item_t *item = NULL;
|
||||
int priority = 0, i, j, low_i;
|
||||
orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
|
||||
opal_pointer_array_t tmp_array;
|
||||
orte_errmgr_base_module_t *i_module = NULL;
|
||||
|
||||
/*
|
||||
* Select the best component
|
||||
* If the user does not want the recovery features, then do not select any.
|
||||
*/
|
||||
if( OPAL_SUCCESS != mca_base_select("errmgr", orte_errmgr_base_output,
|
||||
&orte_errmgr_base_components_available,
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component) ) {
|
||||
/* This will only happen if no component was selected */
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
if( !orte_errmgr_base_enable_recovery ) {
|
||||
goto INIT;
|
||||
}
|
||||
|
||||
/* Save the winner */
|
||||
orte_errmgr = *best_module;
|
||||
orte_errmgr_base_selected_component = *best_component;
|
||||
orte_errmgr_base_selected = true;
|
||||
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
opal_output_verbose(10, orte_errmgr_base_output,
|
||||
"errmgr:base:select: Auto-selecting components");
|
||||
|
||||
/*
|
||||
* Traverse the list of available components.
|
||||
* For each call their 'query' functions to determine relative priority.
|
||||
*/
|
||||
for (item = opal_list_get_first(&orte_errmgr_base_components_available);
|
||||
item != opal_list_get_end(&orte_errmgr_base_components_available);
|
||||
item = opal_list_get_next(item) ) {
|
||||
cli = (mca_base_component_list_item_t *) item;
|
||||
component = (mca_base_component_t *) cli->cli_component;
|
||||
|
||||
/*
|
||||
* If there is a query function then use it.
|
||||
*/
|
||||
if (NULL == component->mca_query_component) {
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query this component for the module and priority
|
||||
*/
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
"errmgr:base:select Querying component [%s]",
|
||||
component->mca_component_name);
|
||||
|
||||
component->mca_query_component(&module, &priority);
|
||||
|
||||
/*
|
||||
* If no module was returned or negative priority, then skip component
|
||||
*/
|
||||
if (NULL == module || priority < 0) {
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append them to the temporary list, we will sort later
|
||||
*/
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
"errmgr:base:select Query of component [%s] set priority to %d",
|
||||
component->mca_component_name, priority);
|
||||
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
|
||||
tmp_module->component = component;
|
||||
tmp_module->module = module;
|
||||
tmp_module->priority = priority;
|
||||
|
||||
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the list by decending priority
|
||||
*/
|
||||
priority = 0;
|
||||
for(j = 0; j < tmp_array.size; ++j) {
|
||||
tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
|
||||
if( NULL == tmp_module_sw ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
low_i = -1;
|
||||
priority = tmp_module_sw->priority;
|
||||
|
||||
for(i = 0; i < tmp_array.size; ++i) {
|
||||
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if( NULL == tmp_module ) {
|
||||
continue;
|
||||
}
|
||||
if( tmp_module->priority > priority ) {
|
||||
low_i = i;
|
||||
priority = tmp_module->priority;
|
||||
}
|
||||
}
|
||||
|
||||
if( low_i >= 0 ) {
|
||||
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
|
||||
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
|
||||
j--; /* Try this entry again, if it is not the lowest */
|
||||
} else {
|
||||
tmp_module = tmp_module_sw;
|
||||
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
||||
}
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
"errmgr:base:select Add module with priority [%s] %d",
|
||||
tmp_module->component->mca_component_name, tmp_module->priority);
|
||||
opal_pointer_array_add(&orte_errmgr_base_modules, (void*)(tmp_module->module));
|
||||
free(tmp_module);
|
||||
}
|
||||
OBJ_DESTRUCT(&tmp_array);
|
||||
|
||||
INIT:
|
||||
/*
|
||||
* Initialize each of the Errmgr Modules
|
||||
*/
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
if( NULL == i_module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != i_module->internal_errmgr_init ) {
|
||||
i_module->internal_errmgr_init();
|
||||
}
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -34,7 +34,6 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
|
||||
/*
|
||||
* Functions for use solely within the ERRMGR framework
|
||||
*/
|
||||
@ -48,29 +47,29 @@ typedef uint8_t orte_errmgr_cmd_flag_t;
|
||||
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
|
||||
#define ORTE_ERRMGR_REGISTER_CALLBACK_CMD 0x02
|
||||
|
||||
/* provide access to verbose output channel */
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_output;
|
||||
|
||||
|
||||
/*
|
||||
* Base functions
|
||||
*/
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_error_abort(int error_code, char *fmt, ...) __opal_attribute_format__(__printf__, 2, 3) __opal_attribute_noreturn__;
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job,
|
||||
orte_job_state_t state,
|
||||
orte_err_cb_fn_t cbfunc,
|
||||
void *cbdata);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
# endif
|
||||
;
|
||||
ORTE_DECLSPEC int orte_recos_base_predicted_fault(char ***proc_list,
|
||||
char ***node_list,
|
||||
char ***suggested_nodes);
|
||||
ORTE_DECLSPEC int orte_recos_base_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
ORTE_DECLSPEC int orte_recos_base_ft_event(int state);
|
||||
|
||||
/*
|
||||
* external API functions will be documented in the mca/errmgr/errmgr.h file
|
||||
* Additional External API function declared in errmgr.h
|
||||
*/
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -1,12 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
mca_link_libraries=libopen-rte
|
@ -1,45 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
sources = \
|
||||
errmgr_default.h \
|
||||
errmgr_default_component.c \
|
||||
errmgr_default.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_default_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_default.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_default.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_default_la_SOURCES = $(sources)
|
||||
mca_errmgr_default_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_default_la_SOURCES =$(sources)
|
||||
libmca_errmgr_default_la_LDFLAGS = -module -avoid-version
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,220 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_default.h"
|
||||
|
||||
/*
|
||||
* This function gets called by the PLM when an orted notifies us
|
||||
* that a process has aborted
|
||||
* Various components will follow their own strategy for dealing with
|
||||
* this situation. For this component, we call the provided
|
||||
* err_cbfunc if they requested notification on proc aborted.
|
||||
* Otherwise, we simply kill the job.
|
||||
*/
|
||||
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
|
||||
/* nothing we can do - abort things */
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/* if the proc was terminated by cmd, ignore it */
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
|
||||
/* nothing we can do */
|
||||
goto PROCESS;
|
||||
}
|
||||
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||
/* don't do anything or else we can enter an infinite loop */
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_ABORTED & jdata->err_cbstates)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:default: proc %s aborted with status %d - calling cbfunc",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name), exit_code));
|
||||
jdata->err_cbfunc(name, ORTE_PROC_STATE_ABORTED, jdata->err_cbdata);
|
||||
return;
|
||||
}
|
||||
|
||||
PROCESS:
|
||||
/* if we are already in progress, then ignore this call */
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:default: abort in progress, ignoring proc %s aborted with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name), exit_code));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:default: proc %s aborted with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name), exit_code));
|
||||
|
||||
orte_job_term_ordered = true;
|
||||
|
||||
/* if the proc is a daemon, then we are abnormally terminating */
|
||||
if (ORTE_PROC_MY_NAME->jobid == name->jobid) {
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
|
||||
/* indicate that all jobs other than the one containing this
|
||||
* proc have been ordered to abort - this is necessary to avoid
|
||||
* duplicate ordering of "abort".
|
||||
*
|
||||
* NOTE: be sure to not include the 0 job data location as this
|
||||
* contains the daemons!
|
||||
*/
|
||||
for (i=1; i < orte_job_data->size; i++) {
|
||||
/* the array may have holes in it as we are recovering
|
||||
* jobids as they complete, so check everything
|
||||
*/
|
||||
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_JOB_STATE_ABORTED != jdata->state &&
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state &&
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORT_ORDERED;
|
||||
}
|
||||
}
|
||||
|
||||
/* tell the plm to terminate all jobs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* set the exit status, just in case whomever called us failed
|
||||
* to do so - it can only be done once, so we are protected
|
||||
* from overwriting it
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
|
||||
/* just return - let the daemons report back so we can properly
|
||||
* know when to actually exit
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called by the PLM when an orted notifies us that
|
||||
* a job failed to start.
|
||||
* Various components will follow their own strategy for dealing with
|
||||
* this situation. For this component, we simply kill the job.
|
||||
*/
|
||||
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_process_name_t name;
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* nothing we can do - abort things */
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_FAILED_TO_START & jdata->err_cbstates)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:cm: job %s reported incomplete start with status %d - calling cbfunc",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
name.jobid = job;
|
||||
name.vpid = ORTE_VPID_WILDCARD;
|
||||
jdata->err_cbfunc(&name, ORTE_PROC_STATE_FAILED_TO_START, jdata->err_cbdata);
|
||||
return;
|
||||
}
|
||||
|
||||
PROCESS:
|
||||
/* if we are already in progress, then ignore this call */
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:default: abort in progress, ignoring incomplete start on job %s with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:default: job %s reported incomplete start with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
|
||||
orte_job_term_ordered = true;
|
||||
|
||||
/* tell the plm to terminate all jobs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* set the exit status, just in case whomever called us failed
|
||||
* to do so - it can only be done once, so we are protected
|
||||
* from overwriting it
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
|
||||
/* just return - let the daemons report back so we can properly
|
||||
* know when to actually exit
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* Register a callback function upon a change to a specified job state.
|
||||
*/
|
||||
int orte_errmgr_default_register_callback(orte_jobid_t job,
|
||||
orte_proc_state_t state,
|
||||
orte_err_cb_fn_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* nothing we can do - abort things */
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* update the error callback data */
|
||||
jdata->err_cbfunc = cbfunc;
|
||||
jdata->err_cbstates = state;
|
||||
jdata->err_cbdata = cbdata;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,57 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef ORTE_ERRMGR_HNP_H
|
||||
#define ORTE_ERRMGR_HNP_H
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_errmgr_default_component_open(void);
|
||||
int orte_errmgr_default_component_close(void);
|
||||
int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Component API functions
|
||||
*/
|
||||
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code);
|
||||
|
||||
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code);
|
||||
|
||||
int orte_errmgr_default_register_callback(orte_jobid_t job,
|
||||
orte_job_state_t state,
|
||||
orte_err_cb_fn_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern mca_errmgr_base_component_t mca_errmgr_default_component;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,108 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
* The Open MPI General Purpose Registry - Proxy component
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "errmgr_default.h"
|
||||
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
mca_errmgr_base_component_t mca_errmgr_default_component = {
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_2_0_0,
|
||||
|
||||
"default", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_errmgr_default_component_open, /* component open */
|
||||
orte_errmgr_default_component_close, /* component close */
|
||||
orte_errmgr_default_component_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* setup the function pointers for the module
|
||||
*/
|
||||
orte_errmgr_base_module_t orte_errmgr_default = {
|
||||
orte_errmgr_default_proc_aborted,
|
||||
orte_errmgr_default_incomplete_start,
|
||||
orte_errmgr_default_register_callback,
|
||||
orte_errmgr_base_error_abort
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
int orte_errmgr_default_component_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
int orte_errmgr_default_component_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* If we are an HNP or a CM, then pick us! */
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_CM) {
|
||||
/* Return a module (choose an arbitrary, positive priority --
|
||||
it's only relevant compared to other components). */
|
||||
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_default;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* otherwise, don't take me! */
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -18,7 +18,38 @@
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
* The Open RTE Error Manager
|
||||
* The Open RTE Error and Recovery Manager (ErrMgr)
|
||||
*
|
||||
* This framework is a composite framework in which multiple components
|
||||
* are often active at the same time and may work on a single external call
|
||||
* to the interface functions.
|
||||
*
|
||||
* This framework allows the user to compose a job recovery policy from multiple
|
||||
* individual components. Each component will operate on the function call if it
|
||||
* has a registered function. If no component registers a function then the base
|
||||
* functionality/policy is used.
|
||||
*
|
||||
* For example, consider the 3 components on the left (C1, C2, C3), and the
|
||||
* API function calls across the top:
|
||||
* | Priority | Fn1 | Fn2 | Fn3 | Fn4 |
|
||||
* -----+----------+------+------+------+------+
|
||||
* base | --- | act0 | --- | --- | act6 |
|
||||
* C1 | 10 | act1 | --- | act2 | --- |
|
||||
* C2 | 20 | --- | act3 | --- | --- |
|
||||
* C3 | 30 | act4 | act5 | --- | --- |
|
||||
* -----+----------+------+------+------+------+
|
||||
* A call to Fn1 will result in:
|
||||
* act4, act1
|
||||
* A call to Fn2 will result in:
|
||||
* act5, act3
|
||||
* A call to Fn3 will result in:
|
||||
* act2
|
||||
* A call to Fn4 will result in:
|
||||
* act6
|
||||
*
|
||||
* Notice that when the base function is overridden it is not called. The base
|
||||
* function is only called when the function has not been overridden by a
|
||||
* component.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -34,6 +65,10 @@
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/error.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -54,16 +89,98 @@ BEGIN_C_DECLS
|
||||
orte_errmgr_base_log(n, __FILE__, __LINE__)
|
||||
|
||||
/**
|
||||
* This is not part of any
|
||||
* module so it can be used at any time!
|
||||
* This is not part of any module so it can be used at any time!
|
||||
*/
|
||||
ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line);
|
||||
|
||||
|
||||
/**
|
||||
* Module initialization function.
|
||||
* Public interface. Will be call in each of the active composite components
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_init_fn_t)
|
||||
(void);
|
||||
|
||||
/**
|
||||
* Module finalization function.
|
||||
* Public interface. Will be call in each of the active composite components
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_finalize_fn_t)
|
||||
(void);
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
* Internal Composite Interfaces
|
||||
*/
|
||||
/**
|
||||
* Predicted process/node failure notification
|
||||
* Composite interface. Called in priority order.
|
||||
*
|
||||
* @param[in] proc_list List of processes (or NULL if none)
|
||||
* @param[in] node_list List of nodes (or NULL if none)
|
||||
* @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none)
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_predicted_fault_fn_t)
|
||||
(char ***proc_list, char ***node_list, char ***suggested_nodes);
|
||||
|
||||
/**
|
||||
* Actual process failure notification
|
||||
* Composite interface. Called in priority order.
|
||||
*
|
||||
* @param[in] proc_name Name of the failed processes
|
||||
* @param[in] state State of the failed process
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_process_fault_fn_t)
|
||||
(orte_job_t *jdata, orte_process_name_t *proec_name, orte_proc_state_t state, int *stack_state);
|
||||
|
||||
/**
|
||||
* Suggest a node to map a restarting process onto
|
||||
* Composite interface. Called in priority order.
|
||||
*
|
||||
* @param[in] proc Process that is being mapped
|
||||
* @param[in] oldnode Previous node where this process resided
|
||||
* @param[in|out] node_list List of nodes to select from
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_suggest_map_targets_fn_t)
|
||||
(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list);
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
* @param[in] state Fault tolerance state update
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
|
||||
|
||||
|
||||
/*
|
||||
* External API Functions - Implemented in errmgr/base/errmgr_base_fns.c
|
||||
*/
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list,
|
||||
char ***node_list,
|
||||
char ***suggested_nodes);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
|
||||
|
||||
|
||||
/**
|
||||
* Alert - process aborted
|
||||
@ -79,7 +196,8 @@ ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, i
|
||||
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
||||
* @retval ORTE_ERROR Appropriate error code
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
|
||||
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
|
||||
|
||||
/**
|
||||
* Alert - incomplete start of a job
|
||||
@ -101,28 +219,8 @@ typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *n
|
||||
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
||||
* @retval ORTE_ERROR Appropriate error code
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
|
||||
|
||||
/*
|
||||
* Register a job with the error manager
|
||||
* When a job is launched, this function is called so the error manager can register
|
||||
* subscriptions on the job segment so that the error manager will be notified when
|
||||
* problems occur - i.e., when process status entries change to abnormal termination
|
||||
* values. Process status entries are changed by the appropriate state monitor
|
||||
* and/or the process launcher, depending upon the stage at which the problem occurs.
|
||||
*
|
||||
* Monitoring of the job begins once the job has reached the "executing" stage. Prior
|
||||
* to that time, failure of processes to start are the responsibility of the respective
|
||||
* process launcher - which is expected to call the error manager via the "incomplete
|
||||
* start" interface to report any problems prior to the job beginning "execution".
|
||||
*
|
||||
* NOTE: ONLY HNPs are allowed to register for trigger reports. All other components
|
||||
* MUST do nothing but return ORTE_SUCCESS.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job,
|
||||
orte_proc_state_t state,
|
||||
orte_err_cb_fn_t cbfunc,
|
||||
void *cbdata);
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
|
||||
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
|
||||
|
||||
/**
|
||||
* Alert - self aborting
|
||||
@ -131,48 +229,85 @@ typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job,
|
||||
* itself, and then exit - it takes no other actions. The intent here is to provide
|
||||
* a last-ditch exit procedure that attempts to clean up a little.
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) __opal_attribute_noreturn__
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
# endif
|
||||
;
|
||||
typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
|
||||
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
# endif
|
||||
;
|
||||
|
||||
/**
|
||||
* If the communication link failed to a peer.
|
||||
* This gives us a chance to recover from this error, or abort.
|
||||
*/
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
|
||||
typedef int (*orte_errmgr_base_module_comm_failed_fn_t)(orte_process_name_t *name,
|
||||
int exit_code);
|
||||
|
||||
/*
|
||||
*
|
||||
* Module Structure
|
||||
*/
|
||||
struct orte_errmgr_base_module_2_3_0_t {
|
||||
/* ---- Previous Interfaces (Always call base) -- */
|
||||
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
|
||||
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
|
||||
orte_errmgr_base_module_register_cb_fn_t register_callback;
|
||||
orte_errmgr_base_module_comm_failed_fn_t comm_failed;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
|
||||
/* -------------- Internal Composite Interfaces -- */
|
||||
/** Initialization Function */
|
||||
orte_errmgr_base_module_init_fn_t internal_errmgr_init;
|
||||
/** Finalization Function */
|
||||
orte_errmgr_base_module_finalize_fn_t internal_errmgr_finalize;
|
||||
|
||||
/** Predicted process/node failure notification */
|
||||
orte_errmgr_base_predicted_fault_fn_t internal_predicted_fault;
|
||||
/** Actual process failure notification */
|
||||
orte_errmgr_base_process_fault_fn_t internal_process_fault;
|
||||
/** Suggest a node to map a restarting process onto */
|
||||
orte_errmgr_base_suggest_map_targets_fn_t internal_suggest_map_targets;
|
||||
|
||||
/** Handle any FT Notifications */
|
||||
orte_errmgr_base_ft_event_fn_t internal_ft_event;
|
||||
};
|
||||
|
||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||
|
||||
/*
|
||||
* ERRMGR Component
|
||||
* the standard component data structure
|
||||
* ErrMgr Component
|
||||
*/
|
||||
struct mca_errmgr_base_component_2_0_0_t {
|
||||
struct orte_errmgr_base_component_3_0_0_t {
|
||||
/** MCA base component */
|
||||
mca_base_component_t base_version;
|
||||
/** MCA base data */
|
||||
mca_base_component_data_t base_data;
|
||||
|
||||
/** Verbosity Level */
|
||||
int verbose;
|
||||
/** Output Handle for opal_output */
|
||||
int output_handle;
|
||||
/** Default Priority */
|
||||
int priority;
|
||||
};
|
||||
typedef struct mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_2_0_0_t;
|
||||
typedef mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_t;
|
||||
|
||||
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
|
||||
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
|
||||
|
||||
/*
|
||||
* Global structure for accessing previous error manager functions
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type errmgr
|
||||
*/
|
||||
#define ORTE_ERRMGR_BASE_VERSION_2_0_0 \
|
||||
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
"errmgr", 2, 0, 0
|
||||
|
||||
/* Global structure for accessing error manager functions
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* holds selected module's function pointers */
|
||||
"errmgr", 3, 0, 0
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
37
orte/mca/errmgr/orcm/Makefile.am
Обычный файл
37
orte/mca/errmgr/orcm/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-errmgr-orcm.txt
|
||||
|
||||
sources = \
|
||||
errmgr_orcm.h \
|
||||
errmgr_orcm_component.c \
|
||||
errmgr_orcm_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_orcm_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_orcm.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_orcm.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_orcm_la_SOURCES = $(sources)
|
||||
mca_errmgr_orcm_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_orcm_la_SOURCES = $(sources)
|
||||
libmca_errmgr_orcm_la_LDFLAGS = -module -avoid-version
|
19
orte/mca/errmgr/orcm/configure.m4
Обычный файл
19
orte/mca/errmgr/orcm/configure.m4
Обычный файл
@ -0,0 +1,19 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_errmgr_orcm_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_errmgr_orcm_CONFIG],[
|
||||
# If we don't want FT, don't compile this component
|
||||
AS_IF([test "$ompi_want_ft" = "1"],
|
||||
[$1],
|
||||
[$2])
|
||||
])dnl
|
13
orte/mca/errmgr/orcm/configure.params
Обычный файл
13
orte/mca/errmgr/orcm/configure.params
Обычный файл
@ -0,0 +1,13 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=errmgr_orcm_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
35
orte/mca/errmgr/orcm/errmgr_orcm.h
Обычный файл
35
orte/mca/errmgr/orcm/errmgr_orcm.h
Обычный файл
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_ORCM_EXPORT_H
|
||||
#define MCA_ERRMGR_ORCM_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orcm_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orcm_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_ORCM_EXPORT_H */
|
99
orte/mca/errmgr/orcm/errmgr_orcm_component.c
Обычный файл
99
orte/mca/errmgr/orcm/errmgr_orcm_component.c
Обычный файл
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "errmgr_orcm.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_orcm_component_version_string =
|
||||
"ORTE ERRMGR orcm MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_orcm_open(void);
|
||||
static int errmgr_orcm_close(void);
|
||||
static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_orcm_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itorcm
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"orcm",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_orcm_open,
|
||||
errmgr_orcm_close,
|
||||
errmgr_orcm_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
1
|
||||
};
|
||||
|
||||
static int errmgr_orcm_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_orcm_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/*
|
||||
* This component is selected only when requested - and if so, then
|
||||
* it MUST be used exclusively
|
||||
*/
|
||||
bool is_required = false;
|
||||
|
||||
mca_base_is_component_required(&orte_errmgr_base_components_available,
|
||||
&mca_errmgr_orcm_component.base_version,
|
||||
true,
|
||||
&is_required);
|
||||
|
||||
if( !is_required ) {
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
*priority = 1000;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_orcm_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
307
orte/mca/errmgr/orcm/errmgr_orcm_module.c
Обычный файл
307
orte/mca/errmgr/orcm/errmgr_orcm_module.c
Обычный файл
@ -0,0 +1,307 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
|
||||
#include "errmgr_orcm.h"
|
||||
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int predicted_fault(char ***proc_list,
|
||||
char ***node_list,
|
||||
char ***suggested_nodes);
|
||||
|
||||
static int process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
int *stack_state);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
|
||||
|
||||
/******************
|
||||
* ORCM module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_orcm_module = {
|
||||
NULL, /* proc_aborted (old interface) */
|
||||
NULL, /* incomplete_start (old interface) */
|
||||
NULL, /* comm_failed (old interface) */
|
||||
NULL, /* abort (old interface) */
|
||||
init,
|
||||
finalize,
|
||||
predicted_fault,
|
||||
process_fault,
|
||||
suggest_map_targets,
|
||||
ft_event
|
||||
};
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int predicted_fault(char ***proc_list,
|
||||
char ***node_list,
|
||||
char ***suggested_nodes)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int process_fault(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
int *stack_state)
|
||||
{
|
||||
orte_job_t *jnew;
|
||||
orte_proc_t *pdata;
|
||||
orte_app_context_t *app=NULL;
|
||||
orte_node_t *node, *newnode;
|
||||
orte_proc_t *daemon, *nodeproc;
|
||||
opal_value_array_t jobs;
|
||||
bool found;
|
||||
int i;
|
||||
size_t j;
|
||||
|
||||
*stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
|
||||
"errmgr:orcm:process_fault() "
|
||||
"------- %s fault reported! proc %s (0x%x)",
|
||||
(proc->jobid == ORTE_PROC_MY_NAME->jobid ? "Daemon" : "App. Process"),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
state ));
|
||||
/* get the app - just for output purposes in case of error */
|
||||
app = opal_pointer_array_get_item(jdata->apps, 0);
|
||||
|
||||
/* Remove the route to this process since it is dead */
|
||||
orte_routed.delete_route(proc);
|
||||
|
||||
/**** NON-DAEMON PROC FAILED ****/
|
||||
if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if the proc failed to start or we killed it by cmd,
|
||||
* don't attempt to restart it as this can lead to an
|
||||
* infinite loop
|
||||
*/
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == state) {
|
||||
opal_output(0, "APPLICATION %s FAILED TO START", app->app);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if the proc was terminated by cmd, then do nothing */
|
||||
if (ORTE_PROC_STATE_KILLED_BY_CMD == state) {
|
||||
opal_output(0, "APPLICATION %s KILLED BY COMMAND", app->app);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the proc_t object for this process */
|
||||
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
||||
if (NULL == pdata) {
|
||||
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* proc just died - save the node where this proc was located */
|
||||
node = pdata->node;
|
||||
/* increment restarts */
|
||||
pdata->restarts++;
|
||||
/* have we exceeded #restarts? */
|
||||
if (jdata->max_restarts < pdata->restarts) {
|
||||
opal_output(0, "Max restarts for proc %s of app %s has been exceeded - process will not be restarted",
|
||||
ORTE_NAME_PRINT(proc), app->app);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* reset the job params for restart */
|
||||
orte_plm_base_reset_job(jdata);
|
||||
|
||||
/* restart the job - the spawn function will remap and
|
||||
* launch the replacement proc(s)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
|
||||
"%s RESTARTING APP: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
if (ORTE_SUCCESS != orte_plm.spawn(jdata)) {
|
||||
opal_output(0, "FAILED TO RESTART APP %s", app->app);
|
||||
orte_trigger_event(&orte_exit);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* get the new node */
|
||||
newnode = pdata->node;
|
||||
/* report what we did */
|
||||
opal_output(0, "Proc %s:%s aborted on node %s and was restarted on node %s\n\n",
|
||||
app->app, ORTE_NAME_PRINT(proc), node->name, newnode->name);
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if it was a daemon that failed, then we have to
|
||||
* treat it differently
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
|
||||
"%s Daemon %s failed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(proc->vpid)));
|
||||
/* need to relaunch all the apps that were on
|
||||
* the node where this daemon was running as
|
||||
* they either died along with the node, or will
|
||||
* have self-terminated when the daemon died
|
||||
*/
|
||||
if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
|
||||
/* nothing we can do - abort things */
|
||||
opal_output(0, "FAILED TO GET DAEMON OBJECT");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* flag the daemon state to indicate it terminated - this will
|
||||
* cause the daemon to be restarted IF required for starting
|
||||
* procs on that node
|
||||
*/
|
||||
daemon->state = ORTE_PROC_STATE_ABORTED;
|
||||
/* identify the node where the daemon was running */
|
||||
node = daemon->node;
|
||||
/* release the contact info, if not already done */
|
||||
if (NULL != daemon->rml_uri) {
|
||||
free(daemon->rml_uri);
|
||||
daemon->rml_uri = NULL;
|
||||
}
|
||||
/* setup to track the jobs on this node */
|
||||
OBJ_CONSTRUCT(&jobs, opal_value_array_t);
|
||||
opal_value_array_init(&jobs, sizeof(orte_jobid_t));
|
||||
/* cycle through the node's procs */
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (nodeproc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
/* set the proc to abnormally terminated */
|
||||
nodeproc->state = ORTE_PROC_STATE_ABORTED;
|
||||
/* increment restarts */
|
||||
nodeproc->restarts++;
|
||||
/* check if this proc's jobid is already in array */
|
||||
found = false;
|
||||
for (j=0; j < opal_value_array_get_size(&jobs); j++) {
|
||||
if (nodeproc->name.jobid == OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
/* add it */
|
||||
opal_value_array_append_item(&jobs, &nodeproc->name.jobid);
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
|
||||
"%s RESTARTING APPS FROM NODE: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
for (j=0; j < opal_value_array_get_size(&jobs); j++) {
|
||||
if (NULL == (jnew = orte_get_job_data_object(OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)))) {
|
||||
/* nothing we can do - abort things */
|
||||
opal_output(0, "FAILED TO GET JOB OBJECT TO BE RESTARTED");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* reset the job params for restart */
|
||||
orte_plm_base_reset_job(jnew);
|
||||
/* restart the job - the spawn function will remap and
|
||||
* launch the replacement proc(s)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output,
|
||||
"%s RESTARTING JOB %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jnew->jobid)));
|
||||
if (ORTE_SUCCESS != orte_plm.spawn(jnew)) {
|
||||
opal_output(0, "FAILED TO RESTART APPS FROM NODE: %s", node->name);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
opal_output(0, "Daemon %s on node %s aborted - procs were restarted elsewhere\n\n",
|
||||
ORTE_NAME_PRINT(proc), node->name);
|
||||
/* all done - cleanup and leave */
|
||||
OBJ_DESTRUCT(&jobs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* save */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
14
orte/mca/errmgr/orcm/help-orte-errmgr-orcm.txt
Обычный файл
14
orte/mca/errmgr/orcm/help-orte-errmgr-orcm.txt
Обычный файл
@ -0,0 +1,14 @@
|
||||
-*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for ORTE RecoS IGNORE framework.
|
||||
#
|
@ -3,6 +3,9 @@
|
||||
* Copyright (c) 2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -72,7 +75,6 @@ orte_ess_base_module_t orte_ess_cm_module = {
|
||||
proc_get_node_rank,
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
NULL /* ft_event */
|
||||
};
|
||||
|
||||
|
21
orte/mca/ess/env/ess_env_module.c
поставляемый
21
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -447,6 +447,10 @@ static int rte_ft_event(int state)
|
||||
}
|
||||
/******** Continue Recovery ********/
|
||||
else if (OPAL_CRS_CONTINUE == state ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:env ft_event(%2d) - %s is Continuing",
|
||||
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/*
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
@ -476,6 +480,10 @@ static int rte_ft_event(int state)
|
||||
}
|
||||
/******** Restart Recovery ********/
|
||||
else if (OPAL_CRS_RESTART == state ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:env ft_event(%2d) - %s is Restarting",
|
||||
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/*
|
||||
* This should follow the ess init() function
|
||||
*/
|
||||
@ -583,6 +591,13 @@ static int rte_ft_event(int state)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if one was provided, build my nidmap */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify SnapC
|
||||
*/
|
||||
@ -592,12 +607,6 @@ static int rte_ft_event(int state)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if one was provided, build my nidmap */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if (OPAL_CRS_TERM == state ) {
|
||||
/* Nothing */
|
||||
|
@ -1873,7 +1873,37 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri)
|
||||
struct sockaddr_storage inaddr;
|
||||
mca_oob_tcp_addr_t* addr = NULL;
|
||||
mca_oob_tcp_peer_t* peer = NULL;
|
||||
opal_list_item_t *item;
|
||||
int rc;
|
||||
|
||||
if (NULL == uri) {
|
||||
/* purge the hash table entry for this proc */
|
||||
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
/* get the peer object */
|
||||
opal_hash_table_get_value_uint64(&mca_oob_tcp_component.tcp_peers,
|
||||
orte_util_hash_name(name),
|
||||
(void**)&peer);
|
||||
if (NULL != peer) {
|
||||
OPAL_THREAD_LOCK(&peer->peer_lock);
|
||||
/* flag the state as closed */
|
||||
peer->peer_state = MCA_OOB_TCP_CLOSED;
|
||||
/* clear any pending sends */
|
||||
while (NULL != (item = opal_list_remove_first(&peer->peer_send_queue))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
peer->peer_send_msg = NULL;
|
||||
/* clear any pending recvs */
|
||||
peer->peer_recv_msg = NULL;
|
||||
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
||||
}
|
||||
/* delete the entry from the hash table */
|
||||
opal_hash_table_set_value_uint64(&mca_oob_tcp_component.tcp_peer_names,
|
||||
orte_util_hash_name(name), NULL);
|
||||
/* all done */
|
||||
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if((rc = mca_oob_tcp_parse_uri(uri, (struct sockaddr*) &inaddr)) != ORTE_SUCCESS) {
|
||||
return rc;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
@ -532,7 +532,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
|
||||
so_error);
|
||||
}
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
if( MCA_OOB_TCP_FAILED != peer->peer_state ) {
|
||||
opal_evtimer_add(&peer->peer_timer_event, &tv);
|
||||
}
|
||||
return;
|
||||
} else if(so_error != 0) {
|
||||
/* No need to worry about the return code here - we return regardless
|
||||
@ -595,6 +597,8 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
peer->peer_state);
|
||||
}
|
||||
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
|
||||
/* inform the routed framework that we have lost a connection so
|
||||
* it can decide if this is important, what to do about it, etc.
|
||||
*/
|
||||
@ -606,8 +610,6 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
||||
orte_errmgr.abort(1, NULL);
|
||||
}
|
||||
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
}
|
||||
|
||||
void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
||||
@ -646,18 +648,6 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
||||
not likely to suddenly become successful, so abort the
|
||||
whole thing */
|
||||
peer->peer_state = MCA_OOB_TCP_FAILED;
|
||||
|
||||
/* since we cannot communicate, and the system obviously needed
|
||||
* to do so, let's abort so we don't just hang here
|
||||
*/
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
/* just wake us up */
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
orte_abnormal_term_ordered = true;
|
||||
orte_trigger_event(&orte_exit);
|
||||
} else {
|
||||
orte_errmgr.abort(1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
if (peer->peer_sd >= 0) {
|
||||
@ -669,8 +659,10 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
||||
}
|
||||
|
||||
opal_event_del(&peer->peer_timer_event);
|
||||
if( MCA_OOB_TCP_FAILED != peer->peer_state ) {
|
||||
peer->peer_state = MCA_OOB_TCP_CLOSED;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Send the globally unique identifier for this process to a peer on
|
||||
|
@ -47,7 +47,7 @@
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
@ -217,13 +217,16 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
***/
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* JJH: Would it be useful to let the errmgr know what we are doing here? */
|
||||
/*
|
||||
* Notify the Global SnapC component regarding new job
|
||||
*/
|
||||
if (ORTE_JOB_STATE_RESTART != jdata->state) {
|
||||
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(jdata->jobid) ) ) {
|
||||
/* Silent Failure :/ JJH */
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -1388,7 +1391,8 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
* an error unless it was specifically commanded
|
||||
*/
|
||||
if (jdata->state < ORTE_JOB_STATE_TERMINATED ||
|
||||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
|
||||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP ||
|
||||
jdata->controls & ORTE_JOB_CONTROL_RECOVERABLE) {
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
/* the proc array may no longer be left justified, so
|
||||
@ -1396,6 +1400,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine how the process state affects the job state
|
||||
*/
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
if (!jdata->abort) {
|
||||
@ -1406,7 +1414,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
if (!jdata->abort) {
|
||||
@ -1417,7 +1424,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
|
||||
if (!jdata->abort) {
|
||||
@ -1428,7 +1434,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
||||
if (!jdata->abort) {
|
||||
@ -1445,7 +1450,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
||||
* and we don't flag it as such - just check the remaining jobs to
|
||||
@ -1471,6 +1475,30 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the errmgr for this process, if necessary
|
||||
*/
|
||||
if (ORTE_PROC_STATE_ABORTED == proc->state ||
|
||||
ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state ||
|
||||
ORTE_PROC_STATE_TERM_WO_SYNC == proc->state ||
|
||||
ORTE_PROC_STATE_KILLED_BY_CMD == proc->state ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:check_job_completed "
|
||||
"Declared job %s %s by proc %s with code %d (0x%x vs 0x%x)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
(jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ?
|
||||
"killed by cmd" : "aborted"),
|
||||
ORTE_NAME_PRINT(&(proc->name)),
|
||||
proc->exit_code,
|
||||
proc->last_errmgr_state, proc->state));
|
||||
/* Only report escalations in the fault state */
|
||||
if( proc->last_errmgr_state < proc->state ) {
|
||||
proc->last_errmgr_state = proc->state;
|
||||
orte_errmgr.proc_aborted(&(proc->name), proc->exit_code);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1492,19 +1520,14 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
goto CHECK_ALL_JOBS;
|
||||
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:check_job_completed declared job %s aborted by proc %s with code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
(NULL == jdata->aborted_proc) ? "unknown" : ORTE_NAME_PRINT(&(jdata->aborted_proc->name)),
|
||||
(NULL == jdata->aborted_proc) ? ORTE_ERROR_DEFAULT_EXIT_CODE : jdata->aborted_proc->exit_code));
|
||||
/* report this to the errmgr */
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state ||
|
||||
ORTE_JOB_STATE_KILLED_BY_CMD == jdata->state ) {
|
||||
/* report this to the errmgr
|
||||
* (if we know which process caused this, then it was reported above)
|
||||
*/
|
||||
if (NULL == jdata->aborted_proc) {
|
||||
/* we don't know who caused us to abort */
|
||||
orte_errmgr.proc_aborted(ORTE_NAME_INVALID, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code);
|
||||
}
|
||||
goto CHECK_ALL_JOBS;
|
||||
} else if (jdata->num_terminated >= jdata->num_procs) {
|
||||
@ -1521,7 +1544,9 @@ CHECK_ALL_JOBS:
|
||||
/* if this job is a continuously operating one, then don't do
|
||||
* anything further - just return here
|
||||
*/
|
||||
if (NULL != jdata && ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls) {
|
||||
if (NULL != jdata &&
|
||||
(ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls ||
|
||||
ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls) ) {
|
||||
goto CHECK_ALIVE;
|
||||
}
|
||||
|
||||
@ -1634,6 +1659,13 @@ CHECK_ALIVE:
|
||||
ORTE_JOBID_PRINT(job->jobid)));
|
||||
one_still_alive = true;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:check_job_completed job %s is terminated (%d vs %d [0x%x])",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job->jobid),
|
||||
job->num_terminated, job->num_procs, jdata->state ));
|
||||
}
|
||||
}
|
||||
/* if a job is still alive, we just return */
|
||||
if (one_still_alive) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -1343,6 +1343,8 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
int n, i, j;
|
||||
orte_proc_t *proc, *proc_from_node;
|
||||
orte_node_t *node_from_map, *node;
|
||||
orte_odls_job_t *jobdat = NULL;
|
||||
opal_list_item_t *item = NULL;
|
||||
|
||||
/* set the state to restart */
|
||||
jdata->state = ORTE_JOB_STATE_RESTART;
|
||||
@ -1354,6 +1356,7 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
if (ORTE_PROC_STATE_TERMINATED < proc->state) {
|
||||
/* this proc abnormally terminated */
|
||||
proc->state = ORTE_PROC_STATE_RESTART;
|
||||
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
|
||||
proc->pid = 0;
|
||||
/* remove the proc from the node upon which it was mapped
|
||||
*
|
||||
@ -1394,8 +1397,14 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
/* adjust job accounting */
|
||||
if( jdata->num_terminated > 0 ) {
|
||||
jdata->num_terminated--;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"plm:base:reset_job() WARNING: Prevented num_terminated from becoming < 0!"));
|
||||
}
|
||||
}
|
||||
}
|
||||
/* clear the info on who aborted */
|
||||
jdata->abort = false;
|
||||
@ -1406,6 +1415,18 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
/* since every daemon will be reporting status for every proc, reset these to zero */
|
||||
jdata->num_launched = 0;
|
||||
jdata->num_reported = 0;
|
||||
|
||||
/* Clean up the orte_odls_job_t structure for this job */
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobdat->jobid == jdata->jobid) {
|
||||
jobdat->num_participating = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* since we are restarting the failed proc, reset the exit status */
|
||||
ORTE_RESET_EXIT_STATUS();
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
@ -58,6 +58,7 @@ typedef uint16_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0400 /* process aborted by signal */
|
||||
#define ORTE_PROC_STATE_TERM_WO_SYNC 0x0800 /* process exit'd w/o required sync */
|
||||
#define ORTE_PROC_STATE_KILLED_BY_CMD 0x1000 /* process was killed by ORTE cmd */
|
||||
#define ORTE_PROC_STATE_COMM_FAILED 0x2000 /* process communication has failed */
|
||||
|
||||
|
||||
/*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
@ -323,8 +323,16 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
/* increment the #daemons terminated so we will exit properly */
|
||||
jdata->num_terminated++;
|
||||
#if 0
|
||||
/* report that the daemon has failed so we can exit */
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
#else
|
||||
/* JJH: Look into a better way of doing this. If we let the daemon
|
||||
* know, then it kills the job when we are trying to restart.. */
|
||||
opal_output(0, "%s daemon %s failed. SKIPPING orte_plm_base_launch_failed()",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&daemon->name));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
@ -19,6 +19,10 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/if.h"
|
||||
@ -708,7 +712,55 @@ int orte_rmaps_base_define_daemons(orte_job_map_t *map)
|
||||
if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
|
||||
map->daemon_vpid_start = proc->name.vpid;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
/*
|
||||
* If we are launching on a node where there used to be a daemon, but
|
||||
* it had previously failed, try to relaunch it. (Daemon Recovery) Do
|
||||
* this ONLY if there are procs mapped to that daemon!
|
||||
*/
|
||||
else if(node->daemon->state > ORTE_PROC_STATE_UNTERMINATED ) {
|
||||
/* If no processes are to be launched on this node, then exclude it */
|
||||
if( 0 >= node->num_procs ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name),
|
||||
node->daemon->state,
|
||||
(node->daemon_launched ? "T" : "F")
|
||||
));
|
||||
/* since this daemon exists but is not needed, then flag it
|
||||
* as "launched" to avoid relaunching it for no reason
|
||||
*/
|
||||
node->daemon_launched = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name),
|
||||
node->daemon->state,
|
||||
(node->daemon_launched ? "T" : "F")
|
||||
));
|
||||
|
||||
/* flag that the daemon is no longer launched */
|
||||
node->daemon_launched = false;
|
||||
|
||||
/* set the state to indicate launch is in progress */
|
||||
node->daemon->state = ORTE_PROC_STATE_RESTART;
|
||||
|
||||
free(node->daemon->rml_uri);
|
||||
node->daemon->rml_uri = NULL;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name)));
|
||||
|
||||
/* track number of daemons to be launched */
|
||||
++map->num_new_daemons;
|
||||
}
|
||||
else {
|
||||
/* this daemon was previously defined - flag it */
|
||||
node->daemon_launched = true;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -116,7 +119,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
int rc;
|
||||
float avgload, minload;
|
||||
orte_node_t *node, *nd=NULL, *oldnode;
|
||||
orte_rmaps_res_ftgrp_t *ftgrp, *target;
|
||||
orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL;
|
||||
orte_vpid_t totprocs, lowprocs, num_assigned;
|
||||
FILE *fp;
|
||||
char *ftinput;
|
||||
@ -195,6 +198,11 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
oldnode = proc->node;
|
||||
/* point to the app */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx);
|
||||
if( NULL == app ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
goto error;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: proc %s from node %s is to be restarted",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -257,14 +265,35 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
* and -host options
|
||||
*/
|
||||
if (NULL == target) {
|
||||
nd = oldnode; /* put it back where it was if nothing else is found */
|
||||
totprocs = 1000000;
|
||||
nd = NULL;
|
||||
|
||||
/*
|
||||
* Get a list of all nodes
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
map = jdata->map;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->policy))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
|
||||
&num_slots,
|
||||
app,
|
||||
map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Ask the ErrMgr components if they have a suggestion for this process */
|
||||
orte_errmgr_base_suggest_map_targets(proc, proc->node, &node_list);
|
||||
|
||||
nd = (orte_node_t*)opal_list_get_first(&node_list);
|
||||
if( NULL == nd ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Look though the list for the least loaded machine.
|
||||
*/
|
||||
nd = oldnode; /* Put it back where it was if nothing else is found */
|
||||
totprocs = 1000000;
|
||||
/* find the lightest loaded node while deconstructing the list */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
@ -280,9 +309,18 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nd->name));
|
||||
/* put proc on the found node */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
|
||||
NULL, jdata->map->oversubscribe, false, &proc))) {
|
||||
|
||||
/*
|
||||
* Put the process on the found node (add it if not already in the map)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata,
|
||||
nd,
|
||||
jdata->map->cpus_per_rank,
|
||||
proc->app_idx,
|
||||
NULL,
|
||||
jdata->map->oversubscribe,
|
||||
false,
|
||||
&proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error
|
||||
*/
|
||||
@ -291,12 +329,15 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/* flag the proc state as non-launched so we'll know to launch it */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
|
||||
/* update the node and local ranks so static ports can
|
||||
* be properly selected if active
|
||||
*/
|
||||
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
|
||||
|
||||
continue;
|
||||
}
|
||||
/* if we did find a target, re-map the proc to the lightest loaded
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
@ -163,6 +163,8 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
int orte_rml_ftrm_ft_event(int state);
|
||||
|
||||
int orte_rml_ftrm_purge(orte_process_name_t *peer);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -81,7 +81,9 @@ orte_rml_module_t orte_rml_ftrm_module = {
|
||||
orte_rml_ftrm_add_exception_handler,
|
||||
orte_rml_ftrm_del_exception_handler,
|
||||
|
||||
orte_rml_ftrm_ft_event
|
||||
orte_rml_ftrm_ft_event,
|
||||
|
||||
orte_rml_ftrm_purge
|
||||
};
|
||||
|
||||
int rml_ftrm_output_handle;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -454,3 +454,18 @@ int orte_rml_ftrm_ft_event(int state)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rml_ftrm_purge(orte_process_name_t *peer)
|
||||
{
|
||||
int ret;
|
||||
|
||||
opal_output_verbose(20, rml_ftrm_output_handle,
|
||||
"orte_rml_ftrm: purge()");
|
||||
|
||||
if( NULL != orte_rml_ftrm_wrapped_module.purge ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.purge(peer) ) ) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
@ -179,6 +179,8 @@ void orte_rml_oob_exception_callback(const orte_process_name_t *peer,
|
||||
orte_rml_exception_t exception);
|
||||
|
||||
|
||||
int orte_rml_oob_purge(orte_process_name_t *peer);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -100,10 +100,14 @@ orte_rml_oob_module_t orte_rml_oob_module = {
|
||||
orte_rml_oob_add_exception,
|
||||
orte_rml_oob_del_exception,
|
||||
|
||||
orte_rml_oob_ft_event
|
||||
orte_rml_oob_ft_event,
|
||||
|
||||
orte_rml_oob_purge
|
||||
}
|
||||
};
|
||||
|
||||
/* Local variables */
|
||||
static bool init_done = false;
|
||||
|
||||
static int
|
||||
rml_oob_open(void)
|
||||
@ -134,6 +138,11 @@ rml_oob_close(void)
|
||||
static orte_rml_module_t*
|
||||
rml_oob_init(int* priority)
|
||||
{
|
||||
if (init_done) {
|
||||
*priority = 1;
|
||||
return &orte_rml_oob_module.super;
|
||||
}
|
||||
|
||||
if (mca_oob_base_init() != ORTE_SUCCESS)
|
||||
return NULL;
|
||||
*priority = 1;
|
||||
@ -156,6 +165,7 @@ rml_oob_init(int* priority)
|
||||
orte_rml_oob_module.active_oob->oob_exception_callback =
|
||||
orte_rml_oob_exception_callback;
|
||||
|
||||
init_done = true;
|
||||
return &orte_rml_oob_module.super;
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -12,6 +15,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -75,3 +79,37 @@ orte_rml_oob_get_new_name(orte_process_name_t *name)
|
||||
return orte_rml_oob_module.active_oob->oob_get_new_name(name);
|
||||
|
||||
}
|
||||
|
||||
int
|
||||
orte_rml_oob_purge(orte_process_name_t *peer)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_rml_oob_queued_msg_t *qmsg;
|
||||
orte_rml_oob_msg_header_t *hdr;
|
||||
orte_process_name_t step;
|
||||
|
||||
/* clear the oob contact info and pending messages */
|
||||
orte_rml_oob_module.active_oob->oob_set_addr(peer, NULL);
|
||||
|
||||
/* clear our message queue */
|
||||
OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
|
||||
item = opal_list_get_first(&orte_rml_oob_module.queued_routing_messages);
|
||||
while (item != opal_list_get_end(&orte_rml_oob_module.queued_routing_messages)) {
|
||||
next = opal_list_get_next(item);
|
||||
qmsg = (orte_rml_oob_queued_msg_t*)item;
|
||||
hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base;
|
||||
step = orte_routed.get_route(&hdr->destination);
|
||||
if (peer->jobid == hdr->destination.jobid &&
|
||||
peer->vpid == hdr->destination.vpid) {
|
||||
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
|
||||
OBJ_RELEASE(item);
|
||||
} else if (step.jobid == hdr->destination.jobid &&
|
||||
step.vpid == hdr->destination.vpid) {
|
||||
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
item = next;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -575,6 +575,12 @@ typedef int (*orte_rml_module_exception_fn_t)(orte_rml_exception_callback_t cbfu
|
||||
*/
|
||||
typedef int (*orte_rml_module_ft_event_fn_t)(int state);
|
||||
|
||||
/**
|
||||
* Purge the RML/OOB of contact info and pending messages
|
||||
* to/from a specified process. Used when a process aborts
|
||||
* and is to be restarted
|
||||
*/
|
||||
typedef int (*orte_rml_module_purge_fn_t)(struct orte_process_name_t *peer);
|
||||
|
||||
/* ******************************************************************** */
|
||||
|
||||
@ -629,6 +635,9 @@ struct orte_rml_module_t {
|
||||
|
||||
/** Fault tolerance handler */
|
||||
orte_rml_module_ft_event_fn_t ft_event;
|
||||
|
||||
/** Purge information */
|
||||
orte_rml_module_purge_fn_t purge;
|
||||
};
|
||||
/** Convienence typedef */
|
||||
typedef struct orte_rml_module_t orte_rml_module_t;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -58,12 +58,19 @@ opal_list_t orte_routed_base_components;
|
||||
|
||||
static orte_routed_component_t *active_component = NULL;
|
||||
static bool component_open_called = false;
|
||||
static bool opened = false;
|
||||
static bool selected = false;
|
||||
|
||||
int
|
||||
orte_routed_base_open(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (opened) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
opened = true;
|
||||
|
||||
/* setup the output stream */
|
||||
orte_routed_base_output = opal_output_open(NULL);
|
||||
|
||||
@ -88,6 +95,11 @@ orte_routed_base_select(void)
|
||||
orte_routed_component_t *best_component = NULL;
|
||||
orte_routed_module_t *best_module = NULL;
|
||||
|
||||
if (selected) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
selected = true;
|
||||
|
||||
/*
|
||||
* Select the best component
|
||||
*/
|
||||
@ -134,6 +146,9 @@ orte_routed_base_close(void)
|
||||
|
||||
OBJ_DESTRUCT(&orte_routed_base_components);
|
||||
|
||||
opened = false;
|
||||
selected = false;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -184,6 +187,8 @@ static int delete_route(orte_process_name_t *proc)
|
||||
* the routing tree
|
||||
*/
|
||||
|
||||
/* remove any entries in the RML for this process */
|
||||
rc = orte_rml.purge(proc);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -279,6 +284,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
{
|
||||
orte_process_name_t *ret, daemon;
|
||||
int rc;
|
||||
int32_t i;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
@ -342,7 +350,37 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
} else {
|
||||
/* otherwise, if I am the HNP, send to the daemon */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/*
|
||||
* Check to make sure the daemon is active, if not then return an INVALID name
|
||||
* JJH: There should be a faster way to do this check, but for now just iterate...
|
||||
*/
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ret = ORTE_NAME_INVALID;
|
||||
goto found;
|
||||
}
|
||||
|
||||
for(i = 0; i < jdata->procs->size; ++i) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if( proc->name.vpid != daemon.vpid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||
"%s routed_cm_get: Checking process %15s state 0x%x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(proc->name)),
|
||||
proc->state));
|
||||
|
||||
if( proc->state <= ORTE_PROC_STATE_UNTERMINATED ) {
|
||||
ret = &daemon;
|
||||
} else {
|
||||
ret = ORTE_NAME_INVALID;
|
||||
}
|
||||
goto found;
|
||||
}
|
||||
} else {
|
||||
/* send to the HNP for routing */
|
||||
ret = ORTE_PROC_MY_HNP;
|
||||
@ -727,7 +765,9 @@ static int update_routing_tree(void)
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
{
|
||||
orte_routed_tree_t *nm;
|
||||
orte_vpid_t i;
|
||||
int32_t i;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* if I am anything other than a daemon or the HNP, this
|
||||
* is a meaningless command as I am not allowed to route
|
||||
@ -741,13 +781,42 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
return ORTE_PROC_MY_HNP->vpid;
|
||||
}
|
||||
|
||||
/* for the HNP, the cm routing tree is direct to all known daemons */
|
||||
/* for the HNP, the cm routing tree is direct to all known alive daemons */
|
||||
if (NULL != children) {
|
||||
for (i=1; i < orte_process_info.num_procs; i++) {
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
for(i = 0; i < jdata->procs->size; ++i) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if( proc->name.vpid == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if( proc->state <= ORTE_PROC_STATE_UNTERMINATED &&
|
||||
NULL != proc->rml_uri ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||
"%s get_routing_tree: Adding process %15s state 0x%x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(proc->name)),
|
||||
proc->state));
|
||||
|
||||
nm = OBJ_NEW(orte_routed_tree_t);
|
||||
nm->vpid = i;
|
||||
nm->vpid = proc->name.vpid;
|
||||
opal_bitmap_clear_all_bits(&nm->relatives);
|
||||
opal_list_append(children, &nm->super);
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||
"%s get_routing_tree: Skipped process %15s state 0x%x (non functional daemon)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(proc->name)),
|
||||
proc->state));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* I have no parent */
|
||||
|
@ -781,6 +781,7 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->pid = 0;
|
||||
proc->local_rank = ORTE_LOCAL_RANK_INVALID;
|
||||
proc->node_rank = ORTE_NODE_RANK_INVALID;
|
||||
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
|
||||
proc->state = ORTE_PROC_STATE_UNDEF;
|
||||
proc->app_idx = 0;
|
||||
proc->slot_list = NULL;
|
||||
|
@ -285,6 +285,7 @@ typedef uint8_t orte_job_controls_t;
|
||||
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x10
|
||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
|
||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
|
||||
#define ORTE_JOB_CONTROL_RECOVERABLE 0x80
|
||||
|
||||
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
||||
/* put the rank assignment method in the upper 8 bits */
|
||||
@ -419,6 +420,8 @@ struct orte_proc_t {
|
||||
* know which static IP port to use
|
||||
*/
|
||||
orte_node_rank_t node_rank;
|
||||
/* Last state used to trigger the errmgr for this proc */
|
||||
orte_proc_state_t last_errmgr_state;
|
||||
/* process state */
|
||||
orte_proc_state_t state;
|
||||
/* exit code */
|
||||
|
@ -5,9 +5,12 @@
|
||||
* A program that just spins - provides mechanism for testing user-driven
|
||||
* abnormal program termination
|
||||
*/
|
||||
#include "opal_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
@ -22,7 +25,15 @@ int main(int argc, char* argv[])
|
||||
while (1) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i > 100) i = 0;
|
||||
if (i > 100) {
|
||||
/* need to progress so we can
|
||||
* wake up if our daemon goes
|
||||
* away!
|
||||
*/
|
||||
opal_progress();
|
||||
/* reset the counter so we loop */
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
orte_finalize();
|
||||
|
@ -81,6 +81,7 @@
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
@ -1127,6 +1128,15 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
!orte_never_launched) {
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger_finalize();
|
||||
|
||||
/*
|
||||
* Turn off the errmgr recovery functionality, if it was enabled.
|
||||
* This keeps the errmgr from trying to recover from the shutdown
|
||||
* procedure.
|
||||
*/
|
||||
orte_errmgr_base_enable_recovery = false;
|
||||
orte_errmgr_base_shutting_down = true;
|
||||
|
||||
/* terminate the orteds - they will automatically kill
|
||||
* their local procs
|
||||
*/
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user