2005-01-21 17:49:14 +00:00
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-09-20 17:09:11 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-01-21 17:49:14 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2009-05-20 13:16:31 +00:00
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
2005-01-21 17:49:14 +00:00
|
|
|
* $COPYRIGHT$
|
2005-09-20 17:09:11 +00:00
|
|
|
*
|
2005-01-21 17:49:14 +00:00
|
|
|
* Additional copyrights may follow
|
2005-09-20 17:09:11 +00:00
|
|
|
*
|
2005-01-21 17:49:14 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
2010-03-23 21:28:02 +00:00
|
|
|
* The Open RTE Error and Recovery Manager (ErrMgr)
|
|
|
|
*
|
|
|
|
* This framework is a composite framework in which multiple components
|
|
|
|
* are often active at the same time and may work on a single external call
|
|
|
|
* to the interface functions.
|
|
|
|
*
|
|
|
|
* This framework allows the user to compose a job recovery policy from multiple
|
|
|
|
* individual components. Each component will operate on the function call if it
|
|
|
|
* has a registered function. If no component registers a function then the base
|
|
|
|
* functionality/policy is used.
|
|
|
|
*
|
|
|
|
* For example, consider the 3 components on the left (C1, C2, C3), and the
|
|
|
|
* API function calls across the top:
|
|
|
|
* | Priority | Fn1 | Fn2 | Fn3 | Fn4 |
|
|
|
|
* -----+----------+------+------+------+------+
|
|
|
|
* base | --- | act0 | --- | --- | act6 |
|
|
|
|
* C1 | 10 | act1 | --- | act2 | --- |
|
|
|
|
* C2 | 20 | --- | act3 | --- | --- |
|
|
|
|
* C3 | 30 | act4 | act5 | --- | --- |
|
|
|
|
* -----+----------+------+------+------+------+
|
|
|
|
* A call to Fn1 will result in:
|
|
|
|
* act4, act1
|
|
|
|
* A call to Fn2 will result in:
|
|
|
|
* act5, act3
|
|
|
|
* A call to Fn3 will result in:
|
|
|
|
* act2
|
|
|
|
* A call to Fn4 will result in:
|
|
|
|
* act6
|
|
|
|
*
|
|
|
|
* Notice that when the base function is overridden it is not called. The base
|
|
|
|
* function is only called when the function has not been overridden by a
|
|
|
|
* component.
|
2005-01-21 17:49:14 +00:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
#ifndef ORTE_MCA_ERRMGR_H
|
|
|
|
#define ORTE_MCA_ERRMGR_H
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
#include "orte_config.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2005-12-08 00:05:26 +00:00
|
|
|
#include "opal/mca/mca.h"
|
2010-03-23 21:28:02 +00:00
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
|
|
|
|
#include "opal/class/opal_object.h"
|
|
|
|
#include "opal/util/output.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "opal/util/error.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
|
The current errmgr.register_callback API takes a jobid as one of its argument. The intent was to have the errmgr check the jobid of the job being reported to it and, if it matches the jobid that was registered, call the specified callback function.
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback.
This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn.
Also, fully implement the "copy" function for the orte_job_t object.
NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value.
This commit was SVN r21200.
2009-05-11 03:38:15 +00:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
2005-05-11 20:21:10 +00:00
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
/*
|
|
|
|
* Macro definitions
|
|
|
|
*/
|
2005-09-20 17:09:11 +00:00
|
|
|
/*
|
|
|
|
* Thess macros and associated error name array are used to output intelligible error
|
|
|
|
* messages.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define ORTE_ERROR_NAME(n) opal_strerror(n)
|
2005-03-14 20:57:21 +00:00
|
|
|
#define ORTE_ERROR_LOG(n) \
|
2008-02-28 01:57:57 +00:00
|
|
|
orte_errmgr_base_log(n, __FILE__, __LINE__)
|
|
|
|
|
|
|
|
/**
|
2010-03-23 21:28:02 +00:00
|
|
|
* This is not part of any module so it can be used at any time!
|
2008-02-28 01:57:57 +00:00
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line);
|
|
|
|
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2010-03-23 21:28:02 +00:00
|
|
|
/**
|
|
|
|
* Module initialization function.
|
|
|
|
* Public interface. Will be call in each of the active composite components
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_init_fn_t)
|
|
|
|
(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Module finalization function.
|
|
|
|
* Public interface. Will be call in each of the active composite components
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_finalize_fn_t)
|
|
|
|
(void);
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Internal Composite Interfaces
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* Predicted process/node failure notification
|
|
|
|
* Composite interface. Called in priority order.
|
|
|
|
*
|
|
|
|
* @param[in] proc_list List of processes (or NULL if none)
|
|
|
|
* @param[in] node_list List of nodes (or NULL if none)
|
|
|
|
* @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none)
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_predicted_fault_fn_t)
|
|
|
|
(char ***proc_list, char ***node_list, char ***suggested_nodes);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Actual process failure notification
|
|
|
|
* Composite interface. Called in priority order.
|
|
|
|
*
|
|
|
|
* @param[in] proc_name Name of the failed processes
|
|
|
|
* @param[in] state State of the failed process
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
typedef int (*orte_errmgr_base_process_fault_fn_t)
|
|
|
|
(orte_job_t *jdata, orte_process_name_t *proec_name, orte_proc_state_t state, int *stack_state);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Suggest a node to map a restarting process onto
|
|
|
|
* Composite interface. Called in priority order.
|
|
|
|
*
|
|
|
|
* @param[in] proc Process that is being mapped
|
|
|
|
* @param[in] oldnode Previous node where this process resided
|
|
|
|
* @param[in|out] node_list List of nodes to select from
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_suggest_map_targets_fn_t)
|
|
|
|
(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Handle fault tolerance updates
|
|
|
|
*
|
|
|
|
* @param[in] state Fault tolerance state update
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* External API Functions - Implemented in errmgr/base/errmgr_base_fns.c
|
|
|
|
*/
|
|
|
|
|
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list,
|
|
|
|
char ***node_list,
|
|
|
|
char ***suggested_nodes);
|
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
|
|
|
orte_node_t *oldnode,
|
|
|
|
opal_list_t *node_list);
|
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
|
|
|
|
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
/**
|
|
|
|
* Alert - process aborted
|
2008-02-28 01:57:57 +00:00
|
|
|
* This function is called by the PLM when a remote process aborts during execution. Actions taken
|
|
|
|
* in response to the abnormal termination of a remote application process will vary across
|
2006-09-14 21:29:51 +00:00
|
|
|
* the various errmgr components.
|
2008-02-28 01:57:57 +00:00
|
|
|
*
|
2006-09-14 21:29:51 +00:00
|
|
|
* NOTE: Local process errors should always be reported through the error_detected interface and
|
|
|
|
* NOT here.
|
2008-02-28 01:57:57 +00:00
|
|
|
*
|
|
|
|
* @param *name Pointer to the name of the proc that aborted
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
|
|
|
* @retval ORTE_ERROR Appropriate error code
|
2005-03-14 20:57:21 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
ORTE_DECLSPEC extern int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
|
|
|
|
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - incomplete start of a job
|
2008-02-28 01:57:57 +00:00
|
|
|
* This function is called by the PLM when an attempted launch of a job encounters failure of
|
2006-09-14 21:29:51 +00:00
|
|
|
* one or more processes to start. The strategy for dealing
|
|
|
|
* with this "incomplete start" situation varies across the various errmgr components.
|
2005-09-20 17:09:11 +00:00
|
|
|
*
|
2005-03-14 20:57:21 +00:00
|
|
|
* This function is only called by the respective process launcher, which is responsible
|
2006-09-14 21:29:51 +00:00
|
|
|
* for detecting incomplete starts. If on a daemon, the function simply updates the
|
|
|
|
* process state to indicate failure to launch - this initiates a trigger that goes to
|
|
|
|
* the respective HNP for response.
|
|
|
|
*
|
|
|
|
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
|
|
|
|
* from taking any action to this function call. Instead, they are restricted to simply
|
|
|
|
* returning.
|
|
|
|
*
|
2008-02-28 01:57:57 +00:00
|
|
|
* @param job Job that failed to start
|
2006-09-14 21:29:51 +00:00
|
|
|
*
|
2008-02-28 01:57:57 +00:00
|
|
|
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
|
|
|
* @retval ORTE_ERROR Appropriate error code
|
2005-03-14 20:57:21 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
ORTE_DECLSPEC extern int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
|
|
|
|
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - self aborting
|
2008-02-28 01:57:57 +00:00
|
|
|
* This function is called when a process is aborting due to some internal error.
|
|
|
|
* It will finalize the process
|
|
|
|
* itself, and then exit - it takes no other actions. The intent here is to provide
|
2006-09-14 21:29:51 +00:00
|
|
|
* a last-ditch exit procedure that attempts to clean up a little.
|
2005-03-14 20:57:21 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
ORTE_DECLSPEC extern int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
2009-05-20 00:39:22 +00:00
|
|
|
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
2009-05-20 13:16:31 +00:00
|
|
|
__opal_attribute_format__(__printf__, 2, 3)
|
2009-05-20 00:39:22 +00:00
|
|
|
# endif
|
|
|
|
;
|
2010-03-23 21:28:02 +00:00
|
|
|
typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
|
|
|
|
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
|
|
|
__opal_attribute_format__(__printf__, 2, 3)
|
|
|
|
# endif
|
|
|
|
;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* If the communication link failed to a peer.
|
|
|
|
* This gives us a chance to recover from this error, or abort.
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC extern int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
|
|
|
|
typedef int (*orte_errmgr_base_module_comm_failed_fn_t)(orte_process_name_t *name,
|
|
|
|
int exit_code);
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Module Structure
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2008-07-28 22:40:57 +00:00
|
|
|
struct orte_errmgr_base_module_2_3_0_t {
|
2010-03-23 21:28:02 +00:00
|
|
|
/* ---- Previous Interfaces (Always call base) -- */
|
2006-09-14 21:29:51 +00:00
|
|
|
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
|
|
|
|
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
|
2010-03-23 21:28:02 +00:00
|
|
|
orte_errmgr_base_module_comm_failed_fn_t comm_failed;
|
2006-09-14 21:29:51 +00:00
|
|
|
orte_errmgr_base_module_abort_fn_t abort;
|
2010-03-23 21:28:02 +00:00
|
|
|
|
|
|
|
/* -------------- Internal Composite Interfaces -- */
|
|
|
|
/** Initialization Function */
|
|
|
|
orte_errmgr_base_module_init_fn_t internal_errmgr_init;
|
|
|
|
/** Finalization Function */
|
|
|
|
orte_errmgr_base_module_finalize_fn_t internal_errmgr_finalize;
|
|
|
|
|
|
|
|
/** Predicted process/node failure notification */
|
|
|
|
orte_errmgr_base_predicted_fault_fn_t internal_predicted_fault;
|
|
|
|
/** Actual process failure notification */
|
|
|
|
orte_errmgr_base_process_fault_fn_t internal_process_fault;
|
|
|
|
/** Suggest a node to map a restarting process onto */
|
|
|
|
orte_errmgr_base_suggest_map_targets_fn_t internal_suggest_map_targets;
|
|
|
|
|
|
|
|
/** Handle any FT Notifications */
|
|
|
|
orte_errmgr_base_ft_event_fn_t internal_ft_event;
|
2005-01-21 17:49:14 +00:00
|
|
|
};
|
|
|
|
|
2008-07-28 22:40:57 +00:00
|
|
|
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
|
|
|
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* ErrMgr Component
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
struct orte_errmgr_base_component_3_0_0_t {
|
|
|
|
/** MCA base component */
|
2008-05-06 18:08:45 +00:00
|
|
|
mca_base_component_t base_version;
|
2010-03-23 21:28:02 +00:00
|
|
|
/** MCA base data */
|
2008-07-28 22:40:57 +00:00
|
|
|
mca_base_component_data_t base_data;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2010-03-23 21:28:02 +00:00
|
|
|
/** Verbosity Level */
|
|
|
|
int verbose;
|
|
|
|
/** Output Handle for opal_output */
|
|
|
|
int output_handle;
|
|
|
|
/** Default Priority */
|
|
|
|
int priority;
|
|
|
|
};
|
|
|
|
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
|
|
|
|
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2010-03-23 21:28:02 +00:00
|
|
|
/*
|
|
|
|
* Global structure for accessing previous error manager functions
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2008-07-28 22:40:57 +00:00
|
|
|
* Macro for use in components that are of type errmgr
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
|
2008-07-28 22:40:57 +00:00
|
|
|
MCA_BASE_VERSION_2_0_0, \
|
2010-03-23 21:28:02 +00:00
|
|
|
"errmgr", 3, 0, 0
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
END_C_DECLS
|
2005-05-11 20:21:10 +00:00
|
|
|
|
2005-01-21 17:49:14 +00:00
|
|
|
#endif
|