/* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** @file: * * The Open RTE Error and Recovery Manager (ErrMgr) * * This framework is a composite framework in which multiple components * are often active at the same time and may work on a single external call * to the interface functions. * * This framework allows the user to compose a job recovery policy from multiple * individual components. Each component will operate on the function call if it * has a registered function. If no component registers a function then the base * functionality/policy is used. * * For example, consider the 3 components on the left (C1, C2, C3), and the * API function calls across the top: * | Priority | Fn1 | Fn2 | Fn3 | Fn4 | * -----+----------+------+------+------+------+ * base | --- | act0 | --- | --- | act6 | * C1 | 10 | act1 | --- | act2 | --- | * C2 | 20 | --- | act3 | --- | --- | * C3 | 30 | act4 | act5 | --- | --- | * -----+----------+------+------+------+------+ * A call to Fn1 will result in: * act4, act1 * A call to Fn2 will result in: * act5, act3 * A call to Fn3 will result in: * act2 * A call to Fn4 will result in: * act6 * * Notice that when the base function is overridden it is not called. The base * function is only called when the function has not been overridden by a * component. * */ #ifndef ORTE_MCA_ERRMGR_H #define ORTE_MCA_ERRMGR_H /* * includes */ #include "orte_config.h" #include "orte/constants.h" #include "orte/types.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/class/opal_object.h" #include "opal/util/output.h" #include "opal/util/error.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/plm/plm_types.h" BEGIN_C_DECLS /* * Macro definitions */ /* * Thess macros and associated error name array are used to output intelligible error * messages. */ #define ORTE_ERROR_NAME(n) opal_strerror(n) #define ORTE_ERROR_LOG(n) \ orte_errmgr_base_log(n, __FILE__, __LINE__) /** * This is not part of any module so it can be used at any time! */ ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line); /** * Module initialization function. * Public interface. Will be call in each of the active composite components * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_errmgr_base_module_init_fn_t) (void); /** * Module finalization function. * Public interface. Will be call in each of the active composite components * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_errmgr_base_module_finalize_fn_t) (void); /* * Internal Composite Interfaces */ /** * Predicted process/node failure notification * Composite interface. Called in priority order. * * @param[in] proc_list List of processes (or NULL if none) * @param[in] node_list List of nodes (or NULL if none) * @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none) * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_errmgr_base_predicted_fault_fn_t) (char ***proc_list, char ***node_list, char ***suggested_nodes); /** * Actual process failure notification * Composite interface. Called in priority order. * * @param[in] proc_name Name of the failed processes * @param[in] state State of the failed process * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_errmgr_base_process_fault_fn_t) (orte_job_t *jdata, orte_process_name_t *proec_name, orte_proc_state_t state, int *stack_state); /** * Suggest a node to map a restarting process onto * Composite interface. Called in priority order. * * @param[in] proc Process that is being mapped * @param[in] oldnode Previous node where this process resided * @param[in|out] node_list List of nodes to select from * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_errmgr_base_suggest_map_targets_fn_t) (orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); /** * Handle fault tolerance updates * * @param[in] state Fault tolerance state update * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_errmgr_base_ft_event_fn_t)(int state); /* * External API Functions - Implemented in errmgr/base/errmgr_base_fns.c */ ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list, char ***node_list, char ***suggested_nodes); ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state); /** * Alert - process aborted * This function is called by the PLM when a remote process aborts during execution. Actions taken * in response to the abnormal termination of a remote application process will vary across * the various errmgr components. * * NOTE: Local process errors should always be reported through the error_detected interface and * NOT here. * * @param *name Pointer to the name of the proc that aborted * * @retval ORTE_SUCCESS Whatever action that was taken was successful * @retval ORTE_ERROR Appropriate error code */ ORTE_DECLSPEC extern int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code); typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code); /** * Alert - incomplete start of a job * This function is called by the PLM when an attempted launch of a job encounters failure of * one or more processes to start. The strategy for dealing * with this "incomplete start" situation varies across the various errmgr components. * * This function is only called by the respective process launcher, which is responsible * for detecting incomplete starts. If on a daemon, the function simply updates the * process state to indicate failure to launch - this initiates a trigger that goes to * the respective HNP for response. * * NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden * from taking any action to this function call. Instead, they are restricted to simply * returning. * * @param job Job that failed to start * * @retval ORTE_SUCCESS Whatever action that was taken was successful * @retval ORTE_ERROR Appropriate error code */ ORTE_DECLSPEC extern int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code); typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code); /** * Alert - self aborting * This function is called when a process is aborting due to some internal error. * It will finalize the process * itself, and then exit - it takes no other actions. The intent here is to provide * a last-ditch exit procedure that attempts to clean up a little. */ ORTE_DECLSPEC extern int orte_errmgr_base_abort(int error_code, char *fmt, ...) # if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR __opal_attribute_format__(__printf__, 2, 3) # endif ; typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) # if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR __opal_attribute_format__(__printf__, 2, 3) # endif ; /** * If the communication link failed to a peer. * This gives us a chance to recover from this error, or abort. */ ORTE_DECLSPEC extern int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code); typedef int (*orte_errmgr_base_module_comm_failed_fn_t)(orte_process_name_t *name, int exit_code); /* * Module Structure */ struct orte_errmgr_base_module_2_3_0_t { /* ---- Previous Interfaces (Always call base) -- */ orte_errmgr_base_module_proc_aborted_fn_t proc_aborted; orte_errmgr_base_module_incomplete_start_fn_t incomplete_start; orte_errmgr_base_module_comm_failed_fn_t comm_failed; orte_errmgr_base_module_abort_fn_t abort; /* -------------- Internal Composite Interfaces -- */ /** Initialization Function */ orte_errmgr_base_module_init_fn_t internal_errmgr_init; /** Finalization Function */ orte_errmgr_base_module_finalize_fn_t internal_errmgr_finalize; /** Predicted process/node failure notification */ orte_errmgr_base_predicted_fault_fn_t internal_predicted_fault; /** Actual process failure notification */ orte_errmgr_base_process_fault_fn_t internal_process_fault; /** Suggest a node to map a restarting process onto */ orte_errmgr_base_suggest_map_targets_fn_t internal_suggest_map_targets; /** Handle any FT Notifications */ orte_errmgr_base_ft_event_fn_t internal_ft_event; }; typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t; typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t; /* * ErrMgr Component */ struct orte_errmgr_base_component_3_0_0_t { /** MCA base component */ mca_base_component_t base_version; /** MCA base data */ mca_base_component_data_t base_data; /** Verbosity Level */ int verbose; /** Output Handle for opal_output */ int output_handle; /** Default Priority */ int priority; }; typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t; typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t; /* * Global structure for accessing previous error manager functions */ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* * Macro for use in components that are of type errmgr */ #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \ MCA_BASE_VERSION_2_0_0, \ "errmgr", 3, 0, 0 END_C_DECLS #endif