e12ca48cd9
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php Documentation: http://osl.iu.edu/research/ft/ Major Changes: -------------- * Added C/R-enabled Debugging support. Enabled with the --enable-crdebug flag. See the following website for more information: http://osl.iu.edu/research/ft/crdebug/ * Added Stable Storage (SStore) framework for checkpoint storage * 'central' component does a direct to central storage save * 'stage' component stages checkpoints to central storage while the application continues execution. * 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress) * 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching) * Added Compression (compress) framework to support * Add two new ErrMgr recovery policies * {{{crmig}}} C/R Process Migration * {{{autor}}} C/R Automatic Recovery * Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component * Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option) * {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342) * {{{OMPI_CR_Restart}}} * {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules) * {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192) * {{{OMPI_CR_Quiesce_start}}} * {{{OMPI_CR_Quiesce_checkpoint}}} * {{{OMPI_CR_Quiesce_end}}} * {{{OMPI_CR_self_register_checkpoint_callback}}} * {{{OMPI_CR_self_register_restart_callback}}} * {{{OMPI_CR_self_register_continue_callback}}} * The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future. * Add a progress meter to: * FileM rsh (filem_rsh_process_meter) * SnapC full (snapc_full_progress_meter) * SStore stage (sstore_stage_progress_meter) * Added 2 new command line options to ompi-restart * --showme : Display the full command line that would have been exec'ed. * --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413) * Deprecated some MCA params: * crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir * snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir * snapc_base_global_shared deprecated, use sstore_stage_global_is_shared * snapc_base_store_in_place deprecated, replaced with different components of SStore * snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref * snapc_base_establish_global_snapshot_dir deprecated, never well supported * snapc_full_skip_filem deprecated, use sstore_stage_skip_filem Minor Changes: -------------- * Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing. * Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components * Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it. * Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}} * Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set. * opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality. * Cleanup the CRS framework and components to work with the SStore framework. * Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably). * Add 'quiesce' hook to CRCP for a future enhancement. * We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}. * Add optional application level INC callbacks (registered through the CR MPI Ext interface). * Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive. * {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked. * {{{opal-restart}}} also support local decompression before restarting * {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata * {{{orte-restart}}} now uses the SStore framework to work with the metadata * Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality. * Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}. * Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped. * Make sure to decrement the number of 'num_local_procs' in the orted when one goes away. * odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options. * Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities. * Improve the checks for 'already checkpointing' error path. * A a recovery output timer, to show how long it takes to restart a job * Do a better job of cleaning up the old session directory on restart. * Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment) * Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize. This commit was SVN r23587. The following Trac tickets were found above: Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924 Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097 Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161 Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192 Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208 Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342 Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
354 строки
12 KiB
C
354 строки
12 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/** @file:
|
|
*
|
|
* The Open RTE Error and Recovery Manager (ErrMgr)
|
|
*
|
|
* This framework is a composite framework in which multiple components
|
|
* are often active at the same time and may work on a single external call
|
|
* to the interface functions.
|
|
*
|
|
* This framework allows the user to compose a job recovery policy from multiple
|
|
* individual components. Each component will operate on the function call if it
|
|
* has a registered function. If no component registers a function then the base
|
|
* functionality/policy is used.
|
|
*
|
|
* For example, consider the 3 components on the left (C1, C2, C3), and the
|
|
* API function calls across the top:
|
|
* | Priority | Fn1 | Fn2 | Fn3 | Fn4 |
|
|
* -----+----------+------+------+------+------+
|
|
* base | --- | act0 | --- | --- | act6 |
|
|
* C1 | 10 | act1 | --- | act2 | --- |
|
|
* C2 | 20 | --- | act3 | --- | --- |
|
|
* C3 | 30 | act4 | act5 | --- | --- |
|
|
* -----+----------+------+------+------+------+
|
|
* A call to Fn1 will result in:
|
|
* act4, act1
|
|
* A call to Fn2 will result in:
|
|
* act5, act3
|
|
* A call to Fn3 will result in:
|
|
* act2
|
|
* A call to Fn4 will result in:
|
|
* act6
|
|
*
|
|
* Notice that when the base function is overridden it is not called. The base
|
|
* function is only called when the function has not been overridden by a
|
|
* component.
|
|
*
|
|
*/
|
|
|
|
#ifndef ORTE_MCA_ERRMGR_H
|
|
#define ORTE_MCA_ERRMGR_H
|
|
|
|
/*
|
|
* includes
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/class/opal_object.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/error.h"
|
|
#include "opal/util/opal_sos.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/plm/plm_types.h"
|
|
|
|
BEGIN_C_DECLS
|
|
/* type definition */
|
|
typedef uint8_t orte_errmgr_stack_state_t;
|
|
|
|
/*
|
|
* Structure to describe a predicted process fault.
|
|
*
|
|
* This can be expanded in the future to support assurance levels, and
|
|
* additional information that may wish to be conveyed.
|
|
*/
|
|
struct orte_errmgr_predicted_proc_t {
|
|
/** This is an object, so must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** Process Name */
|
|
orte_process_name_t proc_name;
|
|
};
|
|
typedef struct orte_errmgr_predicted_proc_t orte_errmgr_predicted_proc_t;
|
|
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_proc_t);
|
|
|
|
/*
|
|
* Structure to describe a predicted node fault.
|
|
*
|
|
* This can be expanded in the future to support assurance levels, and
|
|
* additional information that may wish to be conveyed.
|
|
*/
|
|
struct orte_errmgr_predicted_node_t {
|
|
/** This is an object, so must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** Node Name */
|
|
char * node_name;
|
|
};
|
|
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
|
|
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
|
|
|
|
/*
|
|
* Structure to describe a suggested remapping element for a predicted fault.
|
|
*
|
|
* This can be expanded in the future to support weights , and
|
|
* additional information that may wish to be conveyed.
|
|
*/
|
|
struct orte_errmgr_predicted_map_t {
|
|
/** This is an object, so must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** Process Name (predicted to fail) */
|
|
orte_process_name_t proc_name;
|
|
|
|
/** Node Name (predicted to fail) */
|
|
char * node_name;
|
|
|
|
/** Process Name (Map to) */
|
|
orte_process_name_t map_proc_name;
|
|
|
|
/** Node Name (Map to) */
|
|
char * map_node_name;
|
|
|
|
/** Just off current node */
|
|
bool off_current_node;
|
|
|
|
/** Pre-map fixed node assignment */
|
|
char * pre_map_fixed_node;
|
|
};
|
|
typedef struct orte_errmgr_predicted_map_t orte_errmgr_predicted_map_t;
|
|
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
|
|
|
|
|
|
/*
|
|
* Macro definitions
|
|
*/
|
|
/*
|
|
* Thess macros and associated error name array are used to output intelligible error
|
|
* messages.
|
|
*/
|
|
|
|
#define ORTE_ERROR_NAME(n) opal_strerror(n)
|
|
#define ORTE_ERROR_LOG(n) \
|
|
if (true == OPAL_SOS_IS_NATIVE(n)) { \
|
|
orte_errmgr.log(n, __FILE__, __LINE__); \
|
|
} else { \
|
|
OPAL_SOS_LOG(n); \
|
|
}
|
|
|
|
/**** FRAMEWORK API FUNCTIONS ****/
|
|
|
|
/**
|
|
* This is not part of any module so it can be used at any time!
|
|
*/
|
|
typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, int line);
|
|
|
|
/**
|
|
* Alert - process aborted
|
|
* This function is called by the PLM when a remote process aborts during execution. Actions taken
|
|
* in response to the abnormal termination of a remote application process will vary across
|
|
* the various errmgr components.
|
|
*
|
|
* NOTE: Local process errors should always be reported through the error_detected interface and
|
|
* NOT here.
|
|
*
|
|
* @param *name Pointer to the name of the proc that aborted
|
|
*
|
|
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
|
* @retval ORTE_ERROR Appropriate error code
|
|
*/
|
|
typedef int (*orte_errmgr_base_API_update_state_fn_t)(orte_jobid_t job,
|
|
orte_job_state_t jobstate,
|
|
orte_process_name_t *proc_name,
|
|
orte_proc_state_t state,
|
|
pid_t pid,
|
|
orte_exit_code_t exit_code);
|
|
|
|
/**
|
|
* Predicted process/node failure notification
|
|
* Composite interface. Called in priority order.
|
|
*
|
|
* @param[in] proc_list List of processes (or NULL if none)
|
|
* @param[in] node_list List of nodes (or NULL if none)
|
|
* @param[in] suggested_map List of mapping suggestions to use on recovery (or NULL if none)
|
|
*
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
*/
|
|
typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(opal_list_t *proc_list,
|
|
opal_list_t *node_list,
|
|
opal_list_t *suggested_map);
|
|
|
|
/**
|
|
* Suggest a node to map a restarting process onto
|
|
*
|
|
* @param[in] proc Process that is being mapped
|
|
* @param[in] oldnode Previous node where this process resided
|
|
* @param[in|out] node_list List of nodes to select from
|
|
*
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
*/
|
|
typedef int (*orte_errmgr_base_API_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
|
orte_node_t *oldnode,
|
|
opal_list_t *node_list);
|
|
|
|
|
|
/**
|
|
* Alert - self aborting
|
|
* This function is called when a process is aborting due to some internal error.
|
|
* It will finalize the process
|
|
* itself, and then exit - it takes no other actions. The intent here is to provide
|
|
* a last-ditch exit procedure that attempts to clean up a little.
|
|
*/
|
|
typedef int (*orte_errmgr_base_API_abort_fn_t)(int error_code, char *fmt, ...)
|
|
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
|
__opal_attribute_format__(__printf__, 2, 3)
|
|
# endif
|
|
;
|
|
|
|
/* global structure for accessing ERRMGR FRAMEWORK API's */
|
|
typedef struct {
|
|
orte_errmgr_base_API_log_fn_t log;
|
|
orte_errmgr_base_API_update_state_fn_t update_state;
|
|
orte_errmgr_base_API_predicted_fault_fn_t predicted_fault;
|
|
orte_errmgr_base_API_suggest_map_targets_fn_t suggest_map_targets;
|
|
orte_errmgr_base_API_abort_fn_t abort;
|
|
|
|
} orte_errmgr_API_t;
|
|
|
|
ORTE_DECLSPEC extern orte_errmgr_API_t orte_errmgr;
|
|
|
|
|
|
|
|
|
|
/**** INTERNAL MODULE FUNCTIONS ****/
|
|
|
|
/**
|
|
* Module initialization function.
|
|
* Public interface. Will be call in each of the active composite components
|
|
*
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
*/
|
|
typedef int (*orte_errmgr_base_module_init_fn_t)
|
|
(void);
|
|
|
|
/**
|
|
* Module finalization function.
|
|
* Public interface. Will be call in each of the active composite components
|
|
*
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
*/
|
|
typedef int (*orte_errmgr_base_module_finalize_fn_t)
|
|
(void);
|
|
|
|
/*
|
|
* Internal Composite Interfaces corresponding to API interfaces
|
|
*/
|
|
typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job,
|
|
orte_job_state_t jobstate,
|
|
orte_process_name_t *proc_name,
|
|
orte_proc_state_t state,
|
|
pid_t pid,
|
|
orte_exit_code_t exit_code,
|
|
orte_errmgr_stack_state_t *stack_state);
|
|
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
|
|
opal_list_t *node_list,
|
|
opal_list_t *suggested_map,
|
|
orte_errmgr_stack_state_t *stack_state);
|
|
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
|
orte_node_t *oldnode,
|
|
opal_list_t *node_list,
|
|
orte_errmgr_stack_state_t *stack_state);
|
|
|
|
/**
|
|
* Handle fault tolerance updates
|
|
*
|
|
* @param[in] state Fault tolerance state update
|
|
*
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
*/
|
|
typedef int (*orte_errmgr_base_ft_event_fn_t)(int state);
|
|
|
|
|
|
/*
|
|
* Module Structure
|
|
*/
|
|
struct orte_errmgr_base_module_2_3_0_t {
|
|
/** Initialization Function */
|
|
orte_errmgr_base_module_init_fn_t init;
|
|
/** Finalization Function */
|
|
orte_errmgr_base_module_finalize_fn_t finalize;
|
|
|
|
/* -------------- Internal Composite Interfaces -- */
|
|
/** Actual process failure notification */
|
|
orte_errmgr_base_module_update_state_fn_t update_state;
|
|
/** Predicted process/node failure notification */
|
|
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
|
/** Suggest a node to map a restarting process onto */
|
|
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
|
|
|
/** Handle any FT Notifications */
|
|
orte_errmgr_base_ft_event_fn_t ft_event;
|
|
};
|
|
|
|
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
|
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
|
|
|
/*
|
|
* ErrMgr Component
|
|
*/
|
|
struct orte_errmgr_base_component_3_0_0_t {
|
|
/** MCA base component */
|
|
mca_base_component_t base_version;
|
|
/** MCA base data */
|
|
mca_base_component_data_t base_data;
|
|
|
|
/** Verbosity Level */
|
|
int verbose;
|
|
/** Output Handle for opal_output */
|
|
int output_handle;
|
|
/** Default Priority */
|
|
int priority;
|
|
};
|
|
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
|
|
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
|
|
|
|
|
|
/*
|
|
* Macro for use in components that are of type errmgr
|
|
*/
|
|
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
|
|
MCA_BASE_VERSION_2_0_0, \
|
|
"errmgr", 3, 0, 0
|
|
|
|
END_C_DECLS
|
|
|
|
#endif
|