2005-01-21 20:49:14 +03:00
|
|
|
/*
|
2010-03-24 00:28:02 +03:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2005-11-05 22:57:48 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2011-06-24 00:38:02 +04:00
|
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-09-20 21:09:11 +04:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-01-21 20:49:14 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2009-05-20 17:16:31 +04:00
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
2012-06-27 05:28:28 +04:00
|
|
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
2012-04-06 18:23:13 +04:00
|
|
|
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
|
|
|
* All rights reserved.
|
2005-01-21 20:49:14 +03:00
|
|
|
* $COPYRIGHT$
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* Additional copyrights may follow
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
2010-03-24 00:28:02 +03:00
|
|
|
* The Open RTE Error and Recovery Manager (ErrMgr)
|
|
|
|
*
|
2010-08-19 17:09:20 +04:00
|
|
|
* This framework is the logically central clearing house for process/daemon
|
|
|
|
* state updates. In particular when a process fails and another process detects
|
|
|
|
* it, then that information is reported through this framework. This framework
|
|
|
|
* then (depending on the active component) decides how to handle the failure.
|
2010-03-24 00:28:02 +03:00
|
|
|
*
|
2010-08-19 17:09:20 +04:00
|
|
|
* For example, if a process fails this may activate an automatic recovery
|
|
|
|
* of the process from a previous checkpoint, or initial state. Conversely,
|
|
|
|
* the active component could decide not to continue the job, and request that
|
|
|
|
* it be terminated. The error and recovery policy is determined by individual
|
|
|
|
* components within this framework.
|
2005-01-21 20:49:14 +03:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#ifndef ORTE_MCA_ERRMGR_H
|
|
|
|
#define ORTE_MCA_ERRMGR_H
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "orte_config.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2005-12-08 03:05:26 +03:00
|
|
|
#include "opal/mca/mca.h"
|
2010-03-24 00:28:02 +03:00
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
|
|
|
|
#include "opal/class/opal_object.h"
|
2011-06-24 00:38:02 +04:00
|
|
|
#include "opal/class/opal_pointer_array.h"
|
2010-03-24 00:28:02 +03:00
|
|
|
#include "opal/util/output.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "opal/util/error.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
The current errmgr.register_callback API takes a jobid as one of its argument. The intent was to have the errmgr check the jobid of the job being reported to it and, if it matches the jobid that was registered, call the specified callback function.
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback.
This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn.
Also, fully implement the "copy" function for the orte_job_t object.
NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value.
This commit was SVN r21200.
2009-05-11 07:38:15 +04:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
2005-05-12 00:21:10 +04:00
|
|
|
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-11 00:51:11 +04:00
|
|
|
/*
|
|
|
|
* Structure to describe a predicted process fault.
|
|
|
|
*
|
|
|
|
* This can be expanded in the future to support assurance levels, and
|
|
|
|
* additional information that may wish to be conveyed.
|
|
|
|
*/
|
|
|
|
struct orte_errmgr_predicted_proc_t {
|
|
|
|
/** This is an object, so must have a super */
|
|
|
|
opal_list_item_t super;
|
|
|
|
|
|
|
|
/** Process Name */
|
|
|
|
orte_process_name_t proc_name;
|
|
|
|
};
|
|
|
|
typedef struct orte_errmgr_predicted_proc_t orte_errmgr_predicted_proc_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_proc_t);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Structure to describe a predicted node fault.
|
|
|
|
*
|
|
|
|
* This can be expanded in the future to support assurance levels, and
|
|
|
|
* additional information that may wish to be conveyed.
|
|
|
|
*/
|
|
|
|
struct orte_errmgr_predicted_node_t {
|
|
|
|
/** This is an object, so must have a super */
|
|
|
|
opal_list_item_t super;
|
|
|
|
|
|
|
|
/** Node Name */
|
|
|
|
char * node_name;
|
|
|
|
};
|
|
|
|
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
|
|
|
|
|
2011-06-24 00:38:02 +04:00
|
|
|
/*
|
|
|
|
* Callback function that should be called when there is a fault.
|
|
|
|
*
|
|
|
|
* This callback function will be used anytime (other than during finalize) the
|
|
|
|
* runtime detects and handles a process failure. The runtime will complete all
|
|
|
|
* its stabilization before alerting the callback function. The parameter to the
|
|
|
|
* callback function will be the orte_process_name_t of the process that failed.
|
|
|
|
* It will not alert the application to failures that are not in the same job as
|
|
|
|
* the alerted process, only failures within the same jobid.
|
|
|
|
*
|
|
|
|
* @param[in] proc The names of the process that failed
|
|
|
|
*/
|
|
|
|
typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
|
|
|
|
|
|
|
|
ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
|
|
|
|
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-11 00:51:11 +04:00
|
|
|
/*
|
|
|
|
* Structure to describe a suggested remapping element for a predicted fault.
|
|
|
|
*
|
|
|
|
* This can be expanded in the future to support weights , and
|
|
|
|
* additional information that may wish to be conveyed.
|
|
|
|
*/
|
|
|
|
struct orte_errmgr_predicted_map_t {
|
|
|
|
/** This is an object, so must have a super */
|
|
|
|
opal_list_item_t super;
|
|
|
|
|
|
|
|
/** Process Name (predicted to fail) */
|
|
|
|
orte_process_name_t proc_name;
|
|
|
|
|
|
|
|
/** Node Name (predicted to fail) */
|
|
|
|
char * node_name;
|
|
|
|
|
|
|
|
/** Process Name (Map to) */
|
|
|
|
orte_process_name_t map_proc_name;
|
|
|
|
|
|
|
|
/** Node Name (Map to) */
|
|
|
|
char * map_node_name;
|
|
|
|
|
|
|
|
/** Just off current node */
|
|
|
|
bool off_current_node;
|
|
|
|
|
|
|
|
/** Pre-map fixed node assignment */
|
|
|
|
char * pre_map_fixed_node;
|
|
|
|
};
|
|
|
|
typedef struct orte_errmgr_predicted_map_t orte_errmgr_predicted_map_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Macro definitions
|
|
|
|
*/
|
2005-09-20 21:09:11 +04:00
|
|
|
/*
|
|
|
|
* Thess macros and associated error name array are used to output intelligible error
|
|
|
|
* messages.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define ORTE_ERROR_NAME(n) opal_strerror(n)
|
2010-05-18 03:02:13 +04:00
|
|
|
#define ORTE_ERROR_LOG(n) \
|
2011-10-14 22:45:11 +04:00
|
|
|
orte_errmgr.log(n, __FILE__, __LINE__);
|
2010-04-06 02:59:21 +04:00
|
|
|
|
2011-10-21 08:54:38 +04:00
|
|
|
#if WANT_PMI_SUPPORT
|
|
|
|
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
|
|
|
|
do { \
|
|
|
|
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
|
|
__FILE__, __LINE__, __func__, \
|
|
|
|
pmi_func, orte_errmgr_base_pmi_error(pmi_err)); \
|
|
|
|
} while(0);
|
|
|
|
OPAL_DECLSPEC char* orte_errmgr_base_pmi_error(int pmi_err);
|
|
|
|
#endif
|
2010-08-19 17:09:20 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Framework Interfaces
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* Module initialization function.
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_init_fn_t)(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Module finalization function.
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/**
|
2010-03-24 00:28:02 +03:00
|
|
|
* This is not part of any module so it can be used at any time!
|
2008-02-28 04:57:57 +03:00
|
|
|
*/
|
2010-08-19 17:09:20 +04:00
|
|
|
typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - self aborting
|
|
|
|
* This function is called when a process is aborting due to some internal error.
|
|
|
|
* It will finalize the process
|
|
|
|
* itself, and then exit - it takes no other actions. The intent here is to provide
|
|
|
|
* a last-ditch exit procedure that attempts to clean up a little.
|
|
|
|
*/
|
2010-08-31 18:51:19 +04:00
|
|
|
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
|
2010-08-31 14:28:51 +04:00
|
|
|
__opal_attribute_format_funcptr__(__printf__, 2, 3);
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2011-06-15 17:10:13 +04:00
|
|
|
/**
|
|
|
|
* Alert - abort peers
|
|
|
|
* This function is called when a process wants to abort one or more peer processes.
|
|
|
|
* For example, MPI_Abort(comm) will use this function to terminate peers in the
|
|
|
|
* communicator group before aborting itself.
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
|
|
|
|
orte_std_cntr_t num_procs);
|
|
|
|
|
2010-03-24 00:28:02 +03:00
|
|
|
/**
|
|
|
|
* Predicted process/node failure notification
|
|
|
|
*
|
|
|
|
* @param[in] proc_list List of processes (or NULL if none)
|
|
|
|
* @param[in] node_list List of nodes (or NULL if none)
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-11 00:51:11 +04:00
|
|
|
* @param[in] suggested_map List of mapping suggestions to use on recovery (or NULL if none)
|
2010-03-24 00:28:02 +03:00
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
2010-08-19 17:09:20 +04:00
|
|
|
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
|
|
|
|
opal_list_t *node_list,
|
|
|
|
opal_list_t *suggested_map);
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-11 00:51:11 +04:00
|
|
|
|
2010-03-24 00:28:02 +03:00
|
|
|
/**
|
|
|
|
* Suggest a node to map a restarting process onto
|
|
|
|
*
|
|
|
|
* @param[in] proc Process that is being mapped
|
|
|
|
* @param[in] oldnode Previous node where this process resided
|
|
|
|
* @param[in|out] node_list List of nodes to select from
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
2010-04-06 02:59:21 +04:00
|
|
|
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
|
|
|
orte_node_t *oldnode,
|
2010-08-19 17:09:20 +04:00
|
|
|
opal_list_t *node_list);
|
2010-03-24 00:28:02 +03:00
|
|
|
|
|
|
|
/**
|
2010-04-06 02:59:21 +04:00
|
|
|
* Handle fault tolerance updates
|
|
|
|
*
|
|
|
|
* @param[in] state Fault tolerance state update
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
2010-03-24 00:28:02 +03:00
|
|
|
*/
|
2010-08-19 17:09:20 +04:00
|
|
|
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2011-02-18 05:48:12 +03:00
|
|
|
/**
|
2011-06-24 00:38:02 +04:00
|
|
|
* Function to perform actions that require the rest of the ORTE layer to be up
|
|
|
|
* and running.
|
2011-02-18 05:48:12 +03:00
|
|
|
*
|
2011-06-24 00:38:02 +04:00
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecified error occured
|
2011-02-18 05:48:12 +03:00
|
|
|
*/
|
|
|
|
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
|
|
|
|
|
2011-06-24 00:38:02 +04:00
|
|
|
/**
|
|
|
|
* Set the callback function for faults.
|
|
|
|
*
|
|
|
|
* @param[in] cbfunc The callback function.
|
|
|
|
*
|
|
|
|
* @retval The previous fault callback function.
|
|
|
|
*/
|
|
|
|
typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
|
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
/*
|
2010-03-24 00:28:02 +03:00
|
|
|
* Module Structure
|
2005-01-21 20:49:14 +03:00
|
|
|
*/
|
2008-07-29 02:40:57 +04:00
|
|
|
struct orte_errmgr_base_module_2_3_0_t {
|
2010-03-24 00:28:02 +03:00
|
|
|
/** Initialization Function */
|
2011-06-24 00:38:02 +04:00
|
|
|
orte_errmgr_base_module_init_fn_t init;
|
2010-03-24 00:28:02 +03:00
|
|
|
/** Finalization Function */
|
2011-06-24 00:38:02 +04:00
|
|
|
orte_errmgr_base_module_finalize_fn_t finalize;
|
2010-03-24 00:28:02 +03:00
|
|
|
|
2011-06-24 00:38:02 +04:00
|
|
|
orte_errmgr_base_module_log_fn_t log;
|
|
|
|
orte_errmgr_base_module_abort_fn_t abort;
|
|
|
|
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
2010-08-19 17:09:20 +04:00
|
|
|
|
2010-03-24 00:28:02 +03:00
|
|
|
/** Predicted process/node failure notification */
|
2011-06-24 00:38:02 +04:00
|
|
|
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
2010-03-24 00:28:02 +03:00
|
|
|
/** Suggest a node to map a restarting process onto */
|
2011-06-24 00:38:02 +04:00
|
|
|
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
2010-03-24 00:28:02 +03:00
|
|
|
|
|
|
|
/** Handle any FT Notifications */
|
2011-06-24 00:38:02 +04:00
|
|
|
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
2011-02-18 05:48:12 +03:00
|
|
|
|
2011-06-24 00:38:02 +04:00
|
|
|
/* Register to be warned of impending migration */
|
2011-02-18 05:48:12 +03:00
|
|
|
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
|
2011-06-24 00:38:02 +04:00
|
|
|
|
|
|
|
/* Set the callback function */
|
|
|
|
orte_errmgr_base_module_set_fault_callback_t set_fault_callback;
|
2005-01-21 20:49:14 +03:00
|
|
|
};
|
2008-07-29 02:40:57 +04:00
|
|
|
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
|
|
|
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
2010-08-19 17:09:20 +04:00
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
2010-03-24 00:28:02 +03:00
|
|
|
* ErrMgr Component
|
2005-01-21 20:49:14 +03:00
|
|
|
*/
|
2010-03-24 00:28:02 +03:00
|
|
|
struct orte_errmgr_base_component_3_0_0_t {
|
|
|
|
/** MCA base component */
|
2008-05-06 22:08:45 +04:00
|
|
|
mca_base_component_t base_version;
|
2010-03-24 00:28:02 +03:00
|
|
|
/** MCA base data */
|
2008-07-29 02:40:57 +04:00
|
|
|
mca_base_component_data_t base_data;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2010-03-24 00:28:02 +03:00
|
|
|
/** Verbosity Level */
|
|
|
|
int verbose;
|
|
|
|
/** Output Handle for opal_output */
|
|
|
|
int output_handle;
|
|
|
|
/** Default Priority */
|
|
|
|
int priority;
|
|
|
|
};
|
|
|
|
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
|
|
|
|
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
2008-07-29 02:40:57 +04:00
|
|
|
* Macro for use in components that are of type errmgr
|
2005-01-21 20:49:14 +03:00
|
|
|
*/
|
2010-03-24 00:28:02 +03:00
|
|
|
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
|
2008-07-29 02:40:57 +04:00
|
|
|
MCA_BASE_VERSION_2_0_0, \
|
2010-03-24 00:28:02 +03:00
|
|
|
"errmgr", 3, 0, 0
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
END_C_DECLS
|
2005-05-12 00:21:10 +04:00
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
#endif
|