2015-03-05 20:50:44 -07:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
2005-01-21 17:49:14 +00:00
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2011-06-23 20:38:02 +00:00
|
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
2005-11-05 19:57:48 +00:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-09-20 17:09:11 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-01-21 17:49:14 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2009-05-20 13:16:31 +00:00
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
2012-06-27 01:28:28 +00:00
|
|
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
2015-03-05 20:50:44 -07:00
|
|
|
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
2017-06-05 15:22:28 -07:00
|
|
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
2014-12-12 09:46:44 -05:00
|
|
|
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
2005-01-21 17:49:14 +00:00
|
|
|
* $COPYRIGHT$
|
2005-09-20 17:09:11 +00:00
|
|
|
*
|
2005-01-21 17:49:14 +00:00
|
|
|
* Additional copyrights may follow
|
2005-09-20 17:09:11 +00:00
|
|
|
*
|
2005-01-21 17:49:14 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
2010-03-23 21:28:02 +00:00
|
|
|
* The Open RTE Error and Recovery Manager (ErrMgr)
|
|
|
|
*
|
2010-08-19 13:09:20 +00:00
|
|
|
* This framework is the logically central clearing house for process/daemon
|
|
|
|
* state updates. In particular when a process fails and another process detects
|
|
|
|
* it, then that information is reported through this framework. This framework
|
|
|
|
* then (depending on the active component) decides how to handle the failure.
|
2010-03-23 21:28:02 +00:00
|
|
|
*
|
2010-08-19 13:09:20 +00:00
|
|
|
* For example, if a process fails this may activate an automatic recovery
|
|
|
|
* of the process from a previous checkpoint, or initial state. Conversely,
|
|
|
|
* the active component could decide not to continue the job, and request that
|
|
|
|
* it be terminated. The error and recovery policy is determined by individual
|
|
|
|
* components within this framework.
|
2005-01-21 17:49:14 +00:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
#ifndef ORTE_MCA_ERRMGR_H
|
|
|
|
#define ORTE_MCA_ERRMGR_H
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
#include "orte_config.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2015-03-05 20:50:44 -07:00
|
|
|
#include "orte/mca/mca.h"
|
2010-03-23 21:28:02 +00:00
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
|
|
|
|
#include "opal/class/opal_object.h"
|
2011-06-23 20:38:02 +00:00
|
|
|
#include "opal/class/opal_pointer_array.h"
|
2010-03-23 21:28:02 +00:00
|
|
|
#include "opal/util/output.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "opal/util/error.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
|
The current errmgr.register_callback API takes a jobid as one of its argument. The intent was to have the errmgr check the jobid of the job being reported to it and, if it matches the jobid that was registered, call the specified callback function.
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback.
This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn.
Also, fully implement the "copy" function for the orte_job_t object.
NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value.
This commit was SVN r21200.
2009-05-11 03:38:15 +00:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
2005-05-11 20:21:10 +00:00
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
/*
|
|
|
|
* Macro definitions
|
|
|
|
*/
|
2005-09-20 17:09:11 +00:00
|
|
|
/*
|
|
|
|
* Thess macros and associated error name array are used to output intelligible error
|
|
|
|
* messages.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define ORTE_ERROR_NAME(n) opal_strerror(n)
|
2010-05-17 23:02:13 +00:00
|
|
|
#define ORTE_ERROR_LOG(n) \
|
2014-12-12 09:46:44 -05:00
|
|
|
orte_errmgr.logfn(n, __FILE__, __LINE__);
|
2010-04-05 22:59:21 +00:00
|
|
|
|
2010-08-19 13:09:20 +00:00
|
|
|
/*
|
|
|
|
* Framework Interfaces
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* Module initialization function.
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_init_fn_t)(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Module finalization function.
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
|
2008-02-28 01:57:57 +00:00
|
|
|
|
|
|
|
/**
|
2010-03-23 21:28:02 +00:00
|
|
|
* This is not part of any module so it can be used at any time!
|
2008-02-28 01:57:57 +00:00
|
|
|
*/
|
2010-08-19 13:09:20 +00:00
|
|
|
typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - self aborting
|
|
|
|
* This function is called when a process is aborting due to some internal error.
|
|
|
|
* It will finalize the process
|
|
|
|
* itself, and then exit - it takes no other actions. The intent here is to provide
|
|
|
|
* a last-ditch exit procedure that attempts to clean up a little.
|
|
|
|
*/
|
2010-08-31 14:51:19 +00:00
|
|
|
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
|
2010-08-31 10:28:51 +00:00
|
|
|
__opal_attribute_format_funcptr__(__printf__, 2, 3);
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2011-06-15 13:10:13 +00:00
|
|
|
/**
|
|
|
|
* Alert - abort peers
|
|
|
|
* This function is called when a process wants to abort one or more peer processes.
|
|
|
|
* For example, MPI_Abort(comm) will use this function to terminate peers in the
|
|
|
|
* communicator group before aborting itself.
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
|
2013-10-08 18:37:59 +00:00
|
|
|
orte_std_cntr_t num_procs,
|
|
|
|
int error_code);
|
2011-06-15 13:10:13 +00:00
|
|
|
|
2005-01-21 17:49:14 +00:00
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Module Structure
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2008-07-28 22:40:57 +00:00
|
|
|
struct orte_errmgr_base_module_2_3_0_t {
|
2010-03-23 21:28:02 +00:00
|
|
|
/** Initialization Function */
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
orte_errmgr_base_module_init_fn_t init;
|
2010-03-23 21:28:02 +00:00
|
|
|
/** Finalization Function */
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
orte_errmgr_base_module_finalize_fn_t finalize;
|
2010-03-23 21:28:02 +00:00
|
|
|
|
2014-12-12 09:46:44 -05:00
|
|
|
orte_errmgr_base_module_log_fn_t logfn;
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
orte_errmgr_base_module_abort_fn_t abort;
|
|
|
|
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
2005-01-21 17:49:14 +00:00
|
|
|
};
|
2008-07-28 22:40:57 +00:00
|
|
|
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
|
|
|
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
2010-08-19 13:09:20 +00:00
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* ErrMgr Component
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
struct orte_errmgr_base_component_3_0_0_t {
|
|
|
|
/** MCA base component */
|
2008-05-06 18:08:45 +00:00
|
|
|
mca_base_component_t base_version;
|
2010-03-23 21:28:02 +00:00
|
|
|
/** MCA base data */
|
2008-07-28 22:40:57 +00:00
|
|
|
mca_base_component_data_t base_data;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2010-03-23 21:28:02 +00:00
|
|
|
/** Verbosity Level */
|
|
|
|
int verbose;
|
|
|
|
/** Output Handle for opal_output */
|
|
|
|
int output_handle;
|
|
|
|
/** Default Priority */
|
|
|
|
int priority;
|
|
|
|
};
|
|
|
|
typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
|
|
|
|
typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2008-07-28 22:40:57 +00:00
|
|
|
* Macro for use in components that are of type errmgr
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
|
2015-03-05 20:50:44 -07:00
|
|
|
ORTE_MCA_BASE_VERSION_2_1_0("errmgr", 3, 0, 0)
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
END_C_DECLS
|
2005-05-11 20:21:10 +00:00
|
|
|
|
2005-01-21 17:49:14 +00:00
|
|
|
#endif
|