2005-01-21 17:49:14 +00:00
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-01-21 17:49:14 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2012-06-27 01:28:28 +00:00
|
|
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
2013-03-27 21:14:43 +00:00
|
|
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
2012-04-06 14:23:13 +00:00
|
|
|
* All rights reserved.
|
2013-07-14 18:57:20 +00:00
|
|
|
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
2015-05-08 09:17:00 +09:00
|
|
|
* Copyright (c) 2014-2015 Research Organization for Information Science
|
2014-11-11 17:00:42 -08:00
|
|
|
* and Technology (RIST). All rights reserved.
|
2005-01-21 17:49:14 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
#include "orte_config.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/constants.h"
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2010-03-23 21:28:02 +00:00
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
#include <string.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif
|
|
|
|
|
2015-03-05 20:50:44 -07:00
|
|
|
#include "orte/mca/mca.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "opal/mca/base/base.h"
|
2010-03-23 21:28:02 +00:00
|
|
|
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "opal/util/output.h"
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2010-04-28 04:06:57 +00:00
|
|
|
#include "orte/util/show_help.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/mca/errmgr/base/base.h"
|
2006-09-14 21:29:51 +00:00
|
|
|
#include "orte/mca/errmgr/base/errmgr_private.h"
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2005-07-04 18:24:58 +00:00
|
|
|
#include "orte/mca/errmgr/base/static-components.h"
|
2005-01-21 17:49:14 +00:00
|
|
|
|
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Globals
|
2005-01-21 17:49:14 +00:00
|
|
|
*/
|
2015-05-08 09:17:00 +09:00
|
|
|
orte_errmgr_base_t orte_errmgr_base = {{{0}}};
|
2011-06-23 20:38:02 +00:00
|
|
|
|
2010-03-23 21:28:02 +00:00
|
|
|
/* Public module provides a wrapper around previous functions */
|
2011-08-18 16:24:45 +00:00
|
|
|
orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
2010-08-19 13:09:20 +00:00
|
|
|
NULL, /* init */
|
|
|
|
NULL, /* finalize */
|
2010-04-05 22:59:21 +00:00
|
|
|
orte_errmgr_base_log,
|
2010-08-19 13:09:20 +00:00
|
|
|
orte_errmgr_base_abort,
|
2011-06-15 13:10:13 +00:00
|
|
|
orte_errmgr_base_abort_peers,
|
2010-08-19 13:09:20 +00:00
|
|
|
NULL, /* predicted_fault */
|
|
|
|
NULL, /* suggest_map_targets */
|
2011-02-18 02:48:12 +00:00
|
|
|
NULL, /* ft_event */
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
orte_errmgr_base_register_migration_warning,
|
|
|
|
orte_errmgr_base_register_error_callback,
|
|
|
|
orte_errmgr_base_execute_error_callbacks
|
2010-03-23 21:28:02 +00:00
|
|
|
};
|
2011-10-14 18:45:11 +00:00
|
|
|
/* NOTE: ABSOLUTELY MUST initialize this
|
|
|
|
* struct to include the log function as it
|
|
|
|
* gets called even if the errmgr hasn't been
|
|
|
|
* opened yet due to error
|
|
|
|
*/
|
|
|
|
orte_errmgr_base_module_t orte_errmgr = {
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
orte_errmgr_base_log,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
NULL,
|
|
|
|
NULL,
|
2011-10-14 18:45:11 +00:00
|
|
|
NULL
|
|
|
|
};
|
2005-01-21 17:49:14 +00:00
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
static int orte_errmgr_base_close(void)
|
2005-01-21 17:49:14 +00:00
|
|
|
{
|
2013-03-27 21:14:43 +00:00
|
|
|
/* Close selected component */
|
|
|
|
if (NULL != orte_errmgr.finalize) {
|
|
|
|
orte_errmgr.finalize();
|
2010-03-23 21:28:02 +00:00
|
|
|
}
|
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
/* always leave a default set of fn pointers */
|
|
|
|
orte_errmgr = orte_errmgr_default_fns;
|
|
|
|
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
/* destruct the callback list */
|
|
|
|
OPAL_LIST_DESTRUCT(&orte_errmgr_base.error_cbacks);
|
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
return mca_base_framework_components_close(&orte_errmgr_base_framework, NULL);
|
|
|
|
}
|
2010-03-23 21:28:02 +00:00
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
/**
|
|
|
|
* * Function for finding and opening either all MCA components, or the one
|
|
|
|
* * that was specifically requested via a MCA parameter.
|
|
|
|
* */
|
|
|
|
static int orte_errmgr_base_open(mca_base_open_flag_t flags)
|
|
|
|
{
|
2011-08-18 16:24:45 +00:00
|
|
|
/* load the default fns */
|
|
|
|
orte_errmgr = orte_errmgr_default_fns;
|
|
|
|
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
/* initialize the error callback list */
|
|
|
|
OBJ_CONSTRUCT(&orte_errmgr_base.error_cbacks, opal_list_t);
|
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
/* Open up all available components */
|
|
|
|
return mca_base_framework_components_open(&orte_errmgr_base_framework, flags);
|
2005-01-21 17:49:14 +00:00
|
|
|
}
|
2013-03-27 21:14:43 +00:00
|
|
|
|
|
|
|
MCA_BASE_FRAMEWORK_DECLARE(orte, errmgr, "ORTE Error Manager", NULL,
|
|
|
|
orte_errmgr_base_open, orte_errmgr_base_close,
|
|
|
|
mca_errmgr_base_static_components, 0);
|
|
|
|
|
Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup
George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.
The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.
In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)
The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.
Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.
This commit was SVN r28852.
2013-07-19 01:08:53 +00:00
|
|
|
OBJ_CLASS_INSTANCE(orte_errmgr_cback_t,
|
|
|
|
opal_list_item_t,
|
|
|
|
NULL, NULL);
|