1
1

Per the RFC and discussion on the devel list, update the RTE-MPI error handling interface. There are a few differences in the code from the original RFC that came out of the discussion - I've captured those in the following writeup

George and I were talking about ORTE's error handling the other day in regards to the right way to deal with errors in the updated OOB. Specifically, it seemed a bad idea for a library such as ORTE to be aborting the job on its own prerogative. If we lose a connection or cannot send a message, then we really should just report it upwards and let the application and/or upper layers decide what to do about it.

The current code base only allows a single error callback to exist, which seemed unduly limiting. So, based on the conversation, I've modified the errmgr interface to provide a mechanism for registering any number of error handlers (this replaces the current "set_fault_callback" API). When an error occurs, these handlers will be called in order until one responds that the error has been "resolved" - i.e., no further action is required - by returning OMPI_SUCCESS. The default MPI layer error handler is specified to go "last" and calls mpi_abort, so the current "abort" behavior is preserved unless other error handlers are registered.

In the register_callback function, I provide an "order" param so you can specify "this callback must come first" or "this callback must come last". Seemed to me that we will probably have different code areas registering callbacks, and one might require it go first (the default "abort" will always require it go last). So you can append and prepend, or go first. Note that only one registration can declare itself "first" or "last", and since the default "abort" callback automatically takes "last", that one isn't available. :-)

The errhandler callback function passes an opal_pointer_array of structs, each of which contains the name of the proc involved (which can be yourself for internal errors) and the error code. This is a change from the current fault callback which returned an opal_pointer_array of just process names. Rationale is that you might need to see the cause of the error to decide what action to take. I realize that isn't a requirement for remote procs, but remember that we will use the SAME interface to report RTE errors internal to the proc itself. In those cases, you really do need to see the error code. It is legal to pass a NULL for the pointer array (e.g., when reporting an internal failure without error code), so handlers must be prepared for that possibility. If people find that too burdensome, we can remove it.

Should we ever decide to create a separate callback path for internal errors vs remote process failures, or if we decide to do something different based on experience, then we can adjust this API.

This commit was SVN r28852.
Этот коммит содержится в:
Ralph Castain 2013-07-19 01:08:53 +00:00
родитель 6c50c8167c
Коммит e4e678e234
12 изменённых файлов: 201 добавлений и 81 удалений

Просмотреть файл

@ -214,10 +214,19 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
}
/**
* Runtime errhandler callback
* Default runtime errhandler callback
*/
void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs) {
ompi_mpi_abort(MPI_COMM_WORLD, 1, false);
int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors) {
ompi_rte_error_report_t *err;
int errcode = 1;
if (NULL != errors ||
(NULL != (err = (ompi_rte_error_report_t*)opal_pointer_array_get_item(errors, 0)))) {
errcode = err->errcode;
}
ompi_mpi_abort(MPI_COMM_WORLD, errcode, false);
return OMPI_SUCCESS;
}
/**************************************************************************

Просмотреть файл

@ -31,6 +31,7 @@
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/errhandler/errhandler_predefined.h"
#include "ompi/errhandler/errcode-internal.h"
@ -366,14 +367,25 @@ struct ompi_request_t;
* Callback function from runtime layer to alert the MPI layer of an error at
* the runtime layer.
*
* @param procs The names of the processes that have failed.
* @param errors A pointer array containing structs of type
* ompi_rte_error_report_t that consists of at least
* {
* ompi_process_name_t proc;
* int errcode;
* }
* Each RTE is allowed to add additional information
* as required
*
* This function is used to alert the MPI layer to a specific fault at the
* runtime layer. Currently, the only faults reported using this method are
* process failures. The MPI layer has the option to perform whatever actions it
* needs to stabalize itself and continue running, abort, etc.
* This function is used to alert the MPI layer to a specific fault detected by the
* runtime layer. This could be a process failure, a lost connection, or the inability
* to send an OOB message. The MPI layer has the option to perform whatever actions it
* needs to stabilize itself and continue running, abort, etc.
*
* Upon completion, the error handler should return OMPI_SUCCESS if the error has
* been resolved and no further callbacks are to be executed. Return of any other
* value will cause the RTE to continue executing error callbacks.
*/
OMPI_DECLSPEC void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs);
OMPI_DECLSPEC int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors);
/**
* Check to see if an errhandler is intrinsic.

Просмотреть файл

@ -69,7 +69,12 @@ typedef orte_local_rank_t ompi_local_rank_t;
/* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
#define ompi_rte_abort_peers(a, b) orte_errmgr.abort_peers(a, b)
#define ompi_rte_set_fault_callback(a)
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND
#define OMPI_RTE_ERRHANDLER_APPEND ORTE_ERRMGR_CALLBACK_APPEND
typedef orte_error_t ompi_rte_error_report_t;
#define ompi_rte_register_errhandler(a, b) orte_errmgr.register_error_callback(a, b)
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
/* Init and finalize objects and operations */

Просмотреть файл

@ -482,10 +482,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
}
#endif
/* Register errhandler callback - RTE will ignore if it
/* Register the default errhandler callback - RTE will ignore if it
* doesn't support this capability
*/
ompi_rte_set_fault_callback(ompi_errhandler_runtime_callback);
ompi_rte_register_errhandler(ompi_errhandler_runtime_callback,
OMPI_RTE_ERRHANDLER_LAST);
/* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow

Просмотреть файл

@ -89,15 +89,10 @@ ORTE_DECLSPEC int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global
ORTE_DECLSPEC int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_t *datum);
/* Interface to report process state to the notifier */
ORTE_DECLSPEC void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc);
ORTE_DECLSPEC int orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc);
#endif /* OPAL_ENABLE_FT_CR */
/*
* Additional External API function declared in errmgr.h
*/
ORTE_DECLSPEC orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc);
END_C_DECLS
#endif

Просмотреть файл

@ -245,6 +245,95 @@ int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
orte_errmgr_error_order_t order)
{
orte_errmgr_cback_t *cb, *cbcur;
/* check the order to see what to do */
switch(order) {
case ORTE_ERRMGR_CALLBACK_FIRST:
/* only one can be so designated */
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks))) {
if (ORTE_ERRMGR_CALLBACK_FIRST == cb->order) {
return ORTE_ERR_NOT_SUPPORTED;
}
}
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback =cbfunc;
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
break;
case ORTE_ERRMGR_CALLBACK_LAST:
/* only one can be so designated */
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks))) {
if (ORTE_ERRMGR_CALLBACK_LAST == cb->order) {
return ORTE_ERR_NOT_SUPPORTED;
}
}
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback = cbfunc;
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
break;
case ORTE_ERRMGR_CALLBACK_PREPEND:
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback =cbfunc;
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks)) &&
ORTE_ERRMGR_CALLBACK_FIRST == cbcur->order) {
opal_list_insert(&orte_errmgr_base.error_cbacks, &cb->super, 1);
} else {
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
}
break;
case ORTE_ERRMGR_CALLBACK_APPEND:
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback =cbfunc;
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks)) &&
ORTE_ERRMGR_CALLBACK_LAST == cbcur->order) {
opal_list_insert_pos(&orte_errmgr_base.error_cbacks, &cbcur->super, &cb->super);
} else {
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
}
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
break;
}
return ORTE_SUCCESS;
}
void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors)
{
orte_errmgr_cback_t *cb;
char *errstring;
orte_error_t *err;
int errcode = ORTE_ERROR_DEFAULT_EXIT_CODE;
/* if no callbacks have been provided, then we abort */
if (0 == opal_list_get_size(&orte_errmgr_base.error_cbacks)) {
/* take the first entry, if available */
if (NULL != errors &&
(NULL != (err = (orte_error_t*)opal_pointer_array_get_item(errors, 0)))) {
errstring = (char*)ORTE_ERROR_NAME(err->errcode);
errcode = err->errcode;
}
if (NULL == errstring) {
/* if the error is silent, say nothing */
orte_errmgr.abort(errcode, NULL);
}
orte_errmgr.abort(errcode, "Executing default error callback: %s", errstring);
}
/* cycle across the provided callbacks until we complete the list
* or one reports that no further action is required
*/
OPAL_LIST_FOREACH(cb, &orte_errmgr_base.error_cbacks, orte_errmgr_cback_t) {
if (ORTE_SUCCESS == cb->callback(errors)) {
break;
}
}
}
/********************
* Utility functions
@ -651,18 +740,6 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
#endif
orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc) {
orte_errmgr_fault_callback_t *temp_cbfunc = fault_cbfunc;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
"%s errmgr:base Called set_fault_callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
fault_cbfunc = cbfunc;
return temp_cbfunc;
}
/********************
* Local Functions
********************/

Просмотреть файл

@ -50,7 +50,7 @@
/*
* Globals
*/
orte_errmgr_fault_callback_t *fault_cbfunc;
orte_errmgr_base_t orte_errmgr_base;
/* Public module provides a wrapper around previous functions */
orte_errmgr_base_module_t orte_errmgr_default_fns = {
@ -62,7 +62,9 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = {
NULL, /* predicted_fault */
NULL, /* suggest_map_targets */
NULL, /* ft_event */
orte_errmgr_base_register_migration_warning
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_error_callback,
orte_errmgr_base_execute_error_callbacks
};
/* NOTE: ABSOLUTELY MUST initialize this
* struct to include the log function as it
@ -77,6 +79,8 @@ orte_errmgr_base_module_t orte_errmgr = {
NULL,
NULL,
NULL,
NULL,
NULL,
NULL
};
@ -90,6 +94,9 @@ static int orte_errmgr_base_close(void)
/* always leave a default set of fn pointers */
orte_errmgr = orte_errmgr_default_fns;
/* destruct the callback list */
OPAL_LIST_DESTRUCT(&orte_errmgr_base.error_cbacks);
return mca_base_framework_components_close(&orte_errmgr_base_framework, NULL);
}
@ -102,6 +109,9 @@ static int orte_errmgr_base_open(mca_base_open_flag_t flags)
/* load the default fns */
orte_errmgr = orte_errmgr_default_fns;
/* initialize the error callback list */
OBJ_CONSTRUCT(&orte_errmgr_base.error_cbacks, opal_list_t);
/* Open up all available components */
return mca_base_framework_components_open(&orte_errmgr_base_framework, flags);
}
@ -110,3 +120,6 @@ MCA_BASE_FRAMEWORK_DECLARE(orte, errmgr, "ORTE Error Manager", NULL,
orte_errmgr_base_open, orte_errmgr_base_close,
mca_errmgr_base_static_components, 0);
OBJ_CLASS_INSTANCE(orte_errmgr_cback_t,
opal_list_item_t,
NULL, NULL);

Просмотреть файл

@ -48,19 +48,18 @@ BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
int output;
bool initialized;
opal_list_t error_cbacks;
} orte_errmgr_base_t;
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
/* Define the ERRMGR command flag */
typedef uint8_t orte_errmgr_cmd_flag_t;
#define ORTE_ERRMGR_CMD OPAL_UINT8
/* define some commands */
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
#define ORTE_ERRMGR_REGISTER_CALLBACK_CMD 0x02
/* define a struct to hold registered error callbacks */
typedef struct {
opal_list_item_t super;
orte_errmgr_error_order_t order;
orte_errmgr_error_callback_fn_t *callback;
} orte_errmgr_cback_t;
OBJ_CLASS_DECLARATION(orte_errmgr_cback_t);
/* declare the base default module */
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns;
@ -77,5 +76,10 @@ ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
ORTE_DECLSPEC int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
orte_errmgr_error_order_t order);
ORTE_DECLSPEC void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors);
END_C_DECLS
#endif

Просмотреть файл

@ -49,7 +49,6 @@ static int finalize(void);
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static orte_errmgr_fault_callback_t* set_fault_callback(orte_errmgr_fault_callback_t *cbfunc);
/******************
* HNP module
@ -64,7 +63,8 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = {
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
set_fault_callback
orte_errmgr_base_register_error_callback,
orte_errmgr_base_execute_error_callbacks
};
static void proc_errors(int fd, short args, void *cbdata);
@ -135,8 +135,3 @@ static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
}
return ORTE_SUCCESS;
}
static orte_errmgr_fault_callback_t* set_fault_callback(orte_errmgr_fault_callback_t *cbfunc)
{
return NULL;
}

Просмотреть файл

@ -89,7 +89,8 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
NULL
NULL,
orte_errmgr_base_execute_error_callbacks
};

Просмотреть файл

@ -85,7 +85,8 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
NULL
NULL,
orte_errmgr_base_execute_error_callbacks
};
/* Local functions */

Просмотреть файл

@ -92,22 +92,6 @@ struct orte_errmgr_predicted_node_t {
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
/*
* Callback function that should be called when there is a fault.
*
* This callback function will be used anytime (other than during finalize) the
* runtime detects and handles a process failure. The runtime will complete all
* its stabilization before alerting the callback function. The parameter to the
* callback function will be the orte_process_name_t of the process that failed.
* It will not alert the application to failures that are not in the same job as
* the alerted process, only failures within the same jobid.
*
* @param[in] proc The names of the process that failed
*/
typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
/*
* Structure to describe a suggested remapping element for a predicted fault.
*
@ -242,41 +226,64 @@ typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
*/
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
typedef enum {
ORTE_ERRMGR_CALLBACK_FIRST,
ORTE_ERRMGR_CALLBACK_LAST,
ORTE_ERRMGR_CALLBACK_PREPEND,
ORTE_ERRMGR_CALLBACK_APPEND
} orte_errmgr_error_order_t;
/**
* Set the callback function for faults.
*
* Register a callback function for faults.
*
* This callback function will be used anytime (other than during finalize) the
* runtime detects and handles a critical failure. The runtime will complete all
* its stabilization before cycling thru all registered callbacks. The order of
* the callbacks will proceed in the indicated order with which they were registered.
*
* The parameter to the callback function will be the orte_process_name_t
* of the process involved in the error.
*
* @param[in] cbfunc The callback function.
*
* @retval The previous fault callback function.
*/
typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
typedef struct {
orte_process_name_t proc;
int errcode;
} orte_error_t;
typedef int (orte_errmgr_error_callback_fn_t)(opal_pointer_array_t *errors);
typedef int (*orte_errmgr_base_module_register_error_callback_fn_t)(orte_errmgr_error_callback_fn_t *cbfunc,
orte_errmgr_error_order_t order);
typedef void (*orte_errmgr_base_module_execute_error_callbacks_fn_t)(opal_pointer_array_t *errors);
/*
* Module Structure
*/
struct orte_errmgr_base_module_2_3_0_t {
/** Initialization Function */
orte_errmgr_base_module_init_fn_t init;
orte_errmgr_base_module_init_fn_t init;
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t finalize;
orte_errmgr_base_module_finalize_fn_t finalize;
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
/** Predicted process/node failure notification */
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_module_ft_event_fn_t ft_event;
orte_errmgr_base_module_ft_event_fn_t ft_event;
/* Register to be warned of impending migration */
/* Register to be warned of impending migration */
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
/* Set the callback function */
orte_errmgr_base_module_set_fault_callback_t set_fault_callback;
/* Register a callback function */
orte_errmgr_base_module_register_error_callback_fn_t register_error_callback;
orte_errmgr_base_module_execute_error_callbacks_fn_t execute_error_callbacks;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;