2005-01-21 20:49:14 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-09-20 21:09:11 +04:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-01-21 20:49:14 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2005-01-21 20:49:14 +03:00
|
|
|
* $COPYRIGHT$
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* Additional copyrights may follow
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
|
|
|
* The Open RTE Error Manager
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#ifndef ORTE_MCA_ERRMGR_H
|
|
|
|
#define ORTE_MCA_ERRMGR_H
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "orte_config.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_constants.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
|
2005-12-08 03:05:26 +03:00
|
|
|
#include "orte/mca/schema/schema.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "orte/mca/gpr/gpr_types.h"
|
|
|
|
#include "orte/mca/ns/ns_types.h"
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2005-12-08 03:05:26 +03:00
|
|
|
#include "opal/mca/mca.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2005-05-12 00:21:10 +04:00
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Macro definitions
|
|
|
|
*/
|
2005-09-20 21:09:11 +04:00
|
|
|
/*
|
|
|
|
* Thess macros and associated error name array are used to output intelligible error
|
|
|
|
* messages.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define ORTE_ERROR_NAME(n) opal_strerror(n)
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#define ORTE_ERROR_LOG(n) \
|
|
|
|
orte_errmgr.log((n), __FILE__, __LINE__)
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Component functions - all MUST be provided!
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Log an error
|
2006-09-15 01:29:51 +04:00
|
|
|
* Log an error that occurred in the runtime environment
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* @code
|
2005-03-14 23:57:21 +03:00
|
|
|
* orte_errmgr.log("this is an error", __FILE__, __LINE__);
|
2005-01-21 20:49:14 +03:00
|
|
|
* @endcode
|
|
|
|
*/
|
2005-03-14 23:57:21 +03:00
|
|
|
typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - process aborted
|
2006-09-15 01:29:51 +04:00
|
|
|
* This function is called when a remote process aborts during execution. The function
|
|
|
|
* is called via the GPR's trigger notification system. Actions taken in response
|
|
|
|
* to the abnormal termination of a remote application process will vary across
|
|
|
|
* the various errmgr components.
|
|
|
|
|
|
|
|
* NOTE: Local process errors should always be reported through the error_detected interface and
|
|
|
|
* NOT here.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_gpr_notify_message_t *msg);
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - incomplete start of a job
|
|
|
|
* This function is called when an attempted launch of a job encounters failure of
|
2006-09-15 01:29:51 +04:00
|
|
|
* one or more processes to start. The strategy for dealing
|
|
|
|
* with this "incomplete start" situation varies across the various errmgr components.
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* This function is only called by the respective process launcher, which is responsible
|
2006-09-15 01:29:51 +04:00
|
|
|
* for detecting incomplete starts. If on a daemon, the function simply updates the
|
|
|
|
* process state to indicate failure to launch - this initiates a trigger that goes to
|
|
|
|
* the respective HNP for response.
|
|
|
|
*
|
|
|
|
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
|
|
|
|
* from taking any action to this function call. Instead, they are restricted to simply
|
|
|
|
* returning.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_gpr_notify_message_t *msg);
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - internal error detected
|
2006-09-15 01:29:51 +04:00
|
|
|
* This function is called when an internal error is detected within a local process.
|
|
|
|
* It decides what to do about the error. In the case of application processes, it simply
|
|
|
|
* orders the local process to finalize and terminate. The abnormal termination will be
|
|
|
|
* detected and dealt with by the daemon/HNP system.
|
|
|
|
*
|
|
|
|
* HNPs, of course, cannot simply exit - they must first cleanup their running jobs if at
|
|
|
|
* all possible. In some cases, this cannot be done - e.g., if the error detected would
|
|
|
|
* prevent operation of the registry or has corrupted memory. In these extreme cases,
|
|
|
|
* nothing can really be done.
|
|
|
|
*
|
|
|
|
* Likewise, orteds have responsibility towards their local application processes and
|
|
|
|
* must make some attempt to clean them up before exiting.
|
|
|
|
*
|
|
|
|
* The function pretty prints an error message if possible. Error message should be
|
|
|
|
* specified using the standard \code printf() format.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
typedef void (*orte_errmgr_base_module_error_detected_fn_t)(int error_code, char *fmt, ...);
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a job with the error manager
|
|
|
|
* When a job is launched, this function is called so the error manager can register
|
|
|
|
* subscriptions on the job segment so that the error manager will be notified when
|
|
|
|
* problems occur - i.e., when process status entries change to abnormal termination
|
2006-09-15 01:29:51 +04:00
|
|
|
* values. Process status entries are changed by the appropriate state monitor
|
2005-03-14 23:57:21 +03:00
|
|
|
* and/or the process launcher, depending upon the stage at which the problem occurs.
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* Monitoring of the job begins once the job has reached the "executing" stage. Prior
|
|
|
|
* to that time, failure of processes to start are the responsibility of the respective
|
|
|
|
* process launcher - which is expected to call the error manager via the "incomplete
|
|
|
|
* start" interface to report any problems prior to the job beginning "execution".
|
2006-09-15 01:29:51 +04:00
|
|
|
*
|
|
|
|
* NOTE: ONLY HNPs are allowed to register for trigger reports. All other components
|
|
|
|
* MUST do nothing but return ORTE_SUCCESS.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_register_job_fn_t)(orte_jobid_t job);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - self aborting
|
2006-09-15 01:29:51 +04:00
|
|
|
* This function is called when a process is aborting. It will finalize the process
|
|
|
|
* itself, and then exits - it takes no other actions. The intent here is to provide
|
|
|
|
* a last-ditch exit procedure that attempts to clean up a little.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
|
|
|
typedef void (*orte_errmgr_base_module_abort_fn_t)(void);
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/*
|
|
|
|
* Request that the system abort processes other than myself
|
|
|
|
* The possibility exists that a process will decide that ONLY a small subset of a job
|
|
|
|
* must be aborted. This function allows a process to request that the identified
|
|
|
|
* processes be aborted. The "request" portion of the function's name is not
|
|
|
|
* by accident - this function specifically does NOT perform the abort process
|
|
|
|
* itself, but simply requests that it be done.
|
|
|
|
*
|
|
|
|
* NOTE: Please ensure that you do NOT include your own process name in the
|
|
|
|
* array or else you will be ordered to "die" before you complete this function
|
|
|
|
* (i.e., you will be held in a blocking receive pending an answer from the
|
|
|
|
* HNP, which won't come before you receive your own "die" command). If you need
|
|
|
|
* to die too, then call "abort" after completing this function call.
|
|
|
|
*/
|
|
|
|
typedef int (*orte_errmgr_base_module_abort_procs_request_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs);
|
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
/*
|
|
|
|
* Ver 1.0.0
|
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
struct orte_errmgr_base_module_1_3_0_t {
|
|
|
|
orte_errmgr_base_module_log_fn_t log;
|
|
|
|
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
|
|
|
|
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
|
|
|
|
orte_errmgr_base_module_error_detected_fn_t error_detected;
|
|
|
|
orte_errmgr_base_module_register_job_fn_t register_job;
|
|
|
|
orte_errmgr_base_module_abort_fn_t abort;
|
|
|
|
orte_errmgr_base_module_abort_procs_request_fn_t abort_procs_request;
|
2005-01-21 20:49:14 +03:00
|
|
|
};
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
typedef struct orte_errmgr_base_module_1_3_0_t orte_errmgr_base_module_1_3_0_t;
|
|
|
|
typedef orte_errmgr_base_module_1_3_0_t orte_errmgr_base_module_t;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ERRMGR Component
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
typedef orte_errmgr_base_module_t* (*orte_errmgr_base_component_init_fn_t)(
|
2005-01-21 20:49:14 +03:00
|
|
|
bool *allow_multi_user_threads,
|
|
|
|
bool *have_hidden_threads,
|
|
|
|
int *priority);
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
typedef int (*orte_errmgr_base_component_finalize_fn_t)(void);
|
2005-09-20 21:09:11 +04:00
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
/*
|
|
|
|
* the standard component data structure
|
|
|
|
*/
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
struct mca_errmgr_base_component_1_3_0_t {
|
2005-01-21 20:49:14 +03:00
|
|
|
mca_base_component_t errmgr_version;
|
|
|
|
mca_base_component_data_1_0_0_t errmgr_data;
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_errmgr_base_component_init_fn_t errmgr_init;
|
|
|
|
orte_errmgr_base_component_finalize_fn_t errmgr_finalize;
|
2005-01-21 20:49:14 +03:00
|
|
|
};
|
2006-09-15 01:29:51 +04:00
|
|
|
typedef struct mca_errmgr_base_component_1_3_0_t mca_errmgr_base_component_1_3_0_t;
|
|
|
|
typedef mca_errmgr_base_component_1_3_0_t mca_errmgr_base_component_t;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Macro for use in components that are of type errmgr v1.0.0
|
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
#define ORTE_ERRMGR_BASE_VERSION_1_3_0 \
|
|
|
|
/* errmgr v1.3 is chained to MCA v1.0 */ \
|
2005-01-21 20:49:14 +03:00
|
|
|
MCA_BASE_VERSION_1_0_0, \
|
2006-09-15 01:29:51 +04:00
|
|
|
/* errmgr v1.3 */ \
|
|
|
|
"errmgr", 1, 3, 0
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/* Global structure for accessing error manager functions
|
|
|
|
*/
|
2006-08-20 19:54:04 +04:00
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* holds selected module's function pointers */
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2005-05-12 00:21:10 +04:00
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
#endif
|