2005-01-21 20:49:14 +03:00
|
|
|
/*
|
2008-05-06 22:08:45 +04:00
|
|
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
2005-11-05 22:57:48 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-09-20 21:09:11 +04:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-01-21 20:49:14 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2009-05-20 17:16:31 +04:00
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
2005-01-21 20:49:14 +03:00
|
|
|
* $COPYRIGHT$
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* Additional copyrights may follow
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
|
|
|
* The Open RTE Error Manager
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#ifndef ORTE_MCA_ERRMGR_H
|
|
|
|
#define ORTE_MCA_ERRMGR_H
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "orte_config.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2005-12-08 03:05:26 +03:00
|
|
|
#include "opal/mca/mca.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "opal/util/error.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
The current errmgr.register_callback API takes a jobid as one of its argument. The intent was to have the errmgr check the jobid of the job being reported to it and, if it matches the jobid that was registered, call the specified callback function.
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback.
This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn.
Also, fully implement the "copy" function for the orte_job_t object.
NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value.
This commit was SVN r21200.
2009-05-11 07:38:15 +04:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
2005-05-12 00:21:10 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Macro definitions
|
|
|
|
*/
|
2005-09-20 21:09:11 +04:00
|
|
|
/*
|
|
|
|
* Thess macros and associated error name array are used to output intelligible error
|
|
|
|
* messages.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define ORTE_ERROR_NAME(n) opal_strerror(n)
|
2005-03-14 23:57:21 +03:00
|
|
|
#define ORTE_ERROR_LOG(n) \
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr_base_log(n, __FILE__, __LINE__)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This is not part of any
|
|
|
|
* module so it can be used at any time!
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line);
|
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Component functions - all MUST be provided!
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/**
|
|
|
|
* Alert - process aborted
|
2008-02-28 04:57:57 +03:00
|
|
|
* This function is called by the PLM when a remote process aborts during execution. Actions taken
|
|
|
|
* in response to the abnormal termination of a remote application process will vary across
|
2006-09-15 01:29:51 +04:00
|
|
|
* the various errmgr components.
|
2008-02-28 04:57:57 +03:00
|
|
|
*
|
2006-09-15 01:29:51 +04:00
|
|
|
* NOTE: Local process errors should always be reported through the error_detected interface and
|
|
|
|
* NOT here.
|
2008-02-28 04:57:57 +03:00
|
|
|
*
|
|
|
|
* @param *name Pointer to the name of the proc that aborted
|
|
|
|
*
|
|
|
|
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
|
|
|
* @retval ORTE_ERROR Appropriate error code
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2008-02-28 04:57:57 +03:00
|
|
|
typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - incomplete start of a job
|
2008-02-28 04:57:57 +03:00
|
|
|
* This function is called by the PLM when an attempted launch of a job encounters failure of
|
2006-09-15 01:29:51 +04:00
|
|
|
* one or more processes to start. The strategy for dealing
|
|
|
|
* with this "incomplete start" situation varies across the various errmgr components.
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* This function is only called by the respective process launcher, which is responsible
|
2006-09-15 01:29:51 +04:00
|
|
|
* for detecting incomplete starts. If on a daemon, the function simply updates the
|
|
|
|
* process state to indicate failure to launch - this initiates a trigger that goes to
|
|
|
|
* the respective HNP for response.
|
|
|
|
*
|
|
|
|
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
|
|
|
|
* from taking any action to this function call. Instead, they are restricted to simply
|
|
|
|
* returning.
|
|
|
|
*
|
2008-02-28 04:57:57 +03:00
|
|
|
* @param job Job that failed to start
|
2006-09-15 01:29:51 +04:00
|
|
|
*
|
2008-02-28 04:57:57 +03:00
|
|
|
* @retval ORTE_SUCCESS Whatever action that was taken was successful
|
|
|
|
* @retval ORTE_ERROR Appropriate error code
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2008-02-28 04:57:57 +03:00
|
|
|
typedef void (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Register a job with the error manager
|
|
|
|
* When a job is launched, this function is called so the error manager can register
|
|
|
|
* subscriptions on the job segment so that the error manager will be notified when
|
|
|
|
* problems occur - i.e., when process status entries change to abnormal termination
|
2006-09-15 01:29:51 +04:00
|
|
|
* values. Process status entries are changed by the appropriate state monitor
|
2005-03-14 23:57:21 +03:00
|
|
|
* and/or the process launcher, depending upon the stage at which the problem occurs.
|
2005-09-20 21:09:11 +04:00
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* Monitoring of the job begins once the job has reached the "executing" stage. Prior
|
|
|
|
* to that time, failure of processes to start are the responsibility of the respective
|
|
|
|
* process launcher - which is expected to call the error manager via the "incomplete
|
|
|
|
* start" interface to report any problems prior to the job beginning "execution".
|
2006-09-15 01:29:51 +04:00
|
|
|
*
|
|
|
|
* NOTE: ONLY HNPs are allowed to register for trigger reports. All other components
|
|
|
|
* MUST do nothing but return ORTE_SUCCESS.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2008-02-28 04:57:57 +03:00
|
|
|
typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job,
|
|
|
|
orte_job_state_t state,
|
The current errmgr.register_callback API takes a jobid as one of its argument. The intent was to have the errmgr check the jobid of the job being reported to it and, if it matches the jobid that was registered, call the specified callback function.
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback.
This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn.
Also, fully implement the "copy" function for the orte_job_t object.
NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value.
This commit was SVN r21200.
2009-05-11 07:38:15 +04:00
|
|
|
orte_err_cb_fn_t cbfunc,
|
2008-02-28 04:57:57 +03:00
|
|
|
void *cbdata);
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Alert - self aborting
|
2008-02-28 04:57:57 +03:00
|
|
|
* This function is called when a process is aborting due to some internal error.
|
|
|
|
* It will finalize the process
|
|
|
|
* itself, and then exit - it takes no other actions. The intent here is to provide
|
2006-09-15 01:29:51 +04:00
|
|
|
* a last-ditch exit procedure that attempts to clean up a little.
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
2009-05-20 04:39:22 +04:00
|
|
|
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) __opal_attribute_noreturn__
|
|
|
|
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
2009-05-20 17:16:31 +04:00
|
|
|
__opal_attribute_format__(__printf__, 2, 3)
|
2009-05-20 04:39:22 +04:00
|
|
|
# endif
|
|
|
|
;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
2008-07-29 02:40:57 +04:00
|
|
|
*
|
2005-01-21 20:49:14 +03:00
|
|
|
*/
|
2008-07-29 02:40:57 +04:00
|
|
|
struct orte_errmgr_base_module_2_3_0_t {
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
|
|
|
|
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr_base_module_register_cb_fn_t register_callback;
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_errmgr_base_module_abort_fn_t abort;
|
2005-01-21 20:49:14 +03:00
|
|
|
};
|
|
|
|
|
2008-07-29 02:40:57 +04:00
|
|
|
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
|
|
|
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ERRMGR Component
|
|
|
|
* the standard component data structure
|
|
|
|
*/
|
2008-07-29 02:40:57 +04:00
|
|
|
struct mca_errmgr_base_component_2_0_0_t {
|
2008-05-06 22:08:45 +04:00
|
|
|
mca_base_component_t base_version;
|
2008-07-29 02:40:57 +04:00
|
|
|
mca_base_component_data_t base_data;
|
2005-01-21 20:49:14 +03:00
|
|
|
};
|
2008-07-29 02:40:57 +04:00
|
|
|
typedef struct mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_2_0_0_t;
|
|
|
|
typedef mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_t;
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
2008-07-29 02:40:57 +04:00
|
|
|
* Macro for use in components that are of type errmgr
|
2005-01-21 20:49:14 +03:00
|
|
|
*/
|
2008-07-29 02:40:57 +04:00
|
|
|
#define ORTE_ERRMGR_BASE_VERSION_2_0_0 \
|
|
|
|
MCA_BASE_VERSION_2_0_0, \
|
|
|
|
"errmgr", 2, 0, 0
|
2005-01-21 20:49:14 +03:00
|
|
|
|
|
|
|
/* Global structure for accessing error manager functions
|
|
|
|
*/
|
2006-08-20 19:54:04 +04:00
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* holds selected module's function pointers */
|
2005-01-21 20:49:14 +03:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
END_C_DECLS
|
2005-05-12 00:21:10 +04:00
|
|
|
|
2005-01-21 20:49:14 +03:00
|
|
|
#endif
|