2006-09-14 21:29:51 +00:00
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2006-09-14 21:29:51 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef ORTE_MCA_ERRMGR_PRIVATE_H
|
|
|
|
#define ORTE_MCA_ERRMGR_PRIVATE_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
#include "orte_config.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "opal/dss/dss_types.h"
|
|
|
|
#include "orte/mca/plm/plm_types.h"
|
The current errmgr.register_callback API takes a jobid as one of its argument. The intent was to have the errmgr check the jobid of the job being reported to it and, if it matches the jobid that was registered, call the specified callback function.
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback.
This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn.
Also, fully implement the "copy" function for the orte_job_t object.
NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value.
This commit was SVN r21200.
2009-05-11 03:38:15 +00:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2006-09-14 21:29:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Functions for use solely within the ERRMGR framework
|
|
|
|
*/
|
2008-02-28 01:57:57 +00:00
|
|
|
BEGIN_C_DECLS
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2010-04-26 22:15:57 +00:00
|
|
|
/* define a struct to hold framework-global values */
|
|
|
|
typedef struct {
|
|
|
|
int output;
|
|
|
|
opal_pointer_array_t modules;
|
|
|
|
bool initialized;
|
|
|
|
} orte_errmgr_base_t;
|
|
|
|
|
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/* Define the ERRMGR command flag */
|
|
|
|
typedef uint8_t orte_errmgr_cmd_flag_t;
|
2008-02-28 01:57:57 +00:00
|
|
|
#define ORTE_ERRMGR_CMD OPAL_UINT8
|
2006-09-14 21:29:51 +00:00
|
|
|
|
|
|
|
/* define some commands */
|
|
|
|
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
|
2008-02-28 01:57:57 +00:00
|
|
|
#define ORTE_ERRMGR_REGISTER_CALLBACK_CMD 0x02
|
2010-04-26 22:15:57 +00:00
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/*
|
|
|
|
* Base functions
|
|
|
|
*/
|
2010-03-23 21:28:02 +00:00
|
|
|
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2010-04-23 04:44:41 +00:00
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
|
|
|
orte_job_state_t jobstate,
|
|
|
|
orte_process_name_t *proc_name,
|
|
|
|
orte_proc_state_t state,
|
|
|
|
orte_exit_code_t exit_code);
|
2010-03-23 21:28:02 +00:00
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
|
|
|
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
|
|
|
|
__opal_attribute_format__(__printf__, 2, 3)
|
|
|
|
# endif
|
|
|
|
;
|
2010-04-05 22:59:21 +00:00
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list,
|
|
|
|
char ***node_list,
|
|
|
|
char ***suggested_nodes);
|
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
|
|
|
orte_node_t *oldnode,
|
|
|
|
opal_list_t *node_list);
|
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
|
2006-09-14 21:29:51 +00:00
|
|
|
|
|
|
|
/*
|
2010-03-23 21:28:02 +00:00
|
|
|
* Additional External API function declared in errmgr.h
|
2006-09-14 21:29:51 +00:00
|
|
|
*/
|
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
END_C_DECLS
|
2006-09-14 21:29:51 +00:00
|
|
|
#endif
|