1
1
openmpi/orte/mca/notifier/notifier.h
2015-03-05 19:48:27 -08:00

221 строка
8.9 KiB
C

/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The OpenRTE Notifier Framework
*
* The OpenRTE Notifier framework provides a mechanism for notifying
* system administrators or other fault monitoring systems that a
* problem with the underlying cluster has been detected - e.g., a
* failed connection in a network fabric
*/
#ifndef MCA_NOTIFIER_H
#define MCA_NOTIFIER_H
/*
* includes
*/
#include "orte_config.h"
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#endif
#include "opal/mca/mca.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
/* make the verbose channel visible here so everyone
* doesn't have to include notifier/base/base.h */
ORTE_DECLSPEC extern int orte_notifier_debug_output;
/* The maximum size of any on-stack buffers used in the notifier
* so we can try to avoid calling malloc in OUT_OF_RESOURCES conditions.
* The code has NOT been auditied for use of malloc, so this still
* may fail to get the "OUT_OF_RESOURCE" message out. Oh Well.
*/
#define ORTE_NOTIFIER_MAX_BUF 512
/* Severities */
typedef enum {
ORTE_NOTIFIER_EMERG = LOG_EMERG,
ORTE_NOTIFIER_ALERT = LOG_ALERT,
ORTE_NOTIFIER_CRIT = LOG_CRIT,
ORTE_NOTIFIER_ERROR = LOG_ERR,
ORTE_NOTIFIER_WARN = LOG_WARNING,
ORTE_NOTIFIER_NOTICE = LOG_NOTICE,
ORTE_NOTIFIER_INFO = LOG_INFO,
ORTE_NOTIFIER_DEBUG = LOG_DEBUG
} orte_notifier_severity_t;
typedef struct {
opal_object_t super;
opal_event_t ev;
orte_job_t *jdata;
orte_job_state_t state;
orte_notifier_severity_t severity;
int errcode;
const char *msg;
const char *action;
time_t t;
} orte_notifier_request_t;
OBJ_CLASS_DECLARATION(orte_notifier_request_t);
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_notifier_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
/* Log an internal error - this will include the job that caused the
* error to occur */
typedef void (*orte_notifier_base_module_log_fn_t)(orte_notifier_request_t *req);
/* Report a system event - e.g., a temperature out-of-bound */
typedef void (*orte_notifier_base_module_event_fn_t)(orte_notifier_request_t *req);
/* Report a job state */
typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *req);
#define ORTE_NOTIFIER_INTERNAL_ERROR(j, st, s, e, m) \
do { \
orte_notifier_request_t *_n; \
opal_output_verbose(2, orte_notifier_debug_output, \
"%s notifier:internal:error[%s:%d] " \
"job %s error %s severity %s", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, \
ORTE_JOBID_PRINT((j)->jobid), \
ORTE_ERROR_NAME((e)), \
orte_notifier_base_sev2str(s)); \
_n = OBJ_NEW(orte_notifier_request_t); \
_n->jdata = (j); \
_n->state = (st); \
_n->severity = (s); \
_n->errcode = (e); \
_n->msg = (m); \
_n->t = time(NULL); \
_n->action = (NULL); \
/* add the event */ \
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
} while(0);
#define ORTE_NOTIFIER_JOB_STATE(j, st, m) \
do { \
orte_notifier_request_t *_n; \
opal_output_verbose(2, orte_notifier_debug_output, \
"%s notifier[%s:%d] job %s state %s", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, \
ORTE_JOBID_PRINT((j)->jobid), \
orte_job_state_to_str(st)); \
_n = OBJ_NEW(orte_notifier_request_t); \
_n->jdata = (j); \
_n->state = (st); \
_n->msg = (m); \
_n->t = time(NULL); \
_n->action = (NULL); \
/* add the event */ \
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
} while(0);
#define ORTE_NOTIFIER_SYSTEM_EVENT(s, m, a) \
do { \
orte_notifier_request_t *_n; \
opal_output_verbose(2, orte_notifier_debug_output, \
"%s notifier:sys:event[%s:%d] event %s", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, \
orte_notifier_base_sev2str(s)); \
_n = OBJ_NEW(orte_notifier_request_t); \
_n->jdata = (NULL); \
_n->state = (NULL); \
_n->jdata = NULL; \
_n->msg = (m); \
_n->t = time(NULL); \
_n->severity = (s); \
_n->action = (a); \
/* add the event */ \
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
} while(0);
/*
* Ver 1.0
*/
typedef struct {
orte_notifier_base_module_init_fn_t init;
orte_notifier_base_module_finalize_fn_t finalize;
orte_notifier_base_module_log_fn_t log;
orte_notifier_base_module_event_fn_t event;
orte_notifier_base_module_report_fn_t report;
} orte_notifier_base_module_t;
/*
* the standard component data structure
*/
typedef struct {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
} orte_notifier_base_component_t;
/*
* Macro for use in components that are of type notifier v1.0.0
*/
#define ORTE_NOTIFIER_BASE_VERSION_1_0_0 \
/* notifier v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* notifier v1.0 */ \
"notifier", 1, 0, 0
END_C_DECLS
#endif /* MCA_NOTIFIER_H */