221 строка
8.9 KiB
C
221 строка
8.9 KiB
C
/*
|
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
|
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/** @file:
|
|
*
|
|
* The OpenRTE Notifier Framework
|
|
*
|
|
* The OpenRTE Notifier framework provides a mechanism for notifying
|
|
* system administrators or other fault monitoring systems that a
|
|
* problem with the underlying cluster has been detected - e.g., a
|
|
* failed connection in a network fabric
|
|
*/
|
|
|
|
#ifndef MCA_NOTIFIER_H
|
|
#define MCA_NOTIFIER_H
|
|
|
|
/*
|
|
* includes
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#ifdef HAVE_STDARG_H
|
|
#include <stdarg.h>
|
|
#endif
|
|
#ifdef HAVE_LIMITS_H
|
|
#include <limits.h>
|
|
#endif
|
|
#ifdef HAVE_SYSLOG_H
|
|
#include <syslog.h>
|
|
#endif
|
|
|
|
#include "opal/mca/mca.h"
|
|
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/* make the verbose channel visible here so everyone
|
|
* doesn't have to include notifier/base/base.h */
|
|
ORTE_DECLSPEC extern int orte_notifier_debug_output;
|
|
|
|
/* The maximum size of any on-stack buffers used in the notifier
|
|
* so we can try to avoid calling malloc in OUT_OF_RESOURCES conditions.
|
|
* The code has NOT been auditied for use of malloc, so this still
|
|
* may fail to get the "OUT_OF_RESOURCE" message out. Oh Well.
|
|
*/
|
|
#define ORTE_NOTIFIER_MAX_BUF 512
|
|
|
|
/* Severities */
|
|
typedef enum {
|
|
ORTE_NOTIFIER_EMERG = LOG_EMERG,
|
|
ORTE_NOTIFIER_ALERT = LOG_ALERT,
|
|
ORTE_NOTIFIER_CRIT = LOG_CRIT,
|
|
ORTE_NOTIFIER_ERROR = LOG_ERR,
|
|
ORTE_NOTIFIER_WARN = LOG_WARNING,
|
|
ORTE_NOTIFIER_NOTICE = LOG_NOTICE,
|
|
ORTE_NOTIFIER_INFO = LOG_INFO,
|
|
ORTE_NOTIFIER_DEBUG = LOG_DEBUG
|
|
} orte_notifier_severity_t;
|
|
|
|
typedef struct {
|
|
opal_object_t super;
|
|
opal_event_t ev;
|
|
orte_job_t *jdata;
|
|
orte_job_state_t state;
|
|
orte_notifier_severity_t severity;
|
|
int errcode;
|
|
const char *msg;
|
|
const char *action;
|
|
time_t t;
|
|
} orte_notifier_request_t;
|
|
OBJ_CLASS_DECLARATION(orte_notifier_request_t);
|
|
|
|
/*
|
|
* Component functions - all MUST be provided!
|
|
*/
|
|
|
|
/* initialize the selected module */
|
|
typedef int (*orte_notifier_base_module_init_fn_t)(void);
|
|
|
|
/* finalize the selected module */
|
|
typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
|
|
|
|
/* Log an internal error - this will include the job that caused the
|
|
* error to occur */
|
|
typedef void (*orte_notifier_base_module_log_fn_t)(orte_notifier_request_t *req);
|
|
|
|
/* Report a system event - e.g., a temperature out-of-bound */
|
|
typedef void (*orte_notifier_base_module_event_fn_t)(orte_notifier_request_t *req);
|
|
|
|
/* Report a job state */
|
|
typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *req);
|
|
|
|
|
|
#define ORTE_NOTIFIER_INTERNAL_ERROR(j, st, s, e, m) \
|
|
do { \
|
|
orte_notifier_request_t *_n; \
|
|
opal_output_verbose(2, orte_notifier_debug_output, \
|
|
"%s notifier:internal:error[%s:%d] " \
|
|
"job %s error %s severity %s", \
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, \
|
|
ORTE_JOBID_PRINT((j)->jobid), \
|
|
ORTE_ERROR_NAME((e)), \
|
|
orte_notifier_base_sev2str(s)); \
|
|
_n = OBJ_NEW(orte_notifier_request_t); \
|
|
_n->jdata = (j); \
|
|
_n->state = (st); \
|
|
_n->severity = (s); \
|
|
_n->errcode = (e); \
|
|
_n->msg = (m); \
|
|
_n->t = time(NULL); \
|
|
_n->action = (NULL); \
|
|
/* add the event */ \
|
|
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
|
OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \
|
|
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
|
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
|
} while(0);
|
|
|
|
#define ORTE_NOTIFIER_JOB_STATE(j, st, m) \
|
|
do { \
|
|
orte_notifier_request_t *_n; \
|
|
opal_output_verbose(2, orte_notifier_debug_output, \
|
|
"%s notifier[%s:%d] job %s state %s", \
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, \
|
|
ORTE_JOBID_PRINT((j)->jobid), \
|
|
orte_job_state_to_str(st)); \
|
|
_n = OBJ_NEW(orte_notifier_request_t); \
|
|
_n->jdata = (j); \
|
|
_n->state = (st); \
|
|
_n->msg = (m); \
|
|
_n->t = time(NULL); \
|
|
_n->action = (NULL); \
|
|
/* add the event */ \
|
|
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
|
OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \
|
|
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
|
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
|
} while(0);
|
|
|
|
#define ORTE_NOTIFIER_SYSTEM_EVENT(s, m, a) \
|
|
do { \
|
|
orte_notifier_request_t *_n; \
|
|
opal_output_verbose(2, orte_notifier_debug_output, \
|
|
"%s notifier:sys:event[%s:%d] event %s", \
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, \
|
|
orte_notifier_base_sev2str(s)); \
|
|
_n = OBJ_NEW(orte_notifier_request_t); \
|
|
_n->jdata = (NULL); \
|
|
_n->state = (NULL); \
|
|
_n->jdata = NULL; \
|
|
_n->msg = (m); \
|
|
_n->t = time(NULL); \
|
|
_n->severity = (s); \
|
|
_n->action = (a); \
|
|
/* add the event */ \
|
|
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
|
OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \
|
|
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
|
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
|
} while(0);
|
|
|
|
/*
|
|
* Ver 1.0
|
|
*/
|
|
typedef struct {
|
|
orte_notifier_base_module_init_fn_t init;
|
|
orte_notifier_base_module_finalize_fn_t finalize;
|
|
orte_notifier_base_module_log_fn_t log;
|
|
orte_notifier_base_module_event_fn_t event;
|
|
orte_notifier_base_module_report_fn_t report;
|
|
} orte_notifier_base_module_t;
|
|
|
|
|
|
/*
|
|
* the standard component data structure
|
|
*/
|
|
typedef struct {
|
|
mca_base_component_t base_version;
|
|
mca_base_component_data_t base_data;
|
|
} orte_notifier_base_component_t;
|
|
|
|
|
|
/*
|
|
* Macro for use in components that are of type notifier v1.0.0
|
|
*/
|
|
#define ORTE_NOTIFIER_BASE_VERSION_1_0_0 \
|
|
/* notifier v1.0 is chained to MCA v2.0 */ \
|
|
MCA_BASE_VERSION_2_0_0, \
|
|
/* notifier v1.0 */ \
|
|
"notifier", 1, 0, 0
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* MCA_NOTIFIER_H */
|