1
1

Merge improvements to the "notifier" framework from the OPAL SOS and the ORTE WDC mercurial branches into the SVN trunk.

A brief description of the improvements can be found at
https://svn.open-mpi.org/trac/ompi/wiki/ORTEWDC#ChangesdonetotheORTEnotifier

This commit was SVN r23157.
Этот коммит содержится в:
Abhishek Kulkarni 2010-05-17 22:48:05 +00:00
родитель f5b9bc4ff1
Коммит 9c5860706f
29 изменённых файлов: 1543 добавлений и 610 удалений

Просмотреть файл

@ -24,7 +24,7 @@ libmca_notifier_la_SOURCES =
nobase_orte_HEADERS =
# local files
headers = notifier.h
headers = notifier.h notifier_event_types.h notifier_event_calls.h
libmca_notifier_la_SOURCES += $(headers)

Просмотреть файл

@ -9,6 +9,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -22,4 +23,6 @@ headers += \
libmca_notifier_la_SOURCES += \
base/notifier_base_close.c \
base/notifier_base_select.c \
base/notifier_base_wrappers.c \
base/notifier_base_events.c \
base/notifier_base_open.c

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,39 +27,95 @@
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_list.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/notifier/notifier.h"
BEGIN_C_DECLS
/*
* Global functions for MCA overall collective open and close
* Type for holding selected module / component pairs
*/
BEGIN_C_DECLS
typedef struct {
opal_list_item_t super;
/* Component */
orte_notifier_base_component_t *onbsp_component;
/* Module */
orte_notifier_base_module_t *onbsp_module;
/* Priority */
int onbsp_priority;
} orte_notifier_base_selected_pair_t;
OBJ_CLASS_DECLARATION(orte_notifier_base_selected_pair_t);
#if !ORTE_DISABLE_FULL_SUPPORT
/*
* function definitions
*/
ORTE_DECLSPEC int orte_notifier_base_open(void);
ORTE_DECLSPEC int orte_notifier_base_select(void);
ORTE_DECLSPEC int orte_notifier_base_close(void);
ORTE_DECLSPEC int orte_notifier_base_open(void);
ORTE_DECLSPEC int orte_notifier_base_select(void);
ORTE_DECLSPEC void orte_notifier_log(orte_notifier_base_severity_t severity,
int errcode,
const char *msg, ...);
ORTE_DECLSPEC void orte_notifier_show_help(orte_notifier_base_severity_t severity,
int errcode,
const char *file,
const char *topic, ...);
ORTE_DECLSPEC void orte_notifier_log_peer(orte_notifier_base_severity_t severity,
int errcode,
orte_process_name_t *peer_proc,
const char *msg, ...);
ORTE_DECLSPEC const char* orte_notifier_base_sev2str(orte_notifier_base_severity_t severity);
ORTE_DECLSPEC char *orte_notifier_base_peer_log(int errcode,
orte_process_name_t *peer_proc,
const char *msg, va_list ap);
ORTE_DECLSPEC int orte_notifier_base_close(void);
#if ORTE_WANT_NOTIFIER_LOG_EVENT
ORTE_DECLSPEC int orte_notifier_base_events_init(void);
ORTE_DECLSPEC void orte_notifier_base_events_finalize(void);
#else /* ORTE_WANT_NOTIFIER_LOG_EVENT */
#define orte_notifier_base_events_init() do {} while (0)
#define orte_notifier_base_events_finalize() do {} while (0)
#endif /* ORTE_WANT_NOTIFIER_LOG_EVENT */
/*
* globals that might be needed
* global variables in the base
* Needs to be declspec'ed for ompi_info and others
*/
/*
* Indication of whether a component was successfully selected or not
* (1 component per interface)
*/
ORTE_DECLSPEC extern bool orte_notifier_base_log_selected;
ORTE_DECLSPEC extern bool orte_notifier_base_help_selected;
ORTE_DECLSPEC extern bool orte_notifier_base_log_peer_selected;
ORTE_DECLSPEC extern bool orte_notifier_base_log_event_selected;
/*
* Lists of selected modules (1 per interface)
*/
ORTE_DECLSPEC extern opal_list_t orte_notifier_log_selected_modules;
ORTE_DECLSPEC extern opal_list_t orte_notifier_help_selected_modules;
ORTE_DECLSPEC extern opal_list_t orte_notifier_log_peer_selected_modules;
ORTE_DECLSPEC extern opal_list_t orte_notifier_log_event_selected_modules;
/*
* That one is a merge of the per interface lists
* It is used during finalize phase to finalize only once each selected module
*/
ORTE_DECLSPEC extern opal_list_t orte_notifier_base_selected_modules;
ORTE_DECLSPEC extern int orte_notifier_base_output;
ORTE_DECLSPEC extern int orte_notifier_threshold_severity;
ORTE_DECLSPEC extern bool mca_notifier_base_selected;
ORTE_DECLSPEC extern opal_list_t mca_notifier_base_components_available;
ORTE_DECLSPEC extern orte_notifier_base_component_t mca_notifier_base_selected_component;
ORTE_DECLSPEC extern orte_notifier_base_severity_t orte_notifier_threshold_severity;
ORTE_DECLSPEC extern opal_list_t orte_notifier_base_components_available;
#if !ORTE_DISABLE_FULL_SUPPORT
/* no base functions to protect at this time */
#endif /* ORTE_DISABLE_FULL_SUPPORT */
#endif /* !ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,19 +28,37 @@
int orte_notifier_base_close(void)
{
/* If we have a selected component and module, then finalize it */
if (NULL != orte_notifier.finalize) {
orte_notifier.finalize();
opal_list_item_t *item;
orte_notifier_base_selected_pair_t *pair;
if (orte_notifier_base_log_event_selected) {
orte_notifier_base_events_finalize();
}
/* Close all remaining available components (may be one if this is a
OpenRTE program, or [possibly] multiple if this is ompi_info) */
/* Finalize all the selected modules
* orte_notifier_base_selected_modules has been built as a merge of the
* per interface selected modules lists, so only going through that list
* to invoke the finalize routines is enough.
*/
for (item = opal_list_remove_first(&orte_notifier_base_selected_modules);
NULL != item;
item = opal_list_remove_first(&orte_notifier_base_selected_modules)) {
pair = (orte_notifier_base_selected_pair_t*) item;
if (NULL != pair->onbsp_module->finalize) {
pair->onbsp_module->finalize();
}
free(pair);
}
OBJ_DESTRUCT(&orte_notifier_base_selected_modules);
OBJ_DESTRUCT(&orte_notifier_log_selected_modules);
OBJ_DESTRUCT(&orte_notifier_help_selected_modules);
OBJ_DESTRUCT(&orte_notifier_log_peer_selected_modules);
OBJ_DESTRUCT(&orte_notifier_log_event_selected_modules);
/* Close all remaining available components */
mca_base_components_close(orte_notifier_base_output,
&mca_notifier_base_components_available, NULL);
&orte_notifier_base_components_available, NULL);
/* All done */
return ORTE_SUCCESS;
}

197
orte/mca/notifier/base/notifier_base_events.c Обычный файл
Просмотреть файл

@ -0,0 +1,197 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Bull SAS. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/types.h"
#include "orte/constants.h"
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STDIO_H
#include <stdio.h>
#endif /* HAVE_STDIO_H */
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/class/opal_object.h"
#include "opal/class/opal_list.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/base/base.h"
#if ORTE_WANT_NOTIFIER_LOG_EVENT
/*
* Definitions for the events that are accounted for before being logged.
* They are stored in a list to ensure they are all unconditionally traced
* out during finalize.
*/
opal_list_t orte_notifier_events_list;
/*
* Log format differs depending on the phase we are in.
*/
#define ORTE_NOTIFIER_LOG_FORMAT_0 "TIME=%ld MPI_NOTIFIER_EVENT FAMILY=%u JOB=%u VPID=%u HOST=%s EVENT=%d COUNT=%u: %s"
#define ORTE_NOTIFIER_LOG_FORMAT_1 "TIME=%ld MPI_NOTIFIER_EVENT FAMILY=%u JOB=%u VPID=%u HOST=%s EVENT=%d COUNT=%u (in %ld seconds): %s"
#define ORTE_NOTIFIER_LOG_FORMAT_2 "TIME=%ld MPI_NOTIFIER_EVENT FAMILY=%u JOB=%u VPID=%u HOST=%s EVENT=%d COUNT=%u (Finalize): %s"
static void orte_notifier_event_construct(orte_notifier_event_t *ev)
{
ev->ev_cnt = 0;
ev->ev_already_traced = 0;
ev->ev_msg = NULL;
}
static void orte_notifier_event_destruct(orte_notifier_event_t *ev)
{
if (NULL != ev->ev_msg) {
free(ev->ev_msg);
}
}
OBJ_CLASS_INSTANCE(orte_notifier_event_t,
opal_list_item_t,
orte_notifier_event_construct,
orte_notifier_event_destruct);
int orte_notifier_base_events_init(void)
{
if (!ORTE_PROC_IS_HNP) {
OBJ_CONSTRUCT(&orte_notifier_events_list, opal_list_t);
}
return ORTE_SUCCESS;
}
void orte_notifier_base_events_finalize(void)
{
orte_notifier_event_t *ev;
opal_list_item_t *item;
int32_t count;
if (ORTE_PROC_IS_HNP) {
return;
}
/*
* Unconditionally trace any event that has been accounted for
*/
for (item = opal_list_remove_first(&orte_notifier_events_list);
NULL != item;
item = opal_list_remove_first(&orte_notifier_events_list)) {
ev = (orte_notifier_event_t *) item;
if ((count = ev->ev_cnt) && notifier_log_event_enabled()) {
notifier_trace_event(ORTE_NOTIFIER_LOG_2, ev->ev_id, count,
time(NULL), 0, ev->ev_msg);
}
OBJ_RELEASE(ev);
}
OBJ_DESTRUCT(&orte_notifier_events_list);
}
/*
* log_type indicates whether we are tracing one of the following:
* . ORTE_NOTIFIER_LOG_0 --> Very first trace
* . ORTE_NOTIFIER_LOG_1 --> Intermediate trace
* . ORTE_NOTIFIER_LOG_2 --> during finalize
* Depending on the log_type the output format is different.
*/
void notifier_trace_event(int log_type, int ev_id, int32_t count, time_t t,
time_t delay, const char *msg)
{
opal_list_item_t *item;
orte_notifier_base_selected_pair_t *pair;
orte_process_name_t *pname = ORTE_PROC_MY_NAME;
char *out = NULL;
switch (log_type) {
case ORTE_NOTIFIER_LOG_0:
asprintf(&out, ORTE_NOTIFIER_LOG_FORMAT_0, t,
ORTE_JOB_FAMILY(pname->jobid),
ORTE_LOCAL_JOBID(pname->jobid),
pname->vpid,
orte_process_info.nodename,
ev_id,
count,
msg);
break;
case ORTE_NOTIFIER_LOG_1:
asprintf(&out, ORTE_NOTIFIER_LOG_FORMAT_1, t,
ORTE_JOB_FAMILY(pname->jobid),
ORTE_LOCAL_JOBID(pname->jobid),
pname->vpid,
orte_process_info.nodename,
ev_id,
count,
delay,
msg);
break;
case ORTE_NOTIFIER_LOG_2:
asprintf(&out, ORTE_NOTIFIER_LOG_FORMAT_2, t,
ORTE_JOB_FAMILY(pname->jobid),
ORTE_LOCAL_JOBID(pname->jobid),
pname->vpid,
orte_process_info.nodename,
ev_id,
count,
msg);
break;
default:
asprintf(&out, "UNKNOWN!!!!!!!!!");
break;
}
if (NULL == out) {
return;
}
for (item = opal_list_get_first(&orte_notifier_log_event_selected_modules);
opal_list_get_end(&orte_notifier_log_event_selected_modules) != item;
item = opal_list_get_next(item)) {
pair = (orte_notifier_base_selected_pair_t*) item;
if (NULL != pair->onbsp_module->log_event) {
pair->onbsp_module->log_event(out);
}
}
}
void notifier_event_store(orte_notifier_event_t *ev)
{
opal_list_append(&orte_notifier_events_list, &ev->super);
}
bool notifier_log_event_enabled(void)
{
return orte_notifier_base_log_event_selected &&
(ORTE_NOTIFIER_NOTICE <= orte_notifier_threshold_severity);
}
#endif /* ORTE_WANT_NOTIFIER_LOG_EVENT */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,7 +32,6 @@
#include "orte/mca/notifier/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
@ -41,25 +40,24 @@
#include "orte/mca/notifier/base/static-components.h"
static void orte_base_log(int severity, int errcode, const char *msg, ...) __opal_attribute_format__(__printf__, 3, 4);
static void orte_log_show_help(int severity, int errcode, const char *file, const char *topic, ...);
static void orte_log_peer(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...) __opal_attribute_format__(__printf__, 4, 5);
/*
* Global variables
*/
int orte_notifier_base_output = -1;
int orte_notifier_threshold_severity = ORTE_NOTIFIER_INFRA;
orte_notifier_base_module_t orte_notifier = {
NULL,
NULL,
orte_base_log,
orte_log_show_help,
orte_log_peer
};
opal_list_t mca_notifier_base_components_available;
orte_notifier_base_component_t mca_notifier_base_selected_component;
orte_notifier_base_severity_t orte_notifier_threshold_severity =
ORTE_NOTIFIER_ERROR;
opal_list_t orte_notifier_base_components_available;
opal_list_t orte_notifier_base_selected_modules;
opal_list_t orte_notifier_log_selected_modules;
opal_list_t orte_notifier_help_selected_modules;
opal_list_t orte_notifier_log_peer_selected_modules;
opal_list_t orte_notifier_log_event_selected_modules;
orte_notifier_API_module_t orte_notifier = {
orte_notifier_log,
orte_notifier_show_help,
orte_notifier_log_peer,
};
/**
* Function for finding and opening either all MCA components, or the one
@ -75,20 +73,40 @@ int orte_notifier_base_open(void)
/* let the user define a base level of severity to report */
mca_base_param_reg_string_name("notifier", "threshold_severity",
"Report all events at or above this severity [default: critical]",
false, false, "critical", &level);
if (0 == strcmp(level, "warning")) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_WARNING;
} else if (0 == strcmp(level, "notice")) {
"Report all events at or above this severity [default: error]",
false, false, "error", &level);
if (0 == strncasecmp(level, "emerg", strlen("emerg"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_EMERG;
} else if (0 == strncasecmp(level, "alert", strlen("alert"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_ALERT;
} else if (0 == strncasecmp(level, "crit", strlen("crit"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_CRIT;
} else if (0 == strncasecmp(level, "warn", strlen("warn"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_WARN;
} else if (0 == strncasecmp(level, "notice", strlen("notice"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_NOTICE;
} else if (0 == strncasecmp(level, "info", strlen("info"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_INFO;
} else if (0 == strncasecmp(level, "debug", strlen("debug"))) {
orte_notifier_threshold_severity = ORTE_NOTIFIER_DEBUG;
} else if (0 != strncasecmp(level, "error", strlen("error"))) {
opal_output(0, "Unknown notifier level");
return ORTE_ERROR;
}
OBJ_CONSTRUCT(&orte_notifier_base_selected_modules, opal_list_t);
OBJ_CONSTRUCT(&orte_notifier_log_selected_modules, opal_list_t);
OBJ_CONSTRUCT(&orte_notifier_help_selected_modules, opal_list_t);
OBJ_CONSTRUCT(&orte_notifier_log_peer_selected_modules, opal_list_t);
OBJ_CONSTRUCT(&orte_notifier_log_event_selected_modules, opal_list_t);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("notifier", orte_notifier_base_output,
mca_notifier_base_static_components,
&mca_notifier_base_components_available, true)) {
&orte_notifier_base_components_available,
true)) {
return ORTE_ERROR;
}
@ -96,36 +114,3 @@ int orte_notifier_base_open(void)
return ORTE_SUCCESS;
}
static void orte_base_log(int severity, int errcode, const char *msg, ...)
{
/* just do nothing - it is here just so someone calling it won't
* segv. Put in va_start/va_end just so that compilers won't
* complain.
*/
va_list ap;
va_start(ap, msg);
va_end(ap);
}
static void orte_log_show_help(int severity, int errcode, const char *file, const char *topic, ...)
{
/* just do nothing - it is here just so someone calling it won't
* segv. Put in va_start/va_end just so that compilers won't
* complain.
*/
va_list ap;
va_start(ap, topic);
va_end(ap);
}
static void orte_log_peer(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...)
{
/* just do nothing - it is here just so someone calling it won't
* segv. Put in va_start/va_end just so that compilers won't
* complain.
*/
va_list ap;
va_start(ap, msg);
va_end(ap);
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -19,67 +20,376 @@
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/opal_sos.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "orte/mca/notifier/base/base.h"
/* Global variables */
/*
* orte_notifier_base_XXX_selected is set to true if at least 1 module has
* been selected for the notifier XXX API interface.
*/
bool orte_notifier_base_log_selected = false;
bool orte_notifier_base_help_selected = false;
bool orte_notifier_base_log_peer_selected = false;
bool orte_notifier_base_log_event_selected = false;
static opal_sos_reporter_callback_fn_t prev_reporter_callback;
static inline char **orte_notifier_get_include_list(const char *,
const char *,
char **);
static bool orte_notifier_add_module(mca_base_component_t *component,
orte_notifier_base_module_t *module,
int priority,
char **include_list,
opal_list_t *selected_modules);
static void onbsp_construct(orte_notifier_base_selected_pair_t *obj)
{
obj->onbsp_component = NULL;
obj->onbsp_module = NULL;
obj->onbsp_priority = -1;
}
static void onbsp_destruct(orte_notifier_base_selected_pair_t *obj)
{
onbsp_construct(obj);
}
OBJ_CLASS_INSTANCE(orte_notifier_base_selected_pair_t,
opal_list_item_t,
onbsp_construct,
onbsp_destruct);
/**
* Function for selecting one component from all those that are
* Function for selecting a set of components from all those that are
* available.
*
* It is possible to select a subset of these components for any interface.
* The syntax is the following:
* [ -mca notifier <list0> ] [ -mca notifier_log <list1> ]
* [ -mca notifier_help <list2> ]
* [ -mca notifier_log_peer <list3> ]
* [ -mca notifier_log_event <list4> ]
* Rules:
* . <list0> empty means nothing selected
* . <list0> to <list4> = comma separated lists of component names
* . <list1> to <list4> may be one of:
* . subsets of <list0>
* . "none" keyword (means empty)
* . 1 of <list1> to <list4> empty means = <list0>
* Last point makes it possible to preserve the way it works today
*
* Examples:
* 1)
* -mca notifier syslog,smtp
* --> syslog and smtp are selected for the log, show_help, log_peer and
* log_event interfaces.
* 2)
* -mca notifier_log syslog
* --> no interface is activated, no component is selected
* 3)
* -mca notifier syslog -mca notifier_help none
* -mca notifier_log_peer none
* -mca notifier_log_event none
* --> only the log interface is activated, with the syslog component
* 4)
* -mca notifier syslog,smtp,hnp -mca notifier_help syslog
* -mca notifier_log_peer smtp
* -mca notifier_log_event none
* --> the log interface is activated, with the syslog, smtp and hnp
* components
* the log_help interface is activated, with the syslog component
* the log_peer interface is activated, with the smtp component
* the log_event interface is not activated
*/
int orte_notifier_base_select(void)
{
int ret, exit_status = ORTE_SUCCESS;
orte_notifier_base_component_t *best_component = NULL;
orte_notifier_base_module_t *best_module = NULL;
char *include_list = NULL;
mca_base_component_list_item_t *cli = NULL;
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
int i, ret, priority, exit_status = ORTE_SUCCESS;
opal_list_item_t *item;
orte_notifier_base_module_t *nmodule;
char **imodules;
char **imodules_log, **imodules_help, **imodules_log_peer;
char **imodules_log_event = NULL;
bool module_needed;
/*
* Register the framework MCA param and look up include list
*/
mca_base_param_reg_string_name("notifier", NULL,
"Which notifier component to use (empty = none)",
false, false,
NULL, &include_list);
/* If we do not have any components to select this is ok. Just use the default
* "no-op" component and move on.
*/
if( 0 >= opal_list_get_size(&mca_notifier_base_components_available) || NULL == include_list) {
/* Close all components since none will be used */
mca_base_components_close(0, /* Pass 0 to keep this from closing the output handle */
&mca_notifier_base_components_available,
NULL);
goto cleanup;
}
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("notifier", orte_notifier_base_output,
&mca_notifier_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* It is okay if no component was selected - we just leave
* the orte_notifier module as the default
*/
exit_status = ORTE_SUCCESS;
goto cleanup;
imodules = orte_notifier_get_include_list("notifier",
"Comma-delimisted list of notifier components to use "
"(empty = none)", NULL);
if (NULL == imodules) {
return ORTE_SUCCESS;
}
if (NULL != orte_notifier.init) {
/* if an init function is provided, use it */
if (ORTE_SUCCESS != (ret = orte_notifier.init()) ) {
exit_status = ret;
goto cleanup;
/*
* Also get the include lists for each interface
*/
imodules_log = orte_notifier_get_include_list("notifier_log",
"Comma-delimisted list of notifier components to use "
"for orte_notifier_log (empty = all selected)",
imodules);
imodules_help = orte_notifier_get_include_list("notifier_help",
"Comma-delimisted list of notifier components to use "
"for orte_notifier_show_help (empty = all selected)",
imodules);
imodules_log_peer = orte_notifier_get_include_list("notifier_log_peer",
"Comma-delimisted list of notifier components to "
"use for orte_notifier_log_peer (empty = all "
"selected)", imodules);
#if ORTE_WANT_NOTIFIER_LOG_EVENT
imodules_log_event = orte_notifier_get_include_list("notifier_log_event",
"Comma-delimisted list of notifier components to "
"use for ORTE_NOTIFIER_LOG_EVENT (empty = all "
"selected)",
imodules);
#endif /* ORTE_WANT_NOTIFIER_LOG_EVENT */
/* Query all available components and ask if they have a module */
for (item = opal_list_get_first(&orte_notifier_base_components_available);
opal_list_get_end(&orte_notifier_base_components_available) != item;
item = opal_list_get_next(item)) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_base_component_t *) cli->cli_component;
/* If this component is not in the include list, skip it */
for (i = 0; NULL != imodules[i]; ++i) {
if (0 == strcmp(imodules[i], component->mca_component_name)) {
break;
}
}
if (NULL == imodules[i]) {
continue;
}
/* If there's no query function, skip it */
if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_notifier_base_output,
"mca:notify:select: Skipping component [%s]. It does not implement a query function",
component->mca_component_name );
continue;
}
/* Query the component */
opal_output_verbose(5, orte_notifier_base_output,
"mca:notify:select: Querying component [%s]",
component->mca_component_name);
ret = component->mca_query_component(&module, &priority);
/* If no module was returned, then skip component */
if (ORTE_SUCCESS != ret || NULL == module) {
opal_output_verbose(5, orte_notifier_base_output,
"mca:notify:select: Skipping component [%s]. Query failed to return a module",
component->mca_component_name );
continue;
}
/* If we got a module, initialize it */
nmodule = (orte_notifier_base_module_t*) module;
if (NULL != nmodule->init) {
/* If the module doesn't want to be used, skip it */
if (ORTE_SUCCESS != (ret = nmodule->init()) ) {
if (ORTE_ERR_NOT_SUPPORTED != OPAL_SOS_GET_ERROR_CODE(ret) &&
ORTE_ERR_NOT_IMPLEMENTED != OPAL_SOS_GET_ERROR_CODE(ret)) {
exit_status = ret;
goto cleanup;
}
if (NULL != nmodule->finalize) {
nmodule->finalize();
}
continue;
}
}
/*
* OK, one module has been selected for the notifier framework, and
* successfully initialized.
* Now we have to include it in the per interface selected modules
* lists if needed.
*/
ret = orte_notifier_add_module(component,
nmodule,
priority,
imodules_log,
&orte_notifier_log_selected_modules);
orte_notifier_base_log_selected = orte_notifier_base_log_selected
|| ret;
/*
* This variable is set to check if the module is needed by at least
* one interface.
*/
module_needed = ret;
ret = orte_notifier_add_module(component,
nmodule,
priority,
imodules_help,
&orte_notifier_help_selected_modules);
orte_notifier_base_help_selected = orte_notifier_base_help_selected
|| ret;
module_needed = module_needed || ret;
ret = orte_notifier_add_module(component,
nmodule,
priority,
imodules_log_peer,
&orte_notifier_log_peer_selected_modules);
orte_notifier_base_log_peer_selected =
orte_notifier_base_log_peer_selected || ret;
module_needed = module_needed || ret;
ret = orte_notifier_add_module(component,
nmodule,
priority,
imodules_log_event,
&orte_notifier_log_event_selected_modules);
orte_notifier_base_log_event_selected =
orte_notifier_base_log_event_selected || ret;
module_needed = module_needed || ret;
/*
* If the module is needed by at least one interface:
* Unconditionally update the global list that will be used during
* the close step. Else unload it.
*/
if (module_needed) {
orte_notifier_add_module(component,
nmodule,
priority,
imodules,
&orte_notifier_base_selected_modules);
} else {
if (NULL != nmodule->finalize) {
nmodule->finalize();
}
}
}
/* Save the winner */
orte_notifier = *best_module;
if (orte_notifier_base_log_event_selected) {
/*
* This has to be done whatever the selected module. That's why it's
* done here.
*/
orte_notifier_base_events_init();
}
/* Register a callback with OPAL SOS so that we can intercept
* error messages */
opal_sos_reg_reporter_callback((opal_sos_reporter_callback_fn_t) orte_notifier_log,
&prev_reporter_callback);
cleanup:
return exit_status;
}
/**
* Register an mca param that represents an include list and build that list.
*
* @param param_name (IN) param name to be registered
* @param help_message (IN) help message for that param
* @param default_modules (IN) list of module names to be inherited if an
* empty include list is provided
* @return list of modules names
*/
static inline char **orte_notifier_get_include_list(const char *param_name,
const char *help_message,
char **default_modules)
{
char *include_list = NULL;
char **imodules;
mca_base_param_reg_string_name(param_name, NULL, help_message,
false, false, NULL, &include_list);
imodules = opal_argv_split(include_list, ',');
if (NULL == imodules) {
/*
* Inherit the default list if nothing specified
*/
return default_modules;
}
if (!strcmp(include_list, "none")) {
return NULL;
}
return imodules;
}
/**
* Check if a component name belongs to an include list and add it to the
* list of selected modules.
*
* @param component (IN) component to be included
* @param module (IN) module to be included
* @param priority (IN) module priority
* @param include_list (IN) list of module names to go through
* @param selected_modules (OUT) list of selected modules to be updated
* @return true/false depending on whether the module
* has been added or not
*/
static bool orte_notifier_add_module(mca_base_component_t *component,
orte_notifier_base_module_t *module,
int priority,
char **include_list,
opal_list_t *selected_modules)
{
orte_notifier_base_selected_pair_t *pair, *pair2;
char *module_name;
opal_list_item_t *item;
int i;
if (NULL == include_list) {
return false;
}
module_name = component->mca_component_name;
/* If this component is not in the include list, skip it */
for (i = 0; NULL != include_list[i]; i++) {
if (!strcmp(include_list[i], module_name)) {
break;
}
}
if (NULL == include_list[i]) {
return false;
}
/* Make an item for the list */
pair = OBJ_NEW(orte_notifier_base_selected_pair_t);
pair->onbsp_component = (orte_notifier_base_component_t*) component;
pair->onbsp_module = module;
pair->onbsp_priority = priority;
/* Put it in the list in priority order */
for (item = opal_list_get_first(selected_modules);
opal_list_get_end(selected_modules) != item;
item = opal_list_get_next(item)) {
pair2 = (orte_notifier_base_selected_pair_t*) item;
if (priority > pair2->onbsp_priority) {
opal_list_insert_pos(selected_modules, item, &(pair->super));
break;
}
}
if (opal_list_get_end(selected_modules) == item) {
opal_list_append(selected_modules, &(pair->super));
}
return true;
}

Просмотреть файл

@ -0,0 +1,190 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/opal_sos.h"
#include "orte/constants.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/base/base.h"
void orte_notifier_log(orte_notifier_base_severity_t severity,
int errcode, const char *msg, ...)
{
va_list ap;
opal_list_item_t *item;
orte_notifier_base_selected_pair_t *pair;
if (!orte_notifier_base_log_selected) {
return;
}
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
for (item = opal_list_get_first(&orte_notifier_log_selected_modules);
opal_list_get_end(&orte_notifier_log_selected_modules) != item;
item = opal_list_get_next(item)) {
pair = (orte_notifier_base_selected_pair_t*) item;
if (NULL != pair->onbsp_module->log) {
va_start(ap, msg);
pair->onbsp_module->log(severity, errcode, msg, ap);
va_end(ap);
}
}
}
void orte_notifier_show_help(orte_notifier_base_severity_t severity,
int errcode, const char *file,
const char *topic, ...)
{
va_list ap;
opal_list_item_t *item;
orte_notifier_base_selected_pair_t *pair;
if (!orte_notifier_base_help_selected) {
return;
}
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
for (item = opal_list_get_first(&orte_notifier_help_selected_modules);
opal_list_get_end(&orte_notifier_help_selected_modules) != item;
item = opal_list_get_next(item)) {
pair = (orte_notifier_base_selected_pair_t*) item;
if (NULL != pair->onbsp_module->help) {
va_start(ap, topic);
pair->onbsp_module->help(severity, errcode, file, topic, ap);
va_end(ap);
}
}
}
void orte_notifier_log_peer(orte_notifier_base_severity_t severity,
int errcode,
orte_process_name_t *peer_proc,
const char *msg, ...)
{
va_list ap;
opal_list_item_t *item;
orte_notifier_base_selected_pair_t *pair;
if (!orte_notifier_base_log_peer_selected) {
return;
}
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
for (item = opal_list_get_first(&orte_notifier_log_peer_selected_modules);
opal_list_get_end(&orte_notifier_log_peer_selected_modules) != item;
item = opal_list_get_next(item)) {
pair = (orte_notifier_base_selected_pair_t*) item;
if (NULL != pair->onbsp_module->peer) {
va_start(ap, msg);
pair->onbsp_module->peer(severity, errcode, peer_proc, msg, ap);
va_end(ap);
}
}
}
const char* orte_notifier_base_sev2str(orte_notifier_base_severity_t severity)
{
switch (severity) {
case ORTE_NOTIFIER_EMERG: return "EMERG"; break;
case ORTE_NOTIFIER_ALERT: return "ALERT"; break;
case ORTE_NOTIFIER_CRIT: return "CRIT"; break;
case ORTE_NOTIFIER_ERROR: return "ERROR"; break;
case ORTE_NOTIFIER_WARN: return "WARN"; break;
case ORTE_NOTIFIER_NOTICE: return "NOTICE"; break;
case ORTE_NOTIFIER_INFO: return "INFO"; break;
case ORTE_NOTIFIER_DEBUG: return "DEBUG"; break;
default: return "UNKNOWN"; break;
}
}
char *orte_notifier_base_peer_log(int errcode, orte_process_name_t *peer_proc,
const char *msg, va_list ap)
{
char *buf = malloc(ORTE_NOTIFIER_MAX_BUF + 1);
char *peer_host = NULL, *peer_name = NULL;
char *pos = buf;
char *errstr = (char*)orte_err2str(errcode);
int len, space = ORTE_NOTIFIER_MAX_BUF;
if (NULL == buf) {
return NULL;
}
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
peer_name = ORTE_NAME_PRINT(peer_proc);
}
len = snprintf(pos, space,
"While communicating to proc %s on node %s,"
" proc %s on node %s encountered an error ",
peer_name ? peer_name : "UNKNOWN",
peer_host ? peer_host : "UNKNOWN",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
space -= len;
pos += len;
if (0 < space) {
if (errstr) {
len = snprintf(pos, space, "'%s':", errstr);
} else {
len = snprintf(pos, space, "(%d):", errcode);
}
space -= len;
pos += len;
}
if (0 < space) {
vsnprintf(pos, space, msg, ap);
}
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
return buf;
}

Просмотреть файл

@ -12,7 +12,21 @@
# MCA_notifier_command_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_notifier_command_CONFIG], [
OMPI_VAR_SCOPE_PUSH(notifier_happy)
notifier_happy=no
# We need fork() and pipe()
AC_CHECK_FUNC([fork],
[AC_CHECK_FUNC([pipe], [$1], [$2])], [$2])
[AC_CHECK_FUNC([pipe], [notifier_happy=yes])])
# We also need thread support
AS_IF([test "$notifier_happy" = "yes"],
[AC_MSG_CHECKING([for thread support])
AC_MSG_RESULT([$THREAD_TYPE])
AS_IF([test "$THREAD_TYPE" != "none"],
[notifier_happy=yes])])
AS_IF([test "$notifier_happy" = "yes"], [$1], [$2])
OMPI_VAR_SCOPE_POP
])

Просмотреть файл

@ -51,6 +51,9 @@ typedef struct {
/* Pipe to the parent */
int to_parent[2];
/* Do we want data sent to child via stdin? */
bool pass_via_stdin;
} orte_notifier_command_component_t;

Просмотреть файл

@ -42,11 +42,21 @@
#include <signal.h>
#include "opal/util/argv.h"
#include "opal/threads/threads.h"
#include "orte/constants.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_command.h"
/* Structre for holding the argument to stdin_main() */
typedef struct {
int sat_pipe_fd;
int sat_severity;
int sat_errcode;
char *sat_msg;
} stdin_arg_t;
int orte_notifier_command_split(const char *cmd_arg, char ***argv_arg)
{
@ -171,6 +181,30 @@ static void diediedie(int status)
_exit(status);
}
/*
* Main entry point for stdin thread
*/
static void *stdin_main(opal_object_t *obj)
{
char *data;
opal_thread_t *t = (opal_thread_t*) obj;
stdin_arg_t *arg = (stdin_arg_t*) t->t_arg;
asprintf(&data, "<stdin>\n<notifier severity_int=\"%d\" severity_str=\"%s\" errcode=\"%d\">\n<message>%s</message>\n</notifier>\n</stdin>\n",
arg->sat_severity,
orte_notifier_base_sev2str(arg->sat_severity),
arg->sat_errcode,
arg->sat_msg);
if (NULL != data) {
orte_notifier_command_write_fd(arg->sat_pipe_fd,
strlen(data) + 1, data);
free(data);
close(arg->sat_pipe_fd);
}
return NULL;
}
/*
* Loop over waiting for a child to die
*/
@ -212,8 +246,11 @@ static void do_exec(void)
pid_t pid;
bool exited, killed;
int sel[3], status;
int pipe_to_stdin[2];
char *msg, *p, *cmd, **argv = NULL;
orte_notifier_command_component_t *c = &mca_notifier_command_component;
opal_thread_t stdin_thread;
stdin_arg_t arg;
/* First three items on the pipe are: severity, errcode, and
string length (sel = Severity, Errcode, string Length. */
@ -250,10 +287,7 @@ static void do_exec(void)
while (NULL != (p = strstr(cmd, "$S"))) {
*p = '\0';
asprintf(&temp, "%s%s%s", cmd,
((ORTE_NOTIFIER_INFRA == sel[0]) ? "INFRA" :
((ORTE_NOTIFIER_WARNING == sel[0]) ? "WARNING" :
((ORTE_NOTIFIER_NOTICE == sel[0]) ? "NOTICE" :
"UNKNOWN"))), p + 2);
orte_notifier_base_sev2str(sel[0]), p + 2);
free(cmd);
cmd = temp;
}
@ -279,6 +313,13 @@ static void do_exec(void)
/* What else can we do? */
}
/* Do we need a stdin pipe? */
if (mca_notifier_command_component.pass_via_stdin) {
if (0 != pipe(pipe_to_stdin)) {
diediedie(8);
}
}
/* Fork off the child and run the command */
pid = fork();
if (pid < 0) {
@ -286,8 +327,23 @@ static void do_exec(void)
} else if (pid == 0) {
int i;
int fdmax = sysconf(_SC_OPEN_MAX);
close(0);
for (i = 3; i < fdmax; ++i) {
close(i);
if (!mca_notifier_command_component.pass_via_stdin ||
pipe_to_stdin[0] != i) {
close(i);
}
}
/* If we have a pipe to stdin, dup it */
if (mca_notifier_command_component.pass_via_stdin) {
close(pipe_to_stdin[1]);
if (0 != pipe_to_stdin[0]) {
if (dup2(pipe_to_stdin[0], 0) < 0) {
diediedie(13);
}
close(pipe_to_stdin[0]);
}
}
/* Run it! */
@ -295,14 +351,28 @@ static void do_exec(void)
/* If we get here, bad */
diediedie(9);
}
free(cmd);
free(msg);
opal_argv_free(argv);
/* Write down stdin. Start a thread because this has to run in
parallel to the timer to kill the grandchild if it runs too
long. */
if (mca_notifier_command_component.pass_via_stdin) {
close(pipe_to_stdin[0]);
OBJ_CONSTRUCT(&stdin_thread, opal_thread_t);
stdin_thread.t_run = stdin_main;
arg.sat_pipe_fd = pipe_to_stdin[1];
arg.sat_severity = sel[0];
arg.sat_errcode = sel[1];
arg.sat_msg = msg;
stdin_thread.t_arg = (void *) &arg;
if (OPAL_SUCCESS != opal_thread_start(&stdin_thread)) {
diediedie(9);
}
}
/* Parent: wait for / reap the child. */
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
/* If it didn't die, try killing it nicely. If that fails, kill
/* If the child didn't die, try killing it nicely. If that fails, kill
it dead. */
killed = false;
if (!exited) {
@ -316,6 +386,20 @@ static void do_exec(void)
}
}
/* Wait for the thread to complete */
if (mca_notifier_command_component.pass_via_stdin) {
void *ret;
close(pipe_to_stdin[1]);
opal_thread_join(&stdin_thread, &ret);
OBJ_DESTRUCT(&stdin_thread);
}
/* Free stuff */
free(cmd);
free(msg);
opal_argv_free(argv);
/* Handshake back up to the parent: just send the status value
back up to the parent and let all interpretation occur up
there. */

Просмотреть файл

@ -46,11 +46,11 @@
#include "notifier_command.h"
static int command_open(void);
static int command_component_query(mca_base_module_t **module, int *priority);
static int command_close(void);
static int command_register(void);
/*
* Struct of function pointers that need to be initialized
*/
@ -65,7 +65,7 @@ orte_notifier_command_component_t mca_notifier_command_component = {
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
command_open,
NULL,
command_close,
command_component_query,
command_register,
@ -93,6 +93,9 @@ orte_notifier_command_component_t mca_notifier_command_component = {
/* To-parent pipe FDs */
{ -1, -1 },
/* Pass via stdin? */
true,
};
/* Safety to ensure we don't try to write down a dead pipe */
@ -107,6 +110,8 @@ static void child_death_cb(pid_t pid, int status, void *data)
static int command_register(void)
{
int val;
mca_base_param_reg_string(&mca_notifier_command_component.super.base_version,
"cmd",
"Command to execute, with substitution. $s = integer severity; $S = string severity; $e = integer error code; $m = string message",
@ -121,6 +126,14 @@ static int command_register(void)
mca_notifier_command_component.timeout,
&mca_notifier_command_component.timeout);
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
"use_stdin",
"If true, pass parameters to the command via stdin, formatted with trivial XML",
false, false,
(int) mca_notifier_command_component.pass_via_stdin,
&val);
mca_notifier_command_component.pass_via_stdin = OPAL_INT_TO_BOOL(val);
/* Priority */
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
"priority",
@ -132,11 +145,6 @@ static int command_register(void)
return ORTE_SUCCESS;
}
static int command_open(void)
{
/* Nothing to do */
return ORTE_SUCCESS;
}
static int command_close(void)
{

Просмотреть файл

@ -55,12 +55,14 @@
#include "notifier_command.h"
static void command_log(int severity, int errcode, const char *msg, ...);
static void command_help(int severity, int errcode, const char *filename,
const char *topic, ...);
static void command_peer(int severity, int errcode,
orte_process_name_t *peer_proc,
const char *msg, ...);
static void command_log(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void command_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap);
static void command_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc,
const char *msg, va_list ap);
/* Module */
orte_notifier_base_module_t orte_notifier_command_module = {
@ -68,16 +70,20 @@ orte_notifier_base_module_t orte_notifier_command_module = {
NULL,
command_log,
command_help,
command_peer
command_peer,
NULL
};
/*
* Back-end function to actually tell the child to fork the command
*/
static int send_command(int severity, int errcode, char *msg)
static int send_command(orte_notifier_base_severity_t severity, int errcode,
char *msg)
{
/* csel = Command, Severity, Errcode, string Length */
int rc, csel[4];
char *errmsg = NULL;
csel[0] = CMD_EXEC;
csel[1] = severity;
csel[2] = errcode;
@ -87,6 +93,7 @@ static int send_command(int severity, int errcode, char *msg)
if (ORTE_SUCCESS !=
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
sizeof(csel), csel))) {
errmsg = "write";
goto error;
}
@ -94,6 +101,7 @@ static int send_command(int severity, int errcode, char *msg)
if (ORTE_SUCCESS !=
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
csel[3] + 1, msg))) {
errmsg = "write";
goto error;
}
@ -104,6 +112,7 @@ static int send_command(int severity, int errcode, char *msg)
if (ORTE_SUCCESS !=
(rc = orte_notifier_command_read_fd(mca_notifier_command_component.to_parent[0],
sizeof(int) * 3, csel))) {
errmsg = "read";
goto error;
}
/* Did the grandchild exit? */
@ -144,27 +153,17 @@ static int send_command(int severity, int errcode, char *msg)
error:
orte_show_help("help-orte-notifier-command.txt",
"system call fail", true, orte_process_info.nodename,
"write", opal_strerror(rc), rc);
errmsg, opal_strerror(rc), rc);
return rc;
}
static void command_log(int severity, int errcode, const char *msg, ...)
static void command_log(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap)
{
char *output;
va_list arglist;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
/* If there was a message, output it */
va_start(arglist, msg);
vasprintf(&output, msg, arglist);
va_end(arglist);
vasprintf(&output, msg, ap);
if (NULL != output) {
send_command(severity, errcode, output);
@ -172,23 +171,11 @@ static void command_log(int severity, int errcode, const char *msg, ...)
}
}
static void command_help(int severity, int errcode, const char *filename,
const char *topic, ...)
static void command_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap)
{
va_list arglist;
char *output;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
char *output = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != output) {
send_command(severity, errcode, output);
@ -196,55 +183,14 @@ static void command_help(int severity, int errcode, const char *filename,
}
}
static void command_peer(int severity, int errcode,
orte_process_name_t *peer_proc, const char *msg, ...)
static void command_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
va_list arglist;
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
char *peer_host = NULL, *peer_name = NULL;
char *pos = buf;
char *errstr = (char*)orte_err2str(errcode);
int len, space = ORTE_NOTIFIER_MAX_BUF;
char *buf = orte_notifier_base_peer_log(errcode, peer_proc, msg, ap);
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
if (NULL != buf) {
send_command(severity, errcode, buf);
free(buf);
}
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
peer_name = ORTE_NAME_PRINT(peer_proc);
}
len = snprintf(pos, space,
"While communicating to proc %s on node %s,"
" proc %s on node %s encountered an error ",
peer_name ? peer_name : "UNKNOWN",
peer_host ? peer_host : "UNKNOWN",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
space -= len;
pos += len;
if (0 < space) {
if (errstr) {
len = snprintf(pos, space, "'%s':", errstr);
} else {
len = snprintf(pos, space, "(%d):", errcode);
}
space -= len;
pos += len;
}
if (0 < space) {
va_start(arglist, msg);
vsnprintf(pos, space, msg, arglist);
va_end(arglist);
}
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
send_command(severity, errcode, buf);
}

35
orte/mca/notifier/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
dnl University of Stuttgart. All rights reserved.
dnl Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2009 Bull SAS. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
AC_DEFUN([MCA_notifier_CONFIG],[
ompi_show_subsubtitle "Pre-configuring the framework notifier"
AC_MSG_CHECKING([if --enable-notifier-log-event was specified])
AC_ARG_ENABLE(notifier-log-event,
AC_HELP_STRING([--enable-notifier-log-event],
[Enable unusual events notification. (default: disabled)]))
if test "$enable_notifier_log_event" = "yes"; then
AC_MSG_RESULT([yes])
WANT_NOTIFIER_LOG_EVENT=1
else
AC_MSG_RESULT([no (disabling "notifier-log-event")])
WANT_NOTIFIER_LOG_EVENT=0
fi
AC_DEFINE_UNQUOTED([ORTE_WANT_NOTIFIER_LOG_EVENT],
[$WANT_NOTIFIER_LOG_EVENT],
[if the notifier_log_event should be enabled])
AM_CONDITIONAL([ORTE_WANT_NOTIFIER_LOG_EVENT],
[test "$WANT_NOTIFIER_LOG_EVENT" = "1"])
MCA_CONFIGURE_FRAMEWORK($1, $2, 1)
])

Просмотреть файл

@ -18,6 +18,9 @@
AM_CPPFLAGS = $(notifier_ftb_CPPFLAGS)
dist_pkgdata_DATA = \
help-orte-notifier-ftb.txt
sources = \
notifier_ftb.h \
notifier_ftb_module.c \

Просмотреть файл

@ -1,5 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2009 The Trustees of Indiana University and Indiana
# University Research and Technology Corporation.
# All rights reserved.
@ -11,6 +12,27 @@
#
# This is the US/English help file for Open MPI's FTB notifier support
#
[ftb connect failed]
Open MPI's FTB notifier component failed to connect to the FTB server.
Check if the FTB bootstrap server is running or not. For further help,
refer the FTB documentation (Section 4.0: RUNNING FTB).
Reason: %s (errno: %d)
#
[declare events failed]
The Open MPI FTB notifier component failed to declare publishable events
to the FTB.
Reason: %s (errno: %d)
#
[publish failed]
Sorry, Open MPI's FTB component failed to publish the following event to
the FTB.
Reason: %s (errno: %d)
Event info: [%s] %s
Event properties: %s (errno: %d)
#
[invalid subscription style]
Error: the Open MPI FTB component tried to register with an invalid
FTB client subscription style.
@ -21,7 +43,3 @@ FTB client subscription style.
Error: the Open MPI FTB notifier component tried to register with an
invalid value in the FTB client information.
#
[unable to connect]
Open MPI's FTB notifier component was unable to establish a connection
with the FTB backplane.
#

Просмотреть файл

@ -10,6 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -10,6 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,7 +22,7 @@
* This component proxies notification events to the Fault Tolerant
* Backplane (See http://www.mcs.anl.gov/research/cifts/).
* The ORTE notifier severity is translated to the corresponding
* FTB severity before the event is published to the FTB.
* FTB severity before the event is published to the FTB.
*/
/*
@ -48,7 +49,7 @@ orte_notifier_ftb_component_t mca_notifier_ftb_component = {
{
{
ORTE_NOTIFIER_BASE_VERSION_1_0_0,
"ftb", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
@ -87,7 +88,8 @@ static int orte_notifier_ftb_close(void)
return ORTE_SUCCESS;
}
static int orte_notifier_ftb_component_query(mca_base_module_t **module, int *priority)
static int orte_notifier_ftb_component_query(mca_base_module_t **module,
int *priority)
{
int ret;
*priority = 0;
@ -100,8 +102,7 @@ static int orte_notifier_ftb_component_query(mca_base_module_t **module, int *pr
/* We represent each client with a client name of the form
openmpi/<hostname>/<PID> as a unique identifier in the
FTB client namespace */
sprintf(ftb_client_info.client_name, "openmpi/%s/%u",
orte_process_info.nodename, orte_process_info.pid);
sprintf(ftb_client_info.client_name, "ompi%u", orte_process_info.pid);
sprintf(ftb_client_info.client_jobid, "%u", ORTE_PROC_MY_NAME->jobid);
@ -117,18 +118,20 @@ static int orte_notifier_ftb_component_query(mca_base_module_t **module, int *pr
orte_show_help("help-orte-notifier-ftb.txt",
"invalid subscription style",
true, ftb_client_info.client_subscription_style);
break;
case FTB_ERR_INVALID_VALUE:
orte_show_help("help-orte-notifier-ftb.txt",
"invalid value",
true);
break;
default:
orte_show_help("help-orte-notifier-ftb.txt",
"unable to connect",
"ftb connect failed",
true);
}
return ORTE_ERR_NOT_FOUND;
}
@ -154,7 +157,7 @@ static int orte_notifier_ftb_register(void)
mca_base_param_reg_int(&mca_notifier_ftb_component.super.base_version,
"priority",
"Priority of this component",
false, false,
false, false,
mca_notifier_ftb_component.priority,
&mca_notifier_ftb_component.priority);

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -20,6 +21,7 @@
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
@ -32,6 +34,7 @@
#include "orte/util/error_strings.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/show_help.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_ftb.h"
@ -39,17 +42,22 @@
/* Static API's */
static int init(void);
static void finalize(void);
static void mylog(int severity, int errcode, const char *msg, ...);
static void myhelplog(int severity, int errcode, const char *filename, const char *topic, ...);
static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...);
static void ftb_log(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap);
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap);
/* Module def */
orte_notifier_base_module_t orte_notifier_ftb_module = {
init,
finalize,
mylog,
myhelplog,
mypeerlog
ftb_log,
ftb_help,
ftb_peer,
NULL
};
/* FTB client information */
@ -71,7 +79,7 @@ static int orte_err2ftb(int errnum)
{
int retval;
switch (errnum) {
switch (OPAL_SOS_GET_ERROR_CODE(errnum)) {
case ORTE_ERR_OUT_OF_RESOURCE:
case ORTE_ERR_TEMP_OUT_OF_RESOURCE:
retval = 1;
@ -102,8 +110,10 @@ static int init(void) {
ret = FTB_Declare_publishable_events(ftb_client_handle, 0, ftb_event_info, ftb_event_info_count);
if (FTB_SUCCESS != ret) {
opal_output(orte_notifier_base_output,
"notifier:ftb:init FTB_Declare_publishable_events failed ret=%d\n", ret);
orte_show_help("help-orte-notifier-ftb.txt",
"declare events failed",
true, "FTB_Declare_publishable_events() failed", ret);
FTB_Disconnect(ftb_client_handle);
return ORTE_ERROR;
}
@ -115,7 +125,7 @@ static void finalize(void) {
FTB_Disconnect(ftb_client_handle);
}
static void convert2ftb(int errcode, char *payload)
static void send_to_ftb(int errcode, char *payload)
{
int ret, event_id;
FTB_event_handle_t ehandle;
@ -126,73 +136,48 @@ static void convert2ftb(int errcode, char *payload)
event_id = orte_err2ftb(errcode);
ret = FTB_Publish(ftb_client_handle, ftb_event_info[event_id].event_name, &eprop, &ehandle);
if (FTB_SUCCESS != ret) {
opal_output(orte_notifier_base_output,
"notifier:ftb:convert2ftb(%d,'%s') FTB_Publish failed ret=%d\n", errcode, eprop.event_payload, ret);
orte_show_help("help-orte-notifier-ftb.txt",
"publish failed",
true, "FTB_Publish() failed", ret,
ftb_event_info[event_id].severity,
ftb_event_info[event_id].event_name,
eprop.event_payload, errcode);
}
}
static void mylog(int severity, int errcode, const char *msg, ...)
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
va_list ap)
{
va_list arglist;
char payload[FTB_MAX_PAYLOAD_DATA + 1];
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
char *payload;
/* If there was a message, output it */
va_start(arglist, msg);
vsnprintf(payload, FTB_MAX_PAYLOAD_DATA, msg, arglist);
payload[FTB_MAX_PAYLOAD_DATA] = '\0'; /* not needed? */
va_end(arglist);
convert2ftb(errcode, payload);
vasprintf(&payload, msg, ap);
if (NULL != payload) {
send_to_ftb(errcode, payload);
free(payload);
}
}
static void myhelplog(int severity, int errcode, const char *filename, const char *topic, ...)
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap)
{
va_list arglist;
char *output;
char *output = opal_show_help_vstring(filename, topic, false, ap);
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
convert2ftb(errcode, output);
if (NULL != output) {
send_to_ftb(errcode, output);
free(output);
}
}
static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...)
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
va_list arglist;
char payload[FTB_MAX_PAYLOAD_DATA + 1];
char *peer_host = NULL;
char *pos = payload;
int len, space = FTB_MAX_PAYLOAD_DATA;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
}
@ -202,11 +187,9 @@ static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc,
/* If there was a message, and space left, output it */
if (0 < space) {
va_start(arglist, msg);
vsnprintf(pos, space, msg, arglist);
va_end(arglist);
vsnprintf(pos, space, msg, ap);
}
payload[FTB_MAX_PAYLOAD_DATA] = '\0'; /* not needed? */
convert2ftb(errcode, payload);
payload[FTB_MAX_PAYLOAD_DATA] = '\0';
send_to_ftb(errcode, payload);
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -33,14 +34,18 @@
*/
#include "orte_config.h"
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif
#include "opal/mca/mca.h"
#include "opal/util/opal_sos.h"
#include "orte/constants.h"
#include "orte/types.h"
#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#endif /* HAVE_SYSLOG_H */
#include "opal/mca/mca.h"
#include "notifier_event_types.h"
BEGIN_C_DECLS
@ -51,13 +56,17 @@ BEGIN_C_DECLS
*/
#define ORTE_NOTIFIER_MAX_BUF 512
/* define severities - this will eventually be replaced by OPAL_SOS
priorities */
enum {
ORTE_NOTIFIER_INFRA = LOG_CRIT,
ORTE_NOTIFIER_WARNING = LOG_WARNING,
ORTE_NOTIFIER_NOTICE = LOG_NOTICE
};
/* Severities, based on OPAL SOS */
typedef enum {
ORTE_NOTIFIER_EMERG = OPAL_SOS_SEVERITY_EMERG,
ORTE_NOTIFIER_ALERT = OPAL_SOS_SEVERITY_ALERT,
ORTE_NOTIFIER_CRIT = OPAL_SOS_SEVERITY_CRIT,
ORTE_NOTIFIER_ERROR = OPAL_SOS_SEVERITY_ERROR,
ORTE_NOTIFIER_WARN = OPAL_SOS_SEVERITY_WARN,
ORTE_NOTIFIER_NOTICE = OPAL_SOS_SEVERITY_NOTICE,
ORTE_NOTIFIER_INFO = OPAL_SOS_SEVERITY_INFO,
ORTE_NOTIFIER_DEBUG = OPAL_SOS_SEVERITY_DEBUG
} orte_notifier_base_severity_t;
/*
* Component functions - all MUST be provided!
@ -70,22 +79,25 @@ typedef int (*orte_notifier_base_module_init_fn_t)(void);
typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
/* Log a failure message */
typedef void (*orte_notifier_base_module_log_fn_t)(int severity, int errcode, const char *msg, ...)
typedef void (*orte_notifier_base_module_log_fn_t)(orte_notifier_base_severity_t severity, int errcode, const char *msg, va_list ap)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 3, 4)
__opal_attribute_format__(__printf__, 3, 0)
# endif
;
/* Log a failure that is based upon a show_help message */
typedef void (*orte_notifier_base_module_log_show_help_fn_t)(int severity, int errcode, const char *file, const char *topic, ...);
typedef void (*orte_notifier_base_module_log_show_help_fn_t)(orte_notifier_base_severity_t severity, int errcode, const char *file, const char *topic, va_list ap);
/* Log a failure related to a peer */
typedef void (*orte_notifier_base_module_log_peer_fn_t)(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...)
typedef void (*orte_notifier_base_module_log_peer_fn_t)(orte_notifier_base_severity_t severity, int errcode, orte_process_name_t *peer_proc, const char *msg, va_list ap)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 4, 5)
__opal_attribute_format__(__printf__, 4, 0)
# endif
;
/* Log an unusual event message */
typedef void (*orte_notifier_base_module_log_event_fn_t)(const char *msg);
/*
* Ver 1.0
*/
@ -95,11 +107,37 @@ struct orte_notifier_base_module_1_0_0_t {
orte_notifier_base_module_log_fn_t log;
orte_notifier_base_module_log_show_help_fn_t help;
orte_notifier_base_module_log_peer_fn_t peer;
orte_notifier_base_module_log_event_fn_t log_event;
};
typedef struct orte_notifier_base_module_1_0_0_t orte_notifier_base_module_1_0_0_t;
typedef orte_notifier_base_module_1_0_0_t orte_notifier_base_module_t;
/*
* API functions
*/
/* Log a failure message */
typedef void (*orte_notifier_base_API_log_fn_t)(orte_notifier_base_severity_t severity, int errcode, const char *msg, ...);
/* Log a failure that is based upon a show_help message */
typedef void (*orte_notifier_base_API_log_show_help_fn_t)(orte_notifier_base_severity_t severity, int errcode, const char *file, const char *topic, ...);
/* Log a failure related to a peer */
typedef void (*orte_notifier_base_API_log_peer_fn_t)(orte_notifier_base_severity_t severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...);
/*
* Define a struct to hold the API functions that users will call
*/
struct orte_notifier_API_module_1_0_0_t {
orte_notifier_base_API_log_fn_t log;
orte_notifier_base_API_log_show_help_fn_t show_help;
orte_notifier_base_API_log_peer_fn_t log_peer;
};
typedef struct orte_notifier_API_module_1_0_0_t orte_notifier_API_module_1_0_0_t;
typedef orte_notifier_API_module_1_0_0_t orte_notifier_API_module_t;
ORTE_DECLSPEC extern orte_notifier_API_module_t orte_notifier;
/*
* the standard component data structure
*/
@ -111,7 +149,6 @@ typedef struct orte_notifier_base_component_1_0_0_t orte_notifier_base_component
typedef orte_notifier_base_component_1_0_0_t orte_notifier_base_component_t;
/*
* Macro for use in components that are of type notifier v1.0.0
*/
@ -121,9 +158,21 @@ typedef orte_notifier_base_component_1_0_0_t orte_notifier_base_component_t;
/* notifier v1.0 */ \
"notifier", 1, 0, 0
/* Global structure for accessing notifier functions
/*
* To manage unusual events notifications
* Set to noop if not wanted
*/
ORTE_DECLSPEC extern orte_notifier_base_module_t orte_notifier; /* holds selected module's function pointers */
#if ORTE_WANT_NOTIFIER_LOG_EVENT
#include "notifier_event_calls.h"
#else /* ORTE_WANT_NOTIFIER_LOG_EVENT */
#define ORTE_NOTIFIER_DEFINE_EVENT(i, m)
#define ORTE_NOTIFIER_LOG_EVENT(i, c, t) do {} while (0)
#endif /* ORTE_WANT_NOTIFIER_LOG_EVENT */
END_C_DECLS

170
orte/mca/notifier/notifier_event_calls.h Обычный файл
Просмотреть файл

@ -0,0 +1,170 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Bull SAS. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_NOTIFIER_EVENTS_CALLS_H
#define ORTE_NOTIFIER_EVENTS_CALLS_H
#include "orte_config.h"
#ifdef HAVE_STDIO_H
#include <stdio.h>
#endif /* HAVE_STDIO_H */
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/class/opal_object.h"
BEGIN_C_DECLS
#define ORTE_NOTIFIER_LOG_0 0 /* Initial log format needed (no delay) */
#define ORTE_NOTIFIER_LOG_1 1 /* Intermediate log format needed (delay) */
#define ORTE_NOTIFIER_LOG_2 2 /* Final log format needed (at finalize) */
ORTE_DECLSPEC bool notifier_log_event_enabled(void);
ORTE_DECLSPEC void notifier_event_store(orte_notifier_event_t *);
ORTE_DECLSPEC void notifier_trace_event(int, int, int32_t, time_t, time_t,
const char *);
/*
* Do not use this function directly: use ORTE_NOTIFIER_DEFINE_EVENT() instead
*/
static inline orte_notifier_event_t *notifier_alloc_event(int ev_id,
const char *msg)
{
orte_notifier_event_t *ev;
ev = OBJ_NEW(orte_notifier_event_t);
if (NULL == ev) {
return NULL;
}
asprintf(&ev->ev_msg, msg);
if (NULL == ev->ev_msg) {
OBJ_RELEASE(ev);
return NULL;
}
ev->ev_id = ev_id;
/*
* Store the allocated event into a list to be able to manage the
* unconditional event tracing and freeing during finalize.
*/
notifier_event_store(ev);
return ev;
}
static inline void notifier_count_and_log_event(orte_notifier_event_t *ev,
int ev_id,
int cnt_thresh,
int time_thresh)
{
time_t now, delay;
int32_t count;
opal_atomic_add_32(&ev->ev_cnt, 1);
if (ev->ev_cnt <= cnt_thresh) {
return;
}
count = ev->ev_cnt;
now = time(NULL);
if (ev->ev_already_traced) {
if (now > ev->ev_time_trc + time_thresh) {
delay = now - ev->ev_time_trc;
ev->ev_cnt = 0;
ev->ev_time_trc = now;
notifier_trace_event(ORTE_NOTIFIER_LOG_1, ev_id, count, now, delay,
ev->ev_msg);
}
} else {
ev->ev_already_traced = 1;
ev->ev_cnt = 0;
ev->ev_time_trc = now;
/* We don't care about the delay for the very 1st trace */
notifier_trace_event(ORTE_NOTIFIER_LOG_0, ev_id, count, now, now,
ev->ev_msg);
}
}
#define notifier_event_fn_prefix(i) notifier_log_event_ ## i
/*
* This macro should be called each time a new event will be traced.
* It expands to a static inline function suffixed by the event number.
*/
#define ORTE_NOTIFIER_DEFINE_EVENT(i, m) \
static inline void notifier_event_fn_prefix(i) (int c_thr, int t_thr) \
{ \
static orte_notifier_event_t *prefix_ ## i = NULL; \
if (!notifier_log_event_enabled()) { \
return; \
} \
if (NULL == prefix_ ## i) { \
prefix_ ## i = notifier_alloc_event(i, m); \
if (NULL == prefix_ ## i) { \
return; \
} \
} \
notifier_count_and_log_event(prefix_ ## i, i, c_thr, t_thr); \
}
/*
* This is the log interface that should be called whenever an unsual event
* should be warned about.
* The event should have been defined before, using
* ORTE_NOTIFIER_DEFINE_EVENT():
*
* (1) Event definition:
*
* Typically in a header file call the following:
* ORTE_NOTIFIER_DEFINE_EVENT(0, "message 0")
* This macro expands to
* static inline void notifier_log_event_0(int c_thr, int t_thr)
* {
* static orte_notifier_event_t *prefix_0 = NULL;
* if (!notifier_log_event_enabled()) {
* return;
* }
* if (NULL == prefix_0) {
* prefix_0 = notifier_alloc_event(0, "message 0");
* if (NULL == prefix_0) {
* return;
* }
* }
* notifier_count_and_log_event(prefix_0, 0, c_thr, t_thr);
* }
*
* (2) Event accounting and tracing:
*
* Whenever you want to trace the unusual event whose id is 0, just call:
* ORTE_NOTIFIER_LOG_EVENT(0, 100, 1);
* 100 and 1 are respectively the counter and time thresholds.
* This actually expands to
* notifier_log_event_0(100, 1);
*/
#define ORTE_NOTIFIER_LOG_EVENT(i, c, t) notifier_event_fn_prefix(i) (c, t)
END_C_DECLS
#endif /* ORTE_NOTIFIER_EVENT_CALLS_H */

49
orte/mca/notifier/notifier_event_types.h Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Bull SAS. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_NOTIFIER_BASE_EVENTS_H
#define ORTE_NOTIFIER_BASE_EVENTS_H
#include "orte_config.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/sys/atomic.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_object.h"
BEGIN_C_DECLS
typedef struct {
opal_list_item_t super;
volatile int32_t ev_cnt;
int ev_id;
int ev_already_traced;
time_t ev_time_trc;
char *ev_msg;
} orte_notifier_event_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_notifier_event_t);
END_C_DECLS
#endif /* ORTE_NOTIFIER_BASE_EVENTS_H */

Просмотреть файл

@ -31,7 +31,6 @@
#include "notifier_smtp.h"
static int smtp_open(void);
static int smtp_component_query(mca_base_module_t **module, int *priority);
static int smtp_close(void);
static int smtp_register(void);
@ -50,7 +49,7 @@ orte_notifier_smtp_component_t mca_notifier_smtp_component = {
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
smtp_open,
NULL,
smtp_close,
smtp_component_query,
smtp_register,
@ -160,12 +159,6 @@ static int smtp_register(void)
return ORTE_SUCCESS;
}
static int smtp_open(void)
{
/* Nothing to do */
return ORTE_SUCCESS;
}
static int smtp_close(void)
{
if (NULL != mca_notifier_smtp_component.server) {

Просмотреть файл

@ -51,11 +51,14 @@
/* Static API's */
static void mylog(int severity, int errcode, const char *msg, ...);
static void myhelplog(int severity, int errcode, const char *filename,
const char *topic, ...);
static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc,
const char *msg, ...);
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap);
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc,
const char *msg, va_list ap);
/* Module */
orte_notifier_base_module_t orte_notifier_smtp_module = {
@ -63,7 +66,8 @@ orte_notifier_base_module_t orte_notifier_smtp_module = {
NULL,
mylog,
myhelplog,
mypeerlog
mypeerlog,
NULL
};
typedef enum {
@ -311,23 +315,13 @@ static int send_email(char *msg)
return err;
}
static void mylog(int severity, int errcode, const char *msg, ...)
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap)
{
char *output;
va_list arglist;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
/* If there was a message, output it */
va_start(arglist, msg);
vasprintf(&output, msg, arglist);
va_end(arglist);
vasprintf(&output, msg, ap);
if (NULL != output) {
send_email(output);
@ -335,23 +329,11 @@ static void mylog(int severity, int errcode, const char *msg, ...)
}
}
static void myhelplog(int severity, int errcode, const char *filename,
const char *topic, ...)
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap)
{
va_list arglist;
char *output;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
char *output = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != output) {
send_email(output);
@ -359,55 +341,14 @@ static void myhelplog(int severity, int errcode, const char *filename,
}
}
static void mypeerlog(int severity, int errcode,
orte_process_name_t *peer_proc, const char *msg, ...)
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
va_list arglist;
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
char *peer_host = NULL, *peer_name = NULL;
char *pos = buf;
char *errstr = (char*)orte_err2str(errcode);
int len, space = ORTE_NOTIFIER_MAX_BUF;
char *buf = orte_notifier_base_peer_log(errcode, peer_proc, msg, ap);
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
if (NULL != buf) {
send_email(buf);
free(buf);
}
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
peer_name = ORTE_NAME_PRINT(peer_proc);
}
len = snprintf(pos, space,
"While communicating to proc %s on node %s,"
" proc %s on node %s encountered an error ",
peer_name ? peer_name : "UNKNOWN",
peer_host ? peer_host : "UNKNOWN",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
space -= len;
pos += len;
if (0 < space) {
if (errstr) {
len = snprintf(pos, space, "'%s':", errstr);
} else {
len = snprintf(pos, space, "(%d):", errcode);
}
space -= len;
pos += len;
}
if (0 < space) {
va_start(arglist, msg);
vsnprintf(pos, space, msg, arglist);
va_end(arglist);
}
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
send_email(buf);
}

Просмотреть файл

@ -10,6 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,14 +27,6 @@
BEGIN_C_DECLS
/*
* Component open / close
*/
int orte_notifier_syslog_open(void);
int orte_notifier_syslog_close(void);
int orte_notifier_syslog_component_query(mca_base_module_t **module, int *priority);
/*
* Notifier interfaces
*/

Просмотреть файл

@ -10,6 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -28,6 +29,10 @@
#include "notifier_syslog.h"
static int orte_notifier_syslog_component_query(mca_base_module_t **module,
int *priority);
/*
* Struct of function pointers that need to be initialized
*/
@ -39,8 +44,8 @@ orte_notifier_base_component_t mca_notifier_syslog_component = {
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_notifier_syslog_open, /* module open */
orte_notifier_syslog_close, /* module close */
NULL,
NULL,
orte_notifier_syslog_component_query /* module query */
},
{
@ -49,20 +54,9 @@ orte_notifier_base_component_t mca_notifier_syslog_component = {
}
};
/* Open the component */
int orte_notifier_syslog_open(void)
static int orte_notifier_syslog_component_query(mca_base_module_t **module,
int *priority)
{
return ORTE_SUCCESS;
}
int orte_notifier_syslog_close(void)
{
return ORTE_SUCCESS;
}
int orte_notifier_syslog_component_query(mca_base_module_t **module, int *priority)
{
/* we are a lower-level default, so set a low priority so we can be overridden */
*priority = 1;
*module = (mca_base_module_t *)&orte_notifier_syslog_module;
return ORTE_SUCCESS;

Просмотреть файл

@ -33,10 +33,9 @@
#include "opal/util/show_help.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_syslog.h"
@ -44,9 +43,14 @@
/* Static API's */
static int init(void);
static void finalize(void);
static void mylog(int severity, int errcode, const char *msg, ...);
static void myhelplog(int severity, int errcode, const char *filename, const char *topic, ...);
static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...);
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap);
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap);
static void myeventlog(const char *msg);
/* Module def */
orte_notifier_base_module_t orte_notifier_syslog_module = {
@ -54,11 +58,13 @@ orte_notifier_base_module_t orte_notifier_syslog_module = {
finalize,
mylog,
myhelplog,
mypeerlog
mypeerlog,
myeventlog
};
static int init(void) {
static int init(void)
{
int opts;
opts = LOG_CONS | LOG_PID | LOG_SYSLOG;
@ -67,44 +73,22 @@ static int init(void) {
return ORTE_SUCCESS;
}
static void finalize(void) {
static void finalize(void)
{
closelog();
}
static void mylog(int severity, int errcode, const char *msg, ...)
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap)
{
va_list arglist;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
/* If there was a message, output it */
va_start(arglist, msg);
vsyslog(severity, msg, arglist);
va_end(arglist);
vsyslog(severity, msg, ap);
}
static void myhelplog(int severity, int errcode, const char *filename, const char *topic, ...)
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap)
{
va_list arglist;
char *output;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
char *output = opal_show_help_vstring(filename, topic, false, ap);
/* if nothing came back, then nothing to do */
if (NULL == output) {
@ -116,54 +100,21 @@ static void myhelplog(int severity, int errcode, const char *filename, const cha
free(output);
}
static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc, const char *msg, ...)
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
va_list arglist;
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
char *peer_host = NULL, *peer_name = NULL;
char *pos = buf;
char *errstr = (char*)orte_err2str(errcode);
int len, space = ORTE_NOTIFIER_MAX_BUF;
char *buf = orte_notifier_base_peer_log(errcode, peer_proc, msg, ap);
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
if (NULL != buf) {
syslog(severity, buf);
free(buf);
}
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
peer_name = ORTE_NAME_PRINT(peer_proc);
}
len = snprintf(pos, space,
"While communicating to proc %s on node %s,"
" proc %s on node %s encountered an error ",
peer_name ? peer_name : "UNKNOWN",
peer_host ? peer_host : "UNKNOWN",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
space -= len;
pos += len;
if (0 < space) {
if (errstr) {
len = snprintf(pos, space, "'%s':", errstr);
} else {
len = snprintf(pos, space, "(%d):", errcode);
}
space -= len;
pos += len;
}
if (0 < space) {
va_start(arglist, msg);
vsnprintf(pos, space, msg, arglist);
va_end(arglist);
}
buf[ORTE_NOTIFIER_MAX_BUF] = '\0'; /* not needed? */
syslog(severity, buf);
}
static void myeventlog(const char *msg)
{
/* If there was a message, output it */
syslog(LOG_LOCAL0 | LOG_NOTICE, msg);
}

Просмотреть файл

@ -35,7 +35,6 @@
#include "notifier_twitter.h"
static int twitter_open(void);
static int twitter_component_query(mca_base_module_t **module, int *priority);
static int twitter_close(void);
static int twitter_register(void);
@ -54,7 +53,7 @@ orte_notifier_twitter_component_t mca_notifier_twitter_component = {
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
twitter_open,
NULL,
twitter_close,
twitter_component_query,
twitter_register,
@ -108,12 +107,6 @@ static int twitter_register(void)
return ORTE_SUCCESS;
}
static int twitter_open(void)
{
/* Nothing to do */
return ORTE_SUCCESS;
}
static int twitter_close(void)
{
if (NULL != mca_notifier_twitter_component.url) {
@ -134,9 +127,6 @@ static int twitter_component_query(mca_base_module_t **module,
{
char *str;
*priority = 10;
*module = (mca_base_module_t *)&orte_notifier_twitter_module;
/* If we have no username or password, there's no love */
if (NULL == mca_notifier_twitter_component.username ||
NULL == mca_notifier_twitter_component.password) {
@ -195,5 +185,7 @@ static int twitter_component_query(mca_base_module_t **module,
return ORTE_ERR_NOT_FOUND;
}
*priority = 10;
*module = (mca_base_module_t *)&orte_notifier_twitter_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -56,11 +56,14 @@
/* Static API's */
static void mylog(int severity, int errcode, const char *msg, ...);
static void myhelplog(int severity, int errcode, const char *filename,
const char *topic, ...);
static void mypeerlog(int severity, int errcode, orte_process_name_t *peer_proc,
const char *msg, ...);
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap);
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc,
const char *msg, va_list ap);
/* Module */
orte_notifier_base_module_t orte_notifier_twitter_module = {
@ -68,7 +71,8 @@ orte_notifier_base_module_t orte_notifier_twitter_module = {
NULL,
mylog,
myhelplog,
mypeerlog
mypeerlog,
NULL
};
static char base64_convert(uint8_t i)
@ -265,23 +269,13 @@ static void tweet(char *msg)
close(fd);
}
static void mylog(int severity, int errcode, const char *msg, ...)
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap)
{
char *output;
va_list arglist;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
/* If there was a message, output it */
va_start(arglist, msg);
vasprintf(&output, msg, arglist);
va_end(arglist);
vasprintf(&output, msg, ap);
if (NULL != output) {
tweet(output);
@ -289,23 +283,11 @@ static void mylog(int severity, int errcode, const char *msg, ...)
}
}
static void myhelplog(int severity, int errcode, const char *filename,
const char *topic, ...)
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap)
{
va_list arglist;
char *output;
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
}
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
char *output = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != output) {
tweet(output);
@ -313,55 +295,15 @@ static void myhelplog(int severity, int errcode, const char *filename,
}
}
static void mypeerlog(int severity, int errcode,
orte_process_name_t *peer_proc, const char *msg, ...)
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
va_list arglist;
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
char *peer_host = NULL, *peer_name = NULL;
char *pos = buf;
char *errstr = (char*)orte_err2str(errcode);
int len, space = ORTE_NOTIFIER_MAX_BUF;
char *buf = orte_notifier_base_peer_log(errcode, peer_proc, msg, ap);
/* is the severity value above the threshold - I know
* this seems backward, but lower severity values are
* considered "more severe"
*/
if (severity > orte_notifier_threshold_severity) {
return;
if (NULL != buf) {
tweet(buf);
free(buf);
}
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
peer_name = ORTE_NAME_PRINT(peer_proc);
}
len = snprintf(pos, space,
"While communicating to proc %s on node %s,"
" proc %s on node %s encountered an error ",
peer_name ? peer_name : "UNKNOWN",
peer_host ? peer_host : "UNKNOWN",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
space -= len;
pos += len;
if (0 < space) {
if (errstr) {
len = snprintf(pos, space, "'%s':", errstr);
} else {
len = snprintf(pos, space, "(%d):", errcode);
}
space -= len;
pos += len;
}
if (0 < space) {
va_start(arglist, msg);
vsnprintf(pos, space, msg, arglist);
va_end(arglist);
}
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
tweet(buf);
}