1
1
* register FTB events from an event schema file
  * define more FTB events
  * minor fixes

This commit was SVN r23180.
Этот коммит содержится в:
Abhishek Kulkarni 2010-05-19 22:05:06 +00:00
родитель b56ab33ff6
Коммит 118ce0e166
4 изменённых файлов: 145 добавлений и 78 удалений

Просмотреть файл

@ -19,7 +19,8 @@
AM_CPPFLAGS = $(notifier_ftb_CPPFLAGS)
dist_pkgdata_DATA = \
help-orte-notifier-ftb.txt
help-orte-notifier-ftb.txt \
help-ftb-event-schema.txt
sources = \
notifier_ftb.h \

Просмотреть файл

@ -0,0 +1,37 @@
# -*- text -*-
#
# Copyright (c) 2010 The Trustees of Indiana University and Indiana
# University Research and Technology Corporation.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the FTB event schema file for Open MPI's FTB notifier
#
start
ftb.mpi.openmpi
MPI_INIT INFO
MPI_FINALIZE INFO
MPI_NODE_DEAD ERROR
MPI_NODE_RESTORED INFO
MPI_RANK_DEAD ERROR
MPI_RANK_RESTORED INFO
MPI_NODE_MIGRATE_DONE INFO
MPI_JOB_ABORT_CMD ERROR
MPI_JOB_RESUME_CMD INFO
MPI_JOB_ABORTED ERROR
MPI_JOB_RESUMED INFO
MPI_MSG_CORRUPT ERROR
MPI_IFACE_DEAD ERROR
MPI_IFACE_RESTORED ERROR
MPI_UNKNOWN_ERROR ERROR
MPI_OUT_OF_RESOURCE ERROR
MPI_NODE_UNREACHABLE ERROR
MPI_COMM_FAILURE ERROR
end

Просмотреть файл

@ -39,20 +39,23 @@ typedef struct {
int priority;
} orte_notifier_ftb_component_t;
/*
* Notifier interfaces
*/
/* Notifier interfaces */
ORTE_MODULE_DECLSPEC extern orte_notifier_ftb_component_t mca_notifier_ftb_component;
extern orte_notifier_base_module_t orte_notifier_ftb_module;
/*
* FTB client information
*/
/* FTB client information */
extern FTB_client_t ftb_client_info;
extern FTB_client_handle_t ftb_client_handle;
/* FTB event types */
typedef enum {
FTB_EVENT_NORMAL = 1,
FTB_EVENT_RESPONSE = 2
} ftb_event_type_t;
/* Macro that returns FTB event name given the FTB event code */
#define FTB_ERROR(errnum) #errnum
END_C_DECLS
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -30,15 +30,15 @@
#include <stdarg.h>
#endif
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/show_help.h"
#include "opal/util/os_path.h"
#include "orte/util/error_strings.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/show_help.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_ftb.h"
/* Static API's */
static int init(void);
static void finalize(void);
@ -66,53 +66,23 @@ FTB_client_t ftb_client_info;
/* FTB client handle */
FTB_client_handle_t ftb_client_handle;
static FTB_event_info_t ftb_event_info[] = {
/* 0 */ {"UNKNOWN_ERROR", "error"},
/* 1 */ {"OUT_OF_RESOURCES", "error"},
/* 2 */ {"UNREACHABLE", "error"},
/* 3 */ {"COMM_FAILURE", "error"},
/* 4 */ {"FATAL", "fatal"},
};
static const int ftb_event_info_count = sizeof(ftb_event_info)/sizeof(FTB_event_info_t);
static int orte_err2ftb(int errnum)
{
int retval;
switch (OPAL_SOS_GET_ERROR_CODE(errnum)) {
case ORTE_ERR_OUT_OF_RESOURCE:
case ORTE_ERR_TEMP_OUT_OF_RESOURCE:
retval = 1;
break;
case ORTE_ERR_CONNECTION_REFUSED:
case ORTE_ERR_CONNECTION_FAILED:
case ORTE_ERR_UNREACH:
retval = 2;
break;
case ORTE_ERR_COMM_FAILURE:
retval = 3;
break;
case ORTE_ERR_FATAL:
retval = 4;
break;
default:
retval = 0;
}
if ((ftb_event_info_count <= retval) || (0 > retval)) {
retval = 0;
}
return retval;
}
static int init(void) {
int ret;
char *schema_file;
/* Locate the FTB events schema file */
if (NULL == (schema_file = opal_os_path(false, opal_install_dirs.pkgdatadir,
"help-ftb-event-schema.txt", NULL))) {
schema_file = strdup("help-ftb-event-schema.txt");
}
/* Declare the Open MPI publishable events to the FTB */
ret = FTB_Declare_publishable_events(ftb_client_handle, schema_file, NULL, 0);
free(schema_file);
ret = FTB_Declare_publishable_events(ftb_client_handle, 0, ftb_event_info, ftb_event_info_count);
if (FTB_SUCCESS != ret) {
orte_show_help("help-orte-notifier-ftb.txt",
"declare events failed",
true, "FTB_Declare_publishable_events() failed", ret);
orte_show_help("help-orte-notifier-ftb.txt", "declare events failed", true,
"FTB_Declare_publishable_events() failed", ret);
FTB_Disconnect(ftb_client_handle);
return ORTE_ERROR;
@ -125,53 +95,109 @@ static void finalize(void) {
FTB_Disconnect(ftb_client_handle);
}
static void send_to_ftb(int errcode, char *payload)
static const char* get_ftb_event_severity(orte_notifier_base_severity_t severity)
{
int ret, event_id;
switch (severity) {
case ORTE_NOTIFIER_EMERG:
case ORTE_NOTIFIER_ALERT:
return "ALL";
case ORTE_NOTIFIER_CRIT:
return "FATAL";
case ORTE_NOTIFIER_ERROR:
return "ERROR";
case ORTE_NOTIFIER_WARN:
case ORTE_NOTIFIER_NOTICE:
return "WARNING";
case ORTE_NOTIFIER_INFO:
case ORTE_NOTIFIER_DEBUG:
return "INFO";
default:
return "UNKNOWN";
}
}
static const char* get_ftb_event_name(int errnum)
{
/* If it an OMPI error, translate it to an equivalent FTB event */
if (OPAL_SUCCESS > errnum) {
switch (errnum) {
case ORTE_ERR_OUT_OF_RESOURCE:
case ORTE_ERR_TEMP_OUT_OF_RESOURCE:
return FTB_ERROR(MPI_OUT_OF_RESOURCE);
case ORTE_ERR_CONNECTION_REFUSED:
case ORTE_ERR_CONNECTION_FAILED:
case ORTE_ERR_UNREACH:
return FTB_ERROR(MPI_NODE_DEAD);
case ORTE_ERR_COMM_FAILURE:
return FTB_ERROR(MPI_COMM_FAILURE);
case ORTE_ERR_PROC_DEAD:
return FTB_ERROR(MPI_RANK_DEAD);
case ORTE_ERR_FATAL:
default:
return FTB_ERROR(MPI_UNKNOWN_ERROR);
}
}
return FTB_ERROR(MPI_UNKNOWN_ERROR);
}
static void publish_ftb_event(orte_notifier_base_severity_t severity, int errcode, char *payload)
{
int ret;
const char *event_name;
FTB_event_handle_t ehandle;
FTB_event_properties_t eprop;
eprop.event_type = 1;
snprintf(eprop.event_payload, FTB_MAX_PAYLOAD_DATA, "%s", (payload != NULL) ? payload : "");
event_id = orte_err2ftb(errcode);
ret = FTB_Publish(ftb_client_handle, ftb_event_info[event_id].event_name, &eprop, &ehandle);
/* Only normal FTB events are supported currently. */
eprop.event_type = (int) FTB_EVENT_NORMAL;
/* Copy the event payload, if we have one */
if (NULL != payload) {
strncpy(eprop.event_payload, payload, FTB_MAX_PAYLOAD_DATA);
}
/* Publish the event to the Fault Tolerant Backplane */
event_name = get_ftb_event_name(errcode);
ret = FTB_Publish(ftb_client_handle, event_name, &eprop, &ehandle);
if (FTB_SUCCESS != ret) {
orte_show_help("help-orte-notifier-ftb.txt",
"publish failed",
true, "FTB_Publish() failed", ret,
ftb_event_info[event_id].severity,
ftb_event_info[event_id].event_name,
eprop.event_payload, errcode);
orte_show_help("help-orte-notifier-ftb.txt", "publish failed", true,
"FTB_Publish() failed", ret, get_ftb_event_severity(severity),
event_name, payload, errcode);
}
}
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
va_list ap)
va_list ap)
{
char *payload;
/* If there was a message, output it */
vasprintf(&payload, msg, ap);
if (NULL != payload) {
send_to_ftb(errcode, payload);
publish_ftb_event(severity, errcode, payload);
free(payload);
}
}
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap)
const char *filename, const char *topic, va_list ap)
{
char *output = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != output) {
send_to_ftb(errcode, output);
free(output);
char *payload;
payload = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != payload) {
publish_ftb_event(severity, errcode, payload);
free(payload);
}
}
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
char payload[FTB_MAX_PAYLOAD_DATA + 1];
char *peer_host = NULL;
@ -191,5 +217,5 @@ static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
}
payload[FTB_MAX_PAYLOAD_DATA] = '\0';
send_to_ftb(errcode, payload);
publish_ftb_event(severity, errcode, payload);
}