OMPI FTB component updates
* register FTB events from an event schema file * define more FTB events * minor fixes This commit was SVN r23180.
Этот коммит содержится в:
родитель
b56ab33ff6
Коммит
118ce0e166
@ -19,7 +19,8 @@
|
|||||||
AM_CPPFLAGS = $(notifier_ftb_CPPFLAGS)
|
AM_CPPFLAGS = $(notifier_ftb_CPPFLAGS)
|
||||||
|
|
||||||
dist_pkgdata_DATA = \
|
dist_pkgdata_DATA = \
|
||||||
help-orte-notifier-ftb.txt
|
help-orte-notifier-ftb.txt \
|
||||||
|
help-ftb-event-schema.txt
|
||||||
|
|
||||||
sources = \
|
sources = \
|
||||||
notifier_ftb.h \
|
notifier_ftb.h \
|
||||||
|
37
orte/mca/notifier/ftb/help-ftb-event-schema.txt
Обычный файл
37
orte/mca/notifier/ftb/help-ftb-event-schema.txt
Обычный файл
@ -0,0 +1,37 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2010 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology Corporation.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the FTB event schema file for Open MPI's FTB notifier
|
||||||
|
#
|
||||||
|
start
|
||||||
|
|
||||||
|
ftb.mpi.openmpi
|
||||||
|
|
||||||
|
MPI_INIT INFO
|
||||||
|
MPI_FINALIZE INFO
|
||||||
|
MPI_NODE_DEAD ERROR
|
||||||
|
MPI_NODE_RESTORED INFO
|
||||||
|
MPI_RANK_DEAD ERROR
|
||||||
|
MPI_RANK_RESTORED INFO
|
||||||
|
MPI_NODE_MIGRATE_DONE INFO
|
||||||
|
MPI_JOB_ABORT_CMD ERROR
|
||||||
|
MPI_JOB_RESUME_CMD INFO
|
||||||
|
MPI_JOB_ABORTED ERROR
|
||||||
|
MPI_JOB_RESUMED INFO
|
||||||
|
MPI_MSG_CORRUPT ERROR
|
||||||
|
MPI_IFACE_DEAD ERROR
|
||||||
|
MPI_IFACE_RESTORED ERROR
|
||||||
|
MPI_UNKNOWN_ERROR ERROR
|
||||||
|
MPI_OUT_OF_RESOURCE ERROR
|
||||||
|
MPI_NODE_UNREACHABLE ERROR
|
||||||
|
MPI_COMM_FAILURE ERROR
|
||||||
|
|
||||||
|
end
|
@ -39,20 +39,23 @@ typedef struct {
|
|||||||
int priority;
|
int priority;
|
||||||
} orte_notifier_ftb_component_t;
|
} orte_notifier_ftb_component_t;
|
||||||
|
|
||||||
/*
|
/* Notifier interfaces */
|
||||||
* Notifier interfaces
|
|
||||||
*/
|
|
||||||
|
|
||||||
ORTE_MODULE_DECLSPEC extern orte_notifier_ftb_component_t mca_notifier_ftb_component;
|
ORTE_MODULE_DECLSPEC extern orte_notifier_ftb_component_t mca_notifier_ftb_component;
|
||||||
extern orte_notifier_base_module_t orte_notifier_ftb_module;
|
extern orte_notifier_base_module_t orte_notifier_ftb_module;
|
||||||
|
|
||||||
/*
|
/* FTB client information */
|
||||||
* FTB client information
|
|
||||||
*/
|
|
||||||
|
|
||||||
extern FTB_client_t ftb_client_info;
|
extern FTB_client_t ftb_client_info;
|
||||||
extern FTB_client_handle_t ftb_client_handle;
|
extern FTB_client_handle_t ftb_client_handle;
|
||||||
|
|
||||||
|
/* FTB event types */
|
||||||
|
typedef enum {
|
||||||
|
FTB_EVENT_NORMAL = 1,
|
||||||
|
FTB_EVENT_RESPONSE = 2
|
||||||
|
} ftb_event_type_t;
|
||||||
|
|
||||||
|
/* Macro that returns FTB event name given the FTB event code */
|
||||||
|
#define FTB_ERROR(errnum) #errnum
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -30,15 +30,15 @@
|
|||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "opal/mca/installdirs/installdirs.h"
|
||||||
#include "opal/util/show_help.h"
|
#include "opal/util/show_help.h"
|
||||||
|
#include "opal/util/os_path.h"
|
||||||
|
|
||||||
#include "orte/util/error_strings.h"
|
|
||||||
#include "orte/mca/ess/ess.h"
|
#include "orte/mca/ess/ess.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/mca/notifier/base/base.h"
|
#include "orte/mca/notifier/base/base.h"
|
||||||
#include "notifier_ftb.h"
|
#include "notifier_ftb.h"
|
||||||
|
|
||||||
|
|
||||||
/* Static API's */
|
/* Static API's */
|
||||||
static int init(void);
|
static int init(void);
|
||||||
static void finalize(void);
|
static void finalize(void);
|
||||||
@ -66,53 +66,23 @@ FTB_client_t ftb_client_info;
|
|||||||
/* FTB client handle */
|
/* FTB client handle */
|
||||||
FTB_client_handle_t ftb_client_handle;
|
FTB_client_handle_t ftb_client_handle;
|
||||||
|
|
||||||
static FTB_event_info_t ftb_event_info[] = {
|
|
||||||
/* 0 */ {"UNKNOWN_ERROR", "error"},
|
|
||||||
/* 1 */ {"OUT_OF_RESOURCES", "error"},
|
|
||||||
/* 2 */ {"UNREACHABLE", "error"},
|
|
||||||
/* 3 */ {"COMM_FAILURE", "error"},
|
|
||||||
/* 4 */ {"FATAL", "fatal"},
|
|
||||||
};
|
|
||||||
static const int ftb_event_info_count = sizeof(ftb_event_info)/sizeof(FTB_event_info_t);
|
|
||||||
|
|
||||||
static int orte_err2ftb(int errnum)
|
|
||||||
{
|
|
||||||
int retval;
|
|
||||||
|
|
||||||
switch (OPAL_SOS_GET_ERROR_CODE(errnum)) {
|
|
||||||
case ORTE_ERR_OUT_OF_RESOURCE:
|
|
||||||
case ORTE_ERR_TEMP_OUT_OF_RESOURCE:
|
|
||||||
retval = 1;
|
|
||||||
break;
|
|
||||||
case ORTE_ERR_CONNECTION_REFUSED:
|
|
||||||
case ORTE_ERR_CONNECTION_FAILED:
|
|
||||||
case ORTE_ERR_UNREACH:
|
|
||||||
retval = 2;
|
|
||||||
break;
|
|
||||||
case ORTE_ERR_COMM_FAILURE:
|
|
||||||
retval = 3;
|
|
||||||
break;
|
|
||||||
case ORTE_ERR_FATAL:
|
|
||||||
retval = 4;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
retval = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((ftb_event_info_count <= retval) || (0 > retval)) {
|
|
||||||
retval = 0;
|
|
||||||
}
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int init(void) {
|
static int init(void) {
|
||||||
int ret;
|
int ret;
|
||||||
|
char *schema_file;
|
||||||
|
|
||||||
|
/* Locate the FTB events schema file */
|
||||||
|
if (NULL == (schema_file = opal_os_path(false, opal_install_dirs.pkgdatadir,
|
||||||
|
"help-ftb-event-schema.txt", NULL))) {
|
||||||
|
schema_file = strdup("help-ftb-event-schema.txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Declare the Open MPI publishable events to the FTB */
|
||||||
|
ret = FTB_Declare_publishable_events(ftb_client_handle, schema_file, NULL, 0);
|
||||||
|
free(schema_file);
|
||||||
|
|
||||||
ret = FTB_Declare_publishable_events(ftb_client_handle, 0, ftb_event_info, ftb_event_info_count);
|
|
||||||
if (FTB_SUCCESS != ret) {
|
if (FTB_SUCCESS != ret) {
|
||||||
orte_show_help("help-orte-notifier-ftb.txt",
|
orte_show_help("help-orte-notifier-ftb.txt", "declare events failed", true,
|
||||||
"declare events failed",
|
"FTB_Declare_publishable_events() failed", ret);
|
||||||
true, "FTB_Declare_publishable_events() failed", ret);
|
|
||||||
|
|
||||||
FTB_Disconnect(ftb_client_handle);
|
FTB_Disconnect(ftb_client_handle);
|
||||||
return ORTE_ERROR;
|
return ORTE_ERROR;
|
||||||
@ -125,53 +95,109 @@ static void finalize(void) {
|
|||||||
FTB_Disconnect(ftb_client_handle);
|
FTB_Disconnect(ftb_client_handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void send_to_ftb(int errcode, char *payload)
|
static const char* get_ftb_event_severity(orte_notifier_base_severity_t severity)
|
||||||
{
|
{
|
||||||
int ret, event_id;
|
switch (severity) {
|
||||||
|
case ORTE_NOTIFIER_EMERG:
|
||||||
|
case ORTE_NOTIFIER_ALERT:
|
||||||
|
return "ALL";
|
||||||
|
case ORTE_NOTIFIER_CRIT:
|
||||||
|
return "FATAL";
|
||||||
|
case ORTE_NOTIFIER_ERROR:
|
||||||
|
return "ERROR";
|
||||||
|
case ORTE_NOTIFIER_WARN:
|
||||||
|
case ORTE_NOTIFIER_NOTICE:
|
||||||
|
return "WARNING";
|
||||||
|
case ORTE_NOTIFIER_INFO:
|
||||||
|
case ORTE_NOTIFIER_DEBUG:
|
||||||
|
return "INFO";
|
||||||
|
default:
|
||||||
|
return "UNKNOWN";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char* get_ftb_event_name(int errnum)
|
||||||
|
{
|
||||||
|
/* If it an OMPI error, translate it to an equivalent FTB event */
|
||||||
|
if (OPAL_SUCCESS > errnum) {
|
||||||
|
switch (errnum) {
|
||||||
|
case ORTE_ERR_OUT_OF_RESOURCE:
|
||||||
|
case ORTE_ERR_TEMP_OUT_OF_RESOURCE:
|
||||||
|
return FTB_ERROR(MPI_OUT_OF_RESOURCE);
|
||||||
|
|
||||||
|
case ORTE_ERR_CONNECTION_REFUSED:
|
||||||
|
case ORTE_ERR_CONNECTION_FAILED:
|
||||||
|
case ORTE_ERR_UNREACH:
|
||||||
|
return FTB_ERROR(MPI_NODE_DEAD);
|
||||||
|
|
||||||
|
case ORTE_ERR_COMM_FAILURE:
|
||||||
|
return FTB_ERROR(MPI_COMM_FAILURE);
|
||||||
|
|
||||||
|
case ORTE_ERR_PROC_DEAD:
|
||||||
|
return FTB_ERROR(MPI_RANK_DEAD);
|
||||||
|
|
||||||
|
case ORTE_ERR_FATAL:
|
||||||
|
default:
|
||||||
|
return FTB_ERROR(MPI_UNKNOWN_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return FTB_ERROR(MPI_UNKNOWN_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void publish_ftb_event(orte_notifier_base_severity_t severity, int errcode, char *payload)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
const char *event_name;
|
||||||
FTB_event_handle_t ehandle;
|
FTB_event_handle_t ehandle;
|
||||||
FTB_event_properties_t eprop;
|
FTB_event_properties_t eprop;
|
||||||
eprop.event_type = 1;
|
|
||||||
snprintf(eprop.event_payload, FTB_MAX_PAYLOAD_DATA, "%s", (payload != NULL) ? payload : "");
|
|
||||||
|
|
||||||
event_id = orte_err2ftb(errcode);
|
/* Only normal FTB events are supported currently. */
|
||||||
ret = FTB_Publish(ftb_client_handle, ftb_event_info[event_id].event_name, &eprop, &ehandle);
|
eprop.event_type = (int) FTB_EVENT_NORMAL;
|
||||||
|
|
||||||
|
/* Copy the event payload, if we have one */
|
||||||
|
if (NULL != payload) {
|
||||||
|
strncpy(eprop.event_payload, payload, FTB_MAX_PAYLOAD_DATA);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Publish the event to the Fault Tolerant Backplane */
|
||||||
|
event_name = get_ftb_event_name(errcode);
|
||||||
|
ret = FTB_Publish(ftb_client_handle, event_name, &eprop, &ehandle);
|
||||||
if (FTB_SUCCESS != ret) {
|
if (FTB_SUCCESS != ret) {
|
||||||
orte_show_help("help-orte-notifier-ftb.txt",
|
orte_show_help("help-orte-notifier-ftb.txt", "publish failed", true,
|
||||||
"publish failed",
|
"FTB_Publish() failed", ret, get_ftb_event_severity(severity),
|
||||||
true, "FTB_Publish() failed", ret,
|
event_name, payload, errcode);
|
||||||
ftb_event_info[event_id].severity,
|
|
||||||
ftb_event_info[event_id].event_name,
|
|
||||||
eprop.event_payload, errcode);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
|
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
|
||||||
va_list ap)
|
va_list ap)
|
||||||
{
|
{
|
||||||
char *payload;
|
char *payload;
|
||||||
|
|
||||||
/* If there was a message, output it */
|
/* If there was a message, output it */
|
||||||
vasprintf(&payload, msg, ap);
|
vasprintf(&payload, msg, ap);
|
||||||
if (NULL != payload) {
|
if (NULL != payload) {
|
||||||
send_to_ftb(errcode, payload);
|
publish_ftb_event(severity, errcode, payload);
|
||||||
free(payload);
|
free(payload);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
|
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
|
||||||
const char *filename, const char *topic, va_list ap)
|
const char *filename, const char *topic, va_list ap)
|
||||||
{
|
{
|
||||||
char *output = opal_show_help_vstring(filename, topic, false, ap);
|
char *payload;
|
||||||
|
|
||||||
if (NULL != output) {
|
payload = opal_show_help_vstring(filename, topic, false, ap);
|
||||||
send_to_ftb(errcode, output);
|
if (NULL != payload) {
|
||||||
free(output);
|
publish_ftb_event(severity, errcode, payload);
|
||||||
|
free(payload);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
|
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
|
||||||
orte_process_name_t *peer_proc, const char *msg,
|
orte_process_name_t *peer_proc, const char *msg,
|
||||||
va_list ap)
|
va_list ap)
|
||||||
{
|
{
|
||||||
char payload[FTB_MAX_PAYLOAD_DATA + 1];
|
char payload[FTB_MAX_PAYLOAD_DATA + 1];
|
||||||
char *peer_host = NULL;
|
char *peer_host = NULL;
|
||||||
@ -191,5 +217,5 @@ static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
|
|||||||
}
|
}
|
||||||
|
|
||||||
payload[FTB_MAX_PAYLOAD_DATA] = '\0';
|
payload[FTB_MAX_PAYLOAD_DATA] = '\0';
|
||||||
send_to_ftb(errcode, payload);
|
publish_ftb_event(severity, errcode, payload);
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user