1
1
openmpi/orte/mca/notifier/ftb/notifier_ftb_module.c
Abhishek Kulkarni 87d2c9b31d Few fault tolerance updates related to the CIFTS project (http://www.mcs.anl.gov/research/cifts/)
* Improve the FTB notifier to publish (C/R, process/communication failure) events to the FTB with the
   OMPI jobid as the associated payload.
 * Add notifier calls for C/R events and process status events in SnapC and ErrMgr components.
 * Fix a bug where the SnapC states and process states collide before being thrown out over the notifier.

This commit was SVN r24251.
2011-01-13 20:13:49 +00:00

295 строки
8.9 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/show_help.h"
#include "opal/util/os_path.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/show_help.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_ftb.h"
/* Static API's */
static int init(void);
static void finalize(void);
static void ftb_log(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap);
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap);
/* Module def */
orte_notifier_base_module_t orte_notifier_ftb_module = {
init,
finalize,
ftb_log,
ftb_help,
ftb_peer,
NULL
};
/* FTB client information */
FTB_client_t ftb_client_info;
/* FTB client handle */
FTB_client_handle_t ftb_client_handle;
static int init(void) {
int ret;
char *schema_file;
/* Locate the FTB events schema file */
if (NULL == (schema_file = opal_os_path(false, opal_install_dirs.pkgdatadir,
"help-ftb-event-schema.txt", NULL))) {
schema_file = strdup("help-ftb-event-schema.txt");
}
/* Declare the Open MPI publishable events to the FTB */
ret = FTB_Declare_publishable_events(ftb_client_handle, schema_file, NULL, 0);
free(schema_file);
if (FTB_SUCCESS != ret) {
orte_show_help("help-orte-notifier-ftb.txt", "declare events failed", true,
"FTB_Declare_publishable_events() failed", ret);
FTB_Disconnect(ftb_client_handle);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static void finalize(void) {
/* If the FTB client handle is valid, disconnect the client from FTB. */
if (1 == ftb_client_handle.valid) {
FTB_Disconnect(ftb_client_handle);
}
}
static const char* get_ftb_event_severity(orte_notifier_base_severity_t severity)
{
switch (severity) {
case ORTE_NOTIFIER_EMERG:
case ORTE_NOTIFIER_ALERT:
return "ALL";
case ORTE_NOTIFIER_CRIT:
return "FATAL";
case ORTE_NOTIFIER_ERROR:
return "ERROR";
case ORTE_NOTIFIER_WARN:
case ORTE_NOTIFIER_NOTICE:
return "WARNING";
case ORTE_NOTIFIER_INFO:
case ORTE_NOTIFIER_DEBUG:
return "INFO";
default:
return "UNKNOWN";
}
}
static const char* get_ftb_event_name(int errnum)
{
/* Handle checkpoint/restart and migration events */
if ( CHECK_ORTE_SNAPC_CKPT_STATE(errnum) ) {
errnum = ORTE_SNAPC_CKPT_STATE(errnum);
switch (errnum) {
case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
return FTB_EVENT(FTB_MPI_PROCS_CKPTED);
case ORTE_SNAPC_CKPT_STATE_NO_CKPT:
case ORTE_SNAPC_CKPT_STATE_ERROR:
return FTB_EVENT(FTB_MPI_PROCS_CKPT_FAIL);
/* Restart events */
case ORTE_SNAPC_CKPT_STATE_RECOVERED:
return FTB_EVENT(FTB_MPI_PROCS_RESTARTED);
case ORTE_SNAPC_CKPT_STATE_NO_RESTART:
return FTB_EVENT(FTB_MPI_PROCS_RESTART_FAIL);
/* Process migration events */
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
return FTB_EVENT(FTB_MPI_PROCS_MIGRATED);
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
return FTB_EVENT(FTB_MPI_PROCS_MIGRATE_FAIL);
default:
return NULL;
}
} else {
/* Handle process and communication failure events */
switch (errnum) {
case ORTE_ERR_CONNECTION_REFUSED:
case ORTE_ERR_CONNECTION_FAILED:
case ORTE_ERR_UNREACH:
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
return FTB_EVENT(FTB_MPI_PROCS_UNREACHABLE);
case ORTE_ERR_COMM_FAILURE:
case ORTE_PROC_STATE_COMM_FAILED:
return FTB_EVENT(FTB_MPI_PROCS_COMM_ERROR);
case ORTE_PROC_STATE_FAILED_TO_START:
case ORTE_PROC_STATE_CALLED_ABORT:
return FTB_EVENT(FTB_MPI_PROCS_ABORTED);
case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
case ORTE_PROC_STATE_TERMINATED:
case ORTE_PROC_STATE_KILLED_BY_CMD:
return FTB_EVENT(FTB_MPI_PROCS_DEAD);
default:
return NULL;
}
}
return NULL;
}
/* Extracts the FTB payload (inside the brackets []) from notifier
* message payload.
* For instance: "<FTB message [payload]>" would return "payload".
*/
static unsigned int extract_payload(char *dest, char *src, unsigned int size)
{
unsigned int ret;
char *lbrace, *rbrace;
rbrace = strrchr(src, ']');
lbrace = strchr(src, '[');
if (NULL == rbrace || NULL == lbrace) {
strncpy(dest, src, size);
ret = size;
} else {
ret = rbrace - lbrace + 1;
if (ret > size) {
ret = size;
}
strncpy(dest, lbrace, ret);
}
return ret;
}
static void publish_ftb_event(orte_notifier_base_severity_t severity, int errcode,
FTB_event_properties_t *eprop)
{
int ret;
const char *event_name;
FTB_event_handle_t ehandle;
/* Publish the event to the Fault Tolerant Backplane */
event_name = get_ftb_event_name(errcode);
if (NULL != event_name) {
ret = FTB_Publish(ftb_client_handle, event_name, eprop, &ehandle);
if (FTB_SUCCESS != ret) {
orte_show_help("help-orte-notifier-ftb.txt", "publish failed", true,
"FTB_Publish() failed", ret, get_ftb_event_severity(severity),
event_name, eprop->event_payload, errcode);
}
}
}
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
va_list ap)
{
char *payload;
FTB_event_properties_t ev_prop;
/* Only normal FTB events are supported currently. */
ev_prop.event_type = (int) FTB_EVENT_NORMAL;
/* Copy the event payload, if we have one */
vasprintf(&payload, msg, ap);
if (NULL != payload) {
extract_payload(ev_prop.event_payload, payload, FTB_MAX_PAYLOAD_DATA);
free(payload);
publish_ftb_event(severity, errcode, &ev_prop);
}
}
static void ftb_help(orte_notifier_base_severity_t severity, int errcode,
const char *filename, const char *topic, va_list ap)
{
char *payload;
FTB_event_properties_t ev_prop;
/* Only normal FTB events are supported currently. */
ev_prop.event_type = (int) FTB_EVENT_NORMAL;
payload = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != payload) {
extract_payload(ev_prop.event_payload, payload, FTB_MAX_PAYLOAD_DATA);
free(payload);
publish_ftb_event(severity, errcode, &ev_prop);
}
}
static void ftb_peer(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc, const char *msg,
va_list ap)
{
char *payload, *peer_host;
FTB_event_properties_t ev_prop;
/* Only normal FTB events are supported currently. */
ev_prop.event_type = (int) FTB_EVENT_NORMAL;
peer_host = NULL;
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
/* Ignore the peer_host for now. */
}
vasprintf(&payload, msg, ap);
if (NULL != payload) {
extract_payload(ev_prop.event_payload, payload, FTB_MAX_PAYLOAD_DATA);
free(payload);
publish_ftb_event(severity, errcode, &ev_prop);
}
}