1
1

Add a new "sensor" module that supports fault tolerance tests - randomly kills local procs and/or the daemon itself

This commit was SVN r24960.
Этот коммит содержится в:
Ralph Castain 2011-07-29 20:48:22 +00:00
родитель e88a6c93da
Коммит 70bca4691f
5 изменённых файлов: 411 добавлений и 0 удалений

35
orte/mca/sensor/ft_tester/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
#
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
sensor_ft_tester.c \
sensor_ft_tester.h \
sensor_ft_tester_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_ft_tester_DSO
component_noinst =
component_install = mca_sensor_ft_tester.la
else
component_noinst = libmca_sensor_ft_tester.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_ft_tester_la_SOURCES = $(sources)
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_ft_tester_la_SOURCES =$(sources)
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version

21
orte/mca/sensor/ft_tester/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,21 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

189
orte/mca/sensor/ft_tester/sensor_ft_tester.c Обычный файл
Просмотреть файл

@ -0,0 +1,189 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#include "opal_stdint.h"
#include "opal/util/output.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_ft_tester.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_ft_tester_module = {
init,
finalize,
start,
stop
};
/* declare the local functions */
static void sample(int fd, short event, void *arg);
/* local globals */
static opal_event_t *sample_ev = NULL;
static struct timeval sample_time;
static int init(void)
{
if (0 == mca_sensor_ft_tester_component.fail_rate) {
/* not monitoring */
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
if (NULL != sample_ev) {
opal_event_del(sample_ev);
free(sample_ev);
sample_ev = NULL;
}
return;
}
/*
* Start killing local processes
*/
static void start(orte_jobid_t jobid)
{
if (NULL == sample_ev) {
/* startup a timer to wake us up periodically */
sample_ev = (opal_event_t *) malloc(sizeof(opal_event_t));
opal_event_evtimer_set(opal_event_base, sample_ev, sample, sample_ev);
sample_time.tv_sec = mca_sensor_ft_tester_component.fail_rate;
sample_time.tv_usec = 0;
opal_event_evtimer_add(sample_ev, &sample_time);
}
return;
}
static void stop(orte_jobid_t jobid)
{
if (NULL != sample_ev) {
opal_event_del(sample_ev);
free(sample_ev);
sample_ev = NULL;
}
return;
}
static void sample(int fd, short event, void *arg)
{
float prob;
opal_list_item_t *item;
orte_odls_child_t *child;
/* if we are not sampling any more, then just return */
if (NULL == sample_ev) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sample:ft_tester considering killing something",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* are we including ourselves? */
if (ORTE_PROC_IS_DAEMON && 0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sample:ft_tester considering killing me!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* roll the dice */
prob = (double)random() / (double)INT32_MAX;
if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
/* commit suicide */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sample:ft_tester committing suicide",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_errmgr.abort(1, NULL);
return;
}
}
/* see if we should kill a child */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (!child->alive || 0 == child->pid ||
ORTE_PROC_STATE_UNTERMINATED < child->state) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name),
child->alive ? "TRUE" : "FALSE",
(unsigned long)child->pid, orte_proc_state_to_str(child->state)));
continue;
}
/* roll the dice */
prob = (double)random() / (double)INT32_MAX;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sample:ft_tester child: %s dice: %f prob %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name),
prob, mca_sensor_ft_tester_component.fail_prob));
if (prob < mca_sensor_ft_tester_component.fail_prob) {
/* you shall die... */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sample:ft_tester killing %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
kill(child->pid, SIGTERM);
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
/* are we allowing multiple deaths */
if (!mca_sensor_ft_tester_component.multi_fail) {
break;
}
}
}
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
/* restart the timer */
if (NULL != sample_ev) {
opal_event_evtimer_add(sample_ev, &sample_time);
}
}

Просмотреть файл

@ -0,0 +1,39 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_FT_TESTER_H
#define ORTE_SENSOR_FT_TESTER_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_ft_tester_component_t {
orte_sensor_base_component_t super;
int fail_rate;
float fail_prob;
float daemon_fail_prob;
bool multi_fail;
};
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,127 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_ft_tester.h"
/*
* Local functions
*/
static int orte_sensor_ft_tester_open(void);
static int orte_sensor_ft_tester_close(void);
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"ft_tester", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_ft_tester_open, /* component open */
orte_sensor_ft_tester_close, /* component close */
orte_sensor_ft_tester_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_ft_tester_open(void)
{
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
int tmp;
char *str;
/* lookup parameters */
mca_base_param_reg_int(c, "fail_rate",
"Time between checks to decide if one or more procs shall be killed, expressed as sec",
false, false, 0, &tmp);
if (tmp < 0) {
opal_output(0, "Illegal value %d - must be >= 0", tmp);
return ORTE_ERR_FATAL;
}
mca_sensor_ft_tester_component.fail_rate = tmp;
mca_base_param_reg_string(c, "fail_prob",
"Probability of killing a single executable",
false, false, "30.0", &str);
if (NULL != str) {
mca_sensor_ft_tester_component.fail_prob = strtof(str, NULL);
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.fail_prob = 0.0;
}
mca_base_param_reg_int(c, "multi_allowed",
"Allow multiple executables to be killed at one time",
false, false, 0, &tmp);
mca_sensor_ft_tester_component.multi_fail = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_string(c, "daemon_fail_prob",
"Probability of killing a daemon",
false, false, "0.0", &str);
if (NULL != str) {
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(str, NULL);
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
}
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
{
if (0 == mca_sensor_ft_tester_component.fail_rate) {
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
*priority = 1; /* at the bottom */
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_ft_tester_close(void)
{
return ORTE_SUCCESS;
}