Add a new "sensor" module that supports fault tolerance tests - randomly kills local procs and/or the daemon itself
This commit was SVN r24960.
Этот коммит содержится в:
родитель
e88a6c93da
Коммит
70bca4691f
35
orte/mca/sensor/ft_tester/Makefile.am
Обычный файл
35
orte/mca/sensor/ft_tester/Makefile.am
Обычный файл
@ -0,0 +1,35 @@
|
||||
#
|
||||
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
sensor_ft_tester.c \
|
||||
sensor_ft_tester.h \
|
||||
sensor_ft_tester_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_ft_tester_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_ft_tester.la
|
||||
else
|
||||
component_noinst = libmca_sensor_ft_tester.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_ft_tester_la_SOURCES = $(sources)
|
||||
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_ft_tester_la_SOURCES =$(sources)
|
||||
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
|
21
orte/mca/sensor/ft_tester/configure.m4
Обычный файл
21
orte/mca/sensor/ft_tester/configure.m4
Обычный файл
@ -0,0 +1,21 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
189
orte/mca/sensor/ft_tester/sensor_ft_tester.c
Обычный файл
189
orte/mca/sensor/ft_tester/sensor_ft_tester.c
Обычный файл
@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_ft_tester.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_ft_tester_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop
|
||||
};
|
||||
|
||||
/* declare the local functions */
|
||||
static void sample(int fd, short event, void *arg);
|
||||
|
||||
/* local globals */
|
||||
static opal_event_t *sample_ev = NULL;
|
||||
static struct timeval sample_time;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
if (0 == mca_sensor_ft_tester_component.fail_rate) {
|
||||
/* not monitoring */
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
if (NULL != sample_ev) {
|
||||
opal_event_del(sample_ev);
|
||||
free(sample_ev);
|
||||
sample_ev = NULL;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start killing local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
if (NULL == sample_ev) {
|
||||
/* startup a timer to wake us up periodically */
|
||||
sample_ev = (opal_event_t *) malloc(sizeof(opal_event_t));
|
||||
opal_event_evtimer_set(opal_event_base, sample_ev, sample, sample_ev);
|
||||
sample_time.tv_sec = mca_sensor_ft_tester_component.fail_rate;
|
||||
sample_time.tv_usec = 0;
|
||||
opal_event_evtimer_add(sample_ev, &sample_time);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
if (NULL != sample_ev) {
|
||||
opal_event_del(sample_ev);
|
||||
free(sample_ev);
|
||||
sample_ev = NULL;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void sample(int fd, short event, void *arg)
|
||||
{
|
||||
float prob;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
/* if we are not sampling any more, then just return */
|
||||
if (NULL == sample_ev) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sample:ft_tester considering killing something",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* are we including ourselves? */
|
||||
if (ORTE_PROC_IS_DAEMON && 0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sample:ft_tester considering killing me!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* roll the dice */
|
||||
prob = (double)random() / (double)INT32_MAX;
|
||||
if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
/* commit suicide */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sample:ft_tester committing suicide",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_errmgr.abort(1, NULL);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* see if we should kill a child */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (!child->alive || 0 == child->pid ||
|
||||
ORTE_PROC_STATE_UNTERMINATED < child->state) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
child->alive ? "TRUE" : "FALSE",
|
||||
(unsigned long)child->pid, orte_proc_state_to_str(child->state)));
|
||||
continue;
|
||||
}
|
||||
/* roll the dice */
|
||||
prob = (double)random() / (double)INT32_MAX;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sample:ft_tester child: %s dice: %f prob %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
prob, mca_sensor_ft_tester_component.fail_prob));
|
||||
if (prob < mca_sensor_ft_tester_component.fail_prob) {
|
||||
/* you shall die... */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sample:ft_tester killing %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
kill(child->pid, SIGTERM);
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
/* are we allowing multiple deaths */
|
||||
if (!mca_sensor_ft_tester_component.multi_fail) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
|
||||
/* restart the timer */
|
||||
if (NULL != sample_ev) {
|
||||
opal_event_evtimer_add(sample_ev, &sample_time);
|
||||
}
|
||||
}
|
39
orte/mca/sensor/ft_tester/sensor_ft_tester.h
Обычный файл
39
orte/mca/sensor/ft_tester/sensor_ft_tester.h
Обычный файл
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Process Resource Utilization sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FT_TESTER_H
|
||||
#define ORTE_SENSOR_FT_TESTER_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_ft_tester_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int fail_rate;
|
||||
float fail_prob;
|
||||
float daemon_fail_prob;
|
||||
bool multi_fail;
|
||||
};
|
||||
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
127
orte/mca/sensor/ft_tester/sensor_ft_tester_component.c
Обычный файл
127
orte/mca/sensor/ft_tester/sensor_ft_tester_component.c
Обычный файл
@ -0,0 +1,127 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_ft_tester.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_ft_tester_open(void);
|
||||
static int orte_sensor_ft_tester_close(void);
|
||||
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"ft_tester", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_ft_tester_open, /* component open */
|
||||
orte_sensor_ft_tester_close, /* component close */
|
||||
orte_sensor_ft_tester_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_ft_tester_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
|
||||
int tmp;
|
||||
char *str;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "fail_rate",
|
||||
"Time between checks to decide if one or more procs shall be killed, expressed as sec",
|
||||
false, false, 0, &tmp);
|
||||
if (tmp < 0) {
|
||||
opal_output(0, "Illegal value %d - must be >= 0", tmp);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
mca_sensor_ft_tester_component.fail_rate = tmp;
|
||||
|
||||
mca_base_param_reg_string(c, "fail_prob",
|
||||
"Probability of killing a single executable",
|
||||
false, false, "30.0", &str);
|
||||
if (NULL != str) {
|
||||
mca_sensor_ft_tester_component.fail_prob = strtof(str, NULL);
|
||||
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
|
||||
/* given in percent */
|
||||
mca_sensor_ft_tester_component.fail_prob /= 100.0;
|
||||
}
|
||||
} else {
|
||||
mca_sensor_ft_tester_component.fail_prob = 0.0;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(c, "multi_allowed",
|
||||
"Allow multiple executables to be killed at one time",
|
||||
false, false, 0, &tmp);
|
||||
mca_sensor_ft_tester_component.multi_fail = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_string(c, "daemon_fail_prob",
|
||||
"Probability of killing a daemon",
|
||||
false, false, "0.0", &str);
|
||||
if (NULL != str) {
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(str, NULL);
|
||||
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
/* given in percent */
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
|
||||
}
|
||||
} else {
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (0 == mca_sensor_ft_tester_component.fail_rate) {
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
*priority = 1; /* at the bottom */
|
||||
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_ft_tester_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user