2017-03-11 17:46:32 -08:00
|
|
|
/*
|
2017-03-14 21:44:05 -07:00
|
|
|
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
2017-03-11 17:46:32 -08:00
|
|
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
|
|
|
*
|
|
|
|
* $COPYRIGHT$
|
2017-03-14 21:44:05 -07:00
|
|
|
*
|
2017-03-11 17:46:32 -08:00
|
|
|
* Additional copyrights may follow
|
2017-03-14 21:44:05 -07:00
|
|
|
*
|
2017-03-11 17:46:32 -08:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
#include <string.h>
|
|
|
|
#endif /* HAVE_STRING_H */
|
|
|
|
#include <stdio.h>
|
|
|
|
#ifdef HAVE_SIGNAL_H
|
|
|
|
#include <signal.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "opal_stdint.h"
|
|
|
|
#include "opal/util/alfg.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
|
|
|
|
#include "orte/util/error_strings.h"
|
|
|
|
#include "orte/util/name_fns.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
|
|
|
|
#include "orte/mca/sensor/base/base.h"
|
|
|
|
#include "orte/mca/sensor/base/sensor_private.h"
|
|
|
|
#include "sensor_ft_tester.h"
|
|
|
|
|
|
|
|
/* declare the API functions */
|
|
|
|
static void sample(void);
|
|
|
|
|
|
|
|
/* instantiate the module */
|
|
|
|
orte_sensor_base_module_t orte_sensor_ft_tester_module = {
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
sample,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static void sample(void)
|
|
|
|
{
|
|
|
|
float prob;
|
|
|
|
orte_proc_t *child;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
|
|
|
"%s sample:ft_tester considering killing something",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
|
|
|
|
/* are we including ourselves? */
|
|
|
|
if (ORTE_PROC_IS_DAEMON &&
|
|
|
|
0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
|
|
|
"%s sample:ft_tester considering killing me!",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* roll the dice */
|
|
|
|
prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
|
|
|
|
if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
|
|
|
/* commit suicide */
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
|
|
|
"%s sample:ft_tester committing suicide",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
orte_errmgr.abort(1, NULL);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 < mca_sensor_ft_tester_component.fail_prob) {
|
|
|
|
/* see if we should kill a child */
|
|
|
|
for (i=0; i < orte_local_children->size; i++) {
|
|
|
|
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!child->alive || 0 == child->pid ||
|
|
|
|
ORTE_PROC_STATE_UNTERMINATED < child->state) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
|
|
|
"%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&child->name),
|
|
|
|
child->alive ? "TRUE" : "FALSE",
|
|
|
|
(unsigned long)child->pid, orte_proc_state_to_str(child->state)));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* roll the dice */
|
|
|
|
prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
|
|
|
"%s sample:ft_tester child: %s dice: %f prob %f",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&child->name),
|
|
|
|
prob, mca_sensor_ft_tester_component.fail_prob));
|
|
|
|
if (prob < mca_sensor_ft_tester_component.fail_prob) {
|
|
|
|
/* you shall die... */
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
|
|
|
"%s sample:ft_tester killing %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&child->name)));
|
|
|
|
kill(child->pid, SIGTERM);
|
|
|
|
/* are we allowing multiple deaths */
|
|
|
|
if (!mca_sensor_ft_tester_component.multi_fail) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|