1
1

Merge pull request #3142 from rhc54/topic/sensor

Restore sensor framework
Этот коммит содержится в:
Ralph Castain 2017-03-11 19:53:45 -08:00 коммит произвёл GitHub
родитель 74125ecc7a ab50665222
Коммит 3afadbad89
32 изменённых файлов: 2992 добавлений и 0 удалений

31
orte/mca/sensor/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_sensor.la
libmca_sensor_la_SOURCES =
# local files
headers = sensor.h \
sensor_types.h
libmca_sensor_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(ompiincludedir)/$(subdir)
nobase_orte_HEADERS = $(headers)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

20
orte/mca/sensor/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,20 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h \
base/sensor_private.h
libmca_sensor_la_SOURCES += \
base/sensor_base_frame.c \
base/sensor_base_select.c \
base/sensor_base_fns.c

39
orte/mca/sensor/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,39 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_BASE_H
#define MCA_SENSOR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/base/base.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
/*
* MCA Framework
*/
ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework;
/* select a component */
ORTE_DECLSPEC int orte_sensor_base_select(void);
END_C_DECLS
#endif

158
orte/mca/sensor/base/sensor_base_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,158 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool mods_active = false;
void orte_sensor_base_start(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (0 < orte_sensor_base.rate.tv_sec) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: starting sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the start function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
mods_active = true;
if (NULL != i_module->module->start) {
i_module->module->start(job);
}
}
if (mods_active && !orte_sensor_base.active) {
/* setup a buffer to collect samples */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
/* startup a timer to wake us up periodically
* for a data sample
*/
orte_sensor_base.active = true;
opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev,
orte_sensor_base_sample, NULL);
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
}
}
return;
}
void orte_sensor_base_stop(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: stopping sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (orte_sensor_base.active) {
opal_event_del(&orte_sensor_base.sample_ev);
orte_sensor_base.active = false;
}
/* call the stop function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->stop) {
i_module->module->stop(job);
}
}
return;
}
void orte_sensor_base_sample(int fd, short args, void *cbdata)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
/* see if we were ordered to stop */
if (!orte_sensor_base.active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the sample function of all modules in priority order from
* highest to lowest - the heartbeat should always be the lowest
* priority, so it will send any collected data
*/
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->sample) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling component %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
i_module->component->base_version.mca_component_name);
i_module->module->sample();
}
}
/* restart the timer */
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
return;
}
void orte_sensor_base_log(char *comp, opal_buffer_t *data)
{
int i;
orte_sensor_active_module_t *i_module;
if (NULL == comp) {
/* nothing we can do */
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: logging sensor %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp);
/* find the specified module */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) {
if (NULL != i_module->module->log) {
i_module->module->log(data);
}
return;
}
}
}

132
orte/mca/sensor/base/sensor_base_frame.c Обычный файл
Просмотреть файл

@ -0,0 +1,132 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_pointer_array.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/sensor/base/static-components.h"
/*
* Global variables
*/
orte_sensor_base_API_module_t orte_sensor = {
orte_sensor_base_start,
orte_sensor_base_stop
};
orte_sensor_base_t orte_sensor_base;
/*
* Local variables
*/
static int orte_sensor_base_sample_rate = 0;
static int orte_sensor_base_register(mca_base_register_flag_t flags)
{
int var_id;
orte_sensor_base_sample_rate = 0;
var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate",
"Sample rate in seconds",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base_sample_rate);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
/* see if we want samples logged */
orte_sensor_base.log_samples = false;
var_id = mca_base_var_register("orte", "sensor", "base", "log_samples",
"Log samples to database",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base.log_samples);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
return ORTE_SUCCESS;
}
static int orte_sensor_base_close(void)
{
orte_sensor_active_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->finalize) {
i_module->module->finalize();
}
}
OBJ_DESTRUCT(&orte_sensor_base.modules);
/* Close all remaining available components */
return mca_base_framework_components_close(&orte_sensor_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int orte_sensor_base_open(mca_base_open_flag_t flags)
{
/* initialize globals */
orte_sensor_base.active = false;
/* construct the array of modules */
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
/* get the sample rate */
orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate;
orte_sensor_base.rate.tv_usec = 0;
/* Open up all available components */
return mca_base_framework_components_open(&orte_sensor_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
orte_sensor_base_register,
orte_sensor_base_open, orte_sensor_base_close,
mca_sensor_base_static_components, 0);
static void cons(orte_sensor_active_module_t *t)
{
t->sampling = true;
}
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
opal_object_t,
cons, NULL);

219
orte/mca/sensor/base/sensor_base_select.c Обычный файл
Просмотреть файл

@ -0,0 +1,219 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool selected = false;
/**
* Function for weeding out sensor components that don't want to run.
*
* Call the init function on all available components to find out if
* they want to run. Select all components that don't fail. Failing
* components will be closed and unloaded. The selected modules will
* be returned to the caller in a opal_list_t.
*/
int orte_sensor_base_select(void)
{
mca_base_component_list_item_t *cli = NULL;
orte_sensor_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
orte_sensor_active_module_t *i_module;
int priority = 0, i, j, low_i;
opal_pointer_array_t tmp_array;
bool none_found;
orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
bool duplicate;
if (selected) {
return ORTE_SUCCESS;
}
selected = true;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_output_verbose(10, orte_sensor_base_framework.framework_output,
"sensor:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
none_found = true;
OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) {
component = (orte_sensor_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->base_version.mca_query_component) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. It does not implement a query function",
component->base_version.mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Querying component [%s]",
component->base_version.mca_component_name);
component->base_version.mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. Query failed to return a module",
component->base_version.mca_component_name );
continue;
}
/* check to see if we already have someone who senses the
* same things - if so, take the higher priority one
*/
duplicate = false;
for (i=0; i < tmp_array.size; i++) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if (NULL == tmp_module) {
continue;
}
if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) {
if (tmp_module->priority < priority) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Replacing component %s with %s - both measure %s",
tmp_module->component->base_version.mca_component_name,
component->base_version.mca_component_name,
component->data_measured);
OBJ_RELEASE(tmp_module);
opal_pointer_array_set_item(&tmp_array, i, NULL);
break;
} else {
duplicate = true;
}
}
}
if (duplicate) {
/* ignore this component */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Ignoring component %s - duplicate with higher priority measures %s",
component->base_version.mca_component_name,
component->data_measured);
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Query of component [%s] set priority to %d",
component->base_version.mca_component_name, priority);
tmp_module = OBJ_NEW(orte_sensor_active_module_t);
tmp_module->component = component;
tmp_module->module = (orte_sensor_base_module_t*)module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
}
if (none_found) {
/* okay for no modules to be found */
return ORTE_SUCCESS;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Add module with priority [%s] %d",
tmp_module->component->base_version.mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_sensor_base.modules, tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the modules in priority order from
* highest to lowest
*/
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->module->init ) {
if (ORTE_SUCCESS != i_module->module->init()) {
/* can't sample - however, if we are the HNP,
* then we need this module
* anyway so we can log incoming data
*/
if (ORTE_PROC_IS_HNP) {
i_module->sampling = false;
} else {
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
OBJ_RELEASE(i_module);
}
}
}
}
return ORTE_SUCCESS;
}

67
orte/mca/sensor/base/sensor_private.h Обычный файл
Просмотреть файл

@ -0,0 +1,67 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_PRIVATE_H
#define MCA_SENSOR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/event/event.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/sensor.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
opal_pointer_array_t modules;
bool log_samples;
bool active;
struct timeval rate;
opal_event_t sample_ev;
opal_buffer_t *samples;
} orte_sensor_base_t;
typedef struct {
opal_object_t super;
orte_sensor_base_component_t *component;
orte_sensor_base_module_t *module;
int priority;
bool sampling;
} orte_sensor_active_module_t;
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data);
END_C_DECLS
#endif

37
orte/mca/sensor/file/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-file.txt
sources = \
sensor_file.c \
sensor_file.h \
sensor_file_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_file_DSO
component_noinst =
component_install = mca_sensor_file.la
else
component_noinst = libmca_sensor_file.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_file_la_SOURCES = $(sources)
mca_sensor_file_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_file_la_SOURCES =$(sources)
libmca_sensor_file_la_LDFLAGS = -module -avoid-version

24
orte/mca/sensor/file/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_file_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/file/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -0,0 +1,19 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the file sensor
#
[file-stalled]
A specified file is not changing, indicating a possibly stalled application:
File: %s
Last size: %lu
Last access: %sLast modification: %s

354
orte/mca/sensor/file/sensor_file.c Обычный файл
Просмотреть файл

@ -0,0 +1,354 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <stdio.h>
#include <stddef.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include "opal_stdint.h"
#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_file.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
static void file_sample(void);
static void file_log(opal_buffer_t *sample);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_file_module = {
init,
finalize,
start,
stop,
file_sample,
file_log
};
/* define a tracking object */
typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t vpid;
char *file;
int tick;
bool check_size;
bool check_access;
bool check_mod;
int32_t file_size;
time_t last_access;
time_t last_mod;
int limit;
} file_tracker_t;
static void ft_constructor(file_tracker_t *ft)
{
ft->file = NULL;
ft->tick = 0;
ft->file_size = 0;
ft->last_access = 0;
ft->last_mod = 0;
ft->limit = 0;
}
static void ft_destructor(file_tracker_t *ft)
{
if (NULL != ft->file) {
free(ft->file);
}
}
OBJ_CLASS_INSTANCE(file_tracker_t,
opal_list_item_t,
ft_constructor, ft_destructor);
/* local globals */
static opal_list_t jobs;
static int init(void)
{
OBJ_CONSTRUCT(&jobs, opal_list_t);
return ORTE_SUCCESS;
}
static void finalize(void)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first(&jobs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&jobs);
return;
}
static bool find_value(orte_app_context_t *app,
char *pattern, char **value)
{
int i;
char *ptr;
for (i=0; NULL != app->env[i]; i++) {
if (0 == strncmp(app->env[i], pattern, strlen(pattern))) {
ptr = strchr(app->env[i], '=');
ptr++;
if (NULL != value) {
*value = strdup(ptr);
}
return true;
}
}
return false;
}
/*
* Start monitoring of local processes
*/
static void start(orte_jobid_t jobid)
{
orte_job_t *jobdat;
orte_app_context_t *app, *aptr;
int i;
char *filename;
file_tracker_t *ft;
char *ptr;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s starting file monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
/* get the local jobdat for this job */
if (NULL == (jobdat = orte_get_job_data_object(jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* must be at least one app_context, so use the first one found */
app = NULL;
for (i=0; i < jobdat->apps->size; i++) {
if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) {
app = aptr;
break;
}
}
if (NULL == app) {
/* got a problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* search the environ to get the filename */
if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) {
/* was a default file given */
if (NULL == mca_sensor_file_component.file) {
/* can't do anything without a file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:file no file for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
return;
}
filename = mca_sensor_file_component.file;
}
/* create the tracking object */
ft = OBJ_NEW(file_tracker_t);
ft->jobid = jobid;
ft->file = strdup(filename);
/* search the environ to see what we are checking */
if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_size) {
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
}
} else {
ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_access) {
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
}
} else {
ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_mod) {
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
}
} else {
ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) {
ft->limit = mca_sensor_file_component.limit;
} else {
ft->limit = strtol(ptr, NULL, 10);
free(ptr);
}
opal_list_append(&jobs, &ft->super);
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s file %s monitored for %s%s%s with limit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->check_size ? "SIZE:" : " ",
ft->check_access ? "ACCESS TIME:" : " ",
ft->check_mod ? "MOD TIME" : " ", ft->limit));
return;
}
static void stop(orte_jobid_t jobid)
{
opal_list_item_t *item;
file_tracker_t *ft;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
return;
}
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) {
opal_list_remove_item(&jobs, item);
OBJ_RELEASE(item);
}
}
return;
}
static void file_sample(void)
{
struct stat buf;
opal_list_item_t *item;
file_tracker_t *ft;
orte_job_t *jdata;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sampling files",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
/* stat the file and get its size */
if (0 > stat(ft->file, &buf)) {
/* cannot stat file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s could not stat %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file));
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s size %lu access %s\tmod %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
if (ft->check_size) {
if (buf.st_size == ft->file_size) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->file_size = buf.st_size;
}
}
if (ft->check_access) {
if (buf.st_atime == ft->last_access) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_access = buf.st_atime;
}
}
if (ft->check_mod) {
if (buf.st_mtime == ft->last_mod) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_mod = buf.st_mtime;
}
}
CHECK:
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sampled file %s tick %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->tick));
if (ft->tick == ft->limit) {
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
jdata = orte_get_job_data_object(ft->jobid);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED);
}
}
}
static void file_log(opal_buffer_t *sample)
{
}

42
orte/mca/sensor/file/sensor_file.h Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* File movement sensor
*/
#ifndef ORTE_SENSOR_FILE_H
#define ORTE_SENSOR_FILE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_file_component_t {
orte_sensor_base_component_t super;
int sample_rate;
char *file;
bool check_size;
bool check_access;
bool check_mod;
int limit;
};
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
extern orte_sensor_base_module_t orte_sensor_file_module;
END_C_DECLS
#endif

120
orte/mca/sensor/file/sensor_file_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,120 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_file.h"
/*
* Local functions
*/
static int orte_sensor_file_register (void);
static int orte_sensor_file_open(void);
static int orte_sensor_file_close(void);
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
orte_sensor_file_component_t mca_sensor_file_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"file", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_file_open, /* component open */
orte_sensor_file_close, /* component close */
orte_sensor_file_query, /* component query */
orte_sensor_file_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"filemods" // data being sensed
}
};
/**
* component register/open/close/init function
*/
static int orte_sensor_file_register (void)
{
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
/* lookup parameters */
mca_sensor_file_component.file = NULL;
(void) mca_base_component_var_register (c, "filename", "File to be monitored",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.file);
mca_sensor_file_component.check_size = false;
(void) mca_base_component_var_register (c, "check_size", "Check the file size",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_size);
mca_sensor_file_component.check_access = false;
(void) mca_base_component_var_register (c, "check_access", "Check access time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_access);
mca_sensor_file_component.check_mod = false;
(void) mca_base_component_var_register (c, "check_mod", "Check modification time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_mod);
mca_sensor_file_component.limit = 3;
(void) mca_base_component_var_register (c, "limit",
"Number of times the sensor can detect no motion before declaring error (default=3)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.limit);
return ORTE_SUCCESS;
}
static int orte_sensor_file_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
{
*priority = 20; /* higher than heartbeat */
*module = (mca_base_module_t *)&orte_sensor_file_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_file_close(void)
{
return ORTE_SUCCESS;
}

36
orte/mca/sensor/ft_tester/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
sensor_ft_tester.c \
sensor_ft_tester.h \
sensor_ft_tester_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_ft_tester_DSO
component_noinst =
component_install = mca_sensor_ft_tester.la
else
component_noinst = libmca_sensor_ft_tester.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_ft_tester_la_SOURCES = $(sources)
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_ft_tester_la_SOURCES =$(sources)
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version

24
orte/mca/sensor/ft_tester/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

121
orte/mca/sensor/ft_tester/sensor_ft_tester.c Обычный файл
Просмотреть файл

@ -0,0 +1,121 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#include "opal_stdint.h"
#include "opal/util/alfg.h"
#include "opal/util/output.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_ft_tester.h"
/* declare the API functions */
static void sample(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_ft_tester_module = {
NULL,
NULL,
NULL,
NULL,
sample,
NULL
};
static void sample(void)
{
float prob;
orte_proc_t *child;
int i;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester considering killing something",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* are we including ourselves? */
if (ORTE_PROC_IS_DAEMON &&
0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester considering killing me!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* roll the dice */
prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
/* commit suicide */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester committing suicide",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_errmgr.abort(1, NULL);
return;
}
}
if (0 < mca_sensor_ft_tester_component.fail_prob) {
/* see if we should kill a child */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive || 0 == child->pid ||
ORTE_PROC_STATE_UNTERMINATED < child->state) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name),
child->alive ? "TRUE" : "FALSE",
(unsigned long)child->pid, orte_proc_state_to_str(child->state)));
continue;
}
/* roll the dice */
prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester child: %s dice: %f prob %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name),
prob, mca_sensor_ft_tester_component.fail_prob));
if (prob < mca_sensor_ft_tester_component.fail_prob) {
/* you shall die... */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester killing %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name)));
kill(child->pid, SIGTERM);
/* are we allowing multiple deaths */
if (!mca_sensor_ft_tester_component.multi_fail) {
break;
}
}
}
}
}

Просмотреть файл

@ -0,0 +1,41 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_FT_TESTER_H
#define ORTE_SENSOR_FT_TESTER_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
#include "opal/util/alfg.h"
BEGIN_C_DECLS
struct orte_sensor_ft_tester_component_t {
orte_sensor_base_component_t super;
float fail_prob;
float daemon_fail_prob;
bool multi_fail;
};
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
extern opal_rng_buff_t orte_sensor_ft_rng_buff;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,141 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_ft_tester.h"
/*
* Local functions
*/
static int orte_sensor_ft_tester_register (void);
static int orte_sensor_ft_tester_open(void);
static int orte_sensor_ft_tester_close(void);
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"ft_tester", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_ft_tester_open, /* component open */
orte_sensor_ft_tester_close, /* component close */
orte_sensor_ft_tester_query, /* component query */
orte_sensor_ft_tester_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
NULL
}
};
static char *daemon_fail_prob = NULL;
static char *fail_prob = NULL;
opal_rng_buff_t orte_sensor_ft_rng_buff;
/**
* component register/open/close/init function
*/
static int orte_sensor_ft_tester_register (void)
{
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
fail_prob = NULL;
(void) mca_base_component_var_register (c, "fail_prob", "Probability of killing a single executable",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&fail_prob);
mca_sensor_ft_tester_component.multi_fail = false;
(void) mca_base_component_var_register (c, "multi_allowed", "Allow multiple executables to be killed at one time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_ft_tester_component.multi_fail);
daemon_fail_prob = NULL;
(void) mca_base_component_var_register (c, "daemon_fail_prob", "Probability of killing a daemon",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&daemon_fail_prob);
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_open(void)
{
/* lookup parameters */
if (NULL != fail_prob) {
mca_sensor_ft_tester_component.fail_prob = strtof(fail_prob, NULL);
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.fail_prob = 0.0;
}
if (NULL != daemon_fail_prob) {
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(daemon_fail_prob, NULL);
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
}
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
{
if (0.0 < mca_sensor_ft_tester_component.fail_prob ||
0.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
*priority = 1; /* at the bottom */
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
/* seed the RNG --- Not sure if we should assume all procs use
* the same seed?
*/
opal_srand(&orte_sensor_ft_rng_buff, (uint32_t) getpid());
return ORTE_SUCCESS;
}
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
/**
* Close all subsystems.
*/
static int orte_sensor_ft_tester_close(void)
{
return ORTE_SUCCESS;
}

38
orte/mca/sensor/heartbeat/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-heartbeat.txt
sources = \
sensor_heartbeat.c \
sensor_heartbeat.h \
sensor_heartbeat_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_heartbeat_DSO
component_noinst =
component_install = mca_sensor_heartbeat.la
else
component_noinst = libmca_sensor_heartbeat.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_heartbeat_la_SOURCES = $(sources)
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_heartbeat_la_SOURCES =$(sources)
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version

24
orte/mca/sensor/heartbeat/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_heartbeat_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/heartbeat/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -0,0 +1,21 @@
# -*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

279
orte/mca/sensor/heartbeat/sensor_heartbeat.c Обычный файл
Просмотреть файл

@ -0,0 +1,279 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/event/event.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_heartbeat.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void sample(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
init,
finalize,
start,
NULL,
sample,
NULL
};
/* declare the local functions */
static void check_heartbeat(int fd, short event, void *arg);
static void recv_beats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata);
/* local globals */
static orte_job_t *daemons=NULL;
static opal_event_t check_ev;
static bool check_active = false;
static struct timeval check_time;
static int init(void)
{
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s initializing heartbeat recvs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup to receive heartbeats */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT,
ORTE_RML_PERSISTENT,
recv_beats, NULL);
}
if (ORTE_PROC_IS_HNP) {
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
if (check_active) {
opal_event_del(&check_ev);
check_active = false;
}
return;
}
static void start(orte_jobid_t job)
{
if (!check_active && NULL != daemons) {
/* setup the check event */
check_time.tv_sec = 3 * orte_sensor_base.rate.tv_sec;
check_time.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, &check_ev, check_heartbeat, &check_ev);
opal_event_evtimer_add(&check_ev, &check_time);
check_active = true;
}
}
static void sample(void)
{
opal_buffer_t *buf;
int rc;
orte_process_name_t *tgt;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
}
if (ORTE_PROC_IS_CM) {
/* we send to our daemon */
tgt = ORTE_PROC_MY_DAEMON;
} else {
tgt = ORTE_PROC_MY_HNP;
}
/* if my target hasn't been defined yet, ignore - nobody listening yet */
if (ORTE_JOBID_INVALID ==tgt->jobid ||
ORTE_VPID_INVALID == tgt->vpid) {
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"%s sensor:heartbeat: HNP is not defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sending heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we want sampled data included, point to the bucket */
buf = OBJ_NEW(opal_buffer_t);
if (orte_sensor_base.log_samples) {
opal_dss.copy_payload(buf, orte_sensor_base.samples);
OBJ_RELEASE(orte_sensor_base.samples);
/* start a new sample bucket */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
}
/* send heartbeat */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(tgt, buf,
ORTE_RML_TAG_HEARTBEAT,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various orteds
*/
static void check_heartbeat(int fd, short dummy, void *arg)
{
int v;
orte_proc_t *proc;
opal_event_t *tmp = (opal_event_t*)arg;
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
"%s sensor:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
"%s IGNORING CHECK abnorm_term %s fin %s init %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_abnormal_term_ordered ? "TRUE" : "FALSE",
orte_finalizing ? "TRUE" : "FALSE",
orte_initialized ? "TRUE" : "FALSE"));
check_active = false;
return;
}
for (v=0; v < daemons->procs->size; v++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
continue;
}
/* ignore myself */
if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) {
continue;
}
if (ORTE_PROC_STATE_RUNNING != proc->state) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:heartbeat DAEMON %s IS NOT RUNNING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
continue;
}
if (0 == proc->beat) {
/* no heartbeat recvd in last window */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:check_heartbeat FAILED for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), proc->beat));
}
/* reset for next period */
proc->beat = 0;
}
/* reset the timer */
opal_event_evtimer_add(tmp, &check_time);
}
static void recv_beats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
orte_proc_t *proc;
int rc, n;
char *component=NULL;
opal_buffer_t *buf;
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"%s received beat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
}
/* get this daemon's object */
if (NULL != daemons) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s marked beat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
proc->beat++;
/* if this daemon has reappeared, reset things */
if (ORTE_PROC_STATE_HEARTBEAT_FAILED == proc->state) {
proc->state = ORTE_PROC_STATE_RUNNING;
}
}
}
/* unload any sampled data */
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &buf, &n, OPAL_BUFFER))) {
if (NULL != buf) {
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &component, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
break;
}
orte_sensor_base_log(component, buf);
OBJ_RELEASE(buf);
free(component);
n=1;
}
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -0,0 +1,32 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Heartbeat sensor
*/
#ifndef ORTE_SENSOR_HEARTBEAT_H
#define ORTE_SENSOR_HEARTBEAT_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_sensor_base_component_t mca_sensor_heartbeat_component;
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,75 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_heartbeat.h"
/*
* Local functions
*/
static int orte_sensor_heartbeat_open(void);
static int orte_sensor_heartbeat_close(void);
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
orte_sensor_base_component_t mca_sensor_heartbeat_component = {
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"heartbeat", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_heartbeat_open, /* component open */
orte_sensor_heartbeat_close, /* component close */
orte_sensor_heartbeat_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"heartbeat"
};
/**
* component open/close/init function
*/
static int orte_sensor_heartbeat_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
{
*priority = 5; /* lower than all other samplers so that their data gets included in heartbeat */
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_heartbeat_close(void)
{
return ORTE_SUCCESS;
}

38
orte/mca/sensor/resusage/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-resusage.txt
sources = \
sensor_resusage.c \
sensor_resusage.h \
sensor_resusage_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_resusage_DSO
component_noinst =
component_install = mca_sensor_resusage.la
else
component_noinst = libmca_sensor_resusage.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_resusage_la_SOURCES = $(sources)
mca_sensor_resusage_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_resusage_la_SOURCES =$(sources)
libmca_sensor_resusage_la_LDFLAGS = -module -avoid-version

24
orte/mca/sensor/resusage/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_resusage_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_resusage_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/resusage/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -0,0 +1,21 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

478
orte/mca/sensor/resusage/sensor_resusage.c Обычный файл
Просмотреть файл

@ -0,0 +1,478 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_ring_buffer.h"
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/db/db.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/orted/orted.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_resusage.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void sample(void);
static void res_log(opal_buffer_t *sample);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_resusage_module = {
init,
finalize,
NULL,
NULL,
sample,
res_log
};
static bool log_enabled = true;
static orte_node_t *my_node;
static orte_proc_t *my_proc;
static int init(void)
{
orte_job_t *jdata;
/* ensure my_proc and my_node are available on the global arrays */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
my_proc = OBJ_NEW(orte_proc_t);
my_node = OBJ_NEW(orte_node_t);
} else {
if (NULL == (my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (NULL == (my_node = my_proc->node)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* protect the objects */
OBJ_RETAIN(my_proc);
OBJ_RETAIN(my_node);
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
if (NULL != my_proc) {
OBJ_RELEASE(my_proc);
}
if (NULL != my_node) {
OBJ_RELEASE(my_node);
}
return;
}
static void sample(void)
{
opal_pstats_t *stats, *st;
opal_node_stats_t *nstats, *nst;
int rc, i;
orte_proc_t *child, *hog=NULL;
float in_use, max_mem;
opal_buffer_t buf, *bptr;
char *comp;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"sample:resusage sampling resource usage"));
/* setup a buffer for our stats */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* pack our name */
comp = strdup("resusage");
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
free(comp);
/* update stats on ourself and the node */
stats = OBJ_NEW(opal_pstats_t);
nstats = OBJ_NEW(opal_node_stats_t);
if (ORTE_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(stats);
OBJ_RELEASE(nstats);
OBJ_DESTRUCT(&buf);
return;
}
/* the stats framework can't know nodename or rank */
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
stats->rank = ORTE_PROC_MY_NAME->vpid;
/* locally save the stats */
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) {
OBJ_RELEASE(st);
}
if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) {
/* release the popped value */
OBJ_RELEASE(nst);
}
/* pack them */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
/* loop through our children and update their stats */
if (NULL != orte_local_children) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
stats = OBJ_NEW(opal_pstats_t);
if (ORTE_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) {
/* may hit a race condition where the process has
* terminated, so just ignore any error
*/
OBJ_RELEASE(stats);
continue;
}
/* the stats framework can't know nodename or rank */
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
stats->rank = child->name.vpid;
/* store it */
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) {
OBJ_RELEASE(st);
}
/* pack them */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
}
}
/* xfer any data for transmission */
if (0 < buf.bytes_used) {
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
}
OBJ_DESTRUCT(&buf);
/* are there any issues with node-level usage? */
nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1);
if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) {
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s CHECKING NODE MEM",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* compute the percentage of node memory in-use */
in_use = 1.0 - (nst->free_mem / nst->total_mem);
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s PERCENT USED: %f LIMIT: %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
in_use, mca_sensor_resusage_component.node_memory_limit));
if (mca_sensor_resusage_component.node_memory_limit <= in_use) {
/* loop through our children and find the biggest hog */
hog = NULL;
max_mem = 0.0;
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
"%s PROC %s AT VSIZE %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name), st->vsize));
if (max_mem < st->vsize) {
hog = child;
max_mem = st->vsize;
}
}
if (NULL == hog) {
/* if all children dead and we are still too big,
* then we must be the culprit - abort
*/
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s NO CHILD: COMMITTING SUICIDE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL);
} else {
/* report the problem */
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&hog->name)));
ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
}
/* since we have ordered someone to die, we've done enough for this
* time around - don't check proc limits as well
*/
return;
}
}
/* check proc limits */
if (0.0 < mca_sensor_resusage_component.proc_memory_limit) {
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s CHECKING PROC MEM",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* check my children first */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
"%s PROC %s AT VSIZE %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name), st->vsize));
if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) {
/* report the problem */
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
}
}
}
}
static void res_log(opal_buffer_t *sample)
{
opal_pstats_t *st=NULL;
opal_node_stats_t *nst=NULL;
int rc, n, i;
opal_value_t kv[14];
char *node;
if (!log_enabled) {
return;
}
/* unpack the node name */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &node, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
/* unpack the node stats */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &nst, &n, OPAL_NODE_STAT))) {
ORTE_ERROR_LOG(rc);
return;
}
if (mca_sensor_resusage_component.log_node_stats) {
/* convert this into an array of opal_value_t's - no clean way
* to do this, so have to just manually map each field
*/
for (i=0; i < 13; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
i=0;
kv[i].key = strdup("ctime");
kv[i].type = OPAL_TIMEVAL;
kv[i].data.tv.tv_sec = nst->sample_time.tv_sec;
kv[i++].data.tv.tv_usec = nst->sample_time.tv_usec;
kv[i].key = "hostname";
kv[i].type = OPAL_STRING;
kv[i++].data.string = strdup(node);
kv[i].key = strdup("total_mem");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->total_mem;
kv[i].key = strdup("free_mem");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->free_mem;
kv[i].key = strdup("buffers");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->buffers;
kv[i].key = strdup("cached");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->cached;
kv[i].key = strdup("swap_total");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_total;
kv[i].key = strdup("swap_free");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_free;
kv[i].key = strdup("mapped");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->mapped;
kv[i].key = strdup("swap_cached");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_cached;
kv[i].key = strdup("la");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la;
kv[i].key = strdup("la5");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la5;
kv[i].key = strdup("la15");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la15;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("nodestats", kv, 12))) {
/* don't bark about it - just quietly disable the log */
log_enabled = false;
}
for (i=0; i < 12; i++) {
OBJ_DESTRUCT(&kv[i]);
}
}
OBJ_RELEASE(nst);
if (mca_sensor_resusage_component.log_process_stats) {
/* unpack all process stats */
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(sample, &st, &n, OPAL_PSTAT))) {
for (i=0; i < 14; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
kv[0].key = strdup("node");
kv[0].type = OPAL_STRING;
kv[0].data.string = strdup(st->node);
kv[1].key = strdup("rank");
kv[1].type = OPAL_INT32;
kv[1].data.int32 = st->rank;
kv[2].key = strdup("pid");
kv[2].type = OPAL_PID;
kv[2].data.pid = st->pid;
kv[3].key = strdup("cmd");
kv[3].type = OPAL_STRING;
kv[3].data.string = strdup(st->cmd);
kv[4].key = strdup("state");
kv[4].type = OPAL_STRING;
kv[4].data.string = (char*)malloc(3 * sizeof(char));
kv[4].data.string[0] = st->state[0];
kv[4].data.string[1] = st->state[1];
kv[4].data.string[2] = '\0';
kv[5].key = strdup("time");
kv[5].type = OPAL_TIMEVAL;
kv[5].data.tv.tv_sec = st->time.tv_sec;
kv[5].data.tv.tv_usec = st->time.tv_usec;
kv[6].key = strdup("percent_cpu");
kv[6].type = OPAL_FLOAT;
kv[6].data.fval = st->percent_cpu;
kv[7].key = strdup("priority");
kv[7].type = OPAL_INT32;
kv[7].data.int32 = st->priority;
kv[8].key = strdup("num_threads");
kv[8].type = OPAL_INT16;
kv[8].data.int16 = st->num_threads;
kv[9].key = strdup("vsize");
kv[9].type = OPAL_FLOAT;
kv[9].data.fval = st->vsize;
kv[10].key = strdup("rss");
kv[10].type = OPAL_FLOAT;
kv[10].data.fval = st->rss;
kv[11].key = strdup("peak_vsize");
kv[11].type = OPAL_FLOAT;
kv[11].data.fval = st->peak_vsize;
kv[12].key = strdup("processor");
kv[12].type = OPAL_INT16;
kv[12].data.int16 = st->processor;
kv[13].key = strdup("sample_time");
kv[13].type = OPAL_TIMEVAL;
kv[13].data.tv.tv_sec = st->sample_time.tv_sec;
kv[13].data.tv.tv_usec = st->sample_time.tv_usec;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("procstats", kv, 14))) {
log_enabled = false;
}
for (i=0; i < 14; i++) {
OBJ_DESTRUCT(&kv[i]);
}
OBJ_RELEASE(st);
n=1;
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
}
}
}

41
orte/mca/sensor/resusage/sensor_resusage.h Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_RESUSAGE_H
#define ORTE_SENSOR_RESUSAGE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_resusage_component_t {
orte_sensor_base_component_t super;
int sample_rate;
float node_memory_limit;
float proc_memory_limit;
bool log_node_stats;
bool log_process_stats;
};
typedef struct orte_sensor_resusage_component_t orte_sensor_resusage_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_resusage_component_t mca_sensor_resusage_component;
extern orte_sensor_base_module_t orte_sensor_resusage_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,138 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_resusage.h"
/*
* Local functions
*/
static int orte_sensor_resusage_register (void);
static int orte_sensor_resusage_open(void);
static int orte_sensor_resusage_close(void);
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority);
orte_sensor_resusage_component_t mca_sensor_resusage_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"resusage", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_resusage_open, /* component open */
orte_sensor_resusage_close, /* component close */
orte_sensor_resusage_query, /* component query */
orte_sensor_resusage_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"procresource,noderesource"
}
};
static int node_memory_limit;
static int proc_memory_limit;
/**
* component open/close/init function
*/
static int orte_sensor_resusage_register (void)
{
mca_base_component_t *c = &mca_sensor_resusage_component.super.base_version;
mca_sensor_resusage_component.sample_rate = 0;
(void) mca_base_component_var_register (c, "sample_rate", "Sample rate in seconds (default: 0)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.sample_rate);
if (mca_sensor_resusage_component.sample_rate < 0) {
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
return ORTE_ERR_BAD_PARAM;
}
node_memory_limit = 0;
(void) mca_base_component_var_register (c, "node_memory_limit",
"Percentage of total memory that can be in-use",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&node_memory_limit);
mca_sensor_resusage_component.node_memory_limit = (float)node_memory_limit/100.0;
proc_memory_limit = 0;
(void) mca_base_component_var_register (c, "proc_memory_limit",
"Max virtual memory size in MBytes",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&proc_memory_limit);
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
mca_sensor_resusage_component.log_node_stats = false;
(void) mca_base_component_var_register (c, "log_node_stats", "Log the node stats",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.log_node_stats);
mca_sensor_resusage_component.log_process_stats = false;
(void) mca_base_component_var_register (c, "log_process_stats", "Log the process stats",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.log_process_stats);
return ORTE_SUCCESS;
}
static int orte_sensor_resusage_open(void)
{
if (mca_sensor_resusage_component.sample_rate < 0) {
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
return ORTE_ERR_FATAL;
}
mca_sensor_resusage_component.node_memory_limit = (float) node_memory_limit/100.0;
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
return ORTE_SUCCESS;
}
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority)
{
*priority = 100; /* ahead of heartbeat */
*module = (mca_base_module_t *)&orte_sensor_resusage_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_resusage_close(void)
{
return ORTE_SUCCESS;
}

107
orte/mca/sensor/sensor.h Обычный файл
Просмотреть файл

@ -0,0 +1,107 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef MCA_SENSOR_H
#define MCA_SENSOR_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* start collecting data */
typedef void (*orte_sensor_API_module_start_fn_t)(orte_jobid_t job);
/* stop collecting data */
typedef void (*orte_sensor_API_module_stop_fn_t)(orte_jobid_t job);
/* API module */
/*
* Ver 1.0
*/
struct orte_sensor_base_API_module_1_0_0_t {
orte_sensor_API_module_start_fn_t start;
orte_sensor_API_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
/* initialize the module */
typedef int (*orte_sensor_base_module_init_fn_t)(void);
/* finalize the module */
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
/* tell the module to sample its sensor */
typedef void (*orte_sensor_base_module_sample_fn_t)(void);
/* pass a buffer to the module for logging */
typedef void (*orte_sensor_base_module_log_fn_t)(opal_buffer_t *sample);
/*
* Component modules Ver 1.0
*/
struct orte_sensor_base_module_1_0_0_t {
orte_sensor_base_module_init_fn_t init;
orte_sensor_base_module_finalize_fn_t finalize;
orte_sensor_API_module_start_fn_t start;
orte_sensor_API_module_stop_fn_t stop;
orte_sensor_base_module_sample_fn_t sample;
orte_sensor_base_module_log_fn_t log;
};
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
/*
* the standard component data structure
*/
struct orte_sensor_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
char *data_measured;
};
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
/* sensor v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* sensor v1.0 */ \
"sensor", 1, 0, 0
/* Global structure for accessing sensor functions
*/
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

51
orte/mca/sensor/sensor_types.h Обычный файл
Просмотреть файл

@ -0,0 +1,51 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_SENSOR_TYPES_H
#define ORTE_MCA_SENSOR_TYPES_H
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/dss/dss_types.h"
/*
* General SENSOR types - instanced in runtime/orte_globals.c
*/
BEGIN_C_DECLS
enum {
ORTE_SENSOR_SCALE_LINEAR,
ORTE_SENSOR_SCALE_LOG,
ORTE_SENSOR_SCALE_SIGMOID
};
/*
* Structure for passing data from sensors
*/
typedef struct {
opal_object_t super;
char *sensor;
struct timeval timestamp;
opal_byte_object_t data;
} orte_sensor_data_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
END_C_DECLS
#endif