1
1

Add two new sensor modules - one to monitor core temperatures, and the other to monitor resource usage using the sigar library

This commit was SVN r30335.
Этот коммит содержится в:
Ralph Castain 2014-01-20 19:35:48 +00:00
родитель d2d4eeb2d6
Коммит 9b2066cfba
12 изменённых файлов: 1839 добавлений и 0 удалений

37
orte/mca/sensor/coretemp/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-sensor-coretemp.txt
sources = \
sensor_coretemp.c \
sensor_coretemp.h \
sensor_coretemp_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_coretemp_DSO
component_noinst =
component_install = mca_sensor_coretemp.la
else
component_noinst = libmca_sensor_coretemp.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_coretemp_la_SOURCES = $(sources)
mca_sensor_coretemp_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_coretemp_la_SOURCES =$(sources)
libmca_sensor_coretemp_la_LDFLAGS = -module -avoid-version

44
orte/mca/sensor/coretemp/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,44 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
# MCA_sensor_coretemp_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_coretemp_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/coretemp/Makefile])
AC_ARG_WITH([coretemp],
[AC_HELP_STRING([--with-coretemp],
[Build coretemp support (default: no)])],
[], with_coretemp=no)
# do not build if support not requested
AS_IF([test "$with_coretemp" != "no"],
[case "${host}" in
i?86-*linux*|x86_64*linux*|ia64-*linux*|powerpc-*linux*|powerpc64-*linux*|sparc*-*linux*)
AS_IF([test -r "/sys/bus/platform/devices/coretemp.0"],
[sensor_coretemp_happy=yes],
[AC_MSG_WARN([Core temperature sensing was requested but the required directory])
AC_MSG_WARN([was not found. This usually indicates that the \"coretemp\"])
AC_MSG_WARN([kernel module is not installed. Please install the module])
AC_MSG_WARN([and try again, or remove the core temperature sensing request.])
sensor_coretemp_happy=no])
;;
*)
AC_MSG_WARN([Core temperature sensing was requested but is only supported on Linux systems])
sensor_coretemp_happy=no
;;
esac
AS_IF([test "$sensor_coretemp_happy" = "yes"],
[$1],
[AC_MSG_ERROR([Cannot continue])
$2])
],
[$2])
])dnl

Просмотреть файл

@ -0,0 +1,20 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

412
orte/mca/sensor/coretemp/sensor_coretemp.c Обычный файл
Просмотреть файл

@ -0,0 +1,412 @@
/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include "opal_stdint.h"
#include "opal/class/opal_list.h"
#include "opal/dss/dss.h"
#include "opal/util/os_path.h"
#include "opal/util/output.h"
#include "opal/util/os_dirpath.h"
#include "opal/mca/db/db.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_coretemp.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
static void coretemp_sample(void);
static void coretemp_log(opal_buffer_t *buf);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_coretemp_module = {
init,
finalize,
start,
stop,
coretemp_sample,
coretemp_log
};
typedef struct {
opal_list_item_t super;
char *file;
int socket;
char *label;
float critical_temp;
float max_temp;
} core_tracker_t;
static void ctr_con(core_tracker_t *trk)
{
trk->file = NULL;
trk->label = NULL;
}
static void ctr_des(core_tracker_t *trk)
{
if (NULL != trk->file) {
free(trk->file);
}
if (NULL != trk->label) {
free(trk->label);
}
}
OBJ_CLASS_INSTANCE(core_tracker_t,
opal_list_item_t,
ctr_con, ctr_des);
static bool log_enabled = true;
static opal_list_t tracking;
static char *orte_getline(FILE *fp)
{
char *ret, *buff;
char input[1024];
ret = fgets(input, 1024, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
return buff;
}
return NULL;
}
static int init(void)
{
int ret;
DIR *cur_dirp = NULL, *tdir;
struct dirent *dir_entry, *entry;
char *dirname, *filename, *ptr, *tmp;
size_t tlen = strlen("temp");
size_t ilen = strlen("_input");
FILE *fp;
core_tracker_t *trk;
int socket;
OBJ_CONSTRUCT(&tracking, opal_list_t);
if (ORTE_SUCCESS != (ret = opal_os_dirpath_access("/sys/bus/platform/devices", 0))) {
/* if the directory doesn't exist, or we don't have
* access to it, then disqualify us
*/
return ret;
}
/*
* Open up the base directory so we can get a listing
*/
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
return ORTE_ERROR;
}
/*
* For each directory
*/
socket = 0;
while (NULL != (dir_entry = readdir(cur_dirp))) {
/* look for coretemp directories */
if (0 != strncmp(dir_entry->d_name, "coretemp", strlen("coretemp"))) {
continue;
}
/* open that directory */
dirname = opal_os_path(false, "/sys/bus/platform/devices", dir_entry->d_name, NULL );
if (NULL == (tdir = opendir(dirname))) {
continue;
}
while (NULL != (entry = readdir(tdir))) {
/*
* Skip the obvious
*/
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
continue;
}
if (strlen(entry->d_name) < (tlen+ilen)) {
/* cannot be a core temp file */
continue;
}
/*
* See if this is a core temp file
*/
if (0 != strncmp(entry->d_name, "temp", strlen("temp"))) {
continue;
}
if (0 != strcmp(entry->d_name + strlen(entry->d_name) - ilen, "_input")) {
continue;
}
/* track the info for this core */
trk = OBJ_NEW(core_tracker_t);
trk->socket = socket;
trk->file = opal_os_path(false, dirname, entry->d_name, NULL);
/* take the part up to the first underscore as this will
* be used as the start of all the related files
*/
tmp = strdup(entry->d_name);
if (NULL == (ptr = strchr(tmp, '_'))) {
/* unrecognized format */
free(tmp);
OBJ_RELEASE(trk);
continue;
}
*ptr = '\0';
/* look for critical, max, and label info */
asprintf(&filename, "%s/%s_%s", dirname, tmp, "label");
fp = fopen(filename, "r");
trk->label = orte_getline(fp);
fclose(fp);
free(filename);
asprintf(&filename, "%s/%s_%s", dirname, tmp, "crit");
fp = fopen(filename, "r");
ptr = orte_getline(fp);
fclose(fp);
trk->critical_temp = strtol(ptr, NULL, 10)/100.0;
free(ptr);
free(filename);
asprintf(&filename, "%s/%s_%s", dirname, tmp, "max");
fp = fopen(filename, "r");
ptr = orte_getline(fp);
fclose(fp);
trk->max_temp = strtol(ptr, NULL, 10)/100.0;
free(ptr);
free(filename);
/* add to our list */
opal_list_append(&tracking, &trk->super);
/* cleanup */
free(tmp);
}
closedir(tdir);
socket++;
}
closedir(cur_dirp);
if (0 == opal_list_get_size(&tracking)) {
/* nothing to read */
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
OPAL_LIST_DESTRUCT(&tracking);
}
/*
* Start monitoring of local temps
*/
static void start(orte_jobid_t jobid)
{
return;
}
static void stop(orte_jobid_t jobid)
{
return;
}
static void coretemp_sample(void)
{
int ret;
core_tracker_t *trk;
FILE *fp;
char *temp;
float degc;
opal_buffer_t data, *bptr;
int32_t ncores;
time_t now;
char time_str[40];
char *timestamp_str;
/* prep to store the results */
OBJ_CONSTRUCT(&data, opal_buffer_t);
/* store our hostname */
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
/* store the number of cores */
ncores = (int32_t)opal_list_get_size(&tracking);
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ncores, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
/* get the sample time */
now = time(NULL);
/* pass the time along as a simple string */
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
asprintf(&timestamp_str, "%s", time_str);
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &timestamp_str, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
free(timestamp_str);
return;
}
free(timestamp_str);
OPAL_LIST_FOREACH(trk, &tracking, core_tracker_t) {
/* read the temp */
fp = fopen(trk->file, "r");
while (NULL != (temp = orte_getline(fp))) {
degc = strtoul(temp, NULL, 10) / 100.0;
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:coretemp: Socket %d %s temp %f max %f critical %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->socket, trk->label, degc, trk->max_temp, trk->critical_temp);
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &degc, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
free(temp);
return;
}
free(temp);
/* check for exceed critical temp */
if (trk->critical_temp < degc) {
/* alert the errmgr - this is a critical problem */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:coretemp: Socket %d %s CRITICAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->socket, trk->label);
} else if (trk->max_temp < degc) {
/* alert the errmgr */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:coretemp: Socket %d %s MAX",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->socket, trk->label);
}
}
fclose(fp);
}
/* xfer the data for transmission */
bptr = &data;
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
OBJ_DESTRUCT(&data);
}
static void coretemp_log(opal_buffer_t *sample)
{
char *hostname=NULL;
char *sampletime;
int rc;
int32_t n, ncores;
opal_value_t *kv=NULL;
float fval;
int i;
if (!log_enabled) {
return;
}
/* unpack the host this came from */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
/* and the number of cores on that host */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &ncores, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return;
}
/* sample time */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
"%s Received log from host %s with %d cores",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == hostname) ? "NULL" : hostname, ncores);
/* xfr to storage */
kv = malloc((ncores+1) * sizeof(opal_value_t));
/* load the sample time at the start */
OBJ_CONSTRUCT(&kv[0], opal_value_t);
kv[0].key = strdup("ctime");
kv[0].type = OPAL_STRING;
kv[0].data.string = strdup(sampletime);
free(sampletime);
for (i=0; i < ncores; i++) {
OBJ_CONSTRUCT(&kv[i+1], opal_value_t);
asprintf(&kv[i+1].key, "core%d", i);
kv[i+1].type = OPAL_FLOAT;
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
kv[i+1].data.fval = fval;
}
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+1))) {
/* don't bark about it - just quietly disable the log */
log_enabled = false;
}
cleanup:
/* cleanup the xfr storage */
for (i=0; i < ncores+1; i++) {
OBJ_DESTRUCT(&kv[i]);
}
if (NULL != hostname) {
free(hostname);
}
}

35
orte/mca/sensor/coretemp/sensor_coretemp.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* CORETEMP resource manager sensor
*/
#ifndef ORTE_SENSOR_CORETEMP_H
#define ORTE_SENSOR_CORETEMP_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
typedef struct {
orte_sensor_base_component_t super;
bool test;
} orte_sensor_coretemp_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_coretemp_component_t mca_sensor_coretemp_component;
extern orte_sensor_base_module_t orte_sensor_coretemp_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,90 @@
/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_var.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_coretemp.h"
/*
* Local functions
*/
static int orte_sensor_coretemp_open(void);
static int orte_sensor_coretemp_close(void);
static int orte_sensor_coretemp_query(mca_base_module_t **module, int *priority);
static int coretemp_component_register(void);
orte_sensor_coretemp_component_t mca_sensor_coretemp_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"coretemp", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_coretemp_open, /* component open */
orte_sensor_coretemp_close, /* component close */
orte_sensor_coretemp_query, /* component query */
coretemp_component_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_coretemp_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_coretemp_query(mca_base_module_t **module, int *priority)
{
/* if we can build, then we definitely want to be used
* even if we aren't going to sample as we have to be
* present in order to log any received results. Note that
* we tested for existence and read-access for at least
* one socket in the configure test, so we don't have to
* check again here
*/
*priority = 50; /* ahead of heartbeat */
*module = (mca_base_module_t *)&orte_sensor_coretemp_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_coretemp_close(void)
{
return ORTE_SUCCESS;
}
static int coretemp_component_register(void)
{
mca_base_component_t *c = &mca_sensor_coretemp_component.super.base_version;
mca_sensor_coretemp_component.test = false;
(void) mca_base_component_var_register (c, "test",
"Generate and pass test vector",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
& mca_sensor_coretemp_component.test);
return ORTE_SUCCESS;
}

41
orte/mca/sensor/sigar/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
#
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-sensor-sigar.txt
sources = \
sensor_sigar.c \
sensor_sigar.h \
sensor_sigar_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_sigar_DSO
component_noinst =
component_install = mca_sensor_sigar.la
else
component_noinst = libmca_sensor_sigar.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_sigar_la_CPPFLAGS = $(sensor_sigar_CPPFLAGS)
mca_sensor_sigar_la_SOURCES = $(sources)
mca_sensor_sigar_la_LDFLAGS = -module -avoid-version $(sensor_sigar_LDFLAGS)
mca_sensor_sigar_la_LIBADD = $(sensor_sigar_LIBS) -lm
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_sigar_la_SOURCES =$(sources)
libmca_sensor_sigar_la_CPPFLAGS = $(sensor_sigar_CPPFLAGS)
libmca_sensor_sigar_la_LDFLAGS = -module -avoid-version $(sensor_sigar_LDFLAGS)
libmca_sensor_sigar_la_LIBADD = $(sensor_sigar_LIBS) -lm

59
orte/mca/sensor/sigar/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,59 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
# MCA_sensor_sigar_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_sigar_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/sigar/Makefile])
AC_ARG_WITH([sigar],
[AC_HELP_STRING([--with-sigar],
[Build sigar support (default: no)])],
[], with_sigar=no)
# do not build if support not requested
AS_IF([test "$with_sigar" != "no"],
[case "${host}" in
i?86-*linux*|x86_64*linux*|ia64-*linux*|powerpc-*linux*|powerpc64-*linux*|sparc*-*linux*)
AS_IF([test -r "/proc/cpuinfo"],
[sensor_linux_happy="yes"],
[sensor_linux_happy="no"])
;;
*)
sensor_linux_happy="no"
;;
esac
AS_IF([test "$sensor_linux_happy" = "yes"],
[libname="sigar"], [libname="sigar-universal-macosx"])
AS_IF([test ! -z "$with_sigar" -a "$with_sigar" != "yes"],
[orte_check_sigar_dir="$with_sigar"])
OMPI_CHECK_PACKAGE([sensor_sigar],
[sigar.h],
[$libname],
[sigar_proc_cpu_get],
[],
[$orte_check_sigar_dir],
[],
[$1],
[AC_MSG_WARN([SIGAR SENSOR SUPPORT REQUESTED])
AC_MSG_WARN([BUT REQUIRED LIBRARY OR HEADER NOT FOUND])
AC_MSG_ERROR([CANNOT CONTINUE])
$2])],
[$2])
AC_DEFINE_UNQUOTED(ORTE_SIGAR_LINUX, [test "$sensor_linux_happy" = "yes"],
[Which name to use for the sigar library on this OS])
AC_SUBST(sensor_sigar_CPPFLAGS)
AC_SUBST(sensor_sigar_LDFLAGS)
AC_SUBST(sensor_sigar_LIBS)
])dnl

Просмотреть файл

@ -0,0 +1,20 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

959
orte/mca/sensor/sigar/sensor_sigar.c Обычный файл
Просмотреть файл

@ -0,0 +1,959 @@
/*
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include <math.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#ifdef ORTE_SIGAR_LINUX
#include <sigar.h>
#else
#include <libsigar-universal-macosx>
#endif
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_ring_buffer.h"
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/event/event.h"
#include "opal/mca/db/db.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/orted/orted.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_sigar.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
static void sigar_sample(void);
static void sigar_log(opal_buffer_t *buf);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_sigar_module = {
init,
finalize,
start,
stop,
sigar_sample,
sigar_log
};
/* define some local classes */
typedef struct {
opal_list_item_t super;
char *interface;
uint64_t rx_packets;
uint64_t rx_bytes;
uint64_t tx_packets;
uint64_t tx_bytes;
} sensor_sigar_interface_t;
static void sit_cons(sensor_sigar_interface_t *sit)
{
sit->interface = NULL;
sit->rx_packets = 0;
sit->rx_bytes = 0;
sit->tx_packets = 0;
sit->tx_bytes = 0;
}
static void sit_dest(sensor_sigar_interface_t *sit)
{
if (NULL != sit->interface) {
free(sit->interface);
}
}
OBJ_CLASS_INSTANCE(sensor_sigar_interface_t,
opal_list_item_t,
sit_cons, sit_dest);
typedef struct {
opal_list_item_t super;
char *mount_pt;
uint64_t reads;
uint64_t writes;
uint64_t read_bytes;
uint64_t write_bytes;
} sensor_sigar_disks_t;
static void dit_cons(sensor_sigar_disks_t *dit)
{
dit->mount_pt = NULL;
dit->reads = 0;
dit->writes = 0;
dit->read_bytes = 0;
dit->write_bytes = 0;
}
static void dit_dest(sensor_sigar_disks_t *dit)
{
if (NULL != dit->mount_pt) {
free(dit->mount_pt);
}
}
OBJ_CLASS_INSTANCE(sensor_sigar_disks_t,
opal_list_item_t,
dit_cons, dit_dest);
static sigar_t *sigar;
static opal_list_t fslist;
static opal_list_t netlist;
static time_t last_sample = 0;
static struct cpu_data_t {
uint64_t user;
uint64_t nice;
uint64_t sys;
uint64_t idle;
uint64_t wait;
uint64_t total;
} pcpu;
static struct swap_data_t {
uint64_t page_in;
uint64_t page_out;
} pswap;
static bool log_enabled = true;
static opal_buffer_t test_vector;
static uint64_t metric_diff_calc(sigar_uint64_t newval, uint64_t oldval,
const char *name_for_log,
const char* value_name_for_log);
static void generate_test_vector(opal_buffer_t *v);
static int init(void)
{
sigar_file_system_list_t sigar_fslist;
sigar_net_interface_list_t sigar_netlist;
sensor_sigar_disks_t *dit;
sensor_sigar_interface_t *sit;
unsigned int i;
if (mca_sensor_sigar_component.test) {
/* generate test vector */
OBJ_CONSTRUCT(&test_vector, opal_buffer_t);
generate_test_vector(&test_vector);
return ORTE_SUCCESS;
}
/* setup the globals */
OBJ_CONSTRUCT(&fslist, opal_list_t);
OBJ_CONSTRUCT(&netlist, opal_list_t);
pcpu.user = 0;
pcpu.nice = 0;
pcpu.sys = 0;
pcpu.idle = 0;
pcpu.wait = 0;
pcpu.total = 0;
pswap.page_in = 0;
pswap.page_out = 0;
/* initialize sigar */
if (0 != sigar_open(&sigar)) {
opal_output(0, "%s: sigar_open failed on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
return ORTE_ERROR;
}
/* load the disk list */
if (0 != sigar_file_system_list_get(sigar, &sigar_fslist)) {
return ORTE_ERROR;
}
for (i = 0; i < sigar_fslist.number; i++) {
if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK || sigar_fslist.data[i].type == SIGAR_FSTYPE_NETWORK) {
dit = OBJ_NEW(sensor_sigar_disks_t);
dit->mount_pt = strdup(sigar_fslist.data[i].dir_name);
opal_list_append(&fslist, &dit->super);
}
}
sigar_file_system_list_destroy(sigar, &sigar_fslist);
/* load the list of network interfaces */
if (0 != sigar_net_interface_list_get(sigar, &sigar_netlist)) {
return ORTE_ERROR;
}
for (i=0; i < sigar_netlist.number; i++) {
sit = OBJ_NEW(sensor_sigar_interface_t);
sit->interface = strdup(sigar_netlist.data[i]);
opal_list_append(&netlist, &sit->super);
}
sigar_net_interface_list_destroy(sigar, &sigar_netlist);
return ORTE_SUCCESS;
}
static void finalize(void)
{
opal_list_item_t *item;
if (mca_sensor_sigar_component.test) {
/* destruct test vector */
OBJ_DESTRUCT(&test_vector);
return;
}
if (NULL != sigar) {
sigar_close(sigar);
}
while (NULL != (item = opal_list_remove_first(&fslist))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&fslist);
while (NULL != (item = opal_list_remove_first(&netlist))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&netlist);
return;
}
/*
* Start monitoring of local processes
*/
static void start(orte_jobid_t jobid)
{
return;
}
static void stop(orte_jobid_t jobid)
{
return;
}
static void sigar_sample(void)
{
sigar_mem_t mem;
sigar_swap_t swap;
sigar_cpu_t cpu;
sigar_loadavg_t loadavg;
sigar_disk_usage_t tdisk;
sensor_sigar_disks_t *dit;
sigar_file_system_usage_t fsusage;
sensor_sigar_interface_t *sit;
sigar_net_interface_stat_t tnet, ifc;
uint64_t reads, writes, read_bytes, write_bytes;
uint64_t rxpkts, txpkts, rxbytes, txbytes;
uint64_t ui64;
opal_buffer_t data, *bptr;
int rc;
time_t now;
double cpu_diff, tdiff;
float tmp;
char *ctmp;
char time_str[40];
char *timestamp_str;
if (mca_sensor_sigar_component.test) {
/* just send the test vector */
bptr = &test_vector;
opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER);
return;
}
/* prep the buffer to collect the data */
OBJ_CONSTRUCT(&data, opal_buffer_t);
/* pack our name */
ctmp = strdup("sigar");
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ctmp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
free(ctmp);
/* include our node name */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* get the sample time */
now = time(NULL);
tdiff = difftime(now, last_sample);
/* pass the time along as a simple string */
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
asprintf(&timestamp_str, "%s", time_str);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &timestamp_str, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
free(timestamp_str);
/* get the memory usage for this node */
memset(&mem, 0, sizeof(mem));
sigar_mem_get(sigar, &mem);
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"mem total: %" PRIu64 " used: %" PRIu64 " actual used: %" PRIu64 " actual free: %" PRIu64 "",
mem.total, mem.used, mem.actual_used, mem.actual_free);
/* add it to the data */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.total, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.used, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.actual_used, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.actual_free, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* get swap data */
memset(&swap, 0, sizeof(swap));
sigar_swap_get(sigar, &swap);
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"swap total: %" PRIu64 " used: %" PRIu64 "page_in: %" PRIu64 " page_out: %" PRIu64 "\n",
swap.total, swap.used, swap.page_in, swap.page_out);
/* compute the values we actually want and add them to the data */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &swap.total, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &swap.used, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
ui64 = swap.page_in - pswap.page_in;
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ui64, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
ui64 = swap.page_out - pswap.page_out;
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ui64, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* get the cpu usage */
memset(&cpu, 0, sizeof(cpu));
sigar_cpu_get(sigar, &cpu);
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"cpu user: %" PRIu64 " sys: %" PRIu64 " idle: %" PRIu64 " wait: %" PRIu64 " nice: %" PRIu64 " total: %" PRIu64 "",
cpu.user, cpu.sys, cpu.idle, cpu.wait, cpu.nice, cpu.total);
/* compute the values we actually want and add them to the data */
cpu_diff = (double)(cpu.total - pcpu.total);
tmp = (float)((cpu.user - pcpu.user) * 100.0 / cpu_diff) + (float)((cpu.nice - pcpu.nice) * 100.0 / cpu_diff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
tmp = ((float) (cpu.sys - pcpu.sys) * 100.0 / cpu_diff) + ((float)((cpu.wait - pcpu.wait) * 100.0 / cpu_diff));
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
tmp = (float) (cpu.idle - pcpu.idle) * 100.0 / cpu_diff;
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* update the values */
pcpu.user = cpu.user;
pcpu.nice = cpu.nice;
pcpu.sys = cpu.sys;
pcpu.wait = cpu.wait;
pcpu.idle = cpu.idle;
pcpu.total = cpu.total;
/* get load average data */
memset(&loadavg, 0, sizeof(loadavg));
sigar_loadavg_get(sigar, &loadavg);
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"load_avg: %e %e %e",
loadavg.loadavg[0], loadavg.loadavg[1], loadavg.loadavg[2]);
/* add them to the data */
tmp = (float)loadavg.loadavg[0];
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
tmp = (float)loadavg.loadavg[1];
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
tmp = (float)loadavg.loadavg[2];
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* get disk usage data */
memset(&tdisk, 0, sizeof(tdisk));
OPAL_LIST_FOREACH(dit, &fslist, sensor_sigar_disks_t) {
if (0 != sigar_file_system_usage_get(sigar, dit->mount_pt, &fsusage)) {
opal_output(0, "%s Failed to get usage data for filesystem %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dit->mount_pt);
} else {
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"FileSystem: %s Reads: %" PRIu64 " Writes: %" PRIu64 " ReadBytes: %" PRIu64 " WriteBytes: %" PRIu64 "",
dit->mount_pt, fsusage.disk.reads, fsusage.disk.writes, fsusage.disk.read_bytes, fsusage.disk.write_bytes);
/* compute the number of reads since last reading */
reads = metric_diff_calc(fsusage.disk.reads, dit->reads, dit->mount_pt, "disk reads");
dit->reads = fsusage.disk.reads; /* old = new */
/* compute the number of writes since last reading */
writes = metric_diff_calc(fsusage.disk.writes, dit->writes, dit->mount_pt, "disk writes");
dit->writes = fsusage.disk.writes; /* old = new */
/* compute the number of read bytes since last reading */
read_bytes = metric_diff_calc(fsusage.disk.read_bytes, dit->read_bytes, dit->mount_pt, "disk read bytes");
dit->read_bytes = fsusage.disk.read_bytes; /* old = new */
/* compute the number of bytes written since last reading */
write_bytes = metric_diff_calc(fsusage.disk.write_bytes, dit->write_bytes, dit->mount_pt, "disk write bytes");
dit->write_bytes = fsusage.disk.write_bytes; /* old = new */
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
"FileSystem: %s ReadsChange: %" PRIu64 " WritesChange: %" PRIu64 " ReadBytesChange: %" PRIu64 " WriteBytesChange: %" PRIu64 "",
dit->mount_pt, reads, writes, read_bytes, write_bytes);
/* accumulate the values */
tdisk.reads += reads;
tdisk.writes += writes;
tdisk.read_bytes += read_bytes;
tdisk.write_bytes += write_bytes;
}
}
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
"Totals: ReadsChange: %" PRIu64 " WritesChange: %" PRIu64 " ReadBytesChange: %" PRIu64 " WriteBytesChange: %" PRIu64 "",
tdisk.reads, tdisk.writes, tdisk.read_bytes, tdisk.write_bytes);
/* compute the values we actually want and add them to the data */
reads = (uint64_t)ceil((double)tdisk.reads/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &reads, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
writes = (uint64_t)ceil((double)tdisk.writes/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &writes, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
read_bytes = (uint64_t)ceil((double)tdisk.read_bytes/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &read_bytes, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
write_bytes = (uint64_t)ceil((double)tdisk.write_bytes/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &write_bytes, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* get network usage data */
memset(&tnet, 0, sizeof(tnet));
OPAL_LIST_FOREACH(sit, &netlist, sensor_sigar_interface_t) {
memset(&ifc, 0, sizeof(ifc));
if (0 != sigar_net_interface_stat_get(sigar, sit->interface, &ifc)) {
opal_output(0, "%s Failed to get usage data for interface %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sit->interface);
} else {
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"Interface: %s RecvdPackets: %" PRIu64 " RecvdBytes: %" PRIu64 " TransPackets: %" PRIu64 " TransBytes: %" PRIu64 "",
sit->interface, ifc.rx_packets, ifc.rx_bytes, ifc.tx_packets, ifc.tx_bytes);
/* compute the number of recvd packets since last reading */
rxpkts = metric_diff_calc(ifc.rx_packets, sit->rx_packets, sit->interface, "rx packets");
sit->rx_packets = ifc.rx_packets; /* old = new */
/* compute the number of transmitted packets since last reading */
txpkts = metric_diff_calc(ifc.tx_packets, sit->tx_packets, sit->interface, "tx packets");
sit->tx_packets = ifc.tx_packets; /* old = new */
/* compute the number of recvd bytes since last reading */
rxbytes = metric_diff_calc(ifc.rx_bytes, sit->rx_bytes, sit->interface, "rx bytes");
sit->rx_bytes = ifc.rx_bytes; /* old = new */
/* compute the number of transmitted bytes since last reading */
txbytes = metric_diff_calc(ifc.tx_bytes, sit->tx_bytes, sit->interface, "tx bytes");
sit->tx_bytes = ifc.tx_bytes; /* old = new */
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
"Interface: %s RxPkts: %" PRIu64 " TxPkts: %" PRIu64 " RxBytes: %" PRIu64 " TxBytes: %" PRIu64 "",
sit->interface, rxpkts, txpkts, rxbytes, txbytes);
/* accumulate the values */
tnet.rx_packets += rxpkts;
tnet.rx_bytes += rxbytes;
tnet.tx_packets += txpkts;
tnet.tx_bytes += txbytes;
}
}
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
"Totals: RxPkts: %" PRIu64 " TxPkts: %" PRIu64 " RxBytes: %" PRIu64 " TxBytes: %" PRIu64 "",
tnet.rx_packets, tnet.tx_packets, tnet.rx_bytes, tnet.tx_bytes);
/* compute the values we actually want and add them to the data */
rxpkts = (uint64_t)ceil((double)tnet.rx_packets/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &rxpkts, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
txpkts = (uint64_t)ceil((double)tnet.tx_packets/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &txpkts, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
rxbytes = (uint64_t)ceil((double)tnet.rx_bytes/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &rxbytes, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
txbytes = (uint64_t)ceil((double)tnet.tx_bytes/tdiff);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &txbytes, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
/* xfer the data for transmission - need at least one prior sample before doing so */
if (0 < last_sample) {
bptr = &data;
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&data);
return;
}
}
OBJ_DESTRUCT(&data);
last_sample = now;
}
static void sigar_log(opal_buffer_t *sample)
{
char *hostname;
char *sampletime;
int rc;
int32_t n;
opal_value_t kv[24];
uint64_t uint64;
float fval;
int i;
if (!log_enabled) {
return;
}
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
"%s Received log from host %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == hostname) ? "NULL" : hostname);
/* prep the xfr storage */
for (i=0; i < 24; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
/* unpack the incoming data and xfer it for storage */
i=0;
/* sample time */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("ctime");
kv[i].type = OPAL_STRING;
kv[i++].data.string = strdup(sampletime);
free(sampletime);
/* hostname */
kv[i].key = strdup("hostname");
kv[i].type = OPAL_STRING;
kv[i++].data.string = strdup(hostname);
/* total memory */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("mem_total");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* total used memory */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("mem_used");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* actual used memory */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("mem_actual_used");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* actual free memory */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("mem_actual_free");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* total swap memory */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("swap_total");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* swap used */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("swap_used");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* swap pages in */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("swap_page_in");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* swap pages out */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("swap_page_out");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* cpu user */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("cpu_user");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = fval;
/* cpu sys */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("cpu_sys");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = fval;
/* cpu idle */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("cpu_idle");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = fval;
/* la0 */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("load0");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = fval;
/* la5 */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("load1");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = fval;
/* la15 */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("load2");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = fval;
/* disk read ops rate */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("disk_ro_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* disk write ops rate */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("disk_wo_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* disk read bytes/sec */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("disk_rb_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* disk write bytes/sec */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("disk_wb_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* net recv packet rate */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("net_rp_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* net tx packet rate */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("net_wp_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* net recv bytes rate */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("net_rb_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* net tx bytes rate */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
return;
}
kv[i].key = strdup("net_wb_rate");
kv[i].type = OPAL_UINT64;
kv[i++].data.uint64 = uint64;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("sigar", kv, 24))) {
/* don't bark about it - just quietly disable the log */
log_enabled = false;
}
/* cleanup the xfr storage */
for (i=0; i < 24; i++) {
OBJ_DESTRUCT(&kv[i]);
}
if (NULL != hostname) {
free(hostname);
}
}
/* Helper function to calculate the metric differences */
static uint64_t metric_diff_calc(sigar_uint64_t newval, uint64_t oldval,
const char *name_for_log,
const char *value_name_for_log)
{
uint64_t diff;
if (newval < oldval) {
/* assume that the value was reset and we are starting over */
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
"%s metric_diff_calc: new value %" PRIu64 " is less than old value %" PRIu64
" for %s metric %s; assume the value was reset and set diff to new value.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newval, oldval, name_for_log, value_name_for_log);
diff = newval;
} else {
diff = newval - oldval;
}
return diff;
}
static void generate_test_vector(opal_buffer_t *v)
{
char *ctmp;
uint64_t ui64;
float ft;
time_t now;
ctmp = strdup("sigar");
opal_dss.pack(v, &ctmp, 1, OPAL_STRING);
free(ctmp);
opal_dss.pack(v, &orte_process_info.nodename, 1, OPAL_STRING);
/* get the time so it will be unique each time */
now = time(NULL);
/* pass the time along as a simple string */
ctmp = ctime(&now);
/* strip the trailing newline */
ctmp[strlen(ctmp)-1] = '\0';
opal_dss.pack(v, &ctmp, 1, OPAL_STRING);
/* mem_total */
ui64 = 1;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* mem_used */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* mem_actual_used */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* mem_actual_free */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* swap total */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* swap used */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* swap page in */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* swap page out */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* cpu user */
ft = 1.0;
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
/* cpu sys */
ft += 1.0;
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
/* cpu idle */
ft += 1.0;
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
/* la */
ft += 1.0;
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
/* la5 */
ft += 1.0;
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
/* la15 */
ft += 1.0;
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
/* reads */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* writes */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* read bytes */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* write bytes */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* rx packets */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* tx packets */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* rx bytes */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
/* tx bytes */
ui64++;
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
}

35
orte/mca/sensor/sigar/sensor_sigar.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* SIGAR resource manager sensor
*/
#ifndef ORTE_SENSOR_SIGAR_H
#define ORTE_SENSOR_SIGAR_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
typedef struct {
orte_sensor_base_component_t super;
bool test;
} orte_sensor_sigar_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_sigar_component_t mca_sensor_sigar_component;
extern orte_sensor_base_module_t orte_sensor_sigar_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,87 @@
/*
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_var.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_sigar.h"
/*
* Local functions
*/
static int orte_sensor_sigar_open(void);
static int orte_sensor_sigar_close(void);
static int orte_sensor_sigar_query(mca_base_module_t **module, int *priority);
static int sigar_component_register(void);
orte_sensor_sigar_component_t mca_sensor_sigar_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"sigar", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_sigar_open, /* component open */
orte_sensor_sigar_close, /* component close */
orte_sensor_sigar_query, /* component query */
sigar_component_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_sigar_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_sigar_query(mca_base_module_t **module, int *priority)
{
/* if we can build, then we definitely want to be used
* even if we aren't going to sample as we have to be
* present in order to log any received results
*/
*priority = 50; /* ahead of heartbeat */
*module = (mca_base_module_t *)&orte_sensor_sigar_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_sigar_close(void)
{
return ORTE_SUCCESS;
}
static int sigar_component_register(void)
{
mca_base_component_t *c = &mca_sensor_sigar_component.super.base_version;
mca_sensor_sigar_component.test = false;
(void) mca_base_component_var_register (c, "test",
"Generate and pass test vector",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
& mca_sensor_sigar_component.test);
return ORTE_SUCCESS;
}