Add two new sensor modules - one to monitor core temperatures, and the other to monitor resource usage using the sigar library
This commit was SVN r30335.
Этот коммит содержится в:
родитель
d2d4eeb2d6
Коммит
9b2066cfba
37
orte/mca/sensor/coretemp/Makefile.am
Обычный файл
37
orte/mca/sensor/coretemp/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-coretemp.txt
|
||||
|
||||
sources = \
|
||||
sensor_coretemp.c \
|
||||
sensor_coretemp.h \
|
||||
sensor_coretemp_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_coretemp_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_coretemp.la
|
||||
else
|
||||
component_noinst = libmca_sensor_coretemp.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_coretemp_la_SOURCES = $(sources)
|
||||
mca_sensor_coretemp_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_coretemp_la_SOURCES =$(sources)
|
||||
libmca_sensor_coretemp_la_LDFLAGS = -module -avoid-version
|
44
orte/mca/sensor/coretemp/configure.m4
Обычный файл
44
orte/mca/sensor/coretemp/configure.m4
Обычный файл
@ -0,0 +1,44 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
dnl Additional copyrights may follow
|
||||
dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
# MCA_sensor_coretemp_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_coretemp_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/coretemp/Makefile])
|
||||
|
||||
AC_ARG_WITH([coretemp],
|
||||
[AC_HELP_STRING([--with-coretemp],
|
||||
[Build coretemp support (default: no)])],
|
||||
[], with_coretemp=no)
|
||||
|
||||
# do not build if support not requested
|
||||
AS_IF([test "$with_coretemp" != "no"],
|
||||
[case "${host}" in
|
||||
i?86-*linux*|x86_64*linux*|ia64-*linux*|powerpc-*linux*|powerpc64-*linux*|sparc*-*linux*)
|
||||
AS_IF([test -r "/sys/bus/platform/devices/coretemp.0"],
|
||||
[sensor_coretemp_happy=yes],
|
||||
[AC_MSG_WARN([Core temperature sensing was requested but the required directory])
|
||||
AC_MSG_WARN([was not found. This usually indicates that the \"coretemp\"])
|
||||
AC_MSG_WARN([kernel module is not installed. Please install the module])
|
||||
AC_MSG_WARN([and try again, or remove the core temperature sensing request.])
|
||||
sensor_coretemp_happy=no])
|
||||
;;
|
||||
*)
|
||||
AC_MSG_WARN([Core temperature sensing was requested but is only supported on Linux systems])
|
||||
sensor_coretemp_happy=no
|
||||
;;
|
||||
esac
|
||||
AS_IF([test "$sensor_coretemp_happy" = "yes"],
|
||||
[$1],
|
||||
[AC_MSG_ERROR([Cannot continue])
|
||||
$2])
|
||||
],
|
||||
[$2])
|
||||
])dnl
|
20
orte/mca/sensor/coretemp/help-orte-sensor-coretemp.txt
Обычный файл
20
orte/mca/sensor/coretemp/help-orte-sensor-coretemp.txt
Обычный файл
@ -0,0 +1,20 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
412
orte/mca/sensor/coretemp/sensor_coretemp.c
Обычный файл
412
orte/mca/sensor/coretemp/sensor_coretemp.c
Обычный файл
@ -0,0 +1,412 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#ifdef HAVE_DIRENT_H
|
||||
#include <dirent.h>
|
||||
#endif /* HAVE_DIRENT_H */
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_coretemp.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void coretemp_sample(void);
|
||||
static void coretemp_log(opal_buffer_t *buf);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_coretemp_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
coretemp_sample,
|
||||
coretemp_log
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *file;
|
||||
int socket;
|
||||
char *label;
|
||||
float critical_temp;
|
||||
float max_temp;
|
||||
} core_tracker_t;
|
||||
static void ctr_con(core_tracker_t *trk)
|
||||
{
|
||||
trk->file = NULL;
|
||||
trk->label = NULL;
|
||||
}
|
||||
static void ctr_des(core_tracker_t *trk)
|
||||
{
|
||||
if (NULL != trk->file) {
|
||||
free(trk->file);
|
||||
}
|
||||
if (NULL != trk->label) {
|
||||
free(trk->label);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(core_tracker_t,
|
||||
opal_list_item_t,
|
||||
ctr_con, ctr_des);
|
||||
|
||||
static bool log_enabled = true;
|
||||
static opal_list_t tracking;
|
||||
|
||||
static char *orte_getline(FILE *fp)
|
||||
{
|
||||
char *ret, *buff;
|
||||
char input[1024];
|
||||
|
||||
ret = fgets(input, 1024, fp);
|
||||
if (NULL != ret) {
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
buff = strdup(input);
|
||||
return buff;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
int ret;
|
||||
DIR *cur_dirp = NULL, *tdir;
|
||||
struct dirent *dir_entry, *entry;
|
||||
char *dirname, *filename, *ptr, *tmp;
|
||||
size_t tlen = strlen("temp");
|
||||
size_t ilen = strlen("_input");
|
||||
FILE *fp;
|
||||
core_tracker_t *trk;
|
||||
int socket;
|
||||
|
||||
OBJ_CONSTRUCT(&tracking, opal_list_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_os_dirpath_access("/sys/bus/platform/devices", 0))) {
|
||||
/* if the directory doesn't exist, or we don't have
|
||||
* access to it, then disqualify us
|
||||
*/
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Open up the base directory so we can get a listing
|
||||
*/
|
||||
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* For each directory
|
||||
*/
|
||||
socket = 0;
|
||||
while (NULL != (dir_entry = readdir(cur_dirp))) {
|
||||
|
||||
/* look for coretemp directories */
|
||||
if (0 != strncmp(dir_entry->d_name, "coretemp", strlen("coretemp"))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* open that directory */
|
||||
dirname = opal_os_path(false, "/sys/bus/platform/devices", dir_entry->d_name, NULL );
|
||||
if (NULL == (tdir = opendir(dirname))) {
|
||||
continue;
|
||||
}
|
||||
while (NULL != (entry = readdir(tdir))) {
|
||||
/*
|
||||
* Skip the obvious
|
||||
*/
|
||||
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
|
||||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
|
||||
continue;
|
||||
}
|
||||
if (strlen(entry->d_name) < (tlen+ilen)) {
|
||||
/* cannot be a core temp file */
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* See if this is a core temp file
|
||||
*/
|
||||
if (0 != strncmp(entry->d_name, "temp", strlen("temp"))) {
|
||||
continue;
|
||||
}
|
||||
if (0 != strcmp(entry->d_name + strlen(entry->d_name) - ilen, "_input")) {
|
||||
continue;
|
||||
}
|
||||
/* track the info for this core */
|
||||
trk = OBJ_NEW(core_tracker_t);
|
||||
trk->socket = socket;
|
||||
trk->file = opal_os_path(false, dirname, entry->d_name, NULL);
|
||||
/* take the part up to the first underscore as this will
|
||||
* be used as the start of all the related files
|
||||
*/
|
||||
tmp = strdup(entry->d_name);
|
||||
if (NULL == (ptr = strchr(tmp, '_'))) {
|
||||
/* unrecognized format */
|
||||
free(tmp);
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
*ptr = '\0';
|
||||
/* look for critical, max, and label info */
|
||||
asprintf(&filename, "%s/%s_%s", dirname, tmp, "label");
|
||||
fp = fopen(filename, "r");
|
||||
trk->label = orte_getline(fp);
|
||||
fclose(fp);
|
||||
free(filename);
|
||||
|
||||
asprintf(&filename, "%s/%s_%s", dirname, tmp, "crit");
|
||||
fp = fopen(filename, "r");
|
||||
ptr = orte_getline(fp);
|
||||
fclose(fp);
|
||||
trk->critical_temp = strtol(ptr, NULL, 10)/100.0;
|
||||
free(ptr);
|
||||
free(filename);
|
||||
|
||||
asprintf(&filename, "%s/%s_%s", dirname, tmp, "max");
|
||||
fp = fopen(filename, "r");
|
||||
ptr = orte_getline(fp);
|
||||
fclose(fp);
|
||||
trk->max_temp = strtol(ptr, NULL, 10)/100.0;
|
||||
free(ptr);
|
||||
free(filename);
|
||||
|
||||
/* add to our list */
|
||||
opal_list_append(&tracking, &trk->super);
|
||||
/* cleanup */
|
||||
free(tmp);
|
||||
}
|
||||
closedir(tdir);
|
||||
socket++;
|
||||
}
|
||||
closedir(cur_dirp);
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
/* nothing to read */
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
OPAL_LIST_DESTRUCT(&tracking);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local temps
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static void coretemp_sample(void)
|
||||
{
|
||||
int ret;
|
||||
core_tracker_t *trk;
|
||||
FILE *fp;
|
||||
char *temp;
|
||||
float degc;
|
||||
opal_buffer_t data, *bptr;
|
||||
int32_t ncores;
|
||||
time_t now;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* store the number of cores */
|
||||
ncores = (int32_t)opal_list_get_size(&tracking);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ncores, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the sample time */
|
||||
now = time(NULL);
|
||||
/* pass the time along as a simple string */
|
||||
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
|
||||
asprintf(×tamp_str, "%s", time_str);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, ×tamp_str, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(timestamp_str);
|
||||
return;
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
OPAL_LIST_FOREACH(trk, &tracking, core_tracker_t) {
|
||||
/* read the temp */
|
||||
fp = fopen(trk->file, "r");
|
||||
while (NULL != (temp = orte_getline(fp))) {
|
||||
degc = strtoul(temp, NULL, 10) / 100.0;
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:coretemp: Socket %d %s temp %f max %f critical %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->socket, trk->label, degc, trk->max_temp, trk->critical_temp);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, °c, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(temp);
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
/* check for exceed critical temp */
|
||||
if (trk->critical_temp < degc) {
|
||||
/* alert the errmgr - this is a critical problem */
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:coretemp: Socket %d %s CRITICAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->socket, trk->label);
|
||||
} else if (trk->max_temp < degc) {
|
||||
/* alert the errmgr */
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:coretemp: Socket %d %s MAX",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->socket, trk->label);
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
|
||||
static void coretemp_log(opal_buffer_t *sample)
|
||||
{
|
||||
char *hostname=NULL;
|
||||
char *sampletime;
|
||||
int rc;
|
||||
int32_t n, ncores;
|
||||
opal_value_t *kv=NULL;
|
||||
float fval;
|
||||
int i;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the host this came from */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* and the number of cores on that host */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &ncores, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* sample time */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received log from host %s with %d cores",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+1) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
kv[0].key = strdup("ctime");
|
||||
kv[0].type = OPAL_STRING;
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+1], opal_value_t);
|
||||
asprintf(&kv[i+1].key, "core%d", i);
|
||||
kv[i+1].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+1].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+1))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+1; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
|
||||
}
|
35
orte/mca/sensor/coretemp/sensor_coretemp.h
Обычный файл
35
orte/mca/sensor/coretemp/sensor_coretemp.h
Обычный файл
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* CORETEMP resource manager sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_CORETEMP_H
|
||||
#define ORTE_SENSOR_CORETEMP_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_sensor_base_component_t super;
|
||||
bool test;
|
||||
} orte_sensor_coretemp_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_coretemp_component_t mca_sensor_coretemp_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_coretemp_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
90
orte/mca/sensor/coretemp/sensor_coretemp_component.c
Обычный файл
90
orte/mca/sensor/coretemp/sensor_coretemp_component.c
Обычный файл
@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_coretemp.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_coretemp_open(void);
|
||||
static int orte_sensor_coretemp_close(void);
|
||||
static int orte_sensor_coretemp_query(mca_base_module_t **module, int *priority);
|
||||
static int coretemp_component_register(void);
|
||||
|
||||
orte_sensor_coretemp_component_t mca_sensor_coretemp_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"coretemp", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_coretemp_open, /* component open */
|
||||
orte_sensor_coretemp_close, /* component close */
|
||||
orte_sensor_coretemp_query, /* component query */
|
||||
coretemp_component_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_coretemp_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_coretemp_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* if we can build, then we definitely want to be used
|
||||
* even if we aren't going to sample as we have to be
|
||||
* present in order to log any received results. Note that
|
||||
* we tested for existence and read-access for at least
|
||||
* one socket in the configure test, so we don't have to
|
||||
* check again here
|
||||
*/
|
||||
*priority = 50; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_coretemp_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_coretemp_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int coretemp_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_coretemp_component.super.base_version;
|
||||
|
||||
mca_sensor_coretemp_component.test = false;
|
||||
(void) mca_base_component_var_register (c, "test",
|
||||
"Generate and pass test vector",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
& mca_sensor_coretemp_component.test);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
41
orte/mca/sensor/sigar/Makefile.am
Обычный файл
41
orte/mca/sensor/sigar/Makefile.am
Обычный файл
@ -0,0 +1,41 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-sigar.txt
|
||||
|
||||
sources = \
|
||||
sensor_sigar.c \
|
||||
sensor_sigar.h \
|
||||
sensor_sigar_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_sigar_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_sigar.la
|
||||
else
|
||||
component_noinst = libmca_sensor_sigar.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_sigar_la_CPPFLAGS = $(sensor_sigar_CPPFLAGS)
|
||||
mca_sensor_sigar_la_SOURCES = $(sources)
|
||||
mca_sensor_sigar_la_LDFLAGS = -module -avoid-version $(sensor_sigar_LDFLAGS)
|
||||
mca_sensor_sigar_la_LIBADD = $(sensor_sigar_LIBS) -lm
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_sigar_la_SOURCES =$(sources)
|
||||
libmca_sensor_sigar_la_CPPFLAGS = $(sensor_sigar_CPPFLAGS)
|
||||
libmca_sensor_sigar_la_LDFLAGS = -module -avoid-version $(sensor_sigar_LDFLAGS)
|
||||
libmca_sensor_sigar_la_LIBADD = $(sensor_sigar_LIBS) -lm
|
59
orte/mca/sensor/sigar/configure.m4
Обычный файл
59
orte/mca/sensor/sigar/configure.m4
Обычный файл
@ -0,0 +1,59 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
dnl Additional copyrights may follow
|
||||
dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
# MCA_sensor_sigar_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_sigar_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/sigar/Makefile])
|
||||
|
||||
AC_ARG_WITH([sigar],
|
||||
[AC_HELP_STRING([--with-sigar],
|
||||
[Build sigar support (default: no)])],
|
||||
[], with_sigar=no)
|
||||
|
||||
# do not build if support not requested
|
||||
AS_IF([test "$with_sigar" != "no"],
|
||||
[case "${host}" in
|
||||
i?86-*linux*|x86_64*linux*|ia64-*linux*|powerpc-*linux*|powerpc64-*linux*|sparc*-*linux*)
|
||||
AS_IF([test -r "/proc/cpuinfo"],
|
||||
[sensor_linux_happy="yes"],
|
||||
[sensor_linux_happy="no"])
|
||||
;;
|
||||
*)
|
||||
sensor_linux_happy="no"
|
||||
;;
|
||||
esac
|
||||
|
||||
AS_IF([test "$sensor_linux_happy" = "yes"],
|
||||
[libname="sigar"], [libname="sigar-universal-macosx"])
|
||||
|
||||
AS_IF([test ! -z "$with_sigar" -a "$with_sigar" != "yes"],
|
||||
[orte_check_sigar_dir="$with_sigar"])
|
||||
|
||||
OMPI_CHECK_PACKAGE([sensor_sigar],
|
||||
[sigar.h],
|
||||
[$libname],
|
||||
[sigar_proc_cpu_get],
|
||||
[],
|
||||
[$orte_check_sigar_dir],
|
||||
[],
|
||||
[$1],
|
||||
[AC_MSG_WARN([SIGAR SENSOR SUPPORT REQUESTED])
|
||||
AC_MSG_WARN([BUT REQUIRED LIBRARY OR HEADER NOT FOUND])
|
||||
AC_MSG_ERROR([CANNOT CONTINUE])
|
||||
$2])],
|
||||
[$2])
|
||||
|
||||
AC_DEFINE_UNQUOTED(ORTE_SIGAR_LINUX, [test "$sensor_linux_happy" = "yes"],
|
||||
[Which name to use for the sigar library on this OS])
|
||||
AC_SUBST(sensor_sigar_CPPFLAGS)
|
||||
AC_SUBST(sensor_sigar_LDFLAGS)
|
||||
AC_SUBST(sensor_sigar_LIBS)
|
||||
])dnl
|
20
orte/mca/sensor/sigar/help-orte-sensor-sigar.txt
Обычный файл
20
orte/mca/sensor/sigar/help-orte-sensor-sigar.txt
Обычный файл
@ -0,0 +1,20 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
959
orte/mca/sensor/sigar/sensor_sigar.c
Обычный файл
959
orte/mca/sensor/sigar/sensor_sigar.c
Обычный файл
@ -0,0 +1,959 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifdef ORTE_SIGAR_LINUX
|
||||
#include <sigar.h>
|
||||
#else
|
||||
#include <libsigar-universal-macosx>
|
||||
#endif
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/class/opal_ring_buffer.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_sigar.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void sigar_sample(void);
|
||||
static void sigar_log(opal_buffer_t *buf);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_sigar_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
sigar_sample,
|
||||
sigar_log
|
||||
};
|
||||
|
||||
/* define some local classes */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *interface;
|
||||
uint64_t rx_packets;
|
||||
uint64_t rx_bytes;
|
||||
uint64_t tx_packets;
|
||||
uint64_t tx_bytes;
|
||||
} sensor_sigar_interface_t;
|
||||
static void sit_cons(sensor_sigar_interface_t *sit)
|
||||
{
|
||||
sit->interface = NULL;
|
||||
sit->rx_packets = 0;
|
||||
sit->rx_bytes = 0;
|
||||
sit->tx_packets = 0;
|
||||
sit->tx_bytes = 0;
|
||||
}
|
||||
static void sit_dest(sensor_sigar_interface_t *sit)
|
||||
{
|
||||
if (NULL != sit->interface) {
|
||||
free(sit->interface);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(sensor_sigar_interface_t,
|
||||
opal_list_item_t,
|
||||
sit_cons, sit_dest);
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *mount_pt;
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t read_bytes;
|
||||
uint64_t write_bytes;
|
||||
} sensor_sigar_disks_t;
|
||||
static void dit_cons(sensor_sigar_disks_t *dit)
|
||||
{
|
||||
dit->mount_pt = NULL;
|
||||
dit->reads = 0;
|
||||
dit->writes = 0;
|
||||
dit->read_bytes = 0;
|
||||
dit->write_bytes = 0;
|
||||
}
|
||||
static void dit_dest(sensor_sigar_disks_t *dit)
|
||||
{
|
||||
if (NULL != dit->mount_pt) {
|
||||
free(dit->mount_pt);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(sensor_sigar_disks_t,
|
||||
opal_list_item_t,
|
||||
dit_cons, dit_dest);
|
||||
|
||||
static sigar_t *sigar;
|
||||
static opal_list_t fslist;
|
||||
static opal_list_t netlist;
|
||||
static time_t last_sample = 0;
|
||||
static struct cpu_data_t {
|
||||
uint64_t user;
|
||||
uint64_t nice;
|
||||
uint64_t sys;
|
||||
uint64_t idle;
|
||||
uint64_t wait;
|
||||
uint64_t total;
|
||||
} pcpu;
|
||||
static struct swap_data_t {
|
||||
uint64_t page_in;
|
||||
uint64_t page_out;
|
||||
} pswap;
|
||||
static bool log_enabled = true;
|
||||
static opal_buffer_t test_vector;
|
||||
|
||||
static uint64_t metric_diff_calc(sigar_uint64_t newval, uint64_t oldval,
|
||||
const char *name_for_log,
|
||||
const char* value_name_for_log);
|
||||
static void generate_test_vector(opal_buffer_t *v);
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
sigar_file_system_list_t sigar_fslist;
|
||||
sigar_net_interface_list_t sigar_netlist;
|
||||
sensor_sigar_disks_t *dit;
|
||||
sensor_sigar_interface_t *sit;
|
||||
unsigned int i;
|
||||
|
||||
if (mca_sensor_sigar_component.test) {
|
||||
/* generate test vector */
|
||||
OBJ_CONSTRUCT(&test_vector, opal_buffer_t);
|
||||
generate_test_vector(&test_vector);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* setup the globals */
|
||||
OBJ_CONSTRUCT(&fslist, opal_list_t);
|
||||
OBJ_CONSTRUCT(&netlist, opal_list_t);
|
||||
pcpu.user = 0;
|
||||
pcpu.nice = 0;
|
||||
pcpu.sys = 0;
|
||||
pcpu.idle = 0;
|
||||
pcpu.wait = 0;
|
||||
pcpu.total = 0;
|
||||
pswap.page_in = 0;
|
||||
pswap.page_out = 0;
|
||||
|
||||
/* initialize sigar */
|
||||
if (0 != sigar_open(&sigar)) {
|
||||
opal_output(0, "%s: sigar_open failed on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* load the disk list */
|
||||
if (0 != sigar_file_system_list_get(sigar, &sigar_fslist)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
for (i = 0; i < sigar_fslist.number; i++) {
|
||||
if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK || sigar_fslist.data[i].type == SIGAR_FSTYPE_NETWORK) {
|
||||
dit = OBJ_NEW(sensor_sigar_disks_t);
|
||||
dit->mount_pt = strdup(sigar_fslist.data[i].dir_name);
|
||||
opal_list_append(&fslist, &dit->super);
|
||||
}
|
||||
}
|
||||
sigar_file_system_list_destroy(sigar, &sigar_fslist);
|
||||
|
||||
/* load the list of network interfaces */
|
||||
if (0 != sigar_net_interface_list_get(sigar, &sigar_netlist)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
for (i=0; i < sigar_netlist.number; i++) {
|
||||
sit = OBJ_NEW(sensor_sigar_interface_t);
|
||||
sit->interface = strdup(sigar_netlist.data[i]);
|
||||
opal_list_append(&netlist, &sit->super);
|
||||
}
|
||||
sigar_net_interface_list_destroy(sigar, &sigar_netlist);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (mca_sensor_sigar_component.test) {
|
||||
/* destruct test vector */
|
||||
OBJ_DESTRUCT(&test_vector);
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != sigar) {
|
||||
sigar_close(sigar);
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&fslist))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&fslist);
|
||||
while (NULL != (item = opal_list_remove_first(&netlist))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&netlist);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static void sigar_sample(void)
|
||||
{
|
||||
sigar_mem_t mem;
|
||||
sigar_swap_t swap;
|
||||
sigar_cpu_t cpu;
|
||||
sigar_loadavg_t loadavg;
|
||||
sigar_disk_usage_t tdisk;
|
||||
sensor_sigar_disks_t *dit;
|
||||
sigar_file_system_usage_t fsusage;
|
||||
sensor_sigar_interface_t *sit;
|
||||
sigar_net_interface_stat_t tnet, ifc;
|
||||
uint64_t reads, writes, read_bytes, write_bytes;
|
||||
uint64_t rxpkts, txpkts, rxbytes, txbytes;
|
||||
uint64_t ui64;
|
||||
opal_buffer_t data, *bptr;
|
||||
int rc;
|
||||
time_t now;
|
||||
double cpu_diff, tdiff;
|
||||
float tmp;
|
||||
char *ctmp;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
|
||||
if (mca_sensor_sigar_component.test) {
|
||||
/* just send the test vector */
|
||||
bptr = &test_vector;
|
||||
opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER);
|
||||
return;
|
||||
}
|
||||
|
||||
/* prep the buffer to collect the data */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
/* pack our name */
|
||||
ctmp = strdup("sigar");
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ctmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(ctmp);
|
||||
/* include our node name */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the sample time */
|
||||
now = time(NULL);
|
||||
tdiff = difftime(now, last_sample);
|
||||
/* pass the time along as a simple string */
|
||||
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
|
||||
asprintf(×tamp_str, "%s", time_str);
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, ×tamp_str, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
/* get the memory usage for this node */
|
||||
memset(&mem, 0, sizeof(mem));
|
||||
sigar_mem_get(sigar, &mem);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"mem total: %" PRIu64 " used: %" PRIu64 " actual used: %" PRIu64 " actual free: %" PRIu64 "",
|
||||
mem.total, mem.used, mem.actual_used, mem.actual_free);
|
||||
/* add it to the data */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.total, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.used, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.actual_used, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.actual_free, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get swap data */
|
||||
memset(&swap, 0, sizeof(swap));
|
||||
sigar_swap_get(sigar, &swap);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"swap total: %" PRIu64 " used: %" PRIu64 "page_in: %" PRIu64 " page_out: %" PRIu64 "\n",
|
||||
swap.total, swap.used, swap.page_in, swap.page_out);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &swap.total, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &swap.used, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
ui64 = swap.page_in - pswap.page_in;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ui64, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
ui64 = swap.page_out - pswap.page_out;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ui64, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the cpu usage */
|
||||
memset(&cpu, 0, sizeof(cpu));
|
||||
sigar_cpu_get(sigar, &cpu);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"cpu user: %" PRIu64 " sys: %" PRIu64 " idle: %" PRIu64 " wait: %" PRIu64 " nice: %" PRIu64 " total: %" PRIu64 "",
|
||||
cpu.user, cpu.sys, cpu.idle, cpu.wait, cpu.nice, cpu.total);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
cpu_diff = (double)(cpu.total - pcpu.total);
|
||||
tmp = (float)((cpu.user - pcpu.user) * 100.0 / cpu_diff) + (float)((cpu.nice - pcpu.nice) * 100.0 / cpu_diff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = ((float) (cpu.sys - pcpu.sys) * 100.0 / cpu_diff) + ((float)((cpu.wait - pcpu.wait) * 100.0 / cpu_diff));
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = (float) (cpu.idle - pcpu.idle) * 100.0 / cpu_diff;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
/* update the values */
|
||||
pcpu.user = cpu.user;
|
||||
pcpu.nice = cpu.nice;
|
||||
pcpu.sys = cpu.sys;
|
||||
pcpu.wait = cpu.wait;
|
||||
pcpu.idle = cpu.idle;
|
||||
pcpu.total = cpu.total;
|
||||
|
||||
/* get load average data */
|
||||
memset(&loadavg, 0, sizeof(loadavg));
|
||||
sigar_loadavg_get(sigar, &loadavg);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"load_avg: %e %e %e",
|
||||
loadavg.loadavg[0], loadavg.loadavg[1], loadavg.loadavg[2]);
|
||||
/* add them to the data */
|
||||
tmp = (float)loadavg.loadavg[0];
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = (float)loadavg.loadavg[1];
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = (float)loadavg.loadavg[2];
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get disk usage data */
|
||||
memset(&tdisk, 0, sizeof(tdisk));
|
||||
OPAL_LIST_FOREACH(dit, &fslist, sensor_sigar_disks_t) {
|
||||
if (0 != sigar_file_system_usage_get(sigar, dit->mount_pt, &fsusage)) {
|
||||
opal_output(0, "%s Failed to get usage data for filesystem %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dit->mount_pt);
|
||||
} else {
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"FileSystem: %s Reads: %" PRIu64 " Writes: %" PRIu64 " ReadBytes: %" PRIu64 " WriteBytes: %" PRIu64 "",
|
||||
dit->mount_pt, fsusage.disk.reads, fsusage.disk.writes, fsusage.disk.read_bytes, fsusage.disk.write_bytes);
|
||||
/* compute the number of reads since last reading */
|
||||
reads = metric_diff_calc(fsusage.disk.reads, dit->reads, dit->mount_pt, "disk reads");
|
||||
dit->reads = fsusage.disk.reads; /* old = new */
|
||||
/* compute the number of writes since last reading */
|
||||
writes = metric_diff_calc(fsusage.disk.writes, dit->writes, dit->mount_pt, "disk writes");
|
||||
dit->writes = fsusage.disk.writes; /* old = new */
|
||||
/* compute the number of read bytes since last reading */
|
||||
read_bytes = metric_diff_calc(fsusage.disk.read_bytes, dit->read_bytes, dit->mount_pt, "disk read bytes");
|
||||
dit->read_bytes = fsusage.disk.read_bytes; /* old = new */
|
||||
/* compute the number of bytes written since last reading */
|
||||
write_bytes = metric_diff_calc(fsusage.disk.write_bytes, dit->write_bytes, dit->mount_pt, "disk write bytes");
|
||||
dit->write_bytes = fsusage.disk.write_bytes; /* old = new */
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"FileSystem: %s ReadsChange: %" PRIu64 " WritesChange: %" PRIu64 " ReadBytesChange: %" PRIu64 " WriteBytesChange: %" PRIu64 "",
|
||||
dit->mount_pt, reads, writes, read_bytes, write_bytes);
|
||||
/* accumulate the values */
|
||||
tdisk.reads += reads;
|
||||
tdisk.writes += writes;
|
||||
tdisk.read_bytes += read_bytes;
|
||||
tdisk.write_bytes += write_bytes;
|
||||
}
|
||||
}
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"Totals: ReadsChange: %" PRIu64 " WritesChange: %" PRIu64 " ReadBytesChange: %" PRIu64 " WriteBytesChange: %" PRIu64 "",
|
||||
tdisk.reads, tdisk.writes, tdisk.read_bytes, tdisk.write_bytes);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
reads = (uint64_t)ceil((double)tdisk.reads/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &reads, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
writes = (uint64_t)ceil((double)tdisk.writes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &writes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
read_bytes = (uint64_t)ceil((double)tdisk.read_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &read_bytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
write_bytes = (uint64_t)ceil((double)tdisk.write_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &write_bytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get network usage data */
|
||||
memset(&tnet, 0, sizeof(tnet));
|
||||
OPAL_LIST_FOREACH(sit, &netlist, sensor_sigar_interface_t) {
|
||||
memset(&ifc, 0, sizeof(ifc));
|
||||
if (0 != sigar_net_interface_stat_get(sigar, sit->interface, &ifc)) {
|
||||
opal_output(0, "%s Failed to get usage data for interface %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sit->interface);
|
||||
} else {
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"Interface: %s RecvdPackets: %" PRIu64 " RecvdBytes: %" PRIu64 " TransPackets: %" PRIu64 " TransBytes: %" PRIu64 "",
|
||||
sit->interface, ifc.rx_packets, ifc.rx_bytes, ifc.tx_packets, ifc.tx_bytes);
|
||||
/* compute the number of recvd packets since last reading */
|
||||
rxpkts = metric_diff_calc(ifc.rx_packets, sit->rx_packets, sit->interface, "rx packets");
|
||||
sit->rx_packets = ifc.rx_packets; /* old = new */
|
||||
/* compute the number of transmitted packets since last reading */
|
||||
txpkts = metric_diff_calc(ifc.tx_packets, sit->tx_packets, sit->interface, "tx packets");
|
||||
sit->tx_packets = ifc.tx_packets; /* old = new */
|
||||
/* compute the number of recvd bytes since last reading */
|
||||
rxbytes = metric_diff_calc(ifc.rx_bytes, sit->rx_bytes, sit->interface, "rx bytes");
|
||||
sit->rx_bytes = ifc.rx_bytes; /* old = new */
|
||||
/* compute the number of transmitted bytes since last reading */
|
||||
txbytes = metric_diff_calc(ifc.tx_bytes, sit->tx_bytes, sit->interface, "tx bytes");
|
||||
sit->tx_bytes = ifc.tx_bytes; /* old = new */
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"Interface: %s RxPkts: %" PRIu64 " TxPkts: %" PRIu64 " RxBytes: %" PRIu64 " TxBytes: %" PRIu64 "",
|
||||
sit->interface, rxpkts, txpkts, rxbytes, txbytes);
|
||||
/* accumulate the values */
|
||||
tnet.rx_packets += rxpkts;
|
||||
tnet.rx_bytes += rxbytes;
|
||||
tnet.tx_packets += txpkts;
|
||||
tnet.tx_bytes += txbytes;
|
||||
}
|
||||
}
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"Totals: RxPkts: %" PRIu64 " TxPkts: %" PRIu64 " RxBytes: %" PRIu64 " TxBytes: %" PRIu64 "",
|
||||
tnet.rx_packets, tnet.tx_packets, tnet.rx_bytes, tnet.tx_bytes);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
rxpkts = (uint64_t)ceil((double)tnet.rx_packets/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &rxpkts, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
txpkts = (uint64_t)ceil((double)tnet.tx_packets/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &txpkts, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
rxbytes = (uint64_t)ceil((double)tnet.rx_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &rxbytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
txbytes = (uint64_t)ceil((double)tnet.tx_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &txbytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* xfer the data for transmission - need at least one prior sample before doing so */
|
||||
if (0 < last_sample) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
|
||||
last_sample = now;
|
||||
}
|
||||
|
||||
static void sigar_log(opal_buffer_t *sample)
|
||||
{
|
||||
char *hostname;
|
||||
char *sampletime;
|
||||
int rc;
|
||||
int32_t n;
|
||||
opal_value_t kv[24];
|
||||
uint64_t uint64;
|
||||
float fval;
|
||||
int i;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received log from host %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname);
|
||||
|
||||
/* prep the xfr storage */
|
||||
for (i=0; i < 24; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i], opal_value_t);
|
||||
}
|
||||
|
||||
/* unpack the incoming data and xfer it for storage */
|
||||
i=0;
|
||||
|
||||
/* sample time */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("ctime");
|
||||
kv[i].type = OPAL_STRING;
|
||||
kv[i++].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* hostname */
|
||||
kv[i].key = strdup("hostname");
|
||||
kv[i].type = OPAL_STRING;
|
||||
kv[i++].data.string = strdup(hostname);
|
||||
|
||||
/* total memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_total");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* total used memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_used");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* actual used memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_actual_used");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* actual free memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_actual_free");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* total swap memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_total");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* swap used */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_used");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* swap pages in */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_page_in");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* swap pages out */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_page_out");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* cpu user */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("cpu_user");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* cpu sys */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("cpu_sys");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* cpu idle */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("cpu_idle");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* la0 */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("load0");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* la5 */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("load1");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* la15 */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("load2");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* disk read ops rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_ro_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* disk write ops rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_wo_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* disk read bytes/sec */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_rb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* disk write bytes/sec */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_wb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net recv packet rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_rp_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net tx packet rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_wp_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net recv bytes rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_rb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net tx bytes rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_wb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("sigar", kv, 24))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < 24; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
}
|
||||
|
||||
/* Helper function to calculate the metric differences */
|
||||
static uint64_t metric_diff_calc(sigar_uint64_t newval, uint64_t oldval,
|
||||
const char *name_for_log,
|
||||
const char *value_name_for_log)
|
||||
{
|
||||
uint64_t diff;
|
||||
|
||||
if (newval < oldval) {
|
||||
/* assume that the value was reset and we are starting over */
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s metric_diff_calc: new value %" PRIu64 " is less than old value %" PRIu64
|
||||
" for %s metric %s; assume the value was reset and set diff to new value.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newval, oldval, name_for_log, value_name_for_log);
|
||||
diff = newval;
|
||||
} else {
|
||||
diff = newval - oldval;
|
||||
}
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
static void generate_test_vector(opal_buffer_t *v)
|
||||
{
|
||||
char *ctmp;
|
||||
uint64_t ui64;
|
||||
float ft;
|
||||
time_t now;
|
||||
|
||||
ctmp = strdup("sigar");
|
||||
opal_dss.pack(v, &ctmp, 1, OPAL_STRING);
|
||||
free(ctmp);
|
||||
opal_dss.pack(v, &orte_process_info.nodename, 1, OPAL_STRING);
|
||||
/* get the time so it will be unique each time */
|
||||
now = time(NULL);
|
||||
/* pass the time along as a simple string */
|
||||
ctmp = ctime(&now);
|
||||
/* strip the trailing newline */
|
||||
ctmp[strlen(ctmp)-1] = '\0';
|
||||
opal_dss.pack(v, &ctmp, 1, OPAL_STRING);
|
||||
/* mem_total */
|
||||
ui64 = 1;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* mem_used */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* mem_actual_used */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* mem_actual_free */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap total */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap used */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap page in */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap page out */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* cpu user */
|
||||
ft = 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* cpu sys */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* cpu idle */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* la */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* la5 */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* la15 */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* reads */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* writes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* read bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* write bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* rx packets */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* tx packets */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* rx bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* tx bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
}
|
35
orte/mca/sensor/sigar/sensor_sigar.h
Обычный файл
35
orte/mca/sensor/sigar/sensor_sigar.h
Обычный файл
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* SIGAR resource manager sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_SIGAR_H
|
||||
#define ORTE_SENSOR_SIGAR_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_sensor_base_component_t super;
|
||||
bool test;
|
||||
} orte_sensor_sigar_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_sigar_component_t mca_sensor_sigar_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_sigar_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
87
orte/mca/sensor/sigar/sensor_sigar_component.c
Обычный файл
87
orte/mca/sensor/sigar/sensor_sigar_component.c
Обычный файл
@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_sigar.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_sigar_open(void);
|
||||
static int orte_sensor_sigar_close(void);
|
||||
static int orte_sensor_sigar_query(mca_base_module_t **module, int *priority);
|
||||
static int sigar_component_register(void);
|
||||
|
||||
orte_sensor_sigar_component_t mca_sensor_sigar_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"sigar", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_sigar_open, /* component open */
|
||||
orte_sensor_sigar_close, /* component close */
|
||||
orte_sensor_sigar_query, /* component query */
|
||||
sigar_component_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_sigar_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_sigar_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* if we can build, then we definitely want to be used
|
||||
* even if we aren't going to sample as we have to be
|
||||
* present in order to log any received results
|
||||
*/
|
||||
*priority = 50; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_sigar_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_sigar_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int sigar_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_sigar_component.super.base_version;
|
||||
|
||||
mca_sensor_sigar_component.test = false;
|
||||
(void) mca_base_component_var_register (c, "test",
|
||||
"Generate and pass test vector",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
& mca_sensor_sigar_component.test);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user