Per the RFC, remove the sensor framework from the ORTE code area, relocating it offsite to the ORCM code area. Also update some ignores to ensure we don't pickup crosstalk in components
This commit was SVN r31403.
Этот коммит содержится в:
родитель
8897e2f5bb
Коммит
a368e84e70
@ -40,7 +40,6 @@
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -474,24 +473,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:hnp: proc %s exceeded sensor boundary",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!jdata->abort) {
|
||||
jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
jdata->aborted_proc = pptr;
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:hnp: proc %s exited with non-zero status %d",
|
||||
@ -614,9 +595,6 @@ static void default_hnp_abort(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* the job aborted - turn off any sensors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
|
||||
/* set control params to indicate we are terminating */
|
||||
orte_job_term_ordered = true;
|
||||
orte_enable_recovery = false;
|
||||
|
@ -8,6 +8,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -41,7 +42,6 @@
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
@ -96,9 +96,6 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child);
|
||||
static bool all_children_registered(orte_jobid_t job);
|
||||
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf);
|
||||
static void failed_start(orte_job_t *jobdat);
|
||||
static void update_local_children(orte_job_t *jobdat,
|
||||
orte_job_state_t jobstate,
|
||||
orte_proc_state_t state);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
|
||||
static void job_errors(int fd, short args, void *cbdata);
|
||||
@ -168,12 +165,6 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
case ORTE_JOB_STATE_FAILED_TO_START:
|
||||
failed_start(jdata);
|
||||
break;
|
||||
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
/* update all procs in job */
|
||||
update_local_children(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||
break;
|
||||
case ORTE_JOB_STATE_COMM_FAILED:
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
@ -341,15 +332,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
orte_proc_state_to_str(state),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
|
||||
child->state = state;
|
||||
/* Decrement the number of local procs */
|
||||
jdata->num_local_procs--;
|
||||
/* kill this proc */
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
|
||||
if (!orte_abort_non_zero_exit) {
|
||||
/* leave the child in orte_local_children so we can
|
||||
@ -791,36 +773,12 @@ static void failed_start(orte_job_t *jobdat)
|
||||
return;
|
||||
}
|
||||
|
||||
static void update_local_children(orte_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state)
|
||||
{
|
||||
int i;
|
||||
orte_proc_t *child;
|
||||
|
||||
/* update job state */
|
||||
jobdat->state = jobstate;
|
||||
/* update children */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
/* is this child part of the specified job? */
|
||||
if (jobdat->jobid == child->name.jobid) {
|
||||
child->state = state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
{
|
||||
opal_pointer_array_t cmd;
|
||||
orte_proc_t proc;
|
||||
int rc;
|
||||
|
||||
/* stop local sensors for this job */
|
||||
if (ORTE_VPID_WILDCARD == vpid) {
|
||||
orte_sensor.stop(job);
|
||||
}
|
||||
|
||||
if (ORTE_JOBID_WILDCARD == job
|
||||
&& ORTE_VPID_WILDCARD == vpid) {
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
|
||||
|
@ -66,8 +66,6 @@
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
@ -596,20 +594,6 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* setup the SENSOR framework */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sensor_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sensor_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sensor_select";
|
||||
goto error;
|
||||
}
|
||||
/* start the local sensors */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* setup the DFS framework */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -634,10 +618,6 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
|
||||
int orte_ess_base_orted_finalize(void)
|
||||
{
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
(void) mca_base_framework_close(&orte_sensor_base_framework);
|
||||
|
||||
if (signals_set) {
|
||||
/* Release all local signal handlers */
|
||||
opal_event_del(&epipe_handler);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -64,8 +64,6 @@
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
@ -711,20 +709,6 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* setup the SENSOR framework */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sensor_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sensor_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sensor_select";
|
||||
goto error;
|
||||
}
|
||||
/* start the local sensors */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* setup the dfs framework */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -799,10 +783,6 @@ static int rte_finalize(void)
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
(void) mca_base_framework_close(&orte_sensor_base_framework);
|
||||
|
||||
/* close the dfs */
|
||||
(void) mca_base_framework_close(&orte_dfs_base_framework);
|
||||
(void) mca_base_framework_close(&orte_filem_base_framework);
|
||||
|
@ -64,7 +64,6 @@
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
|
||||
@ -1606,9 +1605,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
"%s odls:launch setting waitpids",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* start the sensors for this job (if any) */
|
||||
orte_sensor.start(jobdat->jobid);
|
||||
|
||||
/* setup the waitpids on the children that started */
|
||||
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||
|
@ -60,7 +60,6 @@
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/mca/grpcomm/base/base.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
|
@ -1,30 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_sensor.la
|
||||
libmca_sensor_la_SOURCES =
|
||||
|
||||
# local files
|
||||
headers = sensor.h \
|
||||
sensor_types.h
|
||||
|
||||
libmca_sensor_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
ortedir = $(ompiincludedir)/$(subdir)
|
||||
nobase_orte_HEADERS = $(headers)
|
||||
endif
|
||||
|
||||
include base/Makefile.am
|
||||
|
||||
distclean-local:
|
||||
rm -f base/static-components.h
|
@ -1,19 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/base.h \
|
||||
base/sensor_private.h
|
||||
|
||||
libmca_sensor_la_SOURCES += \
|
||||
base/sensor_base_frame.c \
|
||||
base/sensor_base_select.c \
|
||||
base/sensor_base_fns.c
|
@ -1,38 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_BASE_H
|
||||
#define MCA_SENSOR_BASE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* MCA Framework
|
||||
*/
|
||||
ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework;
|
||||
/* select a component */
|
||||
ORTE_DECLSPEC int orte_sensor_base_select(void);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,158 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
static bool mods_active = false;
|
||||
|
||||
void orte_sensor_base_start(orte_jobid_t job)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
if (0 < orte_sensor_base.rate.tv_sec) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: starting sensors",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
/* call the start function of all modules in priority order */
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
mods_active = true;
|
||||
if (NULL != i_module->module->start) {
|
||||
i_module->module->start(job);
|
||||
}
|
||||
}
|
||||
|
||||
if (mods_active && !orte_sensor_base.active) {
|
||||
/* setup a buffer to collect samples */
|
||||
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
|
||||
/* startup a timer to wake us up periodically
|
||||
* for a data sample
|
||||
*/
|
||||
orte_sensor_base.active = true;
|
||||
opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev,
|
||||
orte_sensor_base_sample, NULL);
|
||||
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void orte_sensor_base_stop(orte_jobid_t job)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
if (!mods_active) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: stopping sensors",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
if (orte_sensor_base.active) {
|
||||
opal_event_del(&orte_sensor_base.sample_ev);
|
||||
orte_sensor_base.active = false;
|
||||
}
|
||||
|
||||
/* call the stop function of all modules in priority order */
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->module->stop) {
|
||||
i_module->module->stop(job);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void orte_sensor_base_sample(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
if (!mods_active) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* see if we were ordered to stop */
|
||||
if (!orte_sensor_base.active) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: sampling sensors",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* call the sample function of all modules in priority order from
|
||||
* highest to lowest - the heartbeat should always be the lowest
|
||||
* priority, so it will send any collected data
|
||||
*/
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->module->sample) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: sampling component %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
i_module->component->base_version.mca_component_name);
|
||||
i_module->module->sample();
|
||||
}
|
||||
}
|
||||
|
||||
/* restart the timer */
|
||||
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void orte_sensor_base_log(char *comp, opal_buffer_t *data)
|
||||
{
|
||||
int i;
|
||||
orte_sensor_active_module_t *i_module;
|
||||
|
||||
if (NULL == comp) {
|
||||
/* nothing we can do */
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: logging sensor %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp);
|
||||
|
||||
/* find the specified module */
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) {
|
||||
if (NULL != i_module->module->log) {
|
||||
i_module->module->log(data);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,131 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/sensor/base/static-components.h"
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
orte_sensor_base_API_module_t orte_sensor = {
|
||||
orte_sensor_base_start,
|
||||
orte_sensor_base_stop
|
||||
};
|
||||
orte_sensor_base_t orte_sensor_base;
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static int orte_sensor_base_sample_rate = 0;
|
||||
|
||||
static int orte_sensor_base_register(mca_base_register_flag_t flags)
|
||||
{
|
||||
int var_id;
|
||||
|
||||
orte_sensor_base_sample_rate = 0;
|
||||
var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate",
|
||||
"Sample rate in seconds",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_sensor_base_sample_rate);
|
||||
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
|
||||
/* see if we want samples logged */
|
||||
orte_sensor_base.log_samples = false;
|
||||
var_id = mca_base_var_register("orte", "sensor", "base", "log_samples",
|
||||
"Log samples to database",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_sensor_base.log_samples);
|
||||
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_base_close(void)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->module->finalize) {
|
||||
i_module->module->finalize();
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_sensor_base.modules);
|
||||
|
||||
/* Close all remaining available components */
|
||||
return mca_base_framework_components_close(&orte_sensor_base_framework, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
static int orte_sensor_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
/* initialize globals */
|
||||
orte_sensor_base.active = false;
|
||||
|
||||
/* construct the array of modules */
|
||||
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
|
||||
|
||||
/* get the sample rate */
|
||||
orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate;
|
||||
orte_sensor_base.rate.tv_usec = 0;
|
||||
|
||||
/* Open up all available components */
|
||||
return mca_base_framework_components_open(&orte_sensor_base_framework, flags);
|
||||
}
|
||||
|
||||
MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
|
||||
orte_sensor_base_register,
|
||||
orte_sensor_base_open, orte_sensor_base_close,
|
||||
mca_sensor_base_static_components, 0);
|
||||
|
||||
static void cons(orte_sensor_active_module_t *t)
|
||||
{
|
||||
t->sampling = true;
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
|
||||
opal_object_t,
|
||||
cons, NULL);
|
@ -1,218 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
|
||||
static bool selected = false;
|
||||
|
||||
/**
|
||||
* Function for weeding out sensor components that don't want to run.
|
||||
*
|
||||
* Call the init function on all available components to find out if
|
||||
* they want to run. Select all components that don't fail. Failing
|
||||
* components will be closed and unloaded. The selected modules will
|
||||
* be returned to the caller in a opal_list_t.
|
||||
*/
|
||||
int orte_sensor_base_select(void)
|
||||
{
|
||||
mca_base_component_list_item_t *cli = NULL;
|
||||
orte_sensor_base_component_t *component = NULL;
|
||||
mca_base_module_t *module = NULL;
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int priority = 0, i, j, low_i;
|
||||
opal_pointer_array_t tmp_array;
|
||||
bool none_found;
|
||||
orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
|
||||
bool duplicate;
|
||||
|
||||
if (selected) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
selected = true;
|
||||
|
||||
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
||||
|
||||
opal_output_verbose(10, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select: Auto-selecting components");
|
||||
|
||||
/*
|
||||
* Traverse the list of available components.
|
||||
* For each call their 'query' functions to determine relative priority.
|
||||
*/
|
||||
none_found = true;
|
||||
OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) {
|
||||
component = (orte_sensor_base_component_t *) cli->cli_component;
|
||||
|
||||
/*
|
||||
* If there is a query function then use it.
|
||||
*/
|
||||
if (NULL == component->base_version.mca_query_component) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Skipping component [%s]. It does not implement a query function",
|
||||
component->base_version.mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query this component for the module and priority
|
||||
*/
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Querying component [%s]",
|
||||
component->base_version.mca_component_name);
|
||||
|
||||
component->base_version.mca_query_component(&module, &priority);
|
||||
|
||||
/*
|
||||
* If no module was returned or negative priority, then skip component
|
||||
*/
|
||||
if (NULL == module || priority < 0) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Skipping component [%s]. Query failed to return a module",
|
||||
component->base_version.mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check to see if we already have someone who senses the
|
||||
* same things - if so, take the higher priority one
|
||||
*/
|
||||
duplicate = false;
|
||||
for (i=0; i < tmp_array.size; i++) {
|
||||
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if (NULL == tmp_module) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) {
|
||||
if (tmp_module->priority < priority) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Replacing component %s with %s - both measure %s",
|
||||
tmp_module->component->base_version.mca_component_name,
|
||||
component->base_version.mca_component_name,
|
||||
component->data_measured);
|
||||
OBJ_RELEASE(tmp_module);
|
||||
opal_pointer_array_set_item(&tmp_array, i, NULL);
|
||||
break;
|
||||
} else {
|
||||
duplicate = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (duplicate) {
|
||||
/* ignore this component */
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Ignoring component %s - duplicate with higher priority measures %s",
|
||||
component->base_version.mca_component_name,
|
||||
component->data_measured);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append them to the temporary list, we will sort later
|
||||
*/
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Query of component [%s] set priority to %d",
|
||||
component->base_version.mca_component_name, priority);
|
||||
tmp_module = OBJ_NEW(orte_sensor_active_module_t);
|
||||
tmp_module->component = component;
|
||||
tmp_module->module = (orte_sensor_base_module_t*)module;
|
||||
tmp_module->priority = priority;
|
||||
|
||||
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
|
||||
none_found = false;
|
||||
}
|
||||
|
||||
if (none_found) {
|
||||
/* okay for no modules to be found */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the list by decending priority
|
||||
*/
|
||||
priority = 0;
|
||||
for(j = 0; j < tmp_array.size; ++j) {
|
||||
tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j);
|
||||
if( NULL == tmp_module_sw ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
low_i = -1;
|
||||
priority = tmp_module_sw->priority;
|
||||
|
||||
for(i = 0; i < tmp_array.size; ++i) {
|
||||
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if( NULL == tmp_module ) {
|
||||
continue;
|
||||
}
|
||||
if( tmp_module->priority > priority ) {
|
||||
low_i = i;
|
||||
priority = tmp_module->priority;
|
||||
}
|
||||
}
|
||||
|
||||
if( low_i >= 0 ) {
|
||||
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
|
||||
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
|
||||
j--; /* Try this entry again, if it is not the lowest */
|
||||
} else {
|
||||
tmp_module = tmp_module_sw;
|
||||
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
||||
}
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Add module with priority [%s] %d",
|
||||
tmp_module->component->base_version.mca_component_name, tmp_module->priority);
|
||||
opal_pointer_array_add(&orte_sensor_base.modules, tmp_module);
|
||||
}
|
||||
OBJ_DESTRUCT(&tmp_array);
|
||||
|
||||
/*
|
||||
* Initialize each of the modules in priority order from
|
||||
* highest to lowest
|
||||
*/
|
||||
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
|
||||
i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
|
||||
if( NULL == i_module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != i_module->module->init ) {
|
||||
if (ORTE_SUCCESS != i_module->module->init()) {
|
||||
/* can't sample - however, if we are the HNP
|
||||
* or an aggregator, then we need this module
|
||||
* anyway so we can log incoming data
|
||||
*/
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
|
||||
i_module->sampling = false;
|
||||
} else {
|
||||
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
|
||||
OBJ_RELEASE(i_module);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_PRIVATE_H
|
||||
#define MCA_SENSOR_PRIVATE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
opal_pointer_array_t modules;
|
||||
bool log_samples;
|
||||
bool active;
|
||||
struct timeval rate;
|
||||
opal_event_t sample_ev;
|
||||
opal_buffer_t *samples;
|
||||
} orte_sensor_base_t;
|
||||
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
orte_sensor_base_component_t *component;
|
||||
orte_sensor_base_module_t *module;
|
||||
int priority;
|
||||
bool sampling;
|
||||
} orte_sensor_active_module_t;
|
||||
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);
|
||||
|
||||
|
||||
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
|
||||
ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job);
|
||||
ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job);
|
||||
ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata);
|
||||
ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,37 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-coretemp.txt
|
||||
|
||||
sources = \
|
||||
sensor_coretemp.c \
|
||||
sensor_coretemp.h \
|
||||
sensor_coretemp_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_coretemp_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_coretemp.la
|
||||
else
|
||||
component_noinst = libmca_sensor_coretemp.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_coretemp_la_SOURCES = $(sources)
|
||||
mca_sensor_coretemp_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_coretemp_la_SOURCES =$(sources)
|
||||
libmca_sensor_coretemp_la_LDFLAGS = -module -avoid-version
|
@ -1,30 +0,0 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
dnl Additional copyrights may follow
|
||||
dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
# MCA_sensor_coretemp_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_coretemp_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/coretemp/Makefile])
|
||||
|
||||
AC_ARG_WITH([coretemp],
|
||||
[AC_HELP_STRING([--with-coretemp],
|
||||
[Build coretemp support (default: no)])],
|
||||
[], with_coretemp=no)
|
||||
|
||||
# do not build if support not requested
|
||||
AS_IF([test "$with_coretemp" != "no"],
|
||||
[AS_IF([test "$opal_found_linux" = "yes"],
|
||||
[$1],
|
||||
[AC_MSG_WARN([Core temperature sensing was requested but is only supported on Linux systems])
|
||||
AC_MSG_ERROR([Cannot continue])
|
||||
$2])
|
||||
],
|
||||
[$2])
|
||||
])dnl
|
@ -1,33 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file
|
||||
#
|
||||
[req-dir-not-found]
|
||||
Core temperature monitoring was requested, but this node
|
||||
lacks the required directory:
|
||||
|
||||
Node: %s
|
||||
Directory: %s
|
||||
|
||||
This usually indicates that the "coretemp" kernel module
|
||||
has not been loaded. Operation will continue, but core
|
||||
temperatures will not be monitored.
|
||||
#
|
||||
[no-cores-found]
|
||||
Core temperature monitoring was requested, but this node
|
||||
does not appear to have the required core-level files, or
|
||||
you lack authority to access them:
|
||||
|
||||
Node: %s
|
||||
|
||||
This usually indicates that the "coretemp" kernel module
|
||||
has not been loaded. Operation will continue, but core
|
||||
temperatures will not be monitored.
|
@ -1,453 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#ifdef HAVE_DIRENT_H
|
||||
#include <dirent.h>
|
||||
#endif /* HAVE_DIRENT_H */
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_coretemp.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void coretemp_sample(void);
|
||||
static void coretemp_log(opal_buffer_t *buf);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_coretemp_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
coretemp_sample,
|
||||
coretemp_log
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *file;
|
||||
int socket;
|
||||
char *label;
|
||||
float critical_temp;
|
||||
float max_temp;
|
||||
} coretemp_tracker_t;
|
||||
static void ctr_con(coretemp_tracker_t *trk)
|
||||
{
|
||||
trk->file = NULL;
|
||||
trk->label = NULL;
|
||||
}
|
||||
static void ctr_des(coretemp_tracker_t *trk)
|
||||
{
|
||||
if (NULL != trk->file) {
|
||||
free(trk->file);
|
||||
}
|
||||
if (NULL != trk->label) {
|
||||
free(trk->label);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(coretemp_tracker_t,
|
||||
opal_list_item_t,
|
||||
ctr_con, ctr_des);
|
||||
|
||||
static bool log_enabled = true;
|
||||
static opal_list_t tracking;
|
||||
|
||||
static char *orte_getline(FILE *fp)
|
||||
{
|
||||
char *ret, *buff;
|
||||
char input[1024];
|
||||
|
||||
ret = fgets(input, 1024, fp);
|
||||
if (NULL != ret) {
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
buff = strdup(input);
|
||||
return buff;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* FOR FUTURE: extend to read cooling device speeds in
|
||||
* current speed: /sys/class/thermal/cooling_deviceN/cur_state
|
||||
* max speed: /sys/class/thermal/cooling_deviceN/max_state
|
||||
* type: /sys/class/thermal/cooling_deviceN/type
|
||||
*/
|
||||
static int init(void)
|
||||
{
|
||||
DIR *cur_dirp = NULL, *tdir;
|
||||
struct dirent *dir_entry, *entry;
|
||||
char *dirname, *filename, *ptr, *tmp;
|
||||
size_t tlen = strlen("temp");
|
||||
size_t ilen = strlen("_input");
|
||||
FILE *fp;
|
||||
coretemp_tracker_t *trk;
|
||||
int socket;
|
||||
|
||||
/* always construct this so we don't segfault in finalize */
|
||||
OBJ_CONSTRUCT(&tracking, opal_list_t);
|
||||
|
||||
/*
|
||||
* Open up the base directory so we can get a listing
|
||||
*/
|
||||
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
|
||||
OBJ_DESTRUCT(&tracking);
|
||||
orte_show_help("help-orte-sensor-coretemp.txt", "req-dir-not-found",
|
||||
true, orte_process_info.nodename,
|
||||
"/sys/bus/platform/devices");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* For each directory
|
||||
*/
|
||||
socket = 0;
|
||||
while (NULL != (dir_entry = readdir(cur_dirp))) {
|
||||
|
||||
/* look for coretemp directories */
|
||||
if (0 != strncmp(dir_entry->d_name, "coretemp", strlen("coretemp"))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* open that directory */
|
||||
dirname = opal_os_path(false, "/sys/bus/platform/devices", dir_entry->d_name, NULL );
|
||||
if (NULL == (tdir = opendir(dirname))) {
|
||||
continue;
|
||||
}
|
||||
while (NULL != (entry = readdir(tdir))) {
|
||||
/*
|
||||
* Skip the obvious
|
||||
*/
|
||||
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
|
||||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
|
||||
continue;
|
||||
}
|
||||
if (strlen(entry->d_name) < (tlen+ilen)) {
|
||||
/* cannot be a core temp file */
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* See if this is a core temp file
|
||||
*/
|
||||
if (0 != strncmp(entry->d_name, "temp", strlen("temp"))) {
|
||||
continue;
|
||||
}
|
||||
if (0 != strcmp(entry->d_name + strlen(entry->d_name) - ilen, "_input")) {
|
||||
continue;
|
||||
}
|
||||
/* track the info for this core */
|
||||
trk = OBJ_NEW(coretemp_tracker_t);
|
||||
trk->socket = socket;
|
||||
trk->file = opal_os_path(false, dirname, entry->d_name, NULL);
|
||||
/* take the part up to the first underscore as this will
|
||||
* be used as the start of all the related files
|
||||
*/
|
||||
tmp = strdup(entry->d_name);
|
||||
if (NULL == (ptr = strchr(tmp, '_'))) {
|
||||
/* unrecognized format */
|
||||
free(tmp);
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
*ptr = '\0';
|
||||
/* look for critical, max, and label info */
|
||||
asprintf(&filename, "%s/%s_%s", dirname, tmp, "label");
|
||||
fp = fopen(filename, "r");
|
||||
trk->label = orte_getline(fp);
|
||||
fclose(fp);
|
||||
free(filename);
|
||||
|
||||
asprintf(&filename, "%s/%s_%s", dirname, tmp, "crit");
|
||||
fp = fopen(filename, "r");
|
||||
ptr = orte_getline(fp);
|
||||
fclose(fp);
|
||||
trk->critical_temp = strtol(ptr, NULL, 10)/100.0;
|
||||
free(ptr);
|
||||
free(filename);
|
||||
|
||||
asprintf(&filename, "%s/%s_%s", dirname, tmp, "max");
|
||||
fp = fopen(filename, "r");
|
||||
ptr = orte_getline(fp);
|
||||
fclose(fp);
|
||||
trk->max_temp = strtol(ptr, NULL, 10)/100.0;
|
||||
free(ptr);
|
||||
free(filename);
|
||||
|
||||
/* add to our list */
|
||||
opal_list_append(&tracking, &trk->super);
|
||||
/* cleanup */
|
||||
free(tmp);
|
||||
}
|
||||
closedir(tdir);
|
||||
socket++;
|
||||
}
|
||||
closedir(cur_dirp);
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
/* nothing to read */
|
||||
orte_show_help("help-orte-sensor-coretemp.txt", "no-cores-found",
|
||||
true, orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
OPAL_LIST_DESTRUCT(&tracking);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local temps
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static void coretemp_sample(void)
|
||||
{
|
||||
int ret;
|
||||
coretemp_tracker_t *trk, *nxt;
|
||||
FILE *fp;
|
||||
char *temp;
|
||||
float degc;
|
||||
opal_buffer_t data, *bptr;
|
||||
int32_t ncores;
|
||||
time_t now;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
bool packed;
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
packed = false;
|
||||
|
||||
/* pack our name */
|
||||
temp = strdup("coretemp");
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* store the number of cores */
|
||||
ncores = (int32_t)opal_list_get_size(&tracking);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ncores, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the sample time */
|
||||
now = time(NULL);
|
||||
/* pass the time along as a simple string */
|
||||
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
|
||||
asprintf(×tamp_str, "%s", time_str);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, ×tamp_str, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(timestamp_str);
|
||||
return;
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, coretemp_tracker_t) {
|
||||
/* read the temp */
|
||||
if (NULL == (fp = fopen(trk->file, "r"))) {
|
||||
/* we can't be read, so remove it from the list */
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s access denied to coretemp file %s - removing it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->file);
|
||||
opal_list_remove_item(&tracking, &trk->super);
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
while (NULL != (temp = orte_getline(fp))) {
|
||||
degc = strtoul(temp, NULL, 10) / 100.0;
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:coretemp: Socket %d %s temp %f max %f critical %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->socket, trk->label, degc, trk->max_temp, trk->critical_temp);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, °c, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(temp);
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
packed = true;
|
||||
/* check for exceed critical temp */
|
||||
if (trk->critical_temp < degc) {
|
||||
/* alert the errmgr - this is a critical problem */
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:coretemp: Socket %d %s CRITICAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->socket, trk->label);
|
||||
} else if (trk->max_temp < degc) {
|
||||
/* alert the errmgr */
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:coretemp: Socket %d %s MAX",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->socket, trk->label);
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
if (packed) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
|
||||
static void coretemp_log(opal_buffer_t *sample)
|
||||
{
|
||||
char *hostname=NULL;
|
||||
char *sampletime;
|
||||
int rc;
|
||||
int32_t n, ncores;
|
||||
opal_value_t *kv=NULL;
|
||||
float fval;
|
||||
int i;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the host this came from */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* and the number of cores on that host */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &ncores, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* sample time */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received log from host %s with %d cores",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+2) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
kv[0].key = strdup("ctime");
|
||||
kv[0].type = OPAL_STRING;
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* load the hostname */
|
||||
OBJ_CONSTRUCT(&kv[1], opal_value_t);
|
||||
kv[1].key = strdup("hostname");
|
||||
kv[1].type = OPAL_STRING;
|
||||
kv[1].data.string = strdup(hostname);
|
||||
|
||||
/* protect against segfault if we jump to cleanup */
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
|
||||
}
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
asprintf(&kv[i+2].key, "core%d", i);
|
||||
kv[i+2].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+2].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+2))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+2; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* CORETEMP resource manager sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_CORETEMP_H
|
||||
#define ORTE_SENSOR_CORETEMP_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_sensor_base_component_t super;
|
||||
bool test;
|
||||
} orte_sensor_coretemp_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_coretemp_component_t mca_sensor_coretemp_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_coretemp_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,91 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_coretemp.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_coretemp_open(void);
|
||||
static int orte_sensor_coretemp_close(void);
|
||||
static int orte_sensor_coretemp_query(mca_base_module_t **module, int *priority);
|
||||
static int coretemp_component_register(void);
|
||||
|
||||
orte_sensor_coretemp_component_t mca_sensor_coretemp_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"coretemp", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_coretemp_open, /* component open */
|
||||
orte_sensor_coretemp_close, /* component close */
|
||||
orte_sensor_coretemp_query, /* component query */
|
||||
coretemp_component_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"coretemp" // data being sensed
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_coretemp_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_coretemp_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* if we can build, then we definitely want to be used
|
||||
* even if we aren't going to sample as we have to be
|
||||
* present in order to log any received results. Note that
|
||||
* we tested for existence and read-access for at least
|
||||
* one socket in the configure test, so we don't have to
|
||||
* check again here
|
||||
*/
|
||||
*priority = 50; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_coretemp_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_coretemp_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int coretemp_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_coretemp_component.super.base_version;
|
||||
|
||||
mca_sensor_coretemp_component.test = false;
|
||||
(void) mca_base_component_var_register (c, "test",
|
||||
"Generate and pass test vector",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
& mca_sensor_coretemp_component.test);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-file.txt
|
||||
|
||||
sources = \
|
||||
sensor_file.c \
|
||||
sensor_file.h \
|
||||
sensor_file_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_file_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_file.la
|
||||
else
|
||||
component_noinst = libmca_sensor_file.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_file_la_SOURCES = $(sources)
|
||||
mca_sensor_file_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_file_la_SOURCES =$(sources)
|
||||
libmca_sensor_file_la_LDFLAGS = -module -avoid-version
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_file_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/file/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,18 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the file sensor
|
||||
#
|
||||
[file-stalled]
|
||||
A specified file is not changing, indicating a possibly stalled application:
|
||||
|
||||
File: %s
|
||||
Last size: %lu
|
||||
Last access: %sLast modification: %s
|
@ -1,353 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_file.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void file_sample(void);
|
||||
static void file_log(opal_buffer_t *sample);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_file_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
file_sample,
|
||||
file_log
|
||||
};
|
||||
|
||||
/* define a tracking object */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
char *file;
|
||||
int tick;
|
||||
bool check_size;
|
||||
bool check_access;
|
||||
bool check_mod;
|
||||
int32_t file_size;
|
||||
time_t last_access;
|
||||
time_t last_mod;
|
||||
int limit;
|
||||
} file_tracker_t;
|
||||
static void ft_constructor(file_tracker_t *ft)
|
||||
{
|
||||
ft->file = NULL;
|
||||
ft->tick = 0;
|
||||
ft->file_size = 0;
|
||||
ft->last_access = 0;
|
||||
ft->last_mod = 0;
|
||||
ft->limit = 0;
|
||||
}
|
||||
static void ft_destructor(file_tracker_t *ft)
|
||||
{
|
||||
if (NULL != ft->file) {
|
||||
free(ft->file);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(file_tracker_t,
|
||||
opal_list_item_t,
|
||||
ft_constructor, ft_destructor);
|
||||
|
||||
/* local globals */
|
||||
static opal_list_t jobs;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
OBJ_CONSTRUCT(&jobs, opal_list_t);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&jobs))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&jobs);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static bool find_value(orte_app_context_t *app,
|
||||
char *pattern, char **value)
|
||||
{
|
||||
int i;
|
||||
char *ptr;
|
||||
|
||||
for (i=0; NULL != app->env[i]; i++) {
|
||||
if (0 == strncmp(app->env[i], pattern, strlen(pattern))) {
|
||||
ptr = strchr(app->env[i], '=');
|
||||
ptr++;
|
||||
if (NULL != value) {
|
||||
*value = strdup(ptr);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
orte_job_t *jobdat;
|
||||
orte_app_context_t *app, *aptr;
|
||||
int i;
|
||||
char *filename;
|
||||
file_tracker_t *ft;
|
||||
char *ptr;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s starting file monitoring for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
|
||||
/* get the local jobdat for this job */
|
||||
if (NULL == (jobdat = orte_get_job_data_object(jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* must be at least one app_context, so use the first one found */
|
||||
app = NULL;
|
||||
for (i=0; i < jobdat->apps->size; i++) {
|
||||
if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) {
|
||||
app = aptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == app) {
|
||||
/* got a problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* search the environ to get the filename */
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) {
|
||||
/* was a default file given */
|
||||
if (NULL == mca_sensor_file_component.file) {
|
||||
/* can't do anything without a file */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:file no file for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
return;
|
||||
}
|
||||
filename = mca_sensor_file_component.file;
|
||||
}
|
||||
|
||||
/* create the tracking object */
|
||||
ft = OBJ_NEW(file_tracker_t);
|
||||
ft->jobid = jobid;
|
||||
ft->file = strdup(filename);
|
||||
|
||||
/* search the environ to see what we are checking */
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_size) {
|
||||
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
|
||||
}
|
||||
} else {
|
||||
ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_access) {
|
||||
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
|
||||
}
|
||||
} else {
|
||||
ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_mod) {
|
||||
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
|
||||
}
|
||||
} else {
|
||||
ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) {
|
||||
ft->limit = mca_sensor_file_component.limit;
|
||||
} else {
|
||||
ft->limit = strtol(ptr, NULL, 10);
|
||||
free(ptr);
|
||||
}
|
||||
opal_list_append(&jobs, &ft->super);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s file %s monitored for %s%s%s with limit %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file, ft->check_size ? "SIZE:" : " ",
|
||||
ft->check_access ? "ACCESS TIME:" : " ",
|
||||
ft->check_mod ? "MOD TIME" : " ", ft->limit));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
ft = (file_tracker_t*)item;
|
||||
if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) {
|
||||
opal_list_remove_item(&jobs, item);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void file_sample(void)
|
||||
{
|
||||
struct stat buf;
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
orte_job_t *jdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sampling files",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
ft = (file_tracker_t*)item;
|
||||
|
||||
/* stat the file and get its size */
|
||||
if (0 > stat(ft->file, &buf)) {
|
||||
/* cannot stat file */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s could not stat %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file));
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s size %lu access %s\tmod %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
|
||||
|
||||
if (ft->check_size) {
|
||||
if (buf.st_size == ft->file_size) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->file_size = buf.st_size;
|
||||
}
|
||||
}
|
||||
if (ft->check_access) {
|
||||
if (buf.st_atime == ft->last_access) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->last_access = buf.st_atime;
|
||||
}
|
||||
}
|
||||
if (ft->check_mod) {
|
||||
if (buf.st_mtime == ft->last_mod) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->last_mod = buf.st_mtime;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK:
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sampled file %s tick %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file, ft->tick));
|
||||
|
||||
if (ft->tick == ft->limit) {
|
||||
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
|
||||
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
|
||||
jdata = orte_get_job_data_object(ft->jobid);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void file_log(opal_buffer_t *sample)
|
||||
{
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* File movement sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FILE_H
|
||||
#define ORTE_SENSOR_FILE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_file_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int sample_rate;
|
||||
char *file;
|
||||
bool check_size;
|
||||
bool check_access;
|
||||
bool check_mod;
|
||||
int limit;
|
||||
};
|
||||
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_file_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,119 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_file.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_sensor_file_register (void);
|
||||
static int orte_sensor_file_open(void);
|
||||
static int orte_sensor_file_close(void);
|
||||
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_file_component_t mca_sensor_file_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"file", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_file_open, /* component open */
|
||||
orte_sensor_file_close, /* component close */
|
||||
orte_sensor_file_query, /* component query */
|
||||
orte_sensor_file_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"filemods" // data being sensed
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component register/open/close/init function
|
||||
*/
|
||||
static int orte_sensor_file_register (void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_sensor_file_component.file = NULL;
|
||||
(void) mca_base_component_var_register (c, "filename", "File to be monitored",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.file);
|
||||
|
||||
mca_sensor_file_component.check_size = false;
|
||||
(void) mca_base_component_var_register (c, "check_size", "Check the file size",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.check_size);
|
||||
|
||||
mca_sensor_file_component.check_access = false;
|
||||
(void) mca_base_component_var_register (c, "check_access", "Check access time",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.check_access);
|
||||
|
||||
mca_sensor_file_component.check_mod = false;
|
||||
(void) mca_base_component_var_register (c, "check_mod", "Check modification time",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.check_mod);
|
||||
|
||||
mca_sensor_file_component.limit = 3;
|
||||
(void) mca_base_component_var_register (c, "limit",
|
||||
"Number of times the sensor can detect no motion before declaring error (default=3)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.limit);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_file_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 20; /* higher than heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_file_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_file_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-freq.txt
|
||||
|
||||
sources = \
|
||||
sensor_freq.c \
|
||||
sensor_freq.h \
|
||||
sensor_freq_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_freq_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_freq.la
|
||||
else
|
||||
component_noinst = libmca_sensor_freq.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_freq_la_SOURCES = $(sources)
|
||||
mca_sensor_freq_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_freq_la_SOURCES =$(sources)
|
||||
libmca_sensor_freq_la_LDFLAGS = -module -avoid-version
|
@ -1,30 +0,0 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
dnl Additional copyrights may follow
|
||||
dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
# MCA_sensor_freq_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_freq_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/freq/Makefile])
|
||||
|
||||
AC_ARG_WITH([freq],
|
||||
[AC_HELP_STRING([--with-freq],
|
||||
[Build freq support (default: no)])],
|
||||
[], with_freq=no)
|
||||
|
||||
# do not build if support not requested
|
||||
AS_IF([test "$with_freq" != "no"],
|
||||
[AS_IF([test "$opal_found_linux" = "yes"],
|
||||
[$1],
|
||||
[AC_MSG_WARN([Core frequency sensing was requested but is only supported on Linux systems])
|
||||
AC_MSG_ERROR([Cannot continue])
|
||||
$2])
|
||||
],
|
||||
[$2])
|
||||
])dnl
|
@ -1,29 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[req-dir-not-found]
|
||||
Frequency monitoring was requested, but this node
|
||||
lacks the required directory:
|
||||
|
||||
Node: %s
|
||||
Directory: %s
|
||||
|
||||
Operation will continue, but frequencies will not be monitored.
|
||||
#
|
||||
[no-cores-found]
|
||||
Frequency monitoring was requested, but this node
|
||||
does not appear to have the required core-level files, or
|
||||
you lack authority to access them:
|
||||
|
||||
Node: %s
|
||||
|
||||
Operation will continue, but frequencies will not be monitored.
|
@ -1,412 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#ifdef HAVE_DIRENT_H
|
||||
#include <dirent.h>
|
||||
#endif /* HAVE_DIRENT_H */
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_freq.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void freq_sample(void);
|
||||
static void freq_log(opal_buffer_t *buf);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_freq_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
freq_sample,
|
||||
freq_log
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *file;
|
||||
int core;
|
||||
float max_freq;
|
||||
float min_freq;
|
||||
} corefreq_tracker_t;
|
||||
static void ctr_con(corefreq_tracker_t *trk)
|
||||
{
|
||||
trk->file = NULL;
|
||||
}
|
||||
static void ctr_des(corefreq_tracker_t *trk)
|
||||
{
|
||||
if (NULL != trk->file) {
|
||||
free(trk->file);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(corefreq_tracker_t,
|
||||
opal_list_item_t,
|
||||
ctr_con, ctr_des);
|
||||
|
||||
static bool log_enabled = true;
|
||||
static opal_list_t tracking;
|
||||
|
||||
static char *orte_getline(FILE *fp)
|
||||
{
|
||||
char *ret, *buff;
|
||||
char input[1024];
|
||||
|
||||
ret = fgets(input, 1024, fp);
|
||||
if (NULL != ret) {
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
buff = strdup(input);
|
||||
return buff;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* FOR FUTURE: extend to read cooling device speeds in
|
||||
* current speed: /sys/class/thermal/cooling_deviceN/cur_state
|
||||
* max speed: /sys/class/thermal/cooling_deviceN/max_state
|
||||
* type: /sys/class/thermal/cooling_deviceN/type
|
||||
*/
|
||||
static int init(void)
|
||||
{
|
||||
int k;
|
||||
DIR *cur_dirp = NULL;
|
||||
struct dirent *entry;
|
||||
char *filename, *tmp;
|
||||
FILE *fp;
|
||||
corefreq_tracker_t *trk;
|
||||
|
||||
/* always construct this so we don't segfault in finalize */
|
||||
OBJ_CONSTRUCT(&tracking, opal_list_t);
|
||||
|
||||
/*
|
||||
* Open up the base directory so we can get a listing
|
||||
*/
|
||||
if (NULL == (cur_dirp = opendir("/sys/devices/system/cpu"))) {
|
||||
OBJ_DESTRUCT(&tracking);
|
||||
orte_show_help("help-orte-sensor-freq.txt", "req-dir-not-found",
|
||||
true, orte_process_info.nodename,
|
||||
"/sys/devices/system/cpu");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* For each directory
|
||||
*/
|
||||
while (NULL != (entry = readdir(cur_dirp))) {
|
||||
|
||||
/*
|
||||
* Skip the obvious
|
||||
*/
|
||||
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
|
||||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* look for cpu directories */
|
||||
if (0 != strncmp(entry->d_name, "cpu", strlen("cpu"))) {
|
||||
/* cannot be a cpu directory */
|
||||
continue;
|
||||
}
|
||||
/* if it ends in other than a digit, then it isn't a cpu directory */
|
||||
if (!isdigit(entry->d_name[strlen(entry->d_name)-1])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* track the info for this core */
|
||||
trk = OBJ_NEW(corefreq_tracker_t);
|
||||
/* trailing digits are the core id */
|
||||
for (k=strlen(entry->d_name)-1; 0 <= k; k--) {
|
||||
if (!isdigit(entry->d_name[k])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
trk->core = strtoul(&entry->d_name[k], NULL, 10);
|
||||
trk->file = opal_os_path(false, "/sys/devices/system/cpu", entry->d_name, "cpufreq", "cpuinfo_cur_freq", NULL);
|
||||
|
||||
/* read the static info */
|
||||
filename = opal_os_path(false, "/sys/devices/system/cpu", entry->d_name, "cpufreq", "cpuinfo_max_freq", NULL);
|
||||
fp = fopen(filename, "r");
|
||||
tmp = orte_getline(fp);
|
||||
fclose(fp);
|
||||
trk->max_freq = strtoul(tmp, NULL, 10) / 1000000.0;
|
||||
free(filename);
|
||||
|
||||
filename = opal_os_path(false, "/sys/devices/system/cpu", entry->d_name, "cpufreq", "cpuinfo_min_freq", NULL);
|
||||
fp = fopen(filename, "r");
|
||||
tmp = orte_getline(fp);
|
||||
fclose(fp);
|
||||
trk->min_freq = strtoul(tmp, NULL, 10) / 1000000.0;
|
||||
free(filename);
|
||||
|
||||
/* add to our list */
|
||||
opal_list_append(&tracking, &trk->super);
|
||||
/* cleanup */
|
||||
free(tmp);
|
||||
}
|
||||
closedir(cur_dirp);
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
/* nothing to read */
|
||||
orte_show_help("help-orte-sensor-freq.txt", "no-cores-found",
|
||||
true, orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
OPAL_LIST_DESTRUCT(&tracking);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local temps
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static void freq_sample(void)
|
||||
{
|
||||
int ret;
|
||||
corefreq_tracker_t *trk, *nxt;
|
||||
FILE *fp;
|
||||
char *freq;
|
||||
float ghz;
|
||||
opal_buffer_t data, *bptr;
|
||||
int32_t ncores;
|
||||
time_t now;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
bool packed;
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s sampling freq",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
packed = false;
|
||||
|
||||
/* pack our name */
|
||||
freq = strdup("freq");
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &freq, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(freq);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* store the number of cores */
|
||||
ncores = (int32_t)opal_list_get_size(&tracking);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ncores, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the sample time */
|
||||
now = time(NULL);
|
||||
/* pass the time along as a simple string */
|
||||
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
|
||||
asprintf(×tamp_str, "%s", time_str);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, ×tamp_str, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(timestamp_str);
|
||||
return;
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, corefreq_tracker_t) {
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s processing freq file %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->file);
|
||||
/* read the freq */
|
||||
if (NULL == (fp = fopen(trk->file, "r"))) {
|
||||
/* we can't be read, so remove it from the list */
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s access denied to freq file %s - removing it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->file);
|
||||
opal_list_remove_item(&tracking, &trk->super);
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
while (NULL != (freq = orte_getline(fp))) {
|
||||
ghz = strtoul(freq, NULL, 10) / 1000000.0;
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:freq: Core %d freq %f max %f min %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->core, ghz, trk->max_freq, trk->min_freq);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ghz, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(freq);
|
||||
return;
|
||||
}
|
||||
packed = true;
|
||||
free(freq);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
if (packed) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
|
||||
static void freq_log(opal_buffer_t *sample)
|
||||
{
|
||||
char *hostname=NULL;
|
||||
char *sampletime;
|
||||
int rc;
|
||||
int32_t n, ncores;
|
||||
opal_value_t *kv=NULL;
|
||||
float fval;
|
||||
int i;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the host this came from */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* and the number of cores on that host */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &ncores, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* sample time */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received freq log from host %s with %d cores",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+2) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
kv[0].key = strdup("ctime");
|
||||
kv[0].type = OPAL_STRING;
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* load the hostname */
|
||||
OBJ_CONSTRUCT(&kv[1], opal_value_t);
|
||||
kv[1].key = strdup("hostname");
|
||||
kv[1].type = OPAL_STRING;
|
||||
kv[1].data.string = strdup(hostname);
|
||||
|
||||
/* protect against segfault if we jump to cleanup */
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
|
||||
}
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
asprintf(&kv[i+2].key, "core%d", i);
|
||||
kv[i+2].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+2].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+2))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+2; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* FREQ resource manager sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FREQ_H
|
||||
#define ORTE_SENSOR_FREQ_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_sensor_base_component_t super;
|
||||
bool test;
|
||||
} orte_sensor_freq_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_freq_component_t mca_sensor_freq_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_freq_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,91 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_freq.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_freq_open(void);
|
||||
static int orte_sensor_freq_close(void);
|
||||
static int orte_sensor_freq_query(mca_base_module_t **module, int *priority);
|
||||
static int freq_component_register(void);
|
||||
|
||||
orte_sensor_freq_component_t mca_sensor_freq_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"freq", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_freq_open, /* component open */
|
||||
orte_sensor_freq_close, /* component close */
|
||||
orte_sensor_freq_query, /* component query */
|
||||
freq_component_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"freq" // data being sensed
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_freq_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_freq_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* if we can build, then we definitely want to be used
|
||||
* even if we aren't going to sample as we have to be
|
||||
* present in order to log any received results. Note that
|
||||
* we tested for existence and read-access for at least
|
||||
* one socket in the configure test, so we don't have to
|
||||
* check again here
|
||||
*/
|
||||
*priority = 50; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_freq_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_freq_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int freq_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_freq_component.super.base_version;
|
||||
|
||||
mca_sensor_freq_component.test = false;
|
||||
(void) mca_base_component_var_register (c, "test",
|
||||
"Generate and pass test vector",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
& mca_sensor_freq_component.test);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
sensor_ft_tester.c \
|
||||
sensor_ft_tester.h \
|
||||
sensor_ft_tester_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_ft_tester_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_ft_tester.la
|
||||
else
|
||||
component_noinst = libmca_sensor_ft_tester.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_ft_tester_la_SOURCES = $(sources)
|
||||
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_ft_tester_la_SOURCES =$(sources)
|
||||
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,121 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/alfg.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_ft_tester.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static void sample(void);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_ft_tester_module = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
sample,
|
||||
NULL
|
||||
};
|
||||
|
||||
static void sample(void)
|
||||
{
|
||||
float prob;
|
||||
orte_proc_t *child;
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester considering killing something",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* are we including ourselves? */
|
||||
if (ORTE_PROC_IS_DAEMON &&
|
||||
0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester considering killing me!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* roll the dice */
|
||||
prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
|
||||
if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
/* commit suicide */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester committing suicide",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_errmgr.abort(1, NULL);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 < mca_sensor_ft_tester_component.fail_prob) {
|
||||
/* see if we should kill a child */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive || 0 == child->pid ||
|
||||
ORTE_PROC_STATE_UNTERMINATED < child->state) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name),
|
||||
child->alive ? "TRUE" : "FALSE",
|
||||
(unsigned long)child->pid, orte_proc_state_to_str(child->state)));
|
||||
continue;
|
||||
}
|
||||
/* roll the dice */
|
||||
prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester child: %s dice: %f prob %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name),
|
||||
prob, mca_sensor_ft_tester_component.fail_prob));
|
||||
if (prob < mca_sensor_ft_tester_component.fail_prob) {
|
||||
/* you shall die... */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester killing %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill(child->pid, SIGTERM);
|
||||
/* are we allowing multiple deaths */
|
||||
if (!mca_sensor_ft_tester_component.multi_fail) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,40 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Process Resource Utilization sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FT_TESTER_H
|
||||
#define ORTE_SENSOR_FT_TESTER_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "opal/util/alfg.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_ft_tester_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
float fail_prob;
|
||||
float daemon_fail_prob;
|
||||
bool multi_fail;
|
||||
};
|
||||
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
|
||||
|
||||
extern opal_rng_buff_t orte_sensor_ft_rng_buff;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,140 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_ft_tester.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_sensor_ft_tester_register (void);
|
||||
static int orte_sensor_ft_tester_open(void);
|
||||
static int orte_sensor_ft_tester_close(void);
|
||||
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"ft_tester", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_ft_tester_open, /* component open */
|
||||
orte_sensor_ft_tester_close, /* component close */
|
||||
orte_sensor_ft_tester_query, /* component query */
|
||||
orte_sensor_ft_tester_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
NULL
|
||||
}
|
||||
};
|
||||
|
||||
static char *daemon_fail_prob = NULL;
|
||||
static char *fail_prob = NULL;
|
||||
opal_rng_buff_t orte_sensor_ft_rng_buff;
|
||||
|
||||
/**
|
||||
* component register/open/close/init function
|
||||
*/
|
||||
static int orte_sensor_ft_tester_register (void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
|
||||
|
||||
fail_prob = NULL;
|
||||
(void) mca_base_component_var_register (c, "fail_prob", "Probability of killing a single executable",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&fail_prob);
|
||||
|
||||
mca_sensor_ft_tester_component.multi_fail = false;
|
||||
(void) mca_base_component_var_register (c, "multi_allowed", "Allow multiple executables to be killed at one time",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_ft_tester_component.multi_fail);
|
||||
|
||||
daemon_fail_prob = NULL;
|
||||
(void) mca_base_component_var_register (c, "daemon_fail_prob", "Probability of killing a daemon",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&daemon_fail_prob);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_ft_tester_open(void)
|
||||
{
|
||||
/* lookup parameters */
|
||||
if (NULL != fail_prob) {
|
||||
mca_sensor_ft_tester_component.fail_prob = strtof(fail_prob, NULL);
|
||||
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
|
||||
/* given in percent */
|
||||
mca_sensor_ft_tester_component.fail_prob /= 100.0;
|
||||
}
|
||||
} else {
|
||||
mca_sensor_ft_tester_component.fail_prob = 0.0;
|
||||
}
|
||||
|
||||
if (NULL != daemon_fail_prob) {
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(daemon_fail_prob, NULL);
|
||||
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
/* given in percent */
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
|
||||
}
|
||||
} else {
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (0.0 < mca_sensor_ft_tester_component.fail_prob ||
|
||||
0.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
*priority = 1; /* at the bottom */
|
||||
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
|
||||
/* seed the RNG --- Not sure if we should assume all procs use
|
||||
* the same seed?
|
||||
*/
|
||||
opal_srand(&orte_sensor_ft_rng_buff, (uint32_t) getpid());
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_ft_tester_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-heartbeat.txt
|
||||
|
||||
sources = \
|
||||
sensor_heartbeat.c \
|
||||
sensor_heartbeat.h \
|
||||
sensor_heartbeat_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_heartbeat_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_heartbeat.la
|
||||
else
|
||||
component_noinst = libmca_sensor_heartbeat.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_heartbeat_la_SOURCES = $(sources)
|
||||
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_heartbeat_la_SOURCES =$(sources)
|
||||
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_heartbeat_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/heartbeat/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,20 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
@ -1,278 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_heartbeat.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void sample(void);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
NULL,
|
||||
sample,
|
||||
NULL
|
||||
};
|
||||
|
||||
/* declare the local functions */
|
||||
static void check_heartbeat(int fd, short event, void *arg);
|
||||
static void recv_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata);
|
||||
|
||||
/* local globals */
|
||||
static orte_job_t *daemons=NULL;
|
||||
static opal_event_t check_ev;
|
||||
static bool check_active = false;
|
||||
static struct timeval check_time;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s initializing heartbeat recvs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup to receive heartbeats */
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
ORTE_RML_PERSISTENT,
|
||||
recv_beats, NULL);
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
|
||||
if (check_active) {
|
||||
opal_event_del(&check_ev);
|
||||
check_active = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void start(orte_jobid_t job)
|
||||
{
|
||||
if (!check_active && NULL != daemons) {
|
||||
/* setup the check event */
|
||||
check_time.tv_sec = 3 * orte_sensor_base.rate.tv_sec;
|
||||
check_time.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, &check_ev, check_heartbeat, &check_ev);
|
||||
opal_event_evtimer_add(&check_ev, &check_time);
|
||||
check_active = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void sample(void)
|
||||
{
|
||||
opal_buffer_t *buf;
|
||||
int rc;
|
||||
orte_process_name_t *tgt;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_CM) {
|
||||
/* we send to our daemon */
|
||||
tgt = ORTE_PROC_MY_DAEMON;
|
||||
} else {
|
||||
tgt = ORTE_PROC_MY_HNP;
|
||||
}
|
||||
/* if my target hasn't been defined yet, ignore - nobody listening yet */
|
||||
if (ORTE_JOBID_INVALID ==tgt->jobid ||
|
||||
ORTE_VPID_INVALID == tgt->vpid) {
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:heartbeat: HNP is not defined",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sending heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we want sampled data included, point to the bucket */
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
if (orte_sensor_base.log_samples) {
|
||||
opal_dss.copy_payload(buf, orte_sensor_base.samples);
|
||||
OBJ_RELEASE(orte_sensor_base.samples);
|
||||
/* start a new sample bucket */
|
||||
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
|
||||
}
|
||||
|
||||
/* send heartbeat */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(tgt, buf,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
}
|
||||
}
|
||||
|
||||
/* this function automatically gets periodically called
|
||||
* by the event library so we can check on the state
|
||||
* of the various orteds
|
||||
*/
|
||||
static void check_heartbeat(int fd, short dummy, void *arg)
|
||||
{
|
||||
int v;
|
||||
orte_proc_t *proc;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:check_heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
|
||||
"%s IGNORING CHECK abnorm_term %s fin %s init %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_abnormal_term_ordered ? "TRUE" : "FALSE",
|
||||
orte_finalizing ? "TRUE" : "FALSE",
|
||||
orte_initialized ? "TRUE" : "FALSE"));
|
||||
check_active = false;
|
||||
return;
|
||||
}
|
||||
|
||||
for (v=0; v < daemons->procs->size; v++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore myself */
|
||||
if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_RUNNING != proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:heartbeat DAEMON %s IS NOT RUNNING",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (0 == proc->beat) {
|
||||
/* no heartbeat recvd in last window */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:check_heartbeat FAILED for daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), proc->beat));
|
||||
}
|
||||
/* reset for next period */
|
||||
proc->beat = 0;
|
||||
}
|
||||
|
||||
/* reset the timer */
|
||||
opal_event_evtimer_add(tmp, &check_time);
|
||||
}
|
||||
|
||||
static void recv_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
int rc, n;
|
||||
char *component=NULL;
|
||||
opal_buffer_t *buf;
|
||||
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"%s received beat from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get this daemon's object */
|
||||
if (NULL != daemons) {
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s marked beat from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
proc->beat++;
|
||||
/* if this daemon has reappeared, reset things */
|
||||
if (ORTE_PROC_STATE_HEARTBEAT_FAILED == proc->state) {
|
||||
proc->state = ORTE_PROC_STATE_RUNNING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* unload any sampled data */
|
||||
n=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &buf, &n, OPAL_BUFFER))) {
|
||||
if (NULL != buf) {
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &component, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
break;
|
||||
}
|
||||
orte_sensor_base_log(component, buf);
|
||||
OBJ_RELEASE(buf);
|
||||
free(component);
|
||||
n=1;
|
||||
}
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Heartbeat sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_HEARTBEAT_H
|
||||
#define ORTE_SENSOR_HEARTBEAT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_base_component_t mca_sensor_heartbeat_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,74 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_heartbeat.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_heartbeat_open(void);
|
||||
static int orte_sensor_heartbeat_close(void);
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_base_component_t mca_sensor_heartbeat_component = {
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"heartbeat", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_heartbeat_open, /* component open */
|
||||
orte_sensor_heartbeat_close, /* component close */
|
||||
orte_sensor_heartbeat_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"heartbeat"
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_heartbeat_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 5; /* lower than all other samplers so that their data gets included in heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_heartbeat_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,39 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-pwr.txt
|
||||
|
||||
sources = \
|
||||
sensor_pwr.h \
|
||||
sensor_pwr.c \
|
||||
sensor_pwr_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_pwr_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_pwr.la
|
||||
else
|
||||
component_noinst = libmca_sensor_pwr.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_pwr_la_SOURCES = $(sources)
|
||||
mca_sensor_pwr_la_LDFLAGS = -module -avoid-version
|
||||
mca_sensor_pwr_la_LIBS = -lm
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_pwr_la_SOURCES =$(sources)
|
||||
libmca_sensor_pwr_la_LDFLAGS = -module -avoid-version
|
||||
libmca_sensor_pwr_la_LIBS = -lm
|
@ -1,29 +0,0 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
dnl Additional copyrights may follow
|
||||
dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
# MCA_sensor_pwr_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_pwr_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/pwr/Makefile])
|
||||
|
||||
AC_ARG_WITH([pwr],
|
||||
[AC_HELP_STRING([--with-pwr],
|
||||
[Build pwr support (default: no)])],
|
||||
[], with_pwr=no)
|
||||
|
||||
# do not build if support not requested
|
||||
AS_IF([test "$with_pwr" != "no"],
|
||||
[AS_IF([test "$opal_found_linux" = "yes"],
|
||||
[$1],
|
||||
[AC_MSG_WARN([Core power sensing was requested but is only supported on Intel-based Linux systems])
|
||||
AC_MSG_ERROR([Cannot continue])
|
||||
$2])],
|
||||
[$2])
|
||||
])dnl
|
@ -1,48 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
#
|
||||
[no-access]
|
||||
Power sensing was requested, but you lack access authority
|
||||
to the required path on this node:
|
||||
|
||||
Node: %s
|
||||
Path: %s
|
||||
|
||||
We will continue to operate, but will not monitor power.
|
||||
[no-sockets]
|
||||
Power sensing was requested, but your topology doesn't
|
||||
identify sockets and we are therefore unable to verify
|
||||
the processor type as supported.
|
||||
|
||||
We will continue to operate, but will not monitor power.
|
||||
[unsupported-model]
|
||||
Power sensing was requested, but your processor type
|
||||
is not currently supported.
|
||||
|
||||
Detected model: %d
|
||||
|
||||
We will continue to operate, but will not monitor power.
|
||||
[no-topo-info]
|
||||
Power sensing was requested, but the topology info
|
||||
required to verify processor-level support was not
|
||||
available. This usually means that your system lacks
|
||||
the required revision level for hwloc.
|
||||
|
||||
We will continue to operate, but will not monitor power.
|
||||
#
|
||||
[no-cores-found]
|
||||
Power monitoring was requested, but this node
|
||||
does not appear to have the required core-level files,
|
||||
or you lack access authority to them:
|
||||
|
||||
Node: %s
|
||||
|
||||
We will continue to operate, but will not monitor power.
|
@ -1,477 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#ifdef HAVE_DIRENT_H
|
||||
#include <dirent.h>
|
||||
#endif /* HAVE_DIRENT_H */
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
#include <math.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_pwr.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void pwr_sample(void);
|
||||
static void pwr_log(opal_buffer_t *buf);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_pwr_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
pwr_sample,
|
||||
pwr_log
|
||||
};
|
||||
|
||||
#define MSR_RAPL_POWER_UNIT 0x606
|
||||
|
||||
/*
|
||||
* Platform specific RAPL bitmasks.
|
||||
*/
|
||||
#define MSR_PKG_POWER_INFO 0x614
|
||||
#define POWER_UNIT_OFFSET 0
|
||||
#define POWER_UNIT_MASK 0x0F
|
||||
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *file;
|
||||
int core;
|
||||
double units;
|
||||
} corepwr_tracker_t;
|
||||
static void ctr_con(corepwr_tracker_t *trk)
|
||||
{
|
||||
trk->file = NULL;
|
||||
}
|
||||
static void ctr_des(corepwr_tracker_t *trk)
|
||||
{
|
||||
if (NULL != trk->file) {
|
||||
free(trk->file);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(corepwr_tracker_t,
|
||||
opal_list_item_t,
|
||||
ctr_con, ctr_des);
|
||||
|
||||
static bool log_enabled = true;
|
||||
static opal_list_t tracking;
|
||||
|
||||
static int read_msr(int fd, long long *value, int offset)
|
||||
{
|
||||
uint64_t data;
|
||||
|
||||
if (pread(fd, &data, sizeof data, offset) != sizeof(data)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
*value = (long long)data;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
static int check_cpu_type(void);
|
||||
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
int fd;
|
||||
DIR *cur_dirp = NULL;
|
||||
struct dirent *entry;
|
||||
corepwr_tracker_t *trk;
|
||||
long long units;
|
||||
|
||||
/* always construct this so we don't segfault in finalize */
|
||||
OBJ_CONSTRUCT(&tracking, opal_list_t);
|
||||
|
||||
/* we only handle certain cpu types as we have to know the binary
|
||||
* layout of the msr file
|
||||
*/
|
||||
if (ORTE_SUCCESS != check_cpu_type()) {
|
||||
/* we provided a show help down below */
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Open up the base directory so we can get a listing
|
||||
*/
|
||||
if (NULL == (cur_dirp = opendir("/dev/cpu"))) {
|
||||
OBJ_DESTRUCT(&tracking);
|
||||
orte_show_help("help-orte-sensor-pwr.txt", "no-access",
|
||||
true, orte_process_info.nodename,
|
||||
"/dev/cpu");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* For each directory
|
||||
*/
|
||||
while (NULL != (entry = readdir(cur_dirp))) {
|
||||
|
||||
/*
|
||||
* Skip the obvious
|
||||
*/
|
||||
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
|
||||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* if it contains anything other than a digit, then it isn't a cpu directory */
|
||||
if (!isdigit(entry->d_name[strlen(entry->d_name)-1])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* track the info for this core */
|
||||
trk = OBJ_NEW(corepwr_tracker_t);
|
||||
trk->core = strtoul(entry->d_name, NULL, 10);
|
||||
trk->file = opal_os_path(false, "/dev/cpu", entry->d_name, "msr", NULL);
|
||||
|
||||
/* get the power units for this core */
|
||||
if (0 >= (fd = open(trk->file, O_RDONLY))) {
|
||||
/* can't access file */
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
if (ORTE_SUCCESS != read_msr(fd, &units, MSR_RAPL_POWER_UNIT)) {
|
||||
/* can't read required info */
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
trk->units = pow(0.5,(double)(units & POWER_UNIT_MASK));
|
||||
|
||||
/* add to our list */
|
||||
opal_list_append(&tracking, &trk->super);
|
||||
}
|
||||
closedir(cur_dirp);
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
/* nothing to read */
|
||||
orte_show_help("help-orte-sensor-pwr.txt", "no-cores-found",
|
||||
true, orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
OPAL_LIST_DESTRUCT(&tracking);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local temps
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static void pwr_sample(void)
|
||||
{
|
||||
corepwr_tracker_t *trk, *nxt;
|
||||
opal_buffer_t data, *bptr;
|
||||
int32_t ncores;
|
||||
time_t now;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
long long value;
|
||||
int fd, ret;
|
||||
float power;
|
||||
char *temp;
|
||||
bool packed;
|
||||
|
||||
if (0 == opal_list_get_size(&tracking)) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s sampling power",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
packed = false;
|
||||
|
||||
/* pack our name */
|
||||
temp = strdup("pwr");
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* store the number of cores */
|
||||
ncores = (int32_t)opal_list_get_size(&tracking);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ncores, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the sample time */
|
||||
now = time(NULL);
|
||||
/* pass the time along as a simple string */
|
||||
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
|
||||
asprintf(×tamp_str, "%s", time_str);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, ×tamp_str, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
free(timestamp_str);
|
||||
return;
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, corepwr_tracker_t) {
|
||||
if (0 >= (fd = open(trk->file, O_RDONLY))) {
|
||||
/* disable this one - cannot read the file */
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s access denied to pwr file %s - removing it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->file);
|
||||
opal_list_remove_item(&tracking, &trk->super);
|
||||
OBJ_RELEASE(trk);
|
||||
continue;
|
||||
}
|
||||
if (ORTE_SUCCESS != read_msr(fd, &value, MSR_PKG_POWER_INFO)) {
|
||||
/* disable this one - cannot read the file */
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s failed to read pwr file %s - removing it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->file);
|
||||
opal_list_remove_item(&tracking, &trk->super);
|
||||
OBJ_RELEASE(trk);
|
||||
close(fd);
|
||||
continue;
|
||||
}
|
||||
power = trk->units * (double)(value & 0x7fff);
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &power, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
close(fd);
|
||||
return;
|
||||
}
|
||||
packed = true;
|
||||
close(fd);
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
if (packed) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
|
||||
static void pwr_log(opal_buffer_t *sample)
|
||||
{
|
||||
char *hostname=NULL;
|
||||
char *sampletime;
|
||||
int rc;
|
||||
int32_t n, ncores;
|
||||
opal_value_t *kv=NULL;
|
||||
float fval;
|
||||
int i;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the host this came from */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* and the number of cores on that host */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &ncores, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* sample time */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received log from host %s with %d cores",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+2) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
kv[0].key = strdup("ctime");
|
||||
kv[0].type = OPAL_STRING;
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* load the hostname */
|
||||
OBJ_CONSTRUCT(&kv[1], opal_value_t);
|
||||
kv[1].key = strdup("hostname");
|
||||
kv[1].type = OPAL_STRING;
|
||||
kv[1].data.string = strdup(hostname);
|
||||
|
||||
/* protect against segfault if we jump to cleanup */
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
|
||||
}
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
asprintf(&kv[i+2].key, "core%d", i);
|
||||
kv[i+2].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+2].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+2))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+2; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* list of supported chipsets */
|
||||
#define CPU_SANDYBRIDGE 42
|
||||
#define CPU_SANDYBRIDGE_EP 45
|
||||
#define CPU_IVYBRIDGE 58
|
||||
#define CPU_IVYBRIDGE_EP 62
|
||||
#define CPU_HASWELL 60
|
||||
|
||||
|
||||
/* go thru our topology and check the sockets
|
||||
* to see if they contain a match - at this time,
|
||||
* we don't support hetero sockets, so any mismatch
|
||||
* will disqualify us
|
||||
*/
|
||||
static int check_cpu_type(void)
|
||||
{
|
||||
hwloc_obj_t obj;
|
||||
unsigned k;
|
||||
|
||||
if (NULL == (obj = hwloc_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_SOCKET, 0))) {
|
||||
/* there are no sockets identified in this machine */
|
||||
orte_show_help("help-orte-sensor-pwr.txt", "no-sockets", true);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while (NULL != obj) {
|
||||
for (k=0; k < obj->infos_count; k++) {
|
||||
if (0 == strcmp(obj->infos[k].name, "model") &&
|
||||
NULL != obj->infos[k].value) {
|
||||
mca_sensor_pwr_component.model = strtoul(obj->infos[k].value, NULL, 10);
|
||||
|
||||
switch (mca_sensor_pwr_component.model) {
|
||||
case CPU_SANDYBRIDGE:
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"sensor:pwr Found Sandybridge CPU");
|
||||
return ORTE_SUCCESS;
|
||||
case CPU_SANDYBRIDGE_EP:
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"sensor:pwr Found Sandybridge-EP CPU");
|
||||
return ORTE_SUCCESS;
|
||||
case CPU_IVYBRIDGE:
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"sensor:pwr Found Ivybridge CPU");
|
||||
return ORTE_SUCCESS;
|
||||
case CPU_IVYBRIDGE_EP:
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"sensor:pwr Found Ivybridge-EP CPU");
|
||||
return ORTE_SUCCESS;
|
||||
case CPU_HASWELL:
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"sensor:pwr Found Haswell CPU");
|
||||
return ORTE_SUCCESS;
|
||||
default:
|
||||
orte_show_help("help-orte-sensor-pwr.txt", "unsupported-model",
|
||||
true, mca_sensor_pwr_component.model);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
obj = obj->next_sibling;
|
||||
}
|
||||
orte_show_help("help-orte-sensor-pwr.txt", "no-topo-info",
|
||||
true, mca_sensor_pwr_component.model);
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* PWR resource manager sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_PWR_H
|
||||
#define ORTE_SENSOR_PWR_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_sensor_base_component_t super;
|
||||
int model;
|
||||
bool test;
|
||||
} orte_sensor_pwr_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_pwr_component_t mca_sensor_pwr_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_pwr_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,88 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "sensor_pwr.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_pwr_open(void);
|
||||
static int orte_sensor_pwr_close(void);
|
||||
static int orte_sensor_pwr_query(mca_base_module_t **module, int *priority);
|
||||
static int pwr_component_register(void);
|
||||
|
||||
orte_sensor_pwr_component_t mca_sensor_pwr_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"pwr", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_pwr_open, /* component open */
|
||||
orte_sensor_pwr_close, /* component close */
|
||||
orte_sensor_pwr_query, /* component query */
|
||||
pwr_component_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"pwr" // data being sensed
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_pwr_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_pwr_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 50; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_pwr_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_pwr_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pwr_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_pwr_component.super.base_version;
|
||||
|
||||
mca_sensor_pwr_component.test = false;
|
||||
(void) mca_base_component_var_register (c, "test",
|
||||
"Generate and pass test vector",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
& mca_sensor_pwr_component.test);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-resusage.txt
|
||||
|
||||
sources = \
|
||||
sensor_resusage.c \
|
||||
sensor_resusage.h \
|
||||
sensor_resusage_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_resusage_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_resusage.la
|
||||
else
|
||||
component_noinst = libmca_sensor_resusage.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_resusage_la_SOURCES = $(sources)
|
||||
mca_sensor_resusage_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_resusage_la_SOURCES =$(sources)
|
||||
libmca_sensor_resusage_la_LDFLAGS = -module -avoid-version
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_resusage_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_resusage_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/resusage/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,20 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
@ -1,477 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/class/opal_ring_buffer.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_resusage.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void sample(void);
|
||||
static void res_log(opal_buffer_t *sample);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_resusage_module = {
|
||||
init,
|
||||
finalize,
|
||||
NULL,
|
||||
NULL,
|
||||
sample,
|
||||
res_log
|
||||
};
|
||||
|
||||
static bool log_enabled = true;
|
||||
static orte_node_t *my_node;
|
||||
static orte_proc_t *my_proc;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* ensure my_proc and my_node are available on the global arrays */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
my_proc = OBJ_NEW(orte_proc_t);
|
||||
my_node = OBJ_NEW(orte_node_t);
|
||||
} else {
|
||||
if (NULL == (my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (NULL == (my_node = my_proc->node)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* protect the objects */
|
||||
OBJ_RETAIN(my_proc);
|
||||
OBJ_RETAIN(my_node);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
if (NULL != my_proc) {
|
||||
OBJ_RELEASE(my_proc);
|
||||
}
|
||||
if (NULL != my_node) {
|
||||
OBJ_RELEASE(my_node);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void sample(void)
|
||||
{
|
||||
opal_pstats_t *stats, *st;
|
||||
opal_node_stats_t *nstats, *nst;
|
||||
int rc, i;
|
||||
orte_proc_t *child, *hog=NULL;
|
||||
float in_use, max_mem;
|
||||
opal_buffer_t buf, *bptr;
|
||||
char *comp;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"sample:resusage sampling resource usage"));
|
||||
|
||||
/* setup a buffer for our stats */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* pack our name */
|
||||
comp = strdup("resusage");
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
free(comp);
|
||||
|
||||
/* update stats on ourself and the node */
|
||||
stats = OBJ_NEW(opal_pstats_t);
|
||||
nstats = OBJ_NEW(opal_node_stats_t);
|
||||
if (ORTE_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(stats);
|
||||
OBJ_RELEASE(nstats);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
|
||||
/* the stats framework can't know nodename or rank */
|
||||
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
|
||||
stats->rank = ORTE_PROC_MY_NAME->vpid;
|
||||
/* locally save the stats */
|
||||
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) {
|
||||
OBJ_RELEASE(st);
|
||||
}
|
||||
if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) {
|
||||
/* release the popped value */
|
||||
OBJ_RELEASE(nst);
|
||||
}
|
||||
|
||||
/* pack them */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
|
||||
/* loop through our children and update their stats */
|
||||
if (NULL != orte_local_children) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive) {
|
||||
continue;
|
||||
}
|
||||
if (0 == child->pid) {
|
||||
/* race condition */
|
||||
continue;
|
||||
}
|
||||
stats = OBJ_NEW(opal_pstats_t);
|
||||
if (ORTE_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) {
|
||||
/* may hit a race condition where the process has
|
||||
* terminated, so just ignore any error
|
||||
*/
|
||||
OBJ_RELEASE(stats);
|
||||
continue;
|
||||
}
|
||||
/* the stats framework can't know nodename or rank */
|
||||
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
|
||||
stats->rank = child->name.vpid;
|
||||
/* store it */
|
||||
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) {
|
||||
OBJ_RELEASE(st);
|
||||
}
|
||||
/* pack them */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* xfer any data for transmission */
|
||||
if (0 < buf.bytes_used) {
|
||||
bptr = &buf;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
/* are there any issues with node-level usage? */
|
||||
nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1);
|
||||
if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s CHECKING NODE MEM",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* compute the percentage of node memory in-use */
|
||||
in_use = 1.0 - (nst->free_mem / nst->total_mem);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s PERCENT USED: %f LIMIT: %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
in_use, mca_sensor_resusage_component.node_memory_limit));
|
||||
if (mca_sensor_resusage_component.node_memory_limit <= in_use) {
|
||||
/* loop through our children and find the biggest hog */
|
||||
hog = NULL;
|
||||
max_mem = 0.0;
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive) {
|
||||
continue;
|
||||
}
|
||||
if (0 == child->pid) {
|
||||
/* race condition */
|
||||
continue;
|
||||
}
|
||||
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
|
||||
"%s PROC %s AT VSIZE %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name), st->vsize));
|
||||
if (max_mem < st->vsize) {
|
||||
hog = child;
|
||||
max_mem = st->vsize;
|
||||
}
|
||||
}
|
||||
if (NULL == hog) {
|
||||
/* if all children dead and we are still too big,
|
||||
* then we must be the culprit - abort
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s NO CHILD: COMMITTING SUICIDE",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL);
|
||||
} else {
|
||||
/* report the problem */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&hog->name)));
|
||||
ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
}
|
||||
/* since we have ordered someone to die, we've done enough for this
|
||||
* time around - don't check proc limits as well
|
||||
*/
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* check proc limits */
|
||||
if (0.0 < mca_sensor_resusage_component.proc_memory_limit) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s CHECKING PROC MEM",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* check my children first */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive) {
|
||||
continue;
|
||||
}
|
||||
if (0 == child->pid) {
|
||||
/* race condition */
|
||||
continue;
|
||||
}
|
||||
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
|
||||
"%s PROC %s AT VSIZE %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name), st->vsize));
|
||||
if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) {
|
||||
/* report the problem */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void res_log(opal_buffer_t *sample)
|
||||
{
|
||||
opal_pstats_t *st=NULL;
|
||||
opal_node_stats_t *nst=NULL;
|
||||
int rc, n, i;
|
||||
opal_value_t kv[14];
|
||||
char *node;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the node name */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &node, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the node stats */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &nst, &n, OPAL_NODE_STAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mca_sensor_resusage_component.log_node_stats) {
|
||||
/* convert this into an array of opal_value_t's - no clean way
|
||||
* to do this, so have to just manually map each field
|
||||
*/
|
||||
for (i=0; i < 13; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i], opal_value_t);
|
||||
}
|
||||
i=0;
|
||||
kv[i].key = strdup("ctime");
|
||||
kv[i].type = OPAL_TIMEVAL;
|
||||
kv[i].data.tv.tv_sec = nst->sample_time.tv_sec;
|
||||
kv[i++].data.tv.tv_usec = nst->sample_time.tv_usec;
|
||||
|
||||
kv[i].key = "hostname";
|
||||
kv[i].type = OPAL_STRING;
|
||||
kv[i++].data.string = strdup(node);
|
||||
|
||||
kv[i].key = strdup("total_mem");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->total_mem;
|
||||
|
||||
kv[i].key = strdup("free_mem");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->free_mem;
|
||||
|
||||
kv[i].key = strdup("buffers");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->buffers;
|
||||
|
||||
kv[i].key = strdup("cached");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->cached;
|
||||
|
||||
kv[i].key = strdup("swap_total");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->swap_total;
|
||||
|
||||
kv[i].key = strdup("swap_free");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->swap_free;
|
||||
|
||||
kv[i].key = strdup("mapped");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->mapped;
|
||||
|
||||
kv[i].key = strdup("swap_cached");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->swap_cached;
|
||||
|
||||
kv[i].key = strdup("la");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->la;
|
||||
|
||||
kv[i].key = strdup("la5");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->la5;
|
||||
|
||||
kv[i].key = strdup("la15");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->la15;
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("nodestats", kv, 12))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
for (i=0; i < 12; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(nst);
|
||||
|
||||
if (mca_sensor_resusage_component.log_process_stats) {
|
||||
/* unpack all process stats */
|
||||
n=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(sample, &st, &n, OPAL_PSTAT))) {
|
||||
for (i=0; i < 14; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i], opal_value_t);
|
||||
}
|
||||
kv[0].key = strdup("node");
|
||||
kv[0].type = OPAL_STRING;
|
||||
kv[0].data.string = strdup(st->node);
|
||||
kv[1].key = strdup("rank");
|
||||
kv[1].type = OPAL_INT32;
|
||||
kv[1].data.int32 = st->rank;
|
||||
kv[2].key = strdup("pid");
|
||||
kv[2].type = OPAL_PID;
|
||||
kv[2].data.pid = st->pid;
|
||||
kv[3].key = strdup("cmd");
|
||||
kv[3].type = OPAL_STRING;
|
||||
kv[3].data.string = strdup(st->cmd);
|
||||
kv[4].key = strdup("state");
|
||||
kv[4].type = OPAL_STRING;
|
||||
kv[4].data.string = (char*)malloc(3 * sizeof(char));
|
||||
kv[4].data.string[0] = st->state[0];
|
||||
kv[4].data.string[1] = st->state[1];
|
||||
kv[4].data.string[2] = '\0';
|
||||
kv[5].key = strdup("time");
|
||||
kv[5].type = OPAL_TIMEVAL;
|
||||
kv[5].data.tv.tv_sec = st->time.tv_sec;
|
||||
kv[5].data.tv.tv_usec = st->time.tv_usec;
|
||||
kv[6].key = strdup("percent_cpu");
|
||||
kv[6].type = OPAL_FLOAT;
|
||||
kv[6].data.fval = st->percent_cpu;
|
||||
kv[7].key = strdup("priority");
|
||||
kv[7].type = OPAL_INT32;
|
||||
kv[7].data.int32 = st->priority;
|
||||
kv[8].key = strdup("num_threads");
|
||||
kv[8].type = OPAL_INT16;
|
||||
kv[8].data.int16 = st->num_threads;
|
||||
kv[9].key = strdup("vsize");
|
||||
kv[9].type = OPAL_FLOAT;
|
||||
kv[9].data.fval = st->vsize;
|
||||
kv[10].key = strdup("rss");
|
||||
kv[10].type = OPAL_FLOAT;
|
||||
kv[10].data.fval = st->rss;
|
||||
kv[11].key = strdup("peak_vsize");
|
||||
kv[11].type = OPAL_FLOAT;
|
||||
kv[11].data.fval = st->peak_vsize;
|
||||
kv[12].key = strdup("processor");
|
||||
kv[12].type = OPAL_INT16;
|
||||
kv[12].data.int16 = st->processor;
|
||||
kv[13].key = strdup("sample_time");
|
||||
kv[13].type = OPAL_TIMEVAL;
|
||||
kv[13].data.tv.tv_sec = st->sample_time.tv_sec;
|
||||
kv[13].data.tv.tv_usec = st->sample_time.tv_usec;
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("procstats", kv, 14))) {
|
||||
log_enabled = false;
|
||||
}
|
||||
for (i=0; i < 14; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
OBJ_RELEASE(st);
|
||||
n=1;
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,40 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Process Resource Utilization sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_RESUSAGE_H
|
||||
#define ORTE_SENSOR_RESUSAGE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_resusage_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int sample_rate;
|
||||
float node_memory_limit;
|
||||
float proc_memory_limit;
|
||||
bool log_node_stats;
|
||||
bool log_process_stats;
|
||||
};
|
||||
typedef struct orte_sensor_resusage_component_t orte_sensor_resusage_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_resusage_component_t mca_sensor_resusage_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_resusage_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,137 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_resusage.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_sensor_resusage_register (void);
|
||||
static int orte_sensor_resusage_open(void);
|
||||
static int orte_sensor_resusage_close(void);
|
||||
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_resusage_component_t mca_sensor_resusage_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"resusage", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_resusage_open, /* component open */
|
||||
orte_sensor_resusage_close, /* component close */
|
||||
orte_sensor_resusage_query, /* component query */
|
||||
orte_sensor_resusage_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"procresource,noderesource"
|
||||
}
|
||||
};
|
||||
|
||||
static int node_memory_limit;
|
||||
static int proc_memory_limit;
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_resusage_register (void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_resusage_component.super.base_version;
|
||||
|
||||
mca_sensor_resusage_component.sample_rate = 0;
|
||||
(void) mca_base_component_var_register (c, "sample_rate", "Sample rate in seconds (default: 0)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_resusage_component.sample_rate);
|
||||
if (mca_sensor_resusage_component.sample_rate < 0) {
|
||||
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
node_memory_limit = 0;
|
||||
(void) mca_base_component_var_register (c, "node_memory_limit",
|
||||
"Percentage of total memory that can be in-use",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&node_memory_limit);
|
||||
mca_sensor_resusage_component.node_memory_limit = (float)node_memory_limit/100.0;
|
||||
|
||||
proc_memory_limit = 0;
|
||||
(void) mca_base_component_var_register (c, "proc_memory_limit",
|
||||
"Max virtual memory size in MBytes",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&proc_memory_limit);
|
||||
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
|
||||
|
||||
mca_sensor_resusage_component.log_node_stats = false;
|
||||
(void) mca_base_component_var_register (c, "log_node_stats", "Log the node stats",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_resusage_component.log_node_stats);
|
||||
|
||||
mca_sensor_resusage_component.log_process_stats = false;
|
||||
(void) mca_base_component_var_register (c, "log_process_stats", "Log the process stats",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_resusage_component.log_process_stats);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_resusage_open(void)
|
||||
{
|
||||
if (mca_sensor_resusage_component.sample_rate < 0) {
|
||||
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
mca_sensor_resusage_component.node_memory_limit = (float) node_memory_limit/100.0;
|
||||
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 100; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_resusage_module;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_resusage_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,107 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* @file:
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_H
|
||||
#define MCA_SENSOR_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
|
||||
/* start collecting data */
|
||||
typedef void (*orte_sensor_API_module_start_fn_t)(orte_jobid_t job);
|
||||
|
||||
/* stop collecting data */
|
||||
typedef void (*orte_sensor_API_module_stop_fn_t)(orte_jobid_t job);
|
||||
|
||||
/* API module */
|
||||
/*
|
||||
* Ver 1.0
|
||||
*/
|
||||
struct orte_sensor_base_API_module_1_0_0_t {
|
||||
orte_sensor_API_module_start_fn_t start;
|
||||
orte_sensor_API_module_stop_fn_t stop;
|
||||
};
|
||||
|
||||
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
|
||||
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
|
||||
|
||||
/* initialize the module */
|
||||
typedef int (*orte_sensor_base_module_init_fn_t)(void);
|
||||
|
||||
/* finalize the module */
|
||||
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
|
||||
|
||||
/* tell the module to sample its sensor */
|
||||
typedef void (*orte_sensor_base_module_sample_fn_t)(void);
|
||||
|
||||
/* pass a buffer to the module for logging */
|
||||
typedef void (*orte_sensor_base_module_log_fn_t)(opal_buffer_t *sample);
|
||||
|
||||
/*
|
||||
* Component modules Ver 1.0
|
||||
*/
|
||||
struct orte_sensor_base_module_1_0_0_t {
|
||||
orte_sensor_base_module_init_fn_t init;
|
||||
orte_sensor_base_module_finalize_fn_t finalize;
|
||||
orte_sensor_API_module_start_fn_t start;
|
||||
orte_sensor_API_module_stop_fn_t stop;
|
||||
orte_sensor_base_module_sample_fn_t sample;
|
||||
orte_sensor_base_module_log_fn_t log;
|
||||
};
|
||||
|
||||
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
|
||||
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
|
||||
|
||||
/*
|
||||
* the standard component data structure
|
||||
*/
|
||||
struct orte_sensor_base_component_1_0_0_t {
|
||||
mca_base_component_t base_version;
|
||||
mca_base_component_data_t base_data;
|
||||
char *data_measured;
|
||||
};
|
||||
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
|
||||
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type sensor v1.0.0
|
||||
*/
|
||||
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
|
||||
/* sensor v1.0 is chained to MCA v2.0 */ \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
/* sensor v1.0 */ \
|
||||
"sensor", 1, 0, 0
|
||||
|
||||
/* Global structure for accessing sensor functions
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_SENSOR_H */
|
@ -1,50 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_SENSOR_TYPES_H
|
||||
#define ORTE_MCA_SENSOR_TYPES_H
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
|
||||
/*
|
||||
* General SENSOR types - instanced in runtime/orte_globals.c
|
||||
*/
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
enum {
|
||||
ORTE_SENSOR_SCALE_LINEAR,
|
||||
ORTE_SENSOR_SCALE_LOG,
|
||||
ORTE_SENSOR_SCALE_SIGMOID
|
||||
};
|
||||
|
||||
/*
|
||||
* Structure for passing data from sensors
|
||||
*/
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
char *sensor;
|
||||
struct timeval timestamp;
|
||||
opal_byte_object_t data;
|
||||
} orte_sensor_data_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,41 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-sigar.txt
|
||||
|
||||
sources = \
|
||||
sensor_sigar.c \
|
||||
sensor_sigar.h \
|
||||
sensor_sigar_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_sigar_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_sigar.la
|
||||
else
|
||||
component_noinst = libmca_sensor_sigar.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_sigar_la_CPPFLAGS = $(sensor_sigar_CPPFLAGS)
|
||||
mca_sensor_sigar_la_SOURCES = $(sources)
|
||||
mca_sensor_sigar_la_LDFLAGS = -module -avoid-version $(sensor_sigar_LDFLAGS)
|
||||
mca_sensor_sigar_la_LIBADD = $(sensor_sigar_LIBS) -lm
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_sigar_la_SOURCES =$(sources)
|
||||
libmca_sensor_sigar_la_CPPFLAGS = $(sensor_sigar_CPPFLAGS)
|
||||
libmca_sensor_sigar_la_LDFLAGS = -module -avoid-version $(sensor_sigar_LDFLAGS)
|
||||
libmca_sensor_sigar_la_LIBADD = $(sensor_sigar_LIBS) -lm
|
@ -1,53 +0,0 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
dnl Additional copyrights may follow
|
||||
dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
# MCA_sensor_sigar_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_sigar_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/sigar/Makefile])
|
||||
|
||||
AC_ARG_WITH([sigar],
|
||||
[AC_HELP_STRING([--with-sigar],
|
||||
[Build sigar support (default: no)])],
|
||||
[], with_sigar=no)
|
||||
|
||||
# do not build if support not requested
|
||||
AS_IF([test "$with_sigar" != "no"],
|
||||
[AS_IF([test "$opal_found_linux" = "yes" || test "$opal_found_apple" = "yes"],
|
||||
[AS_IF([test "$opal_found_apple" = "yes"],
|
||||
[libname="sigar-universal-macosx"], [libname="sigar"])
|
||||
|
||||
AS_IF([test ! -z "$with_sigar" -a "$with_sigar" != "yes"],
|
||||
[orte_check_sigar_dir="$with_sigar"])
|
||||
|
||||
OMPI_CHECK_PACKAGE([sensor_sigar],
|
||||
[sigar.h],
|
||||
[$libname],
|
||||
[sigar_proc_cpu_get],
|
||||
[],
|
||||
[$orte_check_sigar_dir],
|
||||
[],
|
||||
[$1],
|
||||
[AC_MSG_WARN([SIGAR SENSOR SUPPORT REQUESTED])
|
||||
AC_MSG_WARN([BUT REQUIRED LIBRARY OR HEADER NOT FOUND])
|
||||
AC_MSG_ERROR([CANNOT CONTINUE])
|
||||
$2])],
|
||||
[AC_MSG_WARN([SIGAR SENSOR SUPPORT REQUESTED])
|
||||
AC_MSG_WARN([BUT ONLY SUPPORTED ON LINUX AND MAC])
|
||||
AC_MSG_ERROR([CANNOT CONTINUE])
|
||||
$2])],
|
||||
[$2])
|
||||
|
||||
AC_DEFINE_UNQUOTED(ORTE_SIGAR_LINUX, [test "$opal_found_linux" = "yes"],
|
||||
[Which name to use for the sigar library on this OS])
|
||||
AC_SUBST(sensor_sigar_CPPFLAGS)
|
||||
AC_SUBST(sensor_sigar_LDFLAGS)
|
||||
AC_SUBST(sensor_sigar_LIBS)
|
||||
])dnl
|
@ -1,20 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
@ -1,959 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifdef ORTE_SIGAR_LINUX
|
||||
#include <sigar.h>
|
||||
#else
|
||||
#include <libsigar-universal-macosx>
|
||||
#endif
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/class/opal_ring_buffer.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_sigar.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void sigar_sample(void);
|
||||
static void sigar_log(opal_buffer_t *buf);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_sigar_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
sigar_sample,
|
||||
sigar_log
|
||||
};
|
||||
|
||||
/* define some local classes */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *interface;
|
||||
uint64_t rx_packets;
|
||||
uint64_t rx_bytes;
|
||||
uint64_t tx_packets;
|
||||
uint64_t tx_bytes;
|
||||
} sensor_sigar_interface_t;
|
||||
static void sit_cons(sensor_sigar_interface_t *sit)
|
||||
{
|
||||
sit->interface = NULL;
|
||||
sit->rx_packets = 0;
|
||||
sit->rx_bytes = 0;
|
||||
sit->tx_packets = 0;
|
||||
sit->tx_bytes = 0;
|
||||
}
|
||||
static void sit_dest(sensor_sigar_interface_t *sit)
|
||||
{
|
||||
if (NULL != sit->interface) {
|
||||
free(sit->interface);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(sensor_sigar_interface_t,
|
||||
opal_list_item_t,
|
||||
sit_cons, sit_dest);
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *mount_pt;
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t read_bytes;
|
||||
uint64_t write_bytes;
|
||||
} sensor_sigar_disks_t;
|
||||
static void dit_cons(sensor_sigar_disks_t *dit)
|
||||
{
|
||||
dit->mount_pt = NULL;
|
||||
dit->reads = 0;
|
||||
dit->writes = 0;
|
||||
dit->read_bytes = 0;
|
||||
dit->write_bytes = 0;
|
||||
}
|
||||
static void dit_dest(sensor_sigar_disks_t *dit)
|
||||
{
|
||||
if (NULL != dit->mount_pt) {
|
||||
free(dit->mount_pt);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(sensor_sigar_disks_t,
|
||||
opal_list_item_t,
|
||||
dit_cons, dit_dest);
|
||||
|
||||
static sigar_t *sigar;
|
||||
static opal_list_t fslist;
|
||||
static opal_list_t netlist;
|
||||
static time_t last_sample = 0;
|
||||
static struct cpu_data_t {
|
||||
uint64_t user;
|
||||
uint64_t nice;
|
||||
uint64_t sys;
|
||||
uint64_t idle;
|
||||
uint64_t wait;
|
||||
uint64_t total;
|
||||
} pcpu;
|
||||
static struct swap_data_t {
|
||||
uint64_t page_in;
|
||||
uint64_t page_out;
|
||||
} pswap;
|
||||
static bool log_enabled = true;
|
||||
static opal_buffer_t test_vector;
|
||||
|
||||
static uint64_t metric_diff_calc(sigar_uint64_t newval, uint64_t oldval,
|
||||
const char *name_for_log,
|
||||
const char* value_name_for_log);
|
||||
static void generate_test_vector(opal_buffer_t *v);
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
sigar_file_system_list_t sigar_fslist;
|
||||
sigar_net_interface_list_t sigar_netlist;
|
||||
sensor_sigar_disks_t *dit;
|
||||
sensor_sigar_interface_t *sit;
|
||||
unsigned int i;
|
||||
|
||||
if (mca_sensor_sigar_component.test) {
|
||||
/* generate test vector */
|
||||
OBJ_CONSTRUCT(&test_vector, opal_buffer_t);
|
||||
generate_test_vector(&test_vector);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* setup the globals */
|
||||
OBJ_CONSTRUCT(&fslist, opal_list_t);
|
||||
OBJ_CONSTRUCT(&netlist, opal_list_t);
|
||||
pcpu.user = 0;
|
||||
pcpu.nice = 0;
|
||||
pcpu.sys = 0;
|
||||
pcpu.idle = 0;
|
||||
pcpu.wait = 0;
|
||||
pcpu.total = 0;
|
||||
pswap.page_in = 0;
|
||||
pswap.page_out = 0;
|
||||
|
||||
/* initialize sigar */
|
||||
if (0 != sigar_open(&sigar)) {
|
||||
opal_output(0, "%s: sigar_open failed on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_process_info.nodename);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* load the disk list */
|
||||
if (0 != sigar_file_system_list_get(sigar, &sigar_fslist)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
for (i = 0; i < sigar_fslist.number; i++) {
|
||||
if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK || sigar_fslist.data[i].type == SIGAR_FSTYPE_NETWORK) {
|
||||
dit = OBJ_NEW(sensor_sigar_disks_t);
|
||||
dit->mount_pt = strdup(sigar_fslist.data[i].dir_name);
|
||||
opal_list_append(&fslist, &dit->super);
|
||||
}
|
||||
}
|
||||
sigar_file_system_list_destroy(sigar, &sigar_fslist);
|
||||
|
||||
/* load the list of network interfaces */
|
||||
if (0 != sigar_net_interface_list_get(sigar, &sigar_netlist)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
for (i=0; i < sigar_netlist.number; i++) {
|
||||
sit = OBJ_NEW(sensor_sigar_interface_t);
|
||||
sit->interface = strdup(sigar_netlist.data[i]);
|
||||
opal_list_append(&netlist, &sit->super);
|
||||
}
|
||||
sigar_net_interface_list_destroy(sigar, &sigar_netlist);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (mca_sensor_sigar_component.test) {
|
||||
/* destruct test vector */
|
||||
OBJ_DESTRUCT(&test_vector);
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != sigar) {
|
||||
sigar_close(sigar);
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&fslist))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&fslist);
|
||||
while (NULL != (item = opal_list_remove_first(&netlist))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&netlist);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static void sigar_sample(void)
|
||||
{
|
||||
sigar_mem_t mem;
|
||||
sigar_swap_t swap;
|
||||
sigar_cpu_t cpu;
|
||||
sigar_loadavg_t loadavg;
|
||||
sigar_disk_usage_t tdisk;
|
||||
sensor_sigar_disks_t *dit;
|
||||
sigar_file_system_usage_t fsusage;
|
||||
sensor_sigar_interface_t *sit;
|
||||
sigar_net_interface_stat_t tnet, ifc;
|
||||
uint64_t reads, writes, read_bytes, write_bytes;
|
||||
uint64_t rxpkts, txpkts, rxbytes, txbytes;
|
||||
uint64_t ui64;
|
||||
opal_buffer_t data, *bptr;
|
||||
int rc;
|
||||
time_t now;
|
||||
double cpu_diff, tdiff;
|
||||
float tmp;
|
||||
char *ctmp;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
|
||||
if (mca_sensor_sigar_component.test) {
|
||||
/* just send the test vector */
|
||||
bptr = &test_vector;
|
||||
opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER);
|
||||
return;
|
||||
}
|
||||
|
||||
/* prep the buffer to collect the data */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
/* pack our name */
|
||||
ctmp = strdup("sigar");
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ctmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(ctmp);
|
||||
/* include our node name */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the sample time */
|
||||
now = time(NULL);
|
||||
tdiff = difftime(now, last_sample);
|
||||
/* pass the time along as a simple string */
|
||||
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
|
||||
asprintf(×tamp_str, "%s", time_str);
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, ×tamp_str, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
/* get the memory usage for this node */
|
||||
memset(&mem, 0, sizeof(mem));
|
||||
sigar_mem_get(sigar, &mem);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"mem total: %" PRIu64 " used: %" PRIu64 " actual used: %" PRIu64 " actual free: %" PRIu64 "",
|
||||
mem.total, mem.used, mem.actual_used, mem.actual_free);
|
||||
/* add it to the data */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.total, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.used, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.actual_used, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &mem.actual_free, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get swap data */
|
||||
memset(&swap, 0, sizeof(swap));
|
||||
sigar_swap_get(sigar, &swap);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"swap total: %" PRIu64 " used: %" PRIu64 "page_in: %" PRIu64 " page_out: %" PRIu64 "\n",
|
||||
swap.total, swap.used, swap.page_in, swap.page_out);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &swap.total, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &swap.used, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
ui64 = swap.page_in - pswap.page_in;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ui64, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
ui64 = swap.page_out - pswap.page_out;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &ui64, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the cpu usage */
|
||||
memset(&cpu, 0, sizeof(cpu));
|
||||
sigar_cpu_get(sigar, &cpu);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"cpu user: %" PRIu64 " sys: %" PRIu64 " idle: %" PRIu64 " wait: %" PRIu64 " nice: %" PRIu64 " total: %" PRIu64 "",
|
||||
cpu.user, cpu.sys, cpu.idle, cpu.wait, cpu.nice, cpu.total);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
cpu_diff = (double)(cpu.total - pcpu.total);
|
||||
tmp = (float)((cpu.user - pcpu.user) * 100.0 / cpu_diff) + (float)((cpu.nice - pcpu.nice) * 100.0 / cpu_diff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = ((float) (cpu.sys - pcpu.sys) * 100.0 / cpu_diff) + ((float)((cpu.wait - pcpu.wait) * 100.0 / cpu_diff));
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = (float) (cpu.idle - pcpu.idle) * 100.0 / cpu_diff;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
/* update the values */
|
||||
pcpu.user = cpu.user;
|
||||
pcpu.nice = cpu.nice;
|
||||
pcpu.sys = cpu.sys;
|
||||
pcpu.wait = cpu.wait;
|
||||
pcpu.idle = cpu.idle;
|
||||
pcpu.total = cpu.total;
|
||||
|
||||
/* get load average data */
|
||||
memset(&loadavg, 0, sizeof(loadavg));
|
||||
sigar_loadavg_get(sigar, &loadavg);
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"load_avg: %e %e %e",
|
||||
loadavg.loadavg[0], loadavg.loadavg[1], loadavg.loadavg[2]);
|
||||
/* add them to the data */
|
||||
tmp = (float)loadavg.loadavg[0];
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = (float)loadavg.loadavg[1];
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
tmp = (float)loadavg.loadavg[2];
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &tmp, 1, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get disk usage data */
|
||||
memset(&tdisk, 0, sizeof(tdisk));
|
||||
OPAL_LIST_FOREACH(dit, &fslist, sensor_sigar_disks_t) {
|
||||
if (0 != sigar_file_system_usage_get(sigar, dit->mount_pt, &fsusage)) {
|
||||
opal_output(0, "%s Failed to get usage data for filesystem %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dit->mount_pt);
|
||||
} else {
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"FileSystem: %s Reads: %" PRIu64 " Writes: %" PRIu64 " ReadBytes: %" PRIu64 " WriteBytes: %" PRIu64 "",
|
||||
dit->mount_pt, fsusage.disk.reads, fsusage.disk.writes, fsusage.disk.read_bytes, fsusage.disk.write_bytes);
|
||||
/* compute the number of reads since last reading */
|
||||
reads = metric_diff_calc(fsusage.disk.reads, dit->reads, dit->mount_pt, "disk reads");
|
||||
dit->reads = fsusage.disk.reads; /* old = new */
|
||||
/* compute the number of writes since last reading */
|
||||
writes = metric_diff_calc(fsusage.disk.writes, dit->writes, dit->mount_pt, "disk writes");
|
||||
dit->writes = fsusage.disk.writes; /* old = new */
|
||||
/* compute the number of read bytes since last reading */
|
||||
read_bytes = metric_diff_calc(fsusage.disk.read_bytes, dit->read_bytes, dit->mount_pt, "disk read bytes");
|
||||
dit->read_bytes = fsusage.disk.read_bytes; /* old = new */
|
||||
/* compute the number of bytes written since last reading */
|
||||
write_bytes = metric_diff_calc(fsusage.disk.write_bytes, dit->write_bytes, dit->mount_pt, "disk write bytes");
|
||||
dit->write_bytes = fsusage.disk.write_bytes; /* old = new */
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"FileSystem: %s ReadsChange: %" PRIu64 " WritesChange: %" PRIu64 " ReadBytesChange: %" PRIu64 " WriteBytesChange: %" PRIu64 "",
|
||||
dit->mount_pt, reads, writes, read_bytes, write_bytes);
|
||||
/* accumulate the values */
|
||||
tdisk.reads += reads;
|
||||
tdisk.writes += writes;
|
||||
tdisk.read_bytes += read_bytes;
|
||||
tdisk.write_bytes += write_bytes;
|
||||
}
|
||||
}
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"Totals: ReadsChange: %" PRIu64 " WritesChange: %" PRIu64 " ReadBytesChange: %" PRIu64 " WriteBytesChange: %" PRIu64 "",
|
||||
tdisk.reads, tdisk.writes, tdisk.read_bytes, tdisk.write_bytes);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
reads = (uint64_t)ceil((double)tdisk.reads/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &reads, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
writes = (uint64_t)ceil((double)tdisk.writes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &writes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
read_bytes = (uint64_t)ceil((double)tdisk.read_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &read_bytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
write_bytes = (uint64_t)ceil((double)tdisk.write_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &write_bytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get network usage data */
|
||||
memset(&tnet, 0, sizeof(tnet));
|
||||
OPAL_LIST_FOREACH(sit, &netlist, sensor_sigar_interface_t) {
|
||||
memset(&ifc, 0, sizeof(ifc));
|
||||
if (0 != sigar_net_interface_stat_get(sigar, sit->interface, &ifc)) {
|
||||
opal_output(0, "%s Failed to get usage data for interface %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sit->interface);
|
||||
} else {
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"Interface: %s RecvdPackets: %" PRIu64 " RecvdBytes: %" PRIu64 " TransPackets: %" PRIu64 " TransBytes: %" PRIu64 "",
|
||||
sit->interface, ifc.rx_packets, ifc.rx_bytes, ifc.tx_packets, ifc.tx_bytes);
|
||||
/* compute the number of recvd packets since last reading */
|
||||
rxpkts = metric_diff_calc(ifc.rx_packets, sit->rx_packets, sit->interface, "rx packets");
|
||||
sit->rx_packets = ifc.rx_packets; /* old = new */
|
||||
/* compute the number of transmitted packets since last reading */
|
||||
txpkts = metric_diff_calc(ifc.tx_packets, sit->tx_packets, sit->interface, "tx packets");
|
||||
sit->tx_packets = ifc.tx_packets; /* old = new */
|
||||
/* compute the number of recvd bytes since last reading */
|
||||
rxbytes = metric_diff_calc(ifc.rx_bytes, sit->rx_bytes, sit->interface, "rx bytes");
|
||||
sit->rx_bytes = ifc.rx_bytes; /* old = new */
|
||||
/* compute the number of transmitted bytes since last reading */
|
||||
txbytes = metric_diff_calc(ifc.tx_bytes, sit->tx_bytes, sit->interface, "tx bytes");
|
||||
sit->tx_bytes = ifc.tx_bytes; /* old = new */
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"Interface: %s RxPkts: %" PRIu64 " TxPkts: %" PRIu64 " RxBytes: %" PRIu64 " TxBytes: %" PRIu64 "",
|
||||
sit->interface, rxpkts, txpkts, rxbytes, txbytes);
|
||||
/* accumulate the values */
|
||||
tnet.rx_packets += rxpkts;
|
||||
tnet.rx_bytes += rxbytes;
|
||||
tnet.tx_packets += txpkts;
|
||||
tnet.tx_bytes += txbytes;
|
||||
}
|
||||
}
|
||||
opal_output_verbose(4, orte_sensor_base_framework.framework_output,
|
||||
"Totals: RxPkts: %" PRIu64 " TxPkts: %" PRIu64 " RxBytes: %" PRIu64 " TxBytes: %" PRIu64 "",
|
||||
tnet.rx_packets, tnet.tx_packets, tnet.rx_bytes, tnet.tx_bytes);
|
||||
/* compute the values we actually want and add them to the data */
|
||||
rxpkts = (uint64_t)ceil((double)tnet.rx_packets/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &rxpkts, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
txpkts = (uint64_t)ceil((double)tnet.tx_packets/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &txpkts, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
rxbytes = (uint64_t)ceil((double)tnet.rx_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &rxbytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
txbytes = (uint64_t)ceil((double)tnet.tx_bytes/tdiff);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&data, &txbytes, 1, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* xfer the data for transmission - need at least one prior sample before doing so */
|
||||
if (0 < last_sample) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
|
||||
last_sample = now;
|
||||
}
|
||||
|
||||
static void sigar_log(opal_buffer_t *sample)
|
||||
{
|
||||
char *hostname;
|
||||
char *sampletime;
|
||||
int rc;
|
||||
int32_t n;
|
||||
opal_value_t kv[24];
|
||||
uint64_t uint64;
|
||||
float fval;
|
||||
int i;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &hostname, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received log from host %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname);
|
||||
|
||||
/* prep the xfr storage */
|
||||
for (i=0; i < 24; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i], opal_value_t);
|
||||
}
|
||||
|
||||
/* unpack the incoming data and xfer it for storage */
|
||||
i=0;
|
||||
|
||||
/* sample time */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &sampletime, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("ctime");
|
||||
kv[i].type = OPAL_STRING;
|
||||
kv[i++].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* hostname */
|
||||
kv[i].key = strdup("hostname");
|
||||
kv[i].type = OPAL_STRING;
|
||||
kv[i++].data.string = strdup(hostname);
|
||||
|
||||
/* total memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_total");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* total used memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_used");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* actual used memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_actual_used");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* actual free memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("mem_actual_free");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* total swap memory */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_total");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* swap used */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_used");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* swap pages in */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_page_in");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* swap pages out */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("swap_page_out");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* cpu user */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("cpu_user");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* cpu sys */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("cpu_sys");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* cpu idle */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("cpu_idle");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* la0 */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("load0");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* la5 */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("load1");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* la15 */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("load2");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = fval;
|
||||
|
||||
/* disk read ops rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_ro_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* disk write ops rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_wo_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* disk read bytes/sec */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_rb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* disk write bytes/sec */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("disk_wb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net recv packet rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_rp_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net tx packet rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_wp_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net recv bytes rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_rb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* net tx bytes rate */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &uint64, &n, OPAL_UINT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
kv[i].key = strdup("net_wb_rate");
|
||||
kv[i].type = OPAL_UINT64;
|
||||
kv[i++].data.uint64 = uint64;
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("sigar", kv, 24))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < 24; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
}
|
||||
|
||||
/* Helper function to calculate the metric differences */
|
||||
static uint64_t metric_diff_calc(sigar_uint64_t newval, uint64_t oldval,
|
||||
const char *name_for_log,
|
||||
const char *value_name_for_log)
|
||||
{
|
||||
uint64_t diff;
|
||||
|
||||
if (newval < oldval) {
|
||||
/* assume that the value was reset and we are starting over */
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s metric_diff_calc: new value %" PRIu64 " is less than old value %" PRIu64
|
||||
" for %s metric %s; assume the value was reset and set diff to new value.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newval, oldval, name_for_log, value_name_for_log);
|
||||
diff = newval;
|
||||
} else {
|
||||
diff = newval - oldval;
|
||||
}
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
static void generate_test_vector(opal_buffer_t *v)
|
||||
{
|
||||
char *ctmp;
|
||||
uint64_t ui64;
|
||||
float ft;
|
||||
time_t now;
|
||||
|
||||
ctmp = strdup("sigar");
|
||||
opal_dss.pack(v, &ctmp, 1, OPAL_STRING);
|
||||
free(ctmp);
|
||||
opal_dss.pack(v, &orte_process_info.nodename, 1, OPAL_STRING);
|
||||
/* get the time so it will be unique each time */
|
||||
now = time(NULL);
|
||||
/* pass the time along as a simple string */
|
||||
ctmp = ctime(&now);
|
||||
/* strip the trailing newline */
|
||||
ctmp[strlen(ctmp)-1] = '\0';
|
||||
opal_dss.pack(v, &ctmp, 1, OPAL_STRING);
|
||||
/* mem_total */
|
||||
ui64 = 1;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* mem_used */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* mem_actual_used */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* mem_actual_free */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap total */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap used */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap page in */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* swap page out */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* cpu user */
|
||||
ft = 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* cpu sys */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* cpu idle */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* la */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* la5 */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* la15 */
|
||||
ft += 1.0;
|
||||
opal_dss.pack(v, &ft, 1, OPAL_FLOAT);
|
||||
/* reads */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* writes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* read bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* write bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* rx packets */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* tx packets */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* rx bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
/* tx bytes */
|
||||
ui64++;
|
||||
opal_dss.pack(v, &ui64, 1, OPAL_UINT64);
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* SIGAR resource manager sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_SIGAR_H
|
||||
#define ORTE_SENSOR_SIGAR_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_sensor_base_component_t super;
|
||||
bool test;
|
||||
} orte_sensor_sigar_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_sigar_component_t mca_sensor_sigar_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_sigar_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,88 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_sigar.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_sigar_open(void);
|
||||
static int orte_sensor_sigar_close(void);
|
||||
static int orte_sensor_sigar_query(mca_base_module_t **module, int *priority);
|
||||
static int sigar_component_register(void);
|
||||
|
||||
orte_sensor_sigar_component_t mca_sensor_sigar_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"sigar", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_sigar_open, /* component open */
|
||||
orte_sensor_sigar_close, /* component close */
|
||||
orte_sensor_sigar_query, /* component query */
|
||||
sigar_component_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"procresource,noderesource"
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_sigar_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_sigar_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* if we can build, then we definitely want to be used
|
||||
* even if we aren't going to sample as we have to be
|
||||
* present in order to log any received results
|
||||
*/
|
||||
*priority = 150; /* ahead of heartbeat and resusage */
|
||||
*module = (mca_base_module_t *)&orte_sensor_sigar_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_sigar_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int sigar_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_sigar_component.super.base_version;
|
||||
|
||||
mca_sensor_sigar_component.test = false;
|
||||
(void) mca_base_component_var_register (c, "test",
|
||||
"Generate and pass test vector",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
& mca_sensor_sigar_component.test);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,7 +22,6 @@
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
|
||||
#include "orte/mca/state/base/base.h"
|
||||
@ -575,9 +575,6 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* turn off any sensor monitors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
|
||||
/* tell the IOF that the job is complete */
|
||||
if (NULL != orte_iof.complete) {
|
||||
orte_iof.complete(jdata);
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,7 +27,6 @@
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,7 +27,6 @@
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,7 +28,6 @@
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier opal_interface orte_spin segfault orte_exit orte_db orte_sensor test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 mapper reducer opal_hotel orte_dfs orte_allocate pmi_abort opal_db
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier opal_interface orte_spin segfault orte_exit orte_db test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 mapper reducer opal_hotel orte_dfs orte_allocate pmi_abort opal_db
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
@ -44,7 +44,6 @@ EXTRA_DIST += \
|
||||
test/system/sigusr_trap.c \
|
||||
test/system/spawn_child.c \
|
||||
test/system/spin.c \
|
||||
test/system/orte_sensor.c \
|
||||
test/system/mapper.c \
|
||||
test/system/mapr.py \
|
||||
test/system/reducer.c \
|
||||
|
@ -1,50 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int rc;
|
||||
char hostname[512];
|
||||
pid_t pid;
|
||||
|
||||
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
|
||||
fprintf(stderr, "orte_mcast: couldn't init orte - error code %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
gethostname(hostname, 512);
|
||||
pid = getpid();
|
||||
|
||||
printf("orte_sensor: Node %s Name %s Pid %ld\n",
|
||||
hostname, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid);
|
||||
|
||||
/* open and select the sensor modules */
|
||||
orte_sensor_base_open();
|
||||
orte_sensor_base_select();
|
||||
|
||||
/* start the sensors - note that we cannot monitor other
|
||||
* jobs as we are an application. So pass the invalid
|
||||
* jobid so the sensor modules can know
|
||||
*/
|
||||
orte_sensor.start(ORTE_JOBID_INVALID);
|
||||
|
||||
/* just sit here, letting the sensors run */
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
|
||||
orte_finalize();
|
||||
return 0;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user