1
1

Per the RFC, remove the sensor framework from the ORTE code area, relocating it offsite to the ORCM code area. Also update some ignores to ensure we don't pickup crosstalk in components

This commit was SVN r31403.
This commit is contained in:
Ralph Castain 2014-04-15 21:48:24 +00:00
parent 8897e2f5bb
commit a368e84e70
69 changed files with 7 additions and 6360 deletions

View File

@ -40,7 +40,6 @@
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/ess/ess.h"
@ -474,24 +473,6 @@ static void proc_errors(int fd, short args, void *cbdata)
default_hnp_abort(jdata);
break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:hnp: proc %s exceeded sensor boundary",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = pptr;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
}
/* abnormal termination - abort */
default_hnp_abort(jdata);
break;
case ORTE_PROC_STATE_TERM_NON_ZERO:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:hnp: proc %s exited with non-zero status %d",
@ -614,9 +595,6 @@ static void default_hnp_abort(orte_job_t *jdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* the job aborted - turn off any sensors on this job */
orte_sensor.stop(jdata->jobid);
/* set control params to indicate we are terminating */
orte_job_term_ordered = true;
orte_enable_recovery = false;

View File

@ -8,6 +8,7 @@
* reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -41,7 +42,6 @@
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/state/state.h"
@ -96,9 +96,6 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child);
static bool all_children_registered(orte_jobid_t job);
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf);
static void failed_start(orte_job_t *jobdat);
static void update_local_children(orte_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static void job_errors(int fd, short args, void *cbdata);
@ -168,12 +165,6 @@ static void job_errors(int fd, short args, void *cbdata)
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jdata);
break;
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
/* update all procs in job */
update_local_children(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
break;
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
@ -341,15 +332,6 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_proc_state_to_str(state),
ORTE_NAME_PRINT(proc)));
if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
child->state = state;
/* Decrement the number of local procs */
jdata->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid);
goto cleanup;
}
if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
if (!orte_abort_non_zero_exit) {
/* leave the child in orte_local_children so we can
@ -791,36 +773,12 @@ static void failed_start(orte_job_t *jobdat)
return;
}
static void update_local_children(orte_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state)
{
int i;
orte_proc_t *child;
/* update job state */
jobdat->state = jobstate;
/* update children */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
/* is this child part of the specified job? */
if (jobdat->jobid == child->name.jobid) {
child->state = state;
}
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
int rc;
/* stop local sensors for this job */
if (ORTE_VPID_WILDCARD == vpid) {
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job
&& ORTE_VPID_WILDCARD == vpid) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {

View File

@ -66,8 +66,6 @@
#include "orte/util/regex.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/state/base/base.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_cr.h"
@ -596,20 +594,6 @@ int orte_ess_base_orted_setup(char **hosts)
goto error;
}
/* setup the SENSOR framework */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sensor_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_select";
goto error;
}
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* setup the DFS framework */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -634,10 +618,6 @@ int orte_ess_base_orted_setup(char **hosts)
int orte_ess_base_orted_finalize(void)
{
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
(void) mca_base_framework_close(&orte_sensor_base_framework);
if (signals_set) {
/* Release all local signal handlers */
opal_event_del(&epipe_handler);

View File

@ -13,7 +13,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -64,8 +64,6 @@
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
@ -711,20 +709,6 @@ static int rte_init(void)
goto error;
}
/* setup the SENSOR framework */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sensor_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_select";
goto error;
}
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* setup the dfs framework */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -799,10 +783,6 @@ static int rte_finalize(void)
signals_set = false;
}
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
(void) mca_base_framework_close(&orte_sensor_base_framework);
/* close the dfs */
(void) mca_base_framework_close(&orte_dfs_base_framework);
(void) mca_base_framework_close(&orte_filem_base_framework);

View File

@ -64,7 +64,6 @@
#include "orte/mca/plm/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/state/state.h"
#include "orte/mca/filem/filem.h"
@ -1606,9 +1605,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
"%s odls:launch setting waitpids",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* start the sensors for this job (if any) */
orte_sensor.start(jobdat->jobid);
/* setup the waitpids on the children that started */
for (idx=0; idx < orte_local_children->size; idx++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {

View File

@ -60,7 +60,6 @@
#include "orte/mca/filem/base/base.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_locks.h"

View File

@ -1,30 +0,0 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_sensor.la
libmca_sensor_la_SOURCES =
# local files
headers = sensor.h \
sensor_types.h
libmca_sensor_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(ompiincludedir)/$(subdir)
nobase_orte_HEADERS = $(headers)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

View File

@ -1,19 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h \
base/sensor_private.h
libmca_sensor_la_SOURCES += \
base/sensor_base_frame.c \
base/sensor_base_select.c \
base/sensor_base_fns.c

View File

@ -1,38 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_BASE_H
#define MCA_SENSOR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/base/base.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
/*
* MCA Framework
*/
ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework;
/* select a component */
ORTE_DECLSPEC int orte_sensor_base_select(void);
END_C_DECLS
#endif

View File

@ -1,158 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool mods_active = false;
void orte_sensor_base_start(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (0 < orte_sensor_base.rate.tv_sec) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: starting sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the start function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
mods_active = true;
if (NULL != i_module->module->start) {
i_module->module->start(job);
}
}
if (mods_active && !orte_sensor_base.active) {
/* setup a buffer to collect samples */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
/* startup a timer to wake us up periodically
* for a data sample
*/
orte_sensor_base.active = true;
opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev,
orte_sensor_base_sample, NULL);
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
}
}
return;
}
void orte_sensor_base_stop(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: stopping sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (orte_sensor_base.active) {
opal_event_del(&orte_sensor_base.sample_ev);
orte_sensor_base.active = false;
}
/* call the stop function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->stop) {
i_module->module->stop(job);
}
}
return;
}
void orte_sensor_base_sample(int fd, short args, void *cbdata)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
/* see if we were ordered to stop */
if (!orte_sensor_base.active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the sample function of all modules in priority order from
* highest to lowest - the heartbeat should always be the lowest
* priority, so it will send any collected data
*/
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->sample) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling component %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
i_module->component->base_version.mca_component_name);
i_module->module->sample();
}
}
/* restart the timer */
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
return;
}
void orte_sensor_base_log(char *comp, opal_buffer_t *data)
{
int i;
orte_sensor_active_module_t *i_module;
if (NULL == comp) {
/* nothing we can do */
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: logging sensor %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp);
/* find the specified module */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) {
if (NULL != i_module->module->log) {
i_module->module->log(data);
}
return;
}
}
}

View File

@ -1,131 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_pointer_array.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/sensor/base/static-components.h"
/*
* Global variables
*/
orte_sensor_base_API_module_t orte_sensor = {
orte_sensor_base_start,
orte_sensor_base_stop
};
orte_sensor_base_t orte_sensor_base;
/*
* Local variables
*/
static int orte_sensor_base_sample_rate = 0;
static int orte_sensor_base_register(mca_base_register_flag_t flags)
{
int var_id;
orte_sensor_base_sample_rate = 0;
var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate",
"Sample rate in seconds",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base_sample_rate);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
/* see if we want samples logged */
orte_sensor_base.log_samples = false;
var_id = mca_base_var_register("orte", "sensor", "base", "log_samples",
"Log samples to database",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base.log_samples);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
return ORTE_SUCCESS;
}
static int orte_sensor_base_close(void)
{
orte_sensor_active_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->finalize) {
i_module->module->finalize();
}
}
OBJ_DESTRUCT(&orte_sensor_base.modules);
/* Close all remaining available components */
return mca_base_framework_components_close(&orte_sensor_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int orte_sensor_base_open(mca_base_open_flag_t flags)
{
/* initialize globals */
orte_sensor_base.active = false;
/* construct the array of modules */
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
/* get the sample rate */
orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate;
orte_sensor_base.rate.tv_usec = 0;
/* Open up all available components */
return mca_base_framework_components_open(&orte_sensor_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
orte_sensor_base_register,
orte_sensor_base_open, orte_sensor_base_close,
mca_sensor_base_static_components, 0);
static void cons(orte_sensor_active_module_t *t)
{
t->sampling = true;
}
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
opal_object_t,
cons, NULL);

View File

@ -1,218 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool selected = false;
/**
* Function for weeding out sensor components that don't want to run.
*
* Call the init function on all available components to find out if
* they want to run. Select all components that don't fail. Failing
* components will be closed and unloaded. The selected modules will
* be returned to the caller in a opal_list_t.
*/
int orte_sensor_base_select(void)
{
mca_base_component_list_item_t *cli = NULL;
orte_sensor_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
orte_sensor_active_module_t *i_module;
int priority = 0, i, j, low_i;
opal_pointer_array_t tmp_array;
bool none_found;
orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
bool duplicate;
if (selected) {
return ORTE_SUCCESS;
}
selected = true;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_output_verbose(10, orte_sensor_base_framework.framework_output,
"sensor:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
none_found = true;
OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) {
component = (orte_sensor_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->base_version.mca_query_component) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. It does not implement a query function",
component->base_version.mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Querying component [%s]",
component->base_version.mca_component_name);
component->base_version.mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. Query failed to return a module",
component->base_version.mca_component_name );
continue;
}
/* check to see if we already have someone who senses the
* same things - if so, take the higher priority one
*/
duplicate = false;
for (i=0; i < tmp_array.size; i++) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if (NULL == tmp_module) {
continue;
}
if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) {
if (tmp_module->priority < priority) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Replacing component %s with %s - both measure %s",
tmp_module->component->base_version.mca_component_name,
component->base_version.mca_component_name,
component->data_measured);
OBJ_RELEASE(tmp_module);
opal_pointer_array_set_item(&tmp_array, i, NULL);
break;
} else {
duplicate = true;
}
}
}
if (duplicate) {
/* ignore this component */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Ignoring component %s - duplicate with higher priority measures %s",
component->base_version.mca_component_name,
component->data_measured);
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Query of component [%s] set priority to %d",
component->base_version.mca_component_name, priority);
tmp_module = OBJ_NEW(orte_sensor_active_module_t);
tmp_module->component = component;
tmp_module->module = (orte_sensor_base_module_t*)module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
}
if (none_found) {
/* okay for no modules to be found */
return ORTE_SUCCESS;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Add module with priority [%s] %d",
tmp_module->component->base_version.mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_sensor_base.modules, tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the modules in priority order from
* highest to lowest
*/
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->module->init ) {
if (ORTE_SUCCESS != i_module->module->init()) {
/* can't sample - however, if we are the HNP
* or an aggregator, then we need this module
* anyway so we can log incoming data
*/
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
i_module->sampling = false;
} else {
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
OBJ_RELEASE(i_module);
}
}
}
}
return ORTE_SUCCESS;
}

View File

@ -1,67 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_PRIVATE_H
#define MCA_SENSOR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/event/event.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/sensor.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
opal_pointer_array_t modules;
bool log_samples;
bool active;
struct timeval rate;
opal_event_t sample_ev;
opal_buffer_t *samples;
} orte_sensor_base_t;
typedef struct {
opal_object_t super;
orte_sensor_base_component_t *component;
orte_sensor_base_module_t *module;
int priority;
bool sampling;
} orte_sensor_active_module_t;
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data);
END_C_DECLS
#endif

View File

@ -1,37 +0,0 @@
#
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-coretemp.txt
sources = \
sensor_coretemp.c \
sensor_coretemp.h \
sensor_coretemp_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_coretemp_DSO
component_noinst =
component_install = mca_sensor_coretemp.la
else
component_noinst = libmca_sensor_coretemp.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_coretemp_la_SOURCES = $(sources)
mca_sensor_coretemp_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_coretemp_la_SOURCES =$(sources)
libmca_sensor_coretemp_la_LDFLAGS = -module -avoid-version

View File

@ -1,30 +0,0 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2014 Intel, Inc. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
# MCA_sensor_coretemp_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_coretemp_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/coretemp/Makefile])
AC_ARG_WITH([coretemp],
[AC_HELP_STRING([--with-coretemp],
[Build coretemp support (default: no)])],
[], with_coretemp=no)
# do not build if support not requested
AS_IF([test "$with_coretemp" != "no"],
[AS_IF([test "$opal_found_linux" = "yes"],
[$1],
[AC_MSG_WARN([Core temperature sensing was requested but is only supported on Linux systems])
AC_MSG_ERROR([Cannot continue])
$2])
],
[$2])
])dnl

View File

@ -1,33 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2014 Intel, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file
#
[req-dir-not-found]
Core temperature monitoring was requested, but this node
lacks the required directory:
Node: %s
Directory: %s
This usually indicates that the "coretemp" kernel module
has not been loaded. Operation will continue, but core
temperatures will not be monitored.
#
[no-cores-found]
Core temperature monitoring was requested, but this node
does not appear to have the required core-level files, or
you lack authority to access them:
Node: %s
This usually indicates that the "coretemp" kernel module
has not been loaded. Operation will continue, but core
temperatures will not be monitored.

View File

@ -1,453 +0,0 @@
/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include "opal_stdint.h"
#include "opal/class/opal_list.h"
#include "opal/dss/dss.h"
#include "opal/util/os_path.h"
#include "opal/util/output.h"
#include "opal/util/os_dirpath.h"
#include "opal/mca/db/db.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_coretemp.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
static void coretemp_sample(void);
static void coretemp_log(opal_buffer_t *buf);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_coretemp_module = {
init,
finalize,
start,
stop,
coretemp_sample,
coretemp_log
};
typedef struct {
opal_list_item_t super;
char *file;
int socket;
char *label;
float critical_temp;
float max_temp;
} coretemp_tracker_t;
static void ctr_con(coretemp_tracker_t *trk)
{
trk->file = NULL;
trk->label = NULL;
}
static void ctr_des(coretemp_tracker_t *trk)
{
if (NULL != trk->file) {
free(trk->file);
}
if (NULL != trk->label) {
free(trk->label);
}
}
OBJ_CLASS_INSTANCE(coretemp_tracker_t,
opal_list_item_t,
ctr_con, ctr_des);
static bool log_enabled = true;
static opal_list_t tracking;
static char *orte_getline(FILE *fp)
{
char *ret, *buff;
char input[1024];
ret = fgets(input, 1024, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
return buff;
}
return NULL;
}
/* FOR FUTURE: extend to read cooling device speeds in
* current speed: /sys/class/thermal/cooling_deviceN/cur_state
* max speed: /sys/class/thermal/cooling_deviceN/max_state
* type: /sys/class/thermal/cooling_deviceN/type
*/
static int init(void)
{
DIR *cur_dirp = NULL, *tdir;
struct dirent *dir_entry, *entry;
char *dirname, *filename, *ptr, *tmp;
size_t tlen = strlen("temp");
size_t ilen = strlen("_input");
FILE *fp;
coretemp_tracker_t *trk;
int socket;
/* always construct this so we don't segfault in finalize */
OBJ_CONSTRUCT(&tracking, opal_list_t);
/*
* Open up the base directory so we can get a listing
*/
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
OBJ_DESTRUCT(&tracking);
orte_show_help("help-orte-sensor-coretemp.txt", "req-dir-not-found",
true, orte_process_info.nodename,
"/sys/bus/platform/devices");
return ORTE_ERROR;
}
/*
* For each directory
*/
socket = 0;
while (NULL != (dir_entry = readdir(cur_dirp))) {
/* look for coretemp directories */
if (0 != strncmp(dir_entry->d_name, "coretemp", strlen("coretemp"))) {
continue;
}
/* open that directory */
dirname = opal_os_path(false, "/sys/bus/platform/devices", dir_entry->d_name, NULL );
if (NULL == (tdir = opendir(dirname))) {
continue;
}
while (NULL != (entry = readdir(tdir))) {
/*
* Skip the obvious
*/
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
continue;
}
if (strlen(entry->d_name) < (tlen+ilen)) {
/* cannot be a core temp file */
continue;
}
/*
* See if this is a core temp file
*/
if (0 != strncmp(entry->d_name, "temp", strlen("temp"))) {
continue;
}
if (0 != strcmp(entry->d_name + strlen(entry->d_name) - ilen, "_input")) {
continue;
}
/* track the info for this core */
trk = OBJ_NEW(coretemp_tracker_t);
trk->socket = socket;
trk->file = opal_os_path(false, dirname, entry->d_name, NULL);
/* take the part up to the first underscore as this will
* be used as the start of all the related files
*/
tmp = strdup(entry->d_name);
if (NULL == (ptr = strchr(tmp, '_'))) {
/* unrecognized format */
free(tmp);
OBJ_RELEASE(trk);
continue;
}
*ptr = '\0';
/* look for critical, max, and label info */
asprintf(&filename, "%s/%s_%s", dirname, tmp, "label");
fp = fopen(filename, "r");
trk->label = orte_getline(fp);
fclose(fp);
free(filename);
asprintf(&filename, "%s/%s_%s", dirname, tmp, "crit");
fp = fopen(filename, "r");
ptr = orte_getline(fp);
fclose(fp);
trk->critical_temp = strtol(ptr, NULL, 10)/100.0;
free(ptr);
free(filename);
asprintf(&filename, "%s/%s_%s", dirname, tmp, "max");
fp = fopen(filename, "r");
ptr = orte_getline(fp);
fclose(fp);
trk->max_temp = strtol(ptr, NULL, 10)/100.0;
free(ptr);
free(filename);
/* add to our list */
opal_list_append(&tracking, &trk->super);
/* cleanup */
free(tmp);
}
closedir(tdir);
socket++;
}
closedir(cur_dirp);
if (0 == opal_list_get_size(&tracking)) {
/* nothing to read */
orte_show_help("help-orte-sensor-coretemp.txt", "no-cores-found",
true, orte_process_info.nodename);
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
OPAL_LIST_DESTRUCT(&tracking);
}
/*
* Start monitoring of local temps
*/
static void start(orte_jobid_t jobid)
{
return;
}
static void stop(orte_jobid_t jobid)
{
return;
}
static void coretemp_sample(void)
{
int ret;
coretemp_tracker_t *trk, *nxt;
FILE *fp;
char *temp;
float degc;
opal_buffer_t data, *bptr;
int32_t ncores;
time_t now;
char time_str[40];
char *timestamp_str;
bool packed;
if (0 == opal_list_get_size(&tracking)) {
return;
}
/* prep to store the results */
OBJ_CONSTRUCT(&data, opal_buffer_t);
packed = false;
/* pack our name */
temp = strdup("coretemp");
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
free(temp);
/* store our hostname */
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
/* store the number of cores */
ncores = (int32_t)opal_list_get_size(&tracking);
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &ncores, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
/* get the sample time */
now = time(NULL);
/* pass the time along as a simple string */
strftime(time_str, sizeof(time_str), "%F %T%z", localtime(&now));
asprintf(&timestamp_str, "%s", time_str);
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &timestamp_str, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
free(timestamp_str);
return;
}
free(timestamp_str);
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, coretemp_tracker_t) {
/* read the temp */
if (NULL == (fp = fopen(trk->file, "r"))) {
/* we can't be read, so remove it from the list */
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s access denied to coretemp file %s - removing it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
opal_list_remove_item(&tracking, &trk->super);
OBJ_RELEASE(trk);
continue;
}
while (NULL != (temp = orte_getline(fp))) {
degc = strtoul(temp, NULL, 10) / 100.0;
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:coretemp: Socket %d %s temp %f max %f critical %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->socket, trk->label, degc, trk->max_temp, trk->critical_temp);
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &degc, 1, OPAL_FLOAT))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
free(temp);
return;
}
free(temp);
packed = true;
/* check for exceed critical temp */
if (trk->critical_temp < degc) {
/* alert the errmgr - this is a critical problem */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:coretemp: Socket %d %s CRITICAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->socket, trk->label);
} else if (trk->max_temp < degc) {
/* alert the errmgr */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:coretemp: Socket %d %s MAX",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->socket, trk->label);
}
}
fclose(fp);
}
/* xfer the data for transmission */
if (packed) {
bptr = &data;
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
}
OBJ_DESTRUCT(&data);
}
static void coretemp_log(opal_buffer_t *sample)
{
char *hostname=NULL;
char *sampletime;
int rc;
int32_t n, ncores;
opal_value_t *kv=NULL;
float fval;
int i;
if (!log_enabled) {
return;
}
/* unpack the host this came from */
n=