1
1

Ensure we build the sensor components even if the local system doesn't have the required directories and/or access permissions. Backend nodes that get the binary may have them, and aggregators need to load the component so they can log data even if they aren't locally monitoring. Detect that we can't access the required files when we first try to sample and turn the sampling portion of the plugin off at that time.

Refs trac:4172

This commit was SVN r30426.

The following Trac tickets were found above:
  Ticket 4172 --> https://svn.open-mpi.org/trac/ompi/ticket/4172
Этот коммит содержится в:
Ralph Castain 2014-01-25 04:34:33 +00:00
родитель 967550b3ac
Коммит 11562ab7cb
12 изменённых файлов: 207 добавлений и 120 удалений

Просмотреть файл

@ -124,6 +124,10 @@ MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
orte_sensor_base_open, orte_sensor_base_close,
mca_sensor_base_static_components, 0);
static void cons(orte_sensor_active_module_t *t)
{
t->sampling = true;
}
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
opal_object_t,
NULL, NULL);
cons, NULL);

Просмотреть файл

@ -219,8 +219,16 @@ int orte_sensor_base_select(void)
}
if( NULL != i_module->module->init ) {
if (ORTE_SUCCESS != i_module->module->init()) {
/* can't run after all */
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
/* can't sample - however, if we are the HNP
* or an aggregator, then we need this module
* anyway so we can log incoming data
*/
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
i_module->sampling = false;
} else {
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
OBJ_RELEASE(i_module);
}
}
}
}

Просмотреть файл

@ -54,6 +54,7 @@ typedef struct {
orte_sensor_base_component_t *component;
orte_sensor_base_module_t *module;
int priority;
bool sampling;
} orte_sensor_active_module_t;
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
#
# $COPYRIGHT$
#
@ -8,13 +8,26 @@
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
# This is the US/English general help file
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
[req-dir-not-found]
Core temperature monitoring was requested, but this node
lacks the required directory:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes
Node: %s
Directory: %s
This usually indicates that the "coretemp" kernel module
has not been loaded. Operation will continue, but core
temperatures will not be monitored.
#
[no-cores-found]
Core temperature monitoring was requested, but this node
does not appear to have the required core-level files, or
you lack authority to access them:
Node: %s
This usually indicates that the "coretemp" kernel module
has not been loaded. Operation will continue, but core
temperatures will not be monitored.

Просмотреть файл

@ -35,6 +35,7 @@
#include "opal/mca/db/db.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
@ -120,6 +121,7 @@ static int init(void)
coretemp_tracker_t *trk;
int socket;
/* always construct this so we don't segfault in finalize */
OBJ_CONSTRUCT(&tracking, opal_list_t);
/*
@ -127,6 +129,9 @@ static int init(void)
*/
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
OBJ_DESTRUCT(&tracking);
orte_show_help("help-orte-sensor-coretemp.txt", "req-dir-not-found",
true, orte_process_info.nodename,
"/sys/bus/platform/devices");
return ORTE_ERROR;
}
@ -217,6 +222,8 @@ static int init(void)
if (0 == opal_list_get_size(&tracking)) {
/* nothing to read */
orte_show_help("help-orte-sensor-coretemp.txt", "no-cores-found",
true, orte_process_info.nodename);
return ORTE_ERROR;
}
@ -245,7 +252,7 @@ static void stop(orte_jobid_t jobid)
static void coretemp_sample(void)
{
int ret;
coretemp_tracker_t *trk;
coretemp_tracker_t *trk, *nxt;
FILE *fp;
char *temp;
float degc;
@ -256,6 +263,10 @@ static void coretemp_sample(void)
char *timestamp_str;
bool packed;
if (0 == opal_list_get_size(&tracking)) {
return;
}
/* prep to store the results */
OBJ_CONSTRUCT(&data, opal_buffer_t);
packed = false;
@ -297,9 +308,18 @@ static void coretemp_sample(void)
}
free(timestamp_str);
OPAL_LIST_FOREACH(trk, &tracking, coretemp_tracker_t) {
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, coretemp_tracker_t) {
/* read the temp */
fp = fopen(trk->file, "r");
if (NULL == (fp = fopen(trk->file, "r"))) {
/* we can't be read, so remove it from the list */
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s access denied to coretemp file %s - removing it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
opal_list_remove_item(&tracking, &trk->super);
OBJ_RELEASE(trk);
continue;
}
while (NULL != (temp = orte_getline(fp))) {
degc = strtoul(temp, NULL, 10) / 100.0;
opal_output_verbose(5, orte_sensor_base_framework.framework_output,

Просмотреть файл

@ -21,16 +21,9 @@ AC_DEFUN([MCA_orte_sensor_freq_CONFIG], [
# do not build if support not requested
AS_IF([test "$with_freq" != "no"],
[AS_IF([test "$opal_found_linux" = "yes"],
[AS_IF([test -r "/sys/devices/system/cpu/cpu0/cpufreq/"],
[sensor_freq_happy=yes],
[AC_MSG_WARN([Core frequency sensing was requested but the required directory])
AC_MSG_WARN([was not found])
sensor_freq_happy=no])],
[AC_MSG_WARN([Core frequency sensing was requested but is only supported on Linux systems])
sensor_freq_happy=no])
AS_IF([test "$sensor_freq_happy" = "yes"],
[$1],
[AC_MSG_ERROR([Cannot continue])
[AC_MSG_WARN([Core frequency sensing was requested but is only supported on Linux systems])
AC_MSG_ERROR([Cannot continue])
$2])
],
[$2])

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
#
# $COPYRIGHT$
#
@ -10,11 +10,20 @@
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
[req-dir-not-found]
Frequency monitoring was requested, but this node
lacks the required directory:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes
Node: %s
Directory: %s
Operation will continue, but frequencies will not be monitored.
#
[no-cores-found]
Frequency monitoring was requested, but this node
does not appear to have the required core-level files, or
you lack authority to access them:
Node: %s
Operation will continue, but frequencies will not be monitored.

Просмотреть файл

@ -36,6 +36,7 @@
#include "opal/mca/db/db.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
@ -114,6 +115,7 @@ static int init(void)
FILE *fp;
corefreq_tracker_t *trk;
/* always construct this so we don't segfault in finalize */
OBJ_CONSTRUCT(&tracking, opal_list_t);
/*
@ -121,6 +123,9 @@ static int init(void)
*/
if (NULL == (cur_dirp = opendir("/sys/devices/system/cpu"))) {
OBJ_DESTRUCT(&tracking);
orte_show_help("help-orte-sensor-freq.txt", "req-dir-not-found",
true, orte_process_info.nodename,
"/sys/devices/system/cpu");
return ORTE_ERROR;
}
@ -182,6 +187,8 @@ static int init(void)
if (0 == opal_list_get_size(&tracking)) {
/* nothing to read */
orte_show_help("help-orte-sensor-freq.txt", "no-cores-found",
true, orte_process_info.nodename);
return ORTE_ERROR;
}
@ -210,7 +217,7 @@ static void stop(orte_jobid_t jobid)
static void freq_sample(void)
{
int ret;
corefreq_tracker_t *trk;
corefreq_tracker_t *trk, *nxt;
FILE *fp;
char *freq;
float ghz;
@ -221,6 +228,10 @@ static void freq_sample(void)
char *timestamp_str;
bool packed;
if (0 == opal_list_get_size(&tracking)) {
return;
}
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s sampling freq",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@ -266,13 +277,20 @@ static void freq_sample(void)
}
free(timestamp_str);
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, corefreq_tracker_t) {
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s processing freq file %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
/* read the temp */
/* read the freq */
if (NULL == (fp = fopen(trk->file, "r"))) {
/* we can't be read, so remove it from the list */
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s access denied to freq file %s - removing it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
opal_list_remove_item(&tracking, &trk->super);
OBJ_RELEASE(trk);
continue;
}
while (NULL != (freq = orte_getline(fp))) {

Просмотреть файл

@ -21,17 +21,9 @@ AC_DEFUN([MCA_orte_sensor_pwr_CONFIG], [
# do not build if support not requested
AS_IF([test "$with_pwr" != "no"],
[AS_IF([test "$opal_found_linux" = "yes"],
[AS_IF([test -e "/dev/cpu/0/msr"],
[sensor_pwr_happy=yes],
[AC_MSG_WARN([Core power sensing was requested but the required directory])
AC_MSG_WARN([was not found])
sensor_pwr_happy=no])],
[AC_MSG_WARN([Core power sensing was requested but is only supported on Intel-based Linux systems])
sensor_pwr_happy=no])
AS_IF([test "$sensor_pwr_happy" = "yes"],
[$1],
[AC_MSG_ERROR([Cannot continue])
$2])
],
[AC_MSG_WARN([Core power sensing was requested but is only supported on Intel-based Linux systems])
AC_MSG_ERROR([Cannot continue])
$2])],
[$2])
])dnl

Просмотреть файл

@ -11,8 +11,9 @@
#
[no-access]
Power sensing was requested, but you lack access authority
to the required path:
to the required path on this node:
Node: %s
Path: %s
We will continue to operate, but will not monitor power.
@ -36,3 +37,12 @@ available. This usually means that your system lacks
the required revision level for hwloc.
We will continue to operate, but will not monitor power.
#
[no-cores-found]
Power monitoring was requested, but this node
does not appear to have the required core-level files,
or you lack access authority to them:
Node: %s
We will continue to operate, but will not monitor power.

Просмотреть файл

@ -42,6 +42,7 @@
#include "opal/mca/db/db.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
@ -110,6 +111,7 @@ static int read_msr(int fd, long long *value, int offset)
*value = (long long)data;
return ORTE_SUCCESS;
}
static int check_cpu_type(void);
static int init(void)
@ -120,13 +122,25 @@ static int init(void)
corepwr_tracker_t *trk;
long long units;
/* always construct this so we don't segfault in finalize */
OBJ_CONSTRUCT(&tracking, opal_list_t);
/* we only handle certain cpu types as we have to know the binary
* layout of the msr file
*/
if (ORTE_SUCCESS != check_cpu_type()) {
/* we provided a show help down below */
return ORTE_ERR_NOT_SUPPORTED;
}
/*
* Open up the base directory so we can get a listing
*/
if (NULL == (cur_dirp = opendir("/dev/cpu"))) {
OBJ_DESTRUCT(&tracking);
orte_show_help("help-orte-sensor-pwr.txt", "no-access",
true, orte_process_info.nodename,
"/dev/cpu");
return ORTE_ERROR;
}
@ -173,6 +187,8 @@ static int init(void)
if (0 == opal_list_get_size(&tracking)) {
/* nothing to read */
orte_show_help("help-orte-sensor-pwr.txt", "no-cores-found",
true, orte_process_info.nodename);
return ORTE_ERROR;
}
@ -212,6 +228,10 @@ static void pwr_sample(void)
char *temp;
bool packed;
if (0 == opal_list_get_size(&tracking)) {
return;
}
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s sampling power",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@ -260,12 +280,20 @@ static void pwr_sample(void)
OPAL_LIST_FOREACH_SAFE(trk, nxt, &tracking, corepwr_tracker_t) {
if (0 >= (fd = open(trk->file, O_RDONLY))) {
/* disable this one - cannot read the file */
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s access denied to pwr file %s - removing it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
opal_list_remove_item(&tracking, &trk->super);
OBJ_RELEASE(trk);
continue;
}
if (ORTE_SUCCESS != read_msr(fd, &value, MSR_PKG_POWER_INFO)) {
/* disable this one - cannot read the file */
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s failed to read pwr file %s - removing it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
opal_list_remove_item(&tracking, &trk->super);
OBJ_RELEASE(trk);
close(fd);
@ -381,3 +409,69 @@ static void pwr_log(opal_buffer_t *sample)
}
}
/* list of supported chipsets */
#define CPU_SANDYBRIDGE 42
#define CPU_SANDYBRIDGE_EP 45
#define CPU_IVYBRIDGE 58
#define CPU_IVYBRIDGE_EP 62
#define CPU_HASWELL 60
/* go thru our topology and check the sockets
* to see if they contain a match - at this time,
* we don't support hetero sockets, so any mismatch
* will disqualify us
*/
static int check_cpu_type(void)
{
hwloc_obj_t obj;
unsigned k;
if (NULL == (obj = hwloc_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_SOCKET, 0))) {
/* there are no sockets identified in this machine */
orte_show_help("help-orte-sensor-pwr.txt", "no-sockets", true);
return ORTE_ERROR;
}
while (NULL != obj) {
for (k=0; k < obj->infos_count; k++) {
if (0 == strcmp(obj->infos[k].name, "model") &&
NULL != obj->infos[k].value) {
mca_sensor_pwr_component.model = strtoul(obj->infos[k].value, NULL, 10);
switch (mca_sensor_pwr_component.model) {
case CPU_SANDYBRIDGE:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Sandybridge CPU");
return ORTE_SUCCESS;
case CPU_SANDYBRIDGE_EP:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Sandybridge-EP CPU");
return ORTE_SUCCESS;
case CPU_IVYBRIDGE:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Ivybridge CPU");
return ORTE_SUCCESS;
case CPU_IVYBRIDGE_EP:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Ivybridge-EP CPU");
return ORTE_SUCCESS;
case CPU_HASWELL:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Haswell CPU");
return ORTE_SUCCESS;
default:
orte_show_help("help-orte-sensor-pwr.txt", "unsupported-model",
true, mca_sensor_pwr_component.model);
return ORTE_ERROR;
}
}
}
obj = obj->next_sibling;
}
orte_show_help("help-orte-sensor-pwr.txt", "no-topo-info",
true, mca_sensor_pwr_component.model);
return ORTE_ERROR;
}

Просмотреть файл

@ -26,7 +26,6 @@ static int orte_sensor_pwr_open(void);
static int orte_sensor_pwr_close(void);
static int orte_sensor_pwr_query(mca_base_module_t **module, int *priority);
static int pwr_component_register(void);
static int check_cpu_type(void);
orte_sensor_pwr_component_t mca_sensor_pwr_component = {
{
@ -60,15 +59,6 @@ static int orte_sensor_pwr_open(void)
static int orte_sensor_pwr_query(mca_base_module_t **module, int *priority)
{
/* we only handle certain cpu types as we have to know the binary
* layout of the msr file
*/
if (ORTE_SUCCESS != check_cpu_type()) {
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
*priority = 50; /* ahead of heartbeat */
*module = (mca_base_module_t *)&orte_sensor_pwr_module;
return ORTE_SUCCESS;
@ -96,68 +86,3 @@ static int pwr_component_register(void)
& mca_sensor_pwr_component.test);
return ORTE_SUCCESS;
}
/* list of supported chipsets */
#define CPU_SANDYBRIDGE 42
#define CPU_SANDYBRIDGE_EP 45
#define CPU_IVYBRIDGE 58
#define CPU_IVYBRIDGE_EP 62
#define CPU_HASWELL 60
/* go thru our topology and check the sockets
* to see if they contain a match - at this time,
* we don't support hetero sockets, so any mismatch
* will disqualify us
*/
static int check_cpu_type(void)
{
hwloc_obj_t obj;
unsigned k;
if (NULL == (obj = hwloc_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_SOCKET, 0))) {
/* there are no sockets identified in this machine */
orte_show_help("help-orte-sensor-pwr.txt", "no-sockets", true);
return ORTE_ERROR;
}
while (NULL != obj) {
for (k=0; k < obj->infos_count; k++) {
if (0 == strcmp(obj->infos[k].name, "model") &&
NULL != obj->infos[k].value) {
mca_sensor_pwr_component.model = strtoul(obj->infos[k].value, NULL, 10);
switch (mca_sensor_pwr_component.model) {
case CPU_SANDYBRIDGE:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Sandybridge CPU");
return ORTE_SUCCESS;
case CPU_SANDYBRIDGE_EP:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Sandybridge-EP CPU");
return ORTE_SUCCESS;
case CPU_IVYBRIDGE:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Ivybridge CPU");
return ORTE_SUCCESS;
case CPU_IVYBRIDGE_EP:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Ivybridge-EP CPU");
return ORTE_SUCCESS;
case CPU_HASWELL:
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"sensor:pwr Found Haswell CPU");
return ORTE_SUCCESS;
default:
orte_show_help("help-orte-sensor-pwr.txt", "unsupported-model",
true, mca_sensor_pwr_component.model);
return ORTE_ERROR;
}
}
}
obj = obj->next_sibling;
}
orte_show_help("help-orte-sensor-pwr.txt", "no-topo-info",
true, mca_sensor_pwr_component.model);
return ORTE_ERROR;
}