1
1

Some cleanup of the sensor system to ensure things go in the right place, avoid segfaults under abnormal conditions, etc.

cmr=v1.7.5:reviewer=rhc

This commit was SVN r30409.
Этот коммит содержится в:
Ralph Castain 2014-01-24 17:29:24 +00:00
родитель 31acdb15bc
Коммит e496e348a4
8 изменённых файлов: 129 добавлений и 58 удалений

Просмотреть файл

@ -8,7 +8,7 @@
# $HEADER$ # $HEADER$
# #
dist_pkgdata_DATA = help-orte-sensor-coretemp.txt dist_ompidata_DATA = help-orte-sensor-coretemp.txt
sources = \ sources = \
sensor_coretemp.c \ sensor_coretemp.c \

Просмотреть файл

@ -67,13 +67,13 @@ typedef struct {
char *label; char *label;
float critical_temp; float critical_temp;
float max_temp; float max_temp;
} core_tracker_t; } coretemp_tracker_t;
static void ctr_con(core_tracker_t *trk) static void ctr_con(coretemp_tracker_t *trk)
{ {
trk->file = NULL; trk->file = NULL;
trk->label = NULL; trk->label = NULL;
} }
static void ctr_des(core_tracker_t *trk) static void ctr_des(coretemp_tracker_t *trk)
{ {
if (NULL != trk->file) { if (NULL != trk->file) {
free(trk->file); free(trk->file);
@ -82,7 +82,7 @@ static void ctr_des(core_tracker_t *trk)
free(trk->label); free(trk->label);
} }
} }
OBJ_CLASS_INSTANCE(core_tracker_t, OBJ_CLASS_INSTANCE(coretemp_tracker_t,
opal_list_item_t, opal_list_item_t,
ctr_con, ctr_des); ctr_con, ctr_des);
@ -111,29 +111,22 @@ static char *orte_getline(FILE *fp)
*/ */
static int init(void) static int init(void)
{ {
int ret;
DIR *cur_dirp = NULL, *tdir; DIR *cur_dirp = NULL, *tdir;
struct dirent *dir_entry, *entry; struct dirent *dir_entry, *entry;
char *dirname, *filename, *ptr, *tmp; char *dirname, *filename, *ptr, *tmp;
size_t tlen = strlen("temp"); size_t tlen = strlen("temp");
size_t ilen = strlen("_input"); size_t ilen = strlen("_input");
FILE *fp; FILE *fp;
core_tracker_t *trk; coretemp_tracker_t *trk;
int socket; int socket;
OBJ_CONSTRUCT(&tracking, opal_list_t); OBJ_CONSTRUCT(&tracking, opal_list_t);
if (ORTE_SUCCESS != (ret = opal_os_dirpath_access("/sys/bus/platform/devices", 0))) {
/* if the directory doesn't exist, or we don't have
* access to it, then disqualify us
*/
return ret;
}
/* /*
* Open up the base directory so we can get a listing * Open up the base directory so we can get a listing
*/ */
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) { if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
OBJ_DESTRUCT(&tracking);
return ORTE_ERROR; return ORTE_ERROR;
} }
@ -175,7 +168,7 @@ static int init(void)
continue; continue;
} }
/* track the info for this core */ /* track the info for this core */
trk = OBJ_NEW(core_tracker_t); trk = OBJ_NEW(coretemp_tracker_t);
trk->socket = socket; trk->socket = socket;
trk->file = opal_os_path(false, dirname, entry->d_name, NULL); trk->file = opal_os_path(false, dirname, entry->d_name, NULL);
/* take the part up to the first underscore as this will /* take the part up to the first underscore as this will
@ -252,7 +245,7 @@ static void stop(orte_jobid_t jobid)
static void coretemp_sample(void) static void coretemp_sample(void)
{ {
int ret; int ret;
core_tracker_t *trk; coretemp_tracker_t *trk;
FILE *fp; FILE *fp;
char *temp; char *temp;
float degc; float degc;
@ -261,9 +254,20 @@ static void coretemp_sample(void)
time_t now; time_t now;
char time_str[40]; char time_str[40];
char *timestamp_str; char *timestamp_str;
bool packed;
/* prep to store the results */ /* prep to store the results */
OBJ_CONSTRUCT(&data, opal_buffer_t); OBJ_CONSTRUCT(&data, opal_buffer_t);
packed = false;
/* pack our name */
temp = strdup("coretemp");
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
free(temp);
/* store our hostname */ /* store our hostname */
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) { if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
@ -293,7 +297,7 @@ static void coretemp_sample(void)
} }
free(timestamp_str); free(timestamp_str);
OPAL_LIST_FOREACH(trk, &tracking, core_tracker_t) { OPAL_LIST_FOREACH(trk, &tracking, coretemp_tracker_t) {
/* read the temp */ /* read the temp */
fp = fopen(trk->file, "r"); fp = fopen(trk->file, "r");
while (NULL != (temp = orte_getline(fp))) { while (NULL != (temp = orte_getline(fp))) {
@ -309,6 +313,7 @@ static void coretemp_sample(void)
return; return;
} }
free(temp); free(temp);
packed = true;
/* check for exceed critical temp */ /* check for exceed critical temp */
if (trk->critical_temp < degc) { if (trk->critical_temp < degc) {
/* alert the errmgr - this is a critical problem */ /* alert the errmgr - this is a critical problem */
@ -328,11 +333,13 @@ static void coretemp_sample(void)
} }
/* xfer the data for transmission */ /* xfer the data for transmission */
bptr = &data; if (packed) {
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { bptr = &data;
ORTE_ERROR_LOG(ret); if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
OBJ_DESTRUCT(&data); ORTE_ERROR_LOG(ret);
return; OBJ_DESTRUCT(&data);
return;
}
} }
OBJ_DESTRUCT(&data); OBJ_DESTRUCT(&data);
} }
@ -377,7 +384,7 @@ static void coretemp_log(opal_buffer_t *sample)
(NULL == hostname) ? "NULL" : hostname, ncores); (NULL == hostname) ? "NULL" : hostname, ncores);
/* xfr to storage */ /* xfr to storage */
kv = malloc((ncores+1) * sizeof(opal_value_t)); kv = malloc((ncores+2) * sizeof(opal_value_t));
/* load the sample time at the start */ /* load the sample time at the start */
OBJ_CONSTRUCT(&kv[0], opal_value_t); OBJ_CONSTRUCT(&kv[0], opal_value_t);
@ -386,27 +393,37 @@ static void coretemp_log(opal_buffer_t *sample)
kv[0].data.string = strdup(sampletime); kv[0].data.string = strdup(sampletime);
free(sampletime); free(sampletime);
/* load the hostname */
OBJ_CONSTRUCT(&kv[1], opal_value_t);
kv[1].key = strdup("hostname");
kv[1].type = OPAL_STRING;
kv[1].data.string = strdup(hostname);
/* protect against segfault if we jump to cleanup */
for (i=0; i < ncores; i++) { for (i=0; i < ncores; i++) {
OBJ_CONSTRUCT(&kv[i+1], opal_value_t); OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
asprintf(&kv[i+1].key, "core%d", i); }
kv[i+1].type = OPAL_FLOAT;
for (i=0; i < ncores; i++) {
asprintf(&kv[i+2].key, "core%d", i);
kv[i+2].type = OPAL_FLOAT;
n=1; n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
kv[i+1].data.fval = fval; kv[i+2].data.fval = fval;
} }
/* store it */ /* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+1))) { if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+2))) {
/* don't bark about it - just quietly disable the log */ /* don't bark about it - just quietly disable the log */
log_enabled = false; log_enabled = false;
} }
cleanup: cleanup:
/* cleanup the xfr storage */ /* cleanup the xfr storage */
for (i=0; i < ncores+1; i++) { for (i=0; i < ncores+2; i++) {
OBJ_DESTRUCT(&kv[i]); OBJ_DESTRUCT(&kv[i]);
} }
if (NULL != hostname) { if (NULL != hostname) {

Просмотреть файл

@ -8,7 +8,7 @@
# $HEADER$ # $HEADER$
# #
dist_pkgdata_DATA = help-orte-sensor-freq.txt dist_ompidata_DATA = help-orte-sensor-freq.txt
sources = \ sources = \
sensor_freq.c \ sensor_freq.c \

Просмотреть файл

@ -219,6 +219,7 @@ static void freq_sample(void)
time_t now; time_t now;
char time_str[40]; char time_str[40];
char *timestamp_str; char *timestamp_str;
bool packed;
opal_output_verbose(2, orte_sensor_base_framework.framework_output, opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s sampling freq", "%s sampling freq",
@ -226,6 +227,16 @@ static void freq_sample(void)
/* prep to store the results */ /* prep to store the results */
OBJ_CONSTRUCT(&data, opal_buffer_t); OBJ_CONSTRUCT(&data, opal_buffer_t);
packed = false;
/* pack our name */
freq = strdup("freq");
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &freq, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
free(freq);
/* store our hostname */ /* store our hostname */
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) { if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
@ -256,6 +267,10 @@ static void freq_sample(void)
free(timestamp_str); free(timestamp_str);
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) { OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s processing freq file %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trk->file);
/* read the temp */ /* read the temp */
if (NULL == (fp = fopen(trk->file, "r"))) { if (NULL == (fp = fopen(trk->file, "r"))) {
continue; continue;
@ -272,17 +287,20 @@ static void freq_sample(void)
free(freq); free(freq);
return; return;
} }
packed = true;
free(freq); free(freq);
} }
fclose(fp); fclose(fp);
} }
/* xfer the data for transmission */ /* xfer the data for transmission */
bptr = &data; if (packed) {
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { bptr = &data;
ORTE_ERROR_LOG(ret); if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
OBJ_DESTRUCT(&data); ORTE_ERROR_LOG(ret);
return; OBJ_DESTRUCT(&data);
return;
}
} }
OBJ_DESTRUCT(&data); OBJ_DESTRUCT(&data);
} }
@ -322,12 +340,12 @@ static void freq_log(opal_buffer_t *sample)
} }
opal_output_verbose(3, orte_sensor_base_framework.framework_output, opal_output_verbose(3, orte_sensor_base_framework.framework_output,
"%s Received log from host %s with %d cores", "%s Received freq log from host %s with %d cores",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == hostname) ? "NULL" : hostname, ncores); (NULL == hostname) ? "NULL" : hostname, ncores);
/* xfr to storage */ /* xfr to storage */
kv = malloc((ncores+1) * sizeof(opal_value_t)); kv = malloc((ncores+2) * sizeof(opal_value_t));
/* load the sample time at the start */ /* load the sample time at the start */
OBJ_CONSTRUCT(&kv[0], opal_value_t); OBJ_CONSTRUCT(&kv[0], opal_value_t);
@ -336,27 +354,37 @@ static void freq_log(opal_buffer_t *sample)
kv[0].data.string = strdup(sampletime); kv[0].data.string = strdup(sampletime);
free(sampletime); free(sampletime);
/* load the hostname */
OBJ_CONSTRUCT(&kv[1], opal_value_t);
kv[1].key = strdup("hostname");
kv[1].type = OPAL_STRING;
kv[1].data.string = strdup(hostname);
/* protect against segfault if we jump to cleanup */
for (i=0; i < ncores; i++) { for (i=0; i < ncores; i++) {
OBJ_CONSTRUCT(&kv[i+1], opal_value_t); OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
asprintf(&kv[i+1].key, "core%d", i); }
kv[i+1].type = OPAL_FLOAT;
for (i=0; i < ncores; i++) {
asprintf(&kv[i+2].key, "core%d", i);
kv[i+2].type = OPAL_FLOAT;
n=1; n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
kv[i+1].data.fval = fval; kv[i+2].data.fval = fval;
} }
/* store it */ /* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+1))) { if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+2))) {
/* don't bark about it - just quietly disable the log */ /* don't bark about it - just quietly disable the log */
log_enabled = false; log_enabled = false;
} }
cleanup: cleanup:
/* cleanup the xfr storage */ /* cleanup the xfr storage */
for (i=0; i < ncores+1; i++) { for (i=0; i < ncores+2; i++) {
OBJ_DESTRUCT(&kv[i]); OBJ_DESTRUCT(&kv[i]);
} }
if (NULL != hostname) { if (NULL != hostname) {

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -62,7 +63,7 @@ static void sample(void)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* are we including ourselves? */ /* are we including ourselves? */
if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_CMSLAVE) && if (ORTE_PROC_IS_DAEMON &&
0 < mca_sensor_ft_tester_component.daemon_fail_prob) { 0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sample:ft_tester considering killing me!", "%s sample:ft_tester considering killing me!",

Просмотреть файл

@ -76,7 +76,7 @@ static int init(void)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup to receive heartbeats */ /* setup to receive heartbeats */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_CM) { if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT, ORTE_RML_TAG_HEARTBEAT,
ORTE_RML_PERSISTENT, ORTE_RML_PERSISTENT,

Просмотреть файл

@ -209,6 +209,8 @@ static void pwr_sample(void)
long long value; long long value;
int fd, ret; int fd, ret;
float power; float power;
char *temp;
bool packed;
opal_output_verbose(2, orte_sensor_base_framework.framework_output, opal_output_verbose(2, orte_sensor_base_framework.framework_output,
"%s sampling power", "%s sampling power",
@ -216,6 +218,16 @@ static void pwr_sample(void)
/* prep to store the results */ /* prep to store the results */
OBJ_CONSTRUCT(&data, opal_buffer_t); OBJ_CONSTRUCT(&data, opal_buffer_t);
packed = false;
/* pack our name */
temp = strdup("pwr");
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&data);
return;
}
free(temp);
/* store our hostname */ /* store our hostname */
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) { if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
@ -266,15 +278,18 @@ static void pwr_sample(void)
close(fd); close(fd);
return; return;
} }
packed = true;
close(fd); close(fd);
} }
/* xfer the data for transmission */ /* xfer the data for transmission */
bptr = &data; if (packed) {
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { bptr = &data;
ORTE_ERROR_LOG(ret); if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
OBJ_DESTRUCT(&data); ORTE_ERROR_LOG(ret);
return; OBJ_DESTRUCT(&data);
return;
}
} }
OBJ_DESTRUCT(&data); OBJ_DESTRUCT(&data);
} }
@ -319,7 +334,7 @@ static void pwr_log(opal_buffer_t *sample)
(NULL == hostname) ? "NULL" : hostname, ncores); (NULL == hostname) ? "NULL" : hostname, ncores);
/* xfr to storage */ /* xfr to storage */
kv = malloc((ncores+1) * sizeof(opal_value_t)); kv = malloc((ncores+2) * sizeof(opal_value_t));
/* load the sample time at the start */ /* load the sample time at the start */
OBJ_CONSTRUCT(&kv[0], opal_value_t); OBJ_CONSTRUCT(&kv[0], opal_value_t);
@ -328,27 +343,37 @@ static void pwr_log(opal_buffer_t *sample)
kv[0].data.string = strdup(sampletime); kv[0].data.string = strdup(sampletime);
free(sampletime); free(sampletime);
/* load the hostname */
OBJ_CONSTRUCT(&kv[1], opal_value_t);
kv[1].key = strdup("hostname");
kv[1].type = OPAL_STRING;
kv[1].data.string = strdup(hostname);
/* protect against segfault if we jump to cleanup */
for (i=0; i < ncores; i++) { for (i=0; i < ncores; i++) {
OBJ_CONSTRUCT(&kv[i+1], opal_value_t); OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
asprintf(&kv[i+1].key, "core%d", i); }
kv[i+1].type = OPAL_FLOAT;
for (i=0; i < ncores; i++) {
asprintf(&kv[i+2].key, "core%d", i);
kv[i+2].type = OPAL_FLOAT;
n=1; n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
kv[i+1].data.fval = fval; kv[i+2].data.fval = fval;
} }
/* store it */ /* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+1))) { if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+2))) {
/* don't bark about it - just quietly disable the log */ /* don't bark about it - just quietly disable the log */
log_enabled = false; log_enabled = false;
} }
cleanup: cleanup:
/* cleanup the xfr storage */ /* cleanup the xfr storage */
for (i=0; i < ncores+1; i++) { for (i=0; i < ncores+2; i++) {
OBJ_DESTRUCT(&kv[i]); OBJ_DESTRUCT(&kv[i]);
} }
if (NULL != hostname) { if (NULL != hostname) {

Просмотреть файл

@ -8,7 +8,7 @@
# $HEADER$ # $HEADER$
# #
dist_pkgdata_DATA = help-orte-sensor-sigar.txt dist_ompidata_DATA = help-orte-sensor-sigar.txt
sources = \ sources = \
sensor_sigar.c \ sensor_sigar.c \