Some cleanup of the sensor system to ensure things go in the right place, avoid segfaults under abnormal conditions, etc.
cmr=v1.7.5:reviewer=rhc This commit was SVN r30409.
Этот коммит содержится в:
родитель
31acdb15bc
Коммит
e496e348a4
@ -8,7 +8,7 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-coretemp.txt
|
||||
dist_ompidata_DATA = help-orte-sensor-coretemp.txt
|
||||
|
||||
sources = \
|
||||
sensor_coretemp.c \
|
||||
|
@ -67,13 +67,13 @@ typedef struct {
|
||||
char *label;
|
||||
float critical_temp;
|
||||
float max_temp;
|
||||
} core_tracker_t;
|
||||
static void ctr_con(core_tracker_t *trk)
|
||||
} coretemp_tracker_t;
|
||||
static void ctr_con(coretemp_tracker_t *trk)
|
||||
{
|
||||
trk->file = NULL;
|
||||
trk->label = NULL;
|
||||
}
|
||||
static void ctr_des(core_tracker_t *trk)
|
||||
static void ctr_des(coretemp_tracker_t *trk)
|
||||
{
|
||||
if (NULL != trk->file) {
|
||||
free(trk->file);
|
||||
@ -82,7 +82,7 @@ static void ctr_des(core_tracker_t *trk)
|
||||
free(trk->label);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(core_tracker_t,
|
||||
OBJ_CLASS_INSTANCE(coretemp_tracker_t,
|
||||
opal_list_item_t,
|
||||
ctr_con, ctr_des);
|
||||
|
||||
@ -111,29 +111,22 @@ static char *orte_getline(FILE *fp)
|
||||
*/
|
||||
static int init(void)
|
||||
{
|
||||
int ret;
|
||||
DIR *cur_dirp = NULL, *tdir;
|
||||
struct dirent *dir_entry, *entry;
|
||||
char *dirname, *filename, *ptr, *tmp;
|
||||
size_t tlen = strlen("temp");
|
||||
size_t ilen = strlen("_input");
|
||||
FILE *fp;
|
||||
core_tracker_t *trk;
|
||||
coretemp_tracker_t *trk;
|
||||
int socket;
|
||||
|
||||
OBJ_CONSTRUCT(&tracking, opal_list_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_os_dirpath_access("/sys/bus/platform/devices", 0))) {
|
||||
/* if the directory doesn't exist, or we don't have
|
||||
* access to it, then disqualify us
|
||||
*/
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Open up the base directory so we can get a listing
|
||||
*/
|
||||
if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) {
|
||||
OBJ_DESTRUCT(&tracking);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -175,7 +168,7 @@ static int init(void)
|
||||
continue;
|
||||
}
|
||||
/* track the info for this core */
|
||||
trk = OBJ_NEW(core_tracker_t);
|
||||
trk = OBJ_NEW(coretemp_tracker_t);
|
||||
trk->socket = socket;
|
||||
trk->file = opal_os_path(false, dirname, entry->d_name, NULL);
|
||||
/* take the part up to the first underscore as this will
|
||||
@ -252,7 +245,7 @@ static void stop(orte_jobid_t jobid)
|
||||
static void coretemp_sample(void)
|
||||
{
|
||||
int ret;
|
||||
core_tracker_t *trk;
|
||||
coretemp_tracker_t *trk;
|
||||
FILE *fp;
|
||||
char *temp;
|
||||
float degc;
|
||||
@ -261,9 +254,20 @@ static void coretemp_sample(void)
|
||||
time_t now;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
bool packed;
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
packed = false;
|
||||
|
||||
/* pack our name */
|
||||
temp = strdup("coretemp");
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
@ -293,7 +297,7 @@ static void coretemp_sample(void)
|
||||
}
|
||||
free(timestamp_str);
|
||||
|
||||
OPAL_LIST_FOREACH(trk, &tracking, core_tracker_t) {
|
||||
OPAL_LIST_FOREACH(trk, &tracking, coretemp_tracker_t) {
|
||||
/* read the temp */
|
||||
fp = fopen(trk->file, "r");
|
||||
while (NULL != (temp = orte_getline(fp))) {
|
||||
@ -309,6 +313,7 @@ static void coretemp_sample(void)
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
packed = true;
|
||||
/* check for exceed critical temp */
|
||||
if (trk->critical_temp < degc) {
|
||||
/* alert the errmgr - this is a critical problem */
|
||||
@ -328,11 +333,13 @@ static void coretemp_sample(void)
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
if (packed) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
@ -377,7 +384,7 @@ static void coretemp_log(opal_buffer_t *sample)
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+1) * sizeof(opal_value_t));
|
||||
kv = malloc((ncores+2) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
@ -386,27 +393,37 @@ static void coretemp_log(opal_buffer_t *sample)
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* load the hostname */
|
||||
OBJ_CONSTRUCT(&kv[1], opal_value_t);
|
||||
kv[1].key = strdup("hostname");
|
||||
kv[1].type = OPAL_STRING;
|
||||
kv[1].data.string = strdup(hostname);
|
||||
|
||||
/* protect against segfault if we jump to cleanup */
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+1], opal_value_t);
|
||||
asprintf(&kv[i+1].key, "core%d", i);
|
||||
kv[i+1].type = OPAL_FLOAT;
|
||||
OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
|
||||
}
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
asprintf(&kv[i+2].key, "core%d", i);
|
||||
kv[i+2].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+1].data.fval = fval;
|
||||
kv[i+2].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+1))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+2))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+1; i++) {
|
||||
for (i=0; i < ncores+2; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
|
@ -8,7 +8,7 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-freq.txt
|
||||
dist_ompidata_DATA = help-orte-sensor-freq.txt
|
||||
|
||||
sources = \
|
||||
sensor_freq.c \
|
||||
|
@ -219,6 +219,7 @@ static void freq_sample(void)
|
||||
time_t now;
|
||||
char time_str[40];
|
||||
char *timestamp_str;
|
||||
bool packed;
|
||||
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s sampling freq",
|
||||
@ -226,6 +227,16 @@ static void freq_sample(void)
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
packed = false;
|
||||
|
||||
/* pack our name */
|
||||
freq = strdup("freq");
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &freq, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(freq);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
@ -256,6 +267,10 @@ static void freq_sample(void)
|
||||
free(timestamp_str);
|
||||
|
||||
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s processing freq file %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trk->file);
|
||||
/* read the temp */
|
||||
if (NULL == (fp = fopen(trk->file, "r"))) {
|
||||
continue;
|
||||
@ -272,17 +287,20 @@ static void freq_sample(void)
|
||||
free(freq);
|
||||
return;
|
||||
}
|
||||
packed = true;
|
||||
free(freq);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
if (packed) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
@ -322,12 +340,12 @@ static void freq_log(opal_buffer_t *sample)
|
||||
}
|
||||
|
||||
opal_output_verbose(3, orte_sensor_base_framework.framework_output,
|
||||
"%s Received log from host %s with %d cores",
|
||||
"%s Received freq log from host %s with %d cores",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+1) * sizeof(opal_value_t));
|
||||
kv = malloc((ncores+2) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
@ -336,27 +354,37 @@ static void freq_log(opal_buffer_t *sample)
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* load the hostname */
|
||||
OBJ_CONSTRUCT(&kv[1], opal_value_t);
|
||||
kv[1].key = strdup("hostname");
|
||||
kv[1].type = OPAL_STRING;
|
||||
kv[1].data.string = strdup(hostname);
|
||||
|
||||
/* protect against segfault if we jump to cleanup */
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+1], opal_value_t);
|
||||
asprintf(&kv[i+1].key, "core%d", i);
|
||||
kv[i+1].type = OPAL_FLOAT;
|
||||
OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
|
||||
}
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
asprintf(&kv[i+2].key, "core%d", i);
|
||||
kv[i+2].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+1].data.fval = fval;
|
||||
kv[i+2].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+1))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+2))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+1; i++) {
|
||||
for (i=0; i < ncores+2; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -62,7 +63,7 @@ static void sample(void)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* are we including ourselves? */
|
||||
if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_CMSLAVE) &&
|
||||
if (ORTE_PROC_IS_DAEMON &&
|
||||
0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sample:ft_tester considering killing me!",
|
||||
|
@ -76,7 +76,7 @@ static int init(void)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup to receive heartbeats */
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_CM) {
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
ORTE_RML_PERSISTENT,
|
||||
|
@ -209,6 +209,8 @@ static void pwr_sample(void)
|
||||
long long value;
|
||||
int fd, ret;
|
||||
float power;
|
||||
char *temp;
|
||||
bool packed;
|
||||
|
||||
opal_output_verbose(2, orte_sensor_base_framework.framework_output,
|
||||
"%s sampling power",
|
||||
@ -216,6 +218,16 @@ static void pwr_sample(void)
|
||||
|
||||
/* prep to store the results */
|
||||
OBJ_CONSTRUCT(&data, opal_buffer_t);
|
||||
packed = false;
|
||||
|
||||
/* pack our name */
|
||||
temp = strdup("pwr");
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
free(temp);
|
||||
|
||||
/* store our hostname */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
@ -266,15 +278,18 @@ static void pwr_sample(void)
|
||||
close(fd);
|
||||
return;
|
||||
}
|
||||
packed = true;
|
||||
close(fd);
|
||||
}
|
||||
|
||||
/* xfer the data for transmission */
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
if (packed) {
|
||||
bptr = &data;
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
@ -319,7 +334,7 @@ static void pwr_log(opal_buffer_t *sample)
|
||||
(NULL == hostname) ? "NULL" : hostname, ncores);
|
||||
|
||||
/* xfr to storage */
|
||||
kv = malloc((ncores+1) * sizeof(opal_value_t));
|
||||
kv = malloc((ncores+2) * sizeof(opal_value_t));
|
||||
|
||||
/* load the sample time at the start */
|
||||
OBJ_CONSTRUCT(&kv[0], opal_value_t);
|
||||
@ -328,27 +343,37 @@ static void pwr_log(opal_buffer_t *sample)
|
||||
kv[0].data.string = strdup(sampletime);
|
||||
free(sampletime);
|
||||
|
||||
/* load the hostname */
|
||||
OBJ_CONSTRUCT(&kv[1], opal_value_t);
|
||||
kv[1].key = strdup("hostname");
|
||||
kv[1].type = OPAL_STRING;
|
||||
kv[1].data.string = strdup(hostname);
|
||||
|
||||
/* protect against segfault if we jump to cleanup */
|
||||
for (i=0; i < ncores; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i+1], opal_value_t);
|
||||
asprintf(&kv[i+1].key, "core%d", i);
|
||||
kv[i+1].type = OPAL_FLOAT;
|
||||
OBJ_CONSTRUCT(&kv[i+2], opal_value_t);
|
||||
}
|
||||
|
||||
for (i=0; i < ncores; i++) {
|
||||
asprintf(&kv[i+2].key, "core%d", i);
|
||||
kv[i+2].type = OPAL_FLOAT;
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
kv[i+1].data.fval = fval;
|
||||
kv[i+2].data.fval = fval;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+1))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+2))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* cleanup the xfr storage */
|
||||
for (i=0; i < ncores+1; i++) {
|
||||
for (i=0; i < ncores+2; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
if (NULL != hostname) {
|
||||
|
@ -8,7 +8,7 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-sigar.txt
|
||||
dist_ompidata_DATA = help-orte-sensor-sigar.txt
|
||||
|
||||
sources = \
|
||||
sensor_sigar.c \
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user