From e496e348a4520290a529a7810f24f6487ee1172e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 24 Jan 2014 17:29:24 +0000 Subject: [PATCH] Some cleanup of the sensor system to ensure things go in the right place, avoid segfaults under abnormal conditions, etc. cmr=v1.7.5:reviewer=rhc This commit was SVN r30409. --- orte/mca/sensor/coretemp/Makefile.am | 2 +- orte/mca/sensor/coretemp/sensor_coretemp.c | 73 ++++++++++++-------- orte/mca/sensor/freq/Makefile.am | 2 +- orte/mca/sensor/freq/sensor_freq.c | 54 +++++++++++---- orte/mca/sensor/ft_tester/sensor_ft_tester.c | 3 +- orte/mca/sensor/heartbeat/sensor_heartbeat.c | 2 +- orte/mca/sensor/pwr/sensor_pwr.c | 49 +++++++++---- orte/mca/sensor/sigar/Makefile.am | 2 +- 8 files changed, 129 insertions(+), 58 deletions(-) diff --git a/orte/mca/sensor/coretemp/Makefile.am b/orte/mca/sensor/coretemp/Makefile.am index 84ee176c52..d622de089c 100644 --- a/orte/mca/sensor/coretemp/Makefile.am +++ b/orte/mca/sensor/coretemp/Makefile.am @@ -8,7 +8,7 @@ # $HEADER$ # -dist_pkgdata_DATA = help-orte-sensor-coretemp.txt +dist_ompidata_DATA = help-orte-sensor-coretemp.txt sources = \ sensor_coretemp.c \ diff --git a/orte/mca/sensor/coretemp/sensor_coretemp.c b/orte/mca/sensor/coretemp/sensor_coretemp.c index 2646c0223f..1f5952c7f1 100644 --- a/orte/mca/sensor/coretemp/sensor_coretemp.c +++ b/orte/mca/sensor/coretemp/sensor_coretemp.c @@ -67,13 +67,13 @@ typedef struct { char *label; float critical_temp; float max_temp; -} core_tracker_t; -static void ctr_con(core_tracker_t *trk) +} coretemp_tracker_t; +static void ctr_con(coretemp_tracker_t *trk) { trk->file = NULL; trk->label = NULL; } -static void ctr_des(core_tracker_t *trk) +static void ctr_des(coretemp_tracker_t *trk) { if (NULL != trk->file) { free(trk->file); @@ -82,7 +82,7 @@ static void ctr_des(core_tracker_t *trk) free(trk->label); } } -OBJ_CLASS_INSTANCE(core_tracker_t, +OBJ_CLASS_INSTANCE(coretemp_tracker_t, opal_list_item_t, ctr_con, ctr_des); @@ -111,29 +111,22 @@ static char *orte_getline(FILE *fp) */ static int init(void) { - int ret; DIR *cur_dirp = NULL, *tdir; struct dirent *dir_entry, *entry; char *dirname, *filename, *ptr, *tmp; size_t tlen = strlen("temp"); size_t ilen = strlen("_input"); FILE *fp; - core_tracker_t *trk; + coretemp_tracker_t *trk; int socket; OBJ_CONSTRUCT(&tracking, opal_list_t); - if (ORTE_SUCCESS != (ret = opal_os_dirpath_access("/sys/bus/platform/devices", 0))) { - /* if the directory doesn't exist, or we don't have - * access to it, then disqualify us - */ - return ret; - } - /* * Open up the base directory so we can get a listing */ if (NULL == (cur_dirp = opendir("/sys/bus/platform/devices"))) { + OBJ_DESTRUCT(&tracking); return ORTE_ERROR; } @@ -175,7 +168,7 @@ static int init(void) continue; } /* track the info for this core */ - trk = OBJ_NEW(core_tracker_t); + trk = OBJ_NEW(coretemp_tracker_t); trk->socket = socket; trk->file = opal_os_path(false, dirname, entry->d_name, NULL); /* take the part up to the first underscore as this will @@ -252,7 +245,7 @@ static void stop(orte_jobid_t jobid) static void coretemp_sample(void) { int ret; - core_tracker_t *trk; + coretemp_tracker_t *trk; FILE *fp; char *temp; float degc; @@ -261,9 +254,20 @@ static void coretemp_sample(void) time_t now; char time_str[40]; char *timestamp_str; + bool packed; /* prep to store the results */ OBJ_CONSTRUCT(&data, opal_buffer_t); + packed = false; + + /* pack our name */ + temp = strdup("coretemp"); + if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + return; + } + free(temp); /* store our hostname */ if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) { @@ -293,7 +297,7 @@ static void coretemp_sample(void) } free(timestamp_str); - OPAL_LIST_FOREACH(trk, &tracking, core_tracker_t) { + OPAL_LIST_FOREACH(trk, &tracking, coretemp_tracker_t) { /* read the temp */ fp = fopen(trk->file, "r"); while (NULL != (temp = orte_getline(fp))) { @@ -309,6 +313,7 @@ static void coretemp_sample(void) return; } free(temp); + packed = true; /* check for exceed critical temp */ if (trk->critical_temp < degc) { /* alert the errmgr - this is a critical problem */ @@ -328,11 +333,13 @@ static void coretemp_sample(void) } /* xfer the data for transmission */ - bptr = &data; - if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { - ORTE_ERROR_LOG(ret); - OBJ_DESTRUCT(&data); - return; + if (packed) { + bptr = &data; + if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + return; + } } OBJ_DESTRUCT(&data); } @@ -377,7 +384,7 @@ static void coretemp_log(opal_buffer_t *sample) (NULL == hostname) ? "NULL" : hostname, ncores); /* xfr to storage */ - kv = malloc((ncores+1) * sizeof(opal_value_t)); + kv = malloc((ncores+2) * sizeof(opal_value_t)); /* load the sample time at the start */ OBJ_CONSTRUCT(&kv[0], opal_value_t); @@ -386,27 +393,37 @@ static void coretemp_log(opal_buffer_t *sample) kv[0].data.string = strdup(sampletime); free(sampletime); + /* load the hostname */ + OBJ_CONSTRUCT(&kv[1], opal_value_t); + kv[1].key = strdup("hostname"); + kv[1].type = OPAL_STRING; + kv[1].data.string = strdup(hostname); + + /* protect against segfault if we jump to cleanup */ for (i=0; i < ncores; i++) { - OBJ_CONSTRUCT(&kv[i+1], opal_value_t); - asprintf(&kv[i+1].key, "core%d", i); - kv[i+1].type = OPAL_FLOAT; + OBJ_CONSTRUCT(&kv[i+2], opal_value_t); + } + + for (i=0; i < ncores; i++) { + asprintf(&kv[i+2].key, "core%d", i); + kv[i+2].type = OPAL_FLOAT; n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) { ORTE_ERROR_LOG(rc); goto cleanup; } - kv[i+1].data.fval = fval; + kv[i+2].data.fval = fval; } /* store it */ - if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+1))) { + if (ORTE_SUCCESS != (rc = opal_db.add_log("coretemp", kv, ncores+2))) { /* don't bark about it - just quietly disable the log */ log_enabled = false; } cleanup: /* cleanup the xfr storage */ - for (i=0; i < ncores+1; i++) { + for (i=0; i < ncores+2; i++) { OBJ_DESTRUCT(&kv[i]); } if (NULL != hostname) { diff --git a/orte/mca/sensor/freq/Makefile.am b/orte/mca/sensor/freq/Makefile.am index 8489bf49d0..36739a69e0 100644 --- a/orte/mca/sensor/freq/Makefile.am +++ b/orte/mca/sensor/freq/Makefile.am @@ -8,7 +8,7 @@ # $HEADER$ # -dist_pkgdata_DATA = help-orte-sensor-freq.txt +dist_ompidata_DATA = help-orte-sensor-freq.txt sources = \ sensor_freq.c \ diff --git a/orte/mca/sensor/freq/sensor_freq.c b/orte/mca/sensor/freq/sensor_freq.c index 43041962bc..ff67f690da 100644 --- a/orte/mca/sensor/freq/sensor_freq.c +++ b/orte/mca/sensor/freq/sensor_freq.c @@ -219,6 +219,7 @@ static void freq_sample(void) time_t now; char time_str[40]; char *timestamp_str; + bool packed; opal_output_verbose(2, orte_sensor_base_framework.framework_output, "%s sampling freq", @@ -226,6 +227,16 @@ static void freq_sample(void) /* prep to store the results */ OBJ_CONSTRUCT(&data, opal_buffer_t); + packed = false; + + /* pack our name */ + freq = strdup("freq"); + if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &freq, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + return; + } + free(freq); /* store our hostname */ if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) { @@ -256,6 +267,10 @@ static void freq_sample(void) free(timestamp_str); OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) { + opal_output_verbose(2, orte_sensor_base_framework.framework_output, + "%s processing freq file %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + trk->file); /* read the temp */ if (NULL == (fp = fopen(trk->file, "r"))) { continue; @@ -272,17 +287,20 @@ static void freq_sample(void) free(freq); return; } + packed = true; free(freq); } fclose(fp); } /* xfer the data for transmission */ - bptr = &data; - if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { - ORTE_ERROR_LOG(ret); - OBJ_DESTRUCT(&data); - return; + if (packed) { + bptr = &data; + if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + return; + } } OBJ_DESTRUCT(&data); } @@ -322,12 +340,12 @@ static void freq_log(opal_buffer_t *sample) } opal_output_verbose(3, orte_sensor_base_framework.framework_output, - "%s Received log from host %s with %d cores", + "%s Received freq log from host %s with %d cores", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == hostname) ? "NULL" : hostname, ncores); /* xfr to storage */ - kv = malloc((ncores+1) * sizeof(opal_value_t)); + kv = malloc((ncores+2) * sizeof(opal_value_t)); /* load the sample time at the start */ OBJ_CONSTRUCT(&kv[0], opal_value_t); @@ -336,27 +354,37 @@ static void freq_log(opal_buffer_t *sample) kv[0].data.string = strdup(sampletime); free(sampletime); + /* load the hostname */ + OBJ_CONSTRUCT(&kv[1], opal_value_t); + kv[1].key = strdup("hostname"); + kv[1].type = OPAL_STRING; + kv[1].data.string = strdup(hostname); + + /* protect against segfault if we jump to cleanup */ for (i=0; i < ncores; i++) { - OBJ_CONSTRUCT(&kv[i+1], opal_value_t); - asprintf(&kv[i+1].key, "core%d", i); - kv[i+1].type = OPAL_FLOAT; + OBJ_CONSTRUCT(&kv[i+2], opal_value_t); + } + + for (i=0; i < ncores; i++) { + asprintf(&kv[i+2].key, "core%d", i); + kv[i+2].type = OPAL_FLOAT; n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) { ORTE_ERROR_LOG(rc); goto cleanup; } - kv[i+1].data.fval = fval; + kv[i+2].data.fval = fval; } /* store it */ - if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+1))) { + if (ORTE_SUCCESS != (rc = opal_db.add_log("freq", kv, ncores+2))) { /* don't bark about it - just quietly disable the log */ log_enabled = false; } cleanup: /* cleanup the xfr storage */ - for (i=0; i < ncores+1; i++) { + for (i=0; i < ncores+2; i++) { OBJ_DESTRUCT(&kv[i]); } if (NULL != hostname) { diff --git a/orte/mca/sensor/ft_tester/sensor_ft_tester.c b/orte/mca/sensor/ft_tester/sensor_ft_tester.c index 3efd4ab6dc..efee7a18e1 100644 --- a/orte/mca/sensor/ft_tester/sensor_ft_tester.c +++ b/orte/mca/sensor/ft_tester/sensor_ft_tester.c @@ -2,6 +2,7 @@ * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2014 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -62,7 +63,7 @@ static void sample(void) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* are we including ourselves? */ - if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_CMSLAVE) && + if (ORTE_PROC_IS_DAEMON && 0 < mca_sensor_ft_tester_component.daemon_fail_prob) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester considering killing me!", diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c index 715d01ad96..d4c5137018 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.c +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.c @@ -76,7 +76,7 @@ static int init(void) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup to receive heartbeats */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_CM) { + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) { orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT, ORTE_RML_PERSISTENT, diff --git a/orte/mca/sensor/pwr/sensor_pwr.c b/orte/mca/sensor/pwr/sensor_pwr.c index 5c7d56c703..85f5c1be93 100644 --- a/orte/mca/sensor/pwr/sensor_pwr.c +++ b/orte/mca/sensor/pwr/sensor_pwr.c @@ -209,6 +209,8 @@ static void pwr_sample(void) long long value; int fd, ret; float power; + char *temp; + bool packed; opal_output_verbose(2, orte_sensor_base_framework.framework_output, "%s sampling power", @@ -216,6 +218,16 @@ static void pwr_sample(void) /* prep to store the results */ OBJ_CONSTRUCT(&data, opal_buffer_t); + packed = false; + + /* pack our name */ + temp = strdup("pwr"); + if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &temp, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + return; + } + free(temp); /* store our hostname */ if (OPAL_SUCCESS != (ret = opal_dss.pack(&data, &orte_process_info.nodename, 1, OPAL_STRING))) { @@ -266,15 +278,18 @@ static void pwr_sample(void) close(fd); return; } + packed = true; close(fd); } /* xfer the data for transmission */ - bptr = &data; - if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { - ORTE_ERROR_LOG(ret); - OBJ_DESTRUCT(&data); - return; + if (packed) { + bptr = &data; + if (OPAL_SUCCESS != (ret = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + return; + } } OBJ_DESTRUCT(&data); } @@ -319,7 +334,7 @@ static void pwr_log(opal_buffer_t *sample) (NULL == hostname) ? "NULL" : hostname, ncores); /* xfr to storage */ - kv = malloc((ncores+1) * sizeof(opal_value_t)); + kv = malloc((ncores+2) * sizeof(opal_value_t)); /* load the sample time at the start */ OBJ_CONSTRUCT(&kv[0], opal_value_t); @@ -328,27 +343,37 @@ static void pwr_log(opal_buffer_t *sample) kv[0].data.string = strdup(sampletime); free(sampletime); + /* load the hostname */ + OBJ_CONSTRUCT(&kv[1], opal_value_t); + kv[1].key = strdup("hostname"); + kv[1].type = OPAL_STRING; + kv[1].data.string = strdup(hostname); + + /* protect against segfault if we jump to cleanup */ for (i=0; i < ncores; i++) { - OBJ_CONSTRUCT(&kv[i+1], opal_value_t); - asprintf(&kv[i+1].key, "core%d", i); - kv[i+1].type = OPAL_FLOAT; + OBJ_CONSTRUCT(&kv[i+2], opal_value_t); + } + + for (i=0; i < ncores; i++) { + asprintf(&kv[i+2].key, "core%d", i); + kv[i+2].type = OPAL_FLOAT; n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &fval, &n, OPAL_FLOAT))) { ORTE_ERROR_LOG(rc); goto cleanup; } - kv[i+1].data.fval = fval; + kv[i+2].data.fval = fval; } /* store it */ - if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+1))) { + if (ORTE_SUCCESS != (rc = opal_db.add_log("pwr", kv, ncores+2))) { /* don't bark about it - just quietly disable the log */ log_enabled = false; } cleanup: /* cleanup the xfr storage */ - for (i=0; i < ncores+1; i++) { + for (i=0; i < ncores+2; i++) { OBJ_DESTRUCT(&kv[i]); } if (NULL != hostname) { diff --git a/orte/mca/sensor/sigar/Makefile.am b/orte/mca/sensor/sigar/Makefile.am index f20b050c80..12e974df11 100644 --- a/orte/mca/sensor/sigar/Makefile.am +++ b/orte/mca/sensor/sigar/Makefile.am @@ -8,7 +8,7 @@ # $HEADER$ # -dist_pkgdata_DATA = help-orte-sensor-sigar.txt +dist_ompidata_DATA = help-orte-sensor-sigar.txt sources = \ sensor_sigar.c \