1
1
openmpi/orte/mca/rtc/freq/rtc_freq.c
Ralph Castain 03234f2a33 Revert r31926 and replace it with a more complete checking of availability and accessibility of the required freq control paths.
This commit was SVN r31927.

The following SVN revision numbers were found above:
  r31926 --> open-mpi/ompi@9779084352
2014-06-02 14:34:00 +00:00

610 строки
22 KiB
C

/*
* Copyright (c) 2014 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include <ctype.h>
#include "opal/class/opal_list.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/os_path.h"
#include "opal/util/output.h"
#include "opal/util/os_dirpath.h"
#include "orte/util/show_help.h"
#include "orte/util/error_strings.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rtc/base/base.h"
#include "rtc_freq.h"
static int init(void);
static void finalize(void);
static void assign(orte_job_t *jdata);
static void set(orte_job_t *jdata,
orte_proc_t *proc,
char ***environ_copy,
int write_fd);
static void getvals(opal_list_t *vals);
orte_rtc_base_module_t orte_rtc_freq_module = {
init,
finalize,
assign,
set,
getvals
};
typedef struct {
opal_list_item_t super;
int core;
char *directory;
/* save the system settings so we can restore them when we die */
char *system_governor;
float system_max_freq;
float system_min_freq;
/* save the current settings so we only change them when required */
char *current_governor;
float current_max_freq;
float current_min_freq;
/* keep a list of allowed values */
opal_list_t governors;
opal_list_t frequencies;
/* mark if setspeed is supported */
bool setspeed;
} corefreq_tracker_t;
static void ctr_con(corefreq_tracker_t *trk)
{
trk->directory = NULL;
trk->system_governor = NULL;
trk->current_governor = NULL;
OBJ_CONSTRUCT(&trk->governors, opal_list_t);
OBJ_CONSTRUCT(&trk->frequencies, opal_list_t);
trk->setspeed = false;
}
static void ctr_des(corefreq_tracker_t *trk)
{
if (NULL != trk->directory) {
free(trk->directory);
}
if (NULL != trk->system_governor) {
free(trk->system_governor);
}
if (NULL != trk->current_governor) {
free(trk->current_governor);
}
OPAL_LIST_DESTRUCT(&trk->governors);
OPAL_LIST_DESTRUCT(&trk->frequencies);
}
OBJ_CLASS_INSTANCE(corefreq_tracker_t,
opal_list_item_t,
ctr_con, ctr_des);
static char *orte_getline(FILE *fp)
{
char *ret, *buff;
char input[1024];
int k;
ret = fgets(input, 1024, fp);
if (NULL != ret) {
/* trim the end of the line */
for (k=strlen(input)-1; 0 < k && isspace(input[k]); k--) {
input[k] = '\0';
}
buff = strdup(input);
return buff;
}
return NULL;
}
static opal_list_t tracking;
static int init(void)
{
int k;
DIR *cur_dirp = NULL;
struct dirent *entry;
char *filename, *tmp, **vals;
FILE *fp;
corefreq_tracker_t *trk;
opal_value_t *kv;
/* always construct this so we don't segfault in finalize */
OBJ_CONSTRUCT(&tracking, opal_list_t);
/*
* Open up the base directory so we can get a listing
*/
if (NULL == (cur_dirp = opendir("/sys/devices/system/cpu"))) {
OBJ_DESTRUCT(&tracking);
if (4 < opal_output_get_verbosity(orte_rtc_base_framework.framework_output)) {
orte_show_help("help-rtc-freq.txt", "req-dir-not-found",
true, orte_process_info.nodename,
"/sys/devices/system/cpu");
}
return ORTE_ERROR;
}
/*
* For each directory
*/
while (NULL != (entry = readdir(cur_dirp))) {
/*
* Skip the obvious
*/
if (0 == strncmp(entry->d_name, ".", strlen(".")) ||
0 == strncmp(entry->d_name, "..", strlen(".."))) {
continue;
}
/* look for cpu directories */
if (0 != strncmp(entry->d_name, "cpu", strlen("cpu"))) {
/* cannot be a cpu directory */
continue;
}
/* if it ends in other than a digit, then it isn't a cpu directory */
if (!isdigit(entry->d_name[strlen(entry->d_name)-1])) {
continue;
}
/* track the info for this core */
trk = OBJ_NEW(corefreq_tracker_t);
/* trailing digits are the core id */
for (k=strlen(entry->d_name)-1; 0 <= k; k--) {
if (!isdigit(entry->d_name[k])) {
break;
}
}
trk->core = strtoul(&entry->d_name[k], NULL, 10);
trk->directory = opal_os_path(false, "/sys/devices/system/cpu", entry->d_name, "cpufreq", NULL);
/* read/save the current settings */
filename = opal_os_path(false, trk->directory, "scaling_governor", NULL);
if (NULL == (fp = fopen(filename, "rw"))) {
free(filename);
OBJ_RELEASE(trk);
continue;
}
trk->system_governor = orte_getline(fp);
trk->current_governor = strdup(trk->system_governor);
fclose(fp);
free(filename);
filename = opal_os_path(false, trk->directory, "scaling_max_freq", NULL);
if (NULL == (fp = fopen(filename, "rw"))) {
free(filename);
OBJ_RELEASE(trk);
continue;
}
tmp = orte_getline(fp);
fclose(fp);
trk->system_max_freq = strtoul(tmp, NULL, 10) / 1000000.0;
trk->current_max_freq = trk->system_max_freq;
free(filename);
free(tmp);
filename = opal_os_path(false, trk->directory, "scaling_min_freq", NULL);
if (NULL == (fp = fopen(filename, "rw"))) {
free(filename);
OBJ_RELEASE(trk);
continue;
}
tmp = orte_getline(fp);
fclose(fp);
trk->system_min_freq = strtoul(tmp, NULL, 10) / 1000000.0;
trk->current_min_freq = trk->system_min_freq;
free(filename);
free(tmp);
/* get the list of available governors */
filename = opal_os_path(false, trk->directory, "scaling_available_governors", NULL);
if (NULL == (fp = fopen(filename, "r"))) {
free(filename);
OBJ_RELEASE(trk);
continue;
}
tmp = orte_getline(fp);
fclose(fp);
free(filename);
if (NULL != tmp) {
vals = opal_argv_split(tmp, ' ');
free(tmp);
for (k=0; NULL != vals[k]; k++) {
kv = OBJ_NEW(opal_value_t);
kv->type = OPAL_STRING;
kv->data.string = strdup(vals[k]);
opal_list_append(&trk->governors, &kv->super);
}
opal_argv_free(vals);
}
/* get the list of available frequencies */
filename = opal_os_path(false, trk->directory, "scaling_available_frequencies", NULL);
if (NULL == (fp = fopen(filename, "r"))) {
free(filename);
OBJ_RELEASE(trk);
continue;
}
tmp = orte_getline(fp);
fclose(fp);
free(filename);
if (NULL != tmp) {
vals = opal_argv_split(tmp, ' ');
free(tmp);
for (k=0; NULL != vals[k]; k++) {
kv = OBJ_NEW(opal_value_t);
kv->type = OPAL_FLOAT;
kv->data.fval = strtoul(vals[k], NULL, 10) / 1000000.0;
opal_list_append(&trk->frequencies, &kv->super);
}
opal_argv_free(vals);
}
/* see if setspeed is supported */
filename = opal_os_path(false, trk->directory, "scaling_setspeed", NULL);
if (NULL != (fp = fopen(filename, "rw"))) {
trk->setspeed = true;
}
fclose(fp);
free(filename);
/* add to our list */
opal_list_append(&tracking, &trk->super);
}
closedir(cur_dirp);
if (0 == opal_list_get_size(&tracking)) {
/* nothing to read */
if (0 < opal_output_get_verbosity(orte_rtc_base_framework.framework_output)) {
orte_show_help("help-orcm-sensor-freq.txt", "no-cores-found",
true, orte_process_info.nodename);
}
OPAL_LIST_DESTRUCT(&tracking);
return ORTE_ERROR;
}
/* report out the results, if requested */
if (9 < opal_output_get_verbosity(orte_rtc_base_framework.framework_output)) {
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
opal_output(0, "%s\tCore: %d Governor: %s MaxFreq: %f MinFreq: %f\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), trk->core,
trk->system_governor, trk->system_max_freq, trk->system_min_freq);
OPAL_LIST_FOREACH(kv, &trk->governors, opal_value_t) {
opal_output(0, "%s\t\tGovernor: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), kv->data.string);
}
OPAL_LIST_FOREACH(kv, &trk->frequencies, opal_value_t) {
opal_output(0, "%s\t\tFrequency: %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), kv->data.fval);
}
}
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
OPAL_LIST_DESTRUCT(&tracking);
return;
}
static void assign(orte_job_t *jdata)
{
bool freq_given = false;
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning freq controls to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
/* see if the job already has the max freq attribute set */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_MAX_FREQ, NULL, OPAL_STRING)) {
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning max freq given for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
freq_given = true;
} else if (NULL != mca_rtc_freq_component.max_freq) {
/* if not, set the default value if provided */
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning default max freq control to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
orte_set_attribute(&jdata->attributes, ORTE_JOB_MAX_FREQ, ORTE_ATTR_GLOBAL,
mca_rtc_freq_component.max_freq, OPAL_STRING);
freq_given = true;
}
/* see if the job already has the min freq attribute set */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_MIN_FREQ, NULL, OPAL_STRING)) {
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning min freq controls to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
freq_given = true;
} else if (NULL != mca_rtc_freq_component.min_freq) {
/* if not, set the default value if provided */
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning default minfreq controls to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
orte_set_attribute(&jdata->attributes, ORTE_JOB_MIN_FREQ, ORTE_ATTR_GLOBAL,
mca_rtc_freq_component.min_freq, OPAL_STRING);
freq_given = true;
}
/* see if the job has a governor attribute set */
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_GOVERNOR, NULL, OPAL_STRING)) {
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning freq governor to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
/* if not, was a default value provided? */
if (NULL != mca_rtc_freq_component.governor) {
/* set it */
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning default freq governor to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
orte_set_attribute(&jdata->attributes, ORTE_JOB_GOVERNOR, ORTE_ATTR_GLOBAL,
mca_rtc_freq_component.governor, OPAL_STRING);
} else if (freq_given) {
/* if the user specified a frequency, then we should default
* to the userspace governor to ensure we can set it */
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Assigning default userspace governor to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
orte_set_attribute(&jdata->attributes, ORTE_JOB_GOVERNOR, ORTE_ATTR_GLOBAL,
"userspace", OPAL_STRING);
}
}
}
static void set(orte_job_t *jdata,
orte_proc_t *child,
char ***environ_copy,
int write_fd)
{
char *governor, *tmp, **vals;
corefreq_tracker_t *trk;
opal_value_t *kv;
float freq, *fptr, minfreq;
bool setspeed_used = false;
bool allowed;
char *filename;
FILE *fp;
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Setting freq controls for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
/* see if the job has the governor attribute set */
governor = NULL;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_GOVERNOR, (void**)&governor, OPAL_STRING)) {
/* loop thru all the cpus on this node */
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
/* does the requested value match the current setting? */
if (0 == strcmp(trk->current_governor, governor)) {
continue;
}
/* is the specified governor among those allowed? */
allowed = false;
OPAL_LIST_FOREACH(kv, &trk->governors, opal_value_t) {
if (0 == strcmp(kv->data.string, governor)) {
allowed = true;
break;
}
}
if (!allowed) {
vals = NULL;
OPAL_LIST_FOREACH(kv, &trk->governors, opal_value_t) {
opal_argv_append_nosize(&vals, kv->data.string);
}
tmp = opal_argv_join(vals, ',');
opal_argv_free(vals);
orte_show_help("help-rtc-freq.txt", "unsupported-governor", true,
orte_process_info.nodename, governor, tmp);
free(tmp);
/* generate an error so the errmgr can resolve it */
return;
}
/* attempt to set the value */
filename = opal_os_path(false, trk->directory, "scaling_governor", NULL);
if (NULL == (fp = fopen(filename, "w"))) {
/* not allowed - report the error */
orte_show_help("help-rtc-freq.txt", "permission-denied", true,
"governor", orte_process_info.nodename, filename);
free(filename);
return;
}
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Setting governor %s for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), governor,
ORTE_JOBID_PRINT(jdata->jobid));
fprintf(fp, "%s\n", governor);
fclose(fp);
free(filename);
}
}
/* see if the job has the min freq attribute set */
fptr = &minfreq;
minfreq = -1.0;
orte_get_attribute(&jdata->attributes, ORTE_JOB_MIN_FREQ, (void**)&fptr, OPAL_FLOAT);
/* see if the job has the max freq attribute set */
fptr = &freq;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_MAX_FREQ, (void**)&fptr, OPAL_FLOAT)) {
/* loop thru all the cpus on this node */
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
/* does the requested value match the current setting? */
if (trk->current_max_freq == freq) {
continue;
}
/* is the specified frequency among those allowed? */
allowed = false;
OPAL_LIST_FOREACH(kv, &trk->frequencies, opal_value_t) {
if (kv->data.fval == freq) {
allowed = true;
break;
}
}
if (!allowed) {
vals = NULL;
OPAL_LIST_FOREACH(kv, &trk->frequencies, opal_value_t) {
asprintf(&tmp, "%f", kv->data.fval);
opal_argv_append_nosize(&vals, tmp);
free(tmp);
}
tmp = opal_argv_join(vals, ',');
opal_argv_free(vals);
orte_show_help("help-rtc-freq.txt", "unsupported-freq", true, freq, tmp);
free(tmp);
/* generate an error so the errmgr can resolve it */
return;
}
/* if we got a min freq and the two are the same, then use setspeed if supported */
if (minfreq == freq && trk->setspeed) {
filename = opal_os_path(false, trk->directory, "scaling_setspeed", NULL);
setspeed_used = true;
} else {
filename = opal_os_path(false, trk->directory, "scaling_max_freq", NULL);
}
/* attempt to set the value */
if (NULL == (fp = fopen(filename, "w"))) {
/* not allowed - report the error */
orte_show_help("help-rtc-freq.txt", "permission-denied", true,
"max freq", orte_process_info.nodename, filename);
free(filename);
return;
}
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Setting %s freq controls to %ld for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
setspeed_used ? "cpu" : "max",
(unsigned long)(freq * 1000000.0),
ORTE_JOBID_PRINT(jdata->jobid));
fprintf(fp, "%ld\n", (unsigned long)(freq * 1000000.0));
fclose(fp);
free(filename);
}
}
if (!setspeed_used && 0.0 < minfreq) {
/* need to process the min freq value - loop thru all the cpus on this node */
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
/* does the requested value match the current setting? */
if (trk->current_min_freq == minfreq) {
continue;
}
/* is the specified frequency among those allowed? */
allowed = false;
OPAL_LIST_FOREACH(kv, &trk->frequencies, opal_value_t) {
if (kv->data.fval == minfreq) {
allowed = true;
break;
}
}
if (!allowed) {
vals = NULL;
OPAL_LIST_FOREACH(kv, &trk->frequencies, opal_value_t) {
asprintf(&tmp, "%f", kv->data.fval);
opal_argv_append_nosize(&vals, tmp);
free(tmp);
}
tmp = opal_argv_join(vals, ',');
opal_argv_free(vals);
orte_show_help("help-rtc-freq.txt", "unsupported-freq", true, minfreq, tmp);
free(tmp);
/* generate an error so the errmgr can resolve it */
return;
}
filename = opal_os_path(false, trk->directory, "scaling_min_freq", NULL);
/* attempt to set the value */
if (NULL == (fp = fopen(filename, "w"))) {
/* not allowed - report the error */
orte_show_help("help-rtc-freq.txt", "permission-denied", true,
"min freq", orte_process_info.nodename, filename);
free(filename);
return;
}
opal_output_verbose(2, orte_rtc_base_framework.framework_output,
"%s Setting min freq controls to %ld for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)(minfreq * 1000000.0),
ORTE_JOBID_PRINT(jdata->jobid));
fprintf(fp, "%ld\n", (unsigned long)(minfreq * 1000000.0));
fclose(fp);
free(filename);
}
}
}
static void getvals(opal_list_t *vals)
{
corefreq_tracker_t *trk;
orte_rtc_resource_t *res;
opal_value_t *kv;
char *tmp, **args;
res = OBJ_NEW(orte_rtc_resource_t);
OPAL_LIST_FOREACH(trk, &tracking, corefreq_tracker_t) {
res = OBJ_NEW(orte_rtc_resource_t);
res->component = strdup(mca_rtc_freq_component.super.base_version.mca_component_name);
asprintf(&res->category, "core-%d", trk->core);
opal_list_append(vals, &res->super);
args = NULL;
OPAL_LIST_FOREACH(kv, &trk->governors, opal_value_t) {
opal_argv_append_nosize(&args, kv->data.string);
}
res->control.key = strdup("governors");
res->control.type = OPAL_STRING;
res->control.data.string = opal_argv_join(args, ',');
opal_argv_free(args);
res = OBJ_NEW(orte_rtc_resource_t);
res->component = strdup(mca_rtc_freq_component.super.base_version.mca_component_name);
asprintf(&res->category, "core-%d", trk->core);
opal_list_append(vals, &res->super);
args = NULL;
OPAL_LIST_FOREACH(kv, &trk->frequencies, opal_value_t) {
asprintf(&tmp, "%f", kv->data.fval);
opal_argv_append_nosize(&args, tmp);
free(tmp);
}
res->control.key = strdup("frequencies");
res->control.type = OPAL_STRING;
res->control.data.string = opal_argv_join(args, ',');
opal_argv_free(args);
}
}