diff --git a/orte/mca/ras/alps/ras_alps.h b/orte/mca/ras/alps/ras_alps.h index add5fc79b4..69dde32d87 100644 --- a/orte/mca/ras/alps/ras_alps.h +++ b/orte/mca/ras/alps/ras_alps.h @@ -5,7 +5,7 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -13,9 +13,9 @@ * Copyright (c) 2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ /** @@ -34,7 +34,7 @@ BEGIN_C_DECLS ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_alps_component; ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_alps_module; -ORTE_DECLSPEC int orte_ras_alps_get_appinfo_attempts( int *attempts ); +ORTE_DECLSPEC int orte_ras_alps_get_appinfo_attempts(int *attempts); ORTE_DECLSPEC extern unsigned long int orte_ras_alps_res_id; END_C_DECLS diff --git a/orte/mca/ras/alps/ras_alps_component.c b/orte/mca/ras/alps/ras_alps_component.c index 24163c7b1d..09478e1d6a 100644 --- a/orte/mca/ras/alps/ras_alps_component.c +++ b/orte/mca/ras/alps/ras_alps_component.c @@ -5,7 +5,7 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -13,9 +13,9 @@ * Copyright (c) 2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -30,34 +30,29 @@ #include -/* - * Local variables - */ +/* Local variables */ static int param_priority; static int param_read_attempts; - -/* - * Local functions - */ +/* Local functions */ static int ras_alps_open(void); -static int orte_ras_alps_component_query(mca_base_module_t **module, int *priority); +static int orte_ras_alps_component_query(mca_base_module_t **module, + int *priority); unsigned long int orte_ras_alps_res_id; - orte_ras_base_component_t mca_ras_alps_component = { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - + /* First, the mca_base_component_t struct containing meta information about + * the component itself + * */ { ORTE_RAS_BASE_VERSION_2_0_0, - + /* Component name and version */ "alps", ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, ORTE_RELEASE_VERSION, - + /* Component open and close functions */ ras_alps_open, NULL, @@ -72,7 +67,7 @@ orte_ras_base_component_t mca_ras_alps_component = { /* simple function used to strip off characters on and after a period. NULL * will be returned upon failure. Otherwise, a "prepped" string will be * returned. The caller is responsible for freeing returned resources. - * for example: if jid is 138295.sdb, then 138295 will be returned. + * for example: if jid is 138295.sdb, then 138295 will be returned. */ static char * prep_job_id(const char *jid) @@ -92,9 +87,11 @@ prep_job_id(const char *jid) /* this function replicates some of the id setting functionality found in * ras-alps-command.sh. we wanted the ability to just "mpirun" the application - * without having to set an environment variable */ -static unsigned long int -get_res_id(void) { + * without having to set an environment variable + */ +static unsigned long int +get_res_id(void) +{ const char *apstat_cmd = "/usr/bin/apstat -r"; char *id = NULL; char read_buf[512]; @@ -137,24 +134,27 @@ get_res_id(void) { return jid; } -static int ras_alps_open(void) +static int +ras_alps_open(void) { - param_priority = - mca_base_param_reg_int(&mca_ras_alps_component.base_version, - "priority", - "Priority of the alps ras component", - false, false, 75, NULL); + param_priority = mca_base_param_reg_int( + &mca_ras_alps_component.base_version, + "priority", + "Priority of the alps ras component", + false, false, 75, NULL); - param_read_attempts = + param_read_attempts = mca_base_param_reg_int(&mca_ras_alps_component.base_version, "appinfo_read_attempts", - "Maximum number of attempts to read ALPS appinfo file", - false, false, 10, NULL); + "Maximum number of attempts to read ALPS " + "appinfo file", false, false, 10, NULL); return ORTE_SUCCESS; } -static int orte_ras_alps_component_query(mca_base_module_t **module, int *priority) +static int +orte_ras_alps_component_query(mca_base_module_t **module, + int *priority) { char *jid_str = NULL; /* default to an invalid value */ @@ -165,7 +165,7 @@ static int orte_ras_alps_component_query(mca_base_module_t **module, int *priori *module = NULL; return ORTE_ERROR; } - + /* Are we running under a ALPS job? */ /* BASIL_RESERVATION_ID is the equivalent of OMPI_ALPS_RESID * on some systems @@ -188,15 +188,18 @@ static int orte_ras_alps_component_query(mca_base_module_t **module, int *priori /* Sadly, no */ opal_output(orte_ras_base.ras_output, - "ras:alps: NOT available for selection -- OMPI_ALPS_RESID or BASIL_RESERVATION_ID not set?"); + "ras:alps: NOT available for selection -- " + "OMPI_ALPS_RESID or BASIL_RESERVATION_ID not set?"); *module = NULL; return ORTE_ERROR; } -int orte_ras_alps_get_appinfo_attempts( int *attempts ) { - +int +orte_ras_alps_get_appinfo_attempts(int *attempts) +{ mca_base_param_lookup_int(param_read_attempts, attempts); opal_output_verbose(1, orte_ras_base.ras_output, - "ras:alps:orte_ras_alps_get_appinfo_attempts: %d", *attempts); + "ras:alps:orte_ras_alps_get_appinfo_attempts: %d", + *attempts); return ORTE_SUCCESS; } diff --git a/orte/mca/ras/alps/ras_alps_module.c b/orte/mca/ras/alps/ras_alps_module.c index 5f5739ab24..18eec823f5 100644 --- a/orte/mca/ras/alps/ras_alps_module.c +++ b/orte/mca/ras/alps/ras_alps_module.c @@ -5,7 +5,7 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -13,14 +13,22 @@ * Copyright (c) 2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ + #include "orte_config.h" #include "orte/constants.h" +#include "opal/mca/installdirs/installdirs.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ras/base/ras_private.h" +#include "ras_alps.h" + #include #include #include @@ -28,160 +36,323 @@ #include #include -#include "opal/mca/installdirs/installdirs.h" -#include "opal/util/output.h" -#include "orte/util/show_help.h" -#include "orte/mca/errmgr/errmgr.h" +typedef int (*parser_fn_t)(char **val_if_found, FILE *fp, + const char *var_name); -#include "ras_alps.h" -#include "orte/mca/ras/base/ras_private.h" +typedef struct orte_ras_alps_sysconfig_t { + /* path of file to parse */ + char *path; + /* target variable name */ + char *var_name; + /* parser to use */ + parser_fn_t parse; +} orte_ras_alps_sysconfig_t; - -/* - * Local functions - */ +/* /// Local Functions /// */ static int orte_ras_alps_allocate(opal_list_t *nodes); + static int orte_ras_alps_finalize(void); + static char *ras_alps_getline(FILE *fp); -static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned int *uMe); +static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, + char *filename, + unsigned int *uMe); -/* - * Global variable - */ +static char *orte_ras_get_appinfo_path(void); + +static int parser_ini(char **val_if_found, FILE *fp, const char *var_name); + +static int parser_separated_columns(char **val_if_found, FILE *fp, + const char *var_name); + +/* /// Local Variables /// */ +static const orte_ras_alps_sysconfig_t sysconfigs[] = { + {"/etc/sysconfig/alps", "ALPS_SHARED_DIR_PATH", parser_ini}, + {"/etc/alps.conf" , "sharedDir" , parser_separated_columns}, + /* must be last element */ + {NULL , NULL , NULL} +}; + +/* /// Global Variables /// */ orte_ras_base_module_t orte_ras_alps_module = { orte_ras_alps_allocate, orte_ras_alps_finalize }; +/* Parses: VAR_NAME=val text files - Pseudo INI */ +static int +parser_ini(char **val_if_found, FILE *fp, const char *var_name) +{ + char *alps_config_str = NULL; + + /* invalid argument */ + if (NULL == val_if_found) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + *val_if_found = NULL; + + while ((alps_config_str = ras_alps_getline(fp))) { + char *cpq; + char *cpr; + + cpq = strchr(alps_config_str, '#'); /* Parse comments, actually ANY # */ + cpr = strchr(alps_config_str, '='); /* Parse for variables */ + if (!cpr || /* Skip if not definition */ + (cpq && cpq < cpr)) { /* Skip if commented */ + free(alps_config_str); + continue; + } + for (cpr--; /* Kill trailing whitespace */ + (*cpr == ' ' || *cpr == '\t'); cpr--); + for (cpq = alps_config_str; /* Kill leading whitespace */ + (*cpq == ' ' || *cpq == '\t'); cpq++); + /* Filter to needed variable */ + if (strncmp(cpq, var_name, strlen(var_name))) { + /* Sorry, not the variable name that we are looking for */ + free(alps_config_str); + continue; + } + if (!(cpq = strchr(cpr, '"'))) { /* Can't find pathname start */ + free(alps_config_str); + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + if (!(cpr = strchr(++cpq, '"'))) { /* Can't find pathname end */ + free(alps_config_str); + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + *cpr = '\0'; + if (strlen(cpq) + 8 > PATH_MAX) { /* Bad configuration */ + free(alps_config_str); + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + /* Success! */ + asprintf(val_if_found, "%s/appinfo", cpq); + if (NULL == val_if_found) { + free(alps_config_str); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + free(alps_config_str); + return ORTE_SUCCESS; + } + /* We didn't find what we were looking for, but no unrecoverable errors + * occurred in the process. + * */ + return ORTE_SUCCESS; +} + +/* Parses: VAR_NAME val text files */ +static int +parser_separated_columns(char **val_if_found, FILE *fp, const char *var_name) +{ + char *alps_config_str = NULL; + int var_len = strlen(var_name); + int i; + + /* invalid argument */ + if (NULL == val_if_found) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + *val_if_found = NULL; + + while ((alps_config_str = ras_alps_getline(fp))) { + char *cpq = alps_config_str; + char *cpr; + + /* Eat whitespace */ + while (' ' == *cpq || '\t' == *cpq) { + cpq++; + } + /* Ignore comments and variable names that aren't what we are looking + * for */ + if ('#' == *cpq || strncmp(cpq, var_name, var_len)) { + free(alps_config_str); + continue; + } + /* Move to end of the variable name */ + for (i = 0; i < var_len && '\0' != *cpq; ++i, ++cpq); + /* Eat whitespace until we hit val */ + while (' ' == *cpq || '\t' == *cpq) { + cpq++; + } + /* Now advance cpr until end of value */ + cpr = cpq; + while ('\0' != *cpr && (' ' != *cpr || '\t' != *cpr)) { + cpr++; + } + *cpr = '\0'; + /* Bad configuration sanity check */ + if (strlen(cpq) + 8 > PATH_MAX) { + free(alps_config_str); + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + /* Success! */ + asprintf(val_if_found, "%s/appinfo", cpq); + if (NULL == val_if_found) { + free(alps_config_str); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + free(alps_config_str); + return ORTE_SUCCESS; + } + /* We didn't find what we were looking for, but no unrecoverable errors + * occurred in the process. + * */ + return ORTE_SUCCESS; +} + +/* Gets ALPS scheduler information file pathname from system configuration. On + * our XK6 testbed, ALPS_SHARED_DIR_PATH isn't set in /etc/sysconfig/alps. The + * shared directory path is set in /etc/alps.conf and its corresponding variable + * is named sharedDir. We have to support both because XE6 systems (and + * probably others) still rely on ALPS_SHARED_DIR_PATH and /etc/sysconfig/alps. + */ +static char * +orte_ras_get_appinfo_path(void) +{ + int i, rc = ORTE_ERROR; + FILE *fp = NULL; + char *appinfo_path = NULL; + + /* iterate over all the available ALPS system configurations name pairs + * until we either fail or find what we are looking for. + */ + for (i = 0; NULL != sysconfigs[i].path; ++i) { + opal_output_verbose(1, orte_ras_base.ras_output, + "ras:alps:allocate: Trying ALPS configuration " + "file: \"%s\"", + sysconfigs[i].path); + if (NULL == (fp = fopen(sysconfigs[i].path, "r"))) { + int err = errno; + opal_output_verbose(1, orte_ras_base.ras_output, + "ras:alps:allocate: Skipping ALPS " + "configuration file: \"%s\" (%s).", + sysconfigs[i].path, strerror(err)); + continue; + } + /* Let the search begin */ + rc = sysconfigs[i].parse(&appinfo_path, fp, sysconfigs[i].var_name); + /* no longer needed */ + fclose(fp); + + if (ORTE_SUCCESS == rc) { + /* Success! */ + if (NULL != appinfo_path) { + break; + } + /* else we didn't find what we were looking for - just continue */ + else { + continue; + } + } + /* Failure */ + else { + opal_output_verbose(1, orte_ras_base.ras_output, + "ras:alps:allocate: failure " + "(get_appinfo_dir_path = %d)", rc); + return NULL; + } + } + /* Were we successful? */ + if (NULL != sysconfigs[i].path) { + opal_output_verbose(1, orte_ras_base.ras_output, + "ras:alps:allocate: Located ALPS scheduler file: " + "\"%s\"", appinfo_path); + return appinfo_path; + } + /* Nope */ + else { + opal_output_verbose(1, orte_ras_base.ras_output, + "ras:alps:allocate: Could not locate ALPS " + "scheduler file."); + return NULL; + } + + /* Never reached */ + return NULL; +} /** * Discover available (pre-allocated) nodes. Allocate the * requested number of nodes/process slots to the job. - * */ -static int orte_ras_alps_allocate(opal_list_t *nodes) +static int +orte_ras_alps_allocate(opal_list_t *nodes) { - const char alps_sysconfig[] = "/etc/sysconfig/alps"; /** Get ALPS scheduler information - file pathname from system configuration. **/ - int ret; - FILE *fp; - char *str; - char *alps_config_str; - + int ret; + char *appinfo_path = NULL; + if (0 == orte_ras_alps_res_id) { orte_show_help("help-ras-alps.txt", "alps-env-var-not-found", 1); return ORTE_ERR_NOT_FOUND; } - - opal_output_verbose(1, orte_ras_base.ras_output, - "ras:alps:allocate: Using ALPS configuration file: \"%s\"", - alps_sysconfig); - - fp = fopen(alps_sysconfig, "r"); - if (NULL == fp) { - ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); - return ORTE_ERR_FILE_OPEN_FAILURE; + if (NULL == (appinfo_path = orte_ras_get_appinfo_path())) { + return ORTE_ERR_NOT_FOUND; } - - while( (alps_config_str=ras_alps_getline(fp)) ) { - - char *cpq; - char *cpr; - - cpq=strchr( alps_config_str, '#' ); /* Parse comments, actually ANY # */ - cpr=strchr( alps_config_str, '=' ); /* Parse for variables */ - if( !cpr || /* Skip if not definition */ - (cpq && cpq PATH_MAX ) { /* Bad configuration */ - - errno=ENAMETOOLONG; - ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); - return ORTE_ERR_FILE_OPEN_FAILURE; - } - asprintf(&str, "%s/appinfo", cpq); - if (NULL == str) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - free(alps_config_str); - break; - } - fclose(fp); - - opal_output_verbose(1, orte_ras_base.ras_output, - "ras:alps:allocate: Located ALPS scheduler file: \"%s\"", str); - -/* Parse ALPS scheduler information file (appinfo) for node list. */ + /* Parse ALPS scheduler information file (appinfo) for node list. */ if (ORTE_SUCCESS != (ret = orte_ras_alps_read_appinfo_file( - nodes, str, + nodes, + appinfo_path, (unsigned int *)&orte_ras_alps_res_id))) { ORTE_ERROR_LOG(ret); goto cleanup; } - free(str); - /* record the number of allocated nodes */ + /* Record the number of allocated nodes */ orte_num_allocated_nodes = opal_list_get_size(nodes); cleanup: - /* All done */ + if (NULL != appinfo_path) { + free(appinfo_path); + } if (ORTE_SUCCESS == ret) { opal_output_verbose(1, orte_ras_base.ras_output, - "ras:alps:allocate: success"); - } else { + "ras:alps:allocate: success"); + } + else { opal_output_verbose(1, orte_ras_base.ras_output, - "ras:alps:allocate: failure (base_allocate_nodes=%d)", ret); + "ras:alps:allocate: failure " + "(base_allocate_nodes = %d)", ret); } return ret; } +#define RAS_BASE_FILE_MAX_LINE_LENGTH (PATH_MAX * 2) -#define RAS_BASE_FILE_MAX_LINE_LENGTH PATH_MAX*2 - -static char *ras_alps_getline(FILE *fp) +static char * +ras_alps_getline(FILE *fp) { - char *ret, *buff = NULL; - char input[RAS_BASE_FILE_MAX_LINE_LENGTH+1]; - + char *ret = NULL, *input = NULL; + + input = (char *)calloc(RAS_BASE_FILE_MAX_LINE_LENGTH + 1, sizeof(char)); + /* out of resources */ + if (NULL == input) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } ret = fgets(input, RAS_BASE_FILE_MAX_LINE_LENGTH, fp); if (NULL != ret) { - input[strlen(input)-1] = '\0'; /* remove newline */ - buff = strdup(input); + input[strlen(input) - 1] = '\0'; /* remove newline */ + return input; } - - return buff; + + return NULL; } - -static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned int *uMe) +static int +orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, + unsigned int *uMe) { int iq; int ix; @@ -355,10 +526,9 @@ static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, u return ORTE_SUCCESS; } -/* - * There's really nothing to do here - */ -static int orte_ras_alps_finalize(void) +/* There's really nothing to do here */ +static int +orte_ras_alps_finalize(void) { opal_output_verbose(1, orte_ras_base.ras_output, "ras:alps:finalize: success (nothing to do)");