1
1

modifications to ras alps. this commit allows users to mpirun without having to set id environment variables (BASIL_RESERVATION_ID, OMPI_ALPS_RESID). note, however, that we preserved the old behavior. if an id environment variable is set, it will be obeyed and our new code path is essentially bypassed. if we missed something, please yell at us. with this commit, the use of ras-alps-command.sh is no longer needed... at least that is our hope.

This commit was SVN r25181.
Этот коммит содержится в:
Samuel Gutierrez 2011-09-26 21:31:08 +00:00
родитель 0f2475c554
Коммит 25cbf79592
3 изменённых файлов: 97 добавлений и 21 удалений

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -33,6 +35,7 @@ BEGIN_C_DECLS
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_alps_component;
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_alps_module;
ORTE_DECLSPEC int orte_ras_alps_get_appinfo_attempts( int *attempts );
ORTE_DECLSPEC extern unsigned long int orte_ras_alps_res_id;
END_C_DECLS

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,6 +28,7 @@
#include "orte/util/proc_info.h"
#include "ras_alps.h"
#include <ctype.h>
/*
* Local variables
@ -39,6 +42,7 @@ static int param_read_attempts;
*/
static int ras_alps_open(void);
static int orte_ras_alps_component_query(mca_base_module_t **module, int *priority);
unsigned long int orte_ras_alps_res_id;
orte_ras_base_component_t mca_ras_alps_component = {
@ -65,6 +69,73 @@ orte_ras_base_component_t mca_ras_alps_component = {
}
};
/* simple function used to strip off characters on and after a period. NULL
* will be returned upon failure. Otherwise, a "prepped" string will be
* returned. The caller is responsible for freeing returned resources.
* for example: if jid is 138295.sdb, then 138295 will be returned.
*/
static char *
prep_job_id(const char *jid)
{
char *tmp = strdup(jid);
char *tmp2 = NULL;
if (NULL == tmp) {
/* out of resources */
return NULL;
}
if (NULL != (tmp2 = strchr(tmp, '.'))) {
*tmp2 = '\0';
}
return tmp;
}
/* this function replicates some of the id setting functionality found in
* ras-alps-command.sh. we wanted the ability to just "mpirun" the application
* without having to set an environment variable */
static unsigned long int
get_res_id(void) {
const char *apstat_cmd = "/usr/bin/apstat -r";
char *id = NULL;
char read_buf[512];
FILE *apstat_fp = NULL;
/* zero is considered to be an invalid res id */
unsigned long jid = 0;
if (NULL != (id = getenv("BATCH_PARTITION_ID"))) {
return strtoul(id, NULL, 10);
}
if (NULL != (id = getenv("PBS_JOBID"))) {
char *prepped_jid = prep_job_id(id);
if (NULL == prepped_jid) {
/* out of resources */
return 0;
}
if (NULL == (apstat_fp = popen(apstat_cmd, "r"))) {
/* popen failure */
free(prepped_jid);
return 0;
}
while (NULL != fgets(read_buf, 512, apstat_fp)) {
/* does this line have the id that we care about? */
if (NULL != strstr(read_buf, prepped_jid)) {
/* the line is going to be in the form of something like:
A 1450 571783 batch:138309 XT 80 - - 2000 conf,claim
*/
char *t = read_buf;
for (t = read_buf; !isdigit(*t) && *t; ++t) {
jid = strtoul(t, NULL, 10);
}
/* if we are here, then jid should be, given the example above,
* 1450 */
break;
}
}
fclose(apstat_fp);
free(prepped_jid);
}
return jid;
}
static int ras_alps_open(void)
{
@ -85,6 +156,10 @@ static int ras_alps_open(void)
static int orte_ras_alps_component_query(mca_base_module_t **module, int *priority)
{
char *jid_str = NULL;
/* default to an invalid value */
orte_ras_alps_res_id = 0;
/* if we are not an HNP, then we must not be selected */
if (!ORTE_PROC_IS_HNP) {
*module = NULL;
@ -92,8 +167,17 @@ static int orte_ras_alps_component_query(mca_base_module_t **module, int *priori
}
/* Are we running under a ALPS job? */
if ((NULL != getenv("OMPI_ALPS_RESID")) || (NULL != getenv("BASIL_RESERVATION_ID"))) {
/* BASIL_RESERVATION_ID is the equivalent of OMPI_ALPS_RESID
* on some systems
*/
if ((NULL == (jid_str = getenv("OMPI_ALPS_RESID"))) &&
(NULL == (jid_str = getenv("BASIL_RESERVATION_ID")))) {
orte_ras_alps_res_id = get_res_id();
}
else {
orte_ras_alps_res_id = strtoul(jid_str, NULL, 10);
}
if (0 != orte_ras_alps_res_id) {
mca_base_param_lookup_int(param_priority, priority);
opal_output_verbose(1, orte_ras_base.ras_output,
"ras:alps: available for selection");

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -52,6 +54,7 @@ orte_ras_base_module_t orte_ras_alps_module = {
orte_ras_alps_finalize
};
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
@ -61,32 +64,16 @@ static int orte_ras_alps_allocate(opal_list_t *nodes)
{
const char alps_sysconfig[] = "/etc/sysconfig/alps"; /** Get ALPS scheduler information
file pathname from system configuration. **/
unsigned int alps_res_id;
int ret;
FILE *fp;
char *alps_batch_id;
char *endptr;
char *str;
char *alps_config_str;
alps_batch_id = getenv("OMPI_ALPS_RESID");
/* BASIL_RESERVATION_ID is the equivalent of OMPI_ALPS_RESID
* on some systems
*/
if (NULL == alps_batch_id) alps_batch_id = getenv("BASIL_RESERVATION_ID");
if (NULL == alps_batch_id) {
if (0 == orte_ras_alps_res_id) {
orte_show_help("help-ras-alps.txt", "alps-env-var-not-found", 1);
return ORTE_ERR_NOT_FOUND;
}
alps_res_id=(unsigned int)strtol(alps_batch_id, &endptr, 10);
if (alps_batch_id[0] == '\0' || endptr[0] != '\0') {
orte_show_help("help-ras-alps.txt", "alps-env-var-invalid", 1,
alps_batch_id);
return ORTE_ERR_NOT_FOUND;
}
opal_output_verbose(1, orte_ras_base.ras_output,
"ras:alps:allocate: Using ALPS configuration file: \"%s\"",
alps_sysconfig);
@ -152,7 +139,9 @@ static int orte_ras_alps_allocate(opal_list_t *nodes)
"ras:alps:allocate: Located ALPS scheduler file: \"%s\"", str);
/* Parse ALPS scheduler information file (appinfo) for node list. */
if (ORTE_SUCCESS != (ret = orte_ras_alps_read_appinfo_file(nodes, str, &alps_res_id))) {
if (ORTE_SUCCESS != (ret = orte_ras_alps_read_appinfo_file(
nodes, str,
(unsigned int *)&orte_ras_alps_res_id))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}