c041156f60
This commit was SVN r28240.
217 строки
6.8 KiB
C
217 строки
6.8 KiB
C
/*
|
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2008 UT-Battelle, LLC
|
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/util/output.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "ras_alps.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
/* Local variables */
|
|
static int param_priority;
|
|
static int ras_alps_read_attempts;
|
|
|
|
/* Local functions */
|
|
static int ras_alps_register(void);
|
|
static int ras_alps_open(void);
|
|
static int orte_ras_alps_component_query(mca_base_module_t **module,
|
|
int *priority);
|
|
unsigned long int orte_ras_alps_res_id;
|
|
|
|
orte_ras_base_component_t mca_ras_alps_component = {
|
|
/* First, the mca_base_component_t struct containing meta information about
|
|
* the component itself
|
|
* */
|
|
{
|
|
ORTE_RAS_BASE_VERSION_2_0_0,
|
|
|
|
/* Component name and version */
|
|
"alps",
|
|
ORTE_MAJOR_VERSION,
|
|
ORTE_MINOR_VERSION,
|
|
ORTE_RELEASE_VERSION,
|
|
|
|
/* Component open and close functions */
|
|
ras_alps_open,
|
|
NULL,
|
|
orte_ras_alps_component_query,
|
|
ras_alps_register
|
|
},
|
|
{
|
|
/* The component is checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
}
|
|
};
|
|
|
|
/* simple function used to strip off characters on and after a period. NULL
|
|
* will be returned upon failure. Otherwise, a "prepped" string will be
|
|
* returned. The caller is responsible for freeing returned resources.
|
|
* for example: if jid is 138295.sdb, then 138295 will be returned.
|
|
*/
|
|
static char *
|
|
prep_job_id(const char *jid)
|
|
{
|
|
char *tmp = strdup(jid);
|
|
char *tmp2 = NULL;
|
|
|
|
if (NULL == tmp) {
|
|
/* out of resources */
|
|
return NULL;
|
|
}
|
|
if (NULL != (tmp2 = strchr(tmp, '.'))) {
|
|
*tmp2 = '\0';
|
|
}
|
|
return tmp;
|
|
}
|
|
|
|
/* this function replicates some of the id setting functionality found in
|
|
* ras-alps-command.sh. we wanted the ability to just "mpirun" the application
|
|
* without having to set an environment variable
|
|
*/
|
|
static unsigned long int
|
|
get_res_id(void)
|
|
{
|
|
const char *apstat_cmd = "/usr/bin/apstat -r";
|
|
char *id = NULL;
|
|
char read_buf[512];
|
|
FILE *apstat_fp = NULL;
|
|
/* zero is considered to be an invalid res id */
|
|
unsigned long jid = 0;
|
|
|
|
if (NULL != (id = getenv("BATCH_PARTITION_ID"))) {
|
|
return strtoul(id, NULL, 10);
|
|
}
|
|
if (NULL != (id = getenv("PBS_JOBID"))) {
|
|
char *prepped_jid = prep_job_id(id);
|
|
if (NULL == prepped_jid) {
|
|
/* out of resources */
|
|
return 0;
|
|
}
|
|
if (NULL == (apstat_fp = popen(apstat_cmd, "r"))) {
|
|
/* popen failure */
|
|
free(prepped_jid);
|
|
return 0;
|
|
}
|
|
while (NULL != fgets(read_buf, 512, apstat_fp)) {
|
|
/* does this line have the id that we care about? */
|
|
if (NULL != strstr(read_buf, prepped_jid)) {
|
|
/* the line is going to be in the form of something like:
|
|
A 1450 571783 batch:138309 XT 80 - - 2000 conf,claim
|
|
*/
|
|
char *t = read_buf;
|
|
for (t = read_buf; !isdigit(*t) && *t; ++t) {
|
|
jid = strtoul(t, NULL, 10);
|
|
}
|
|
/* if we are here, then jid should be, given the example above,
|
|
* 1450 */
|
|
break;
|
|
}
|
|
}
|
|
fclose(apstat_fp);
|
|
free(prepped_jid);
|
|
}
|
|
return jid;
|
|
}
|
|
|
|
static int
|
|
ras_alps_register(void)
|
|
{
|
|
param_priority = 75;
|
|
(void) mca_base_component_var_register (&mca_ras_alps_component.base_version,
|
|
"priority", "Priority of the alps ras component",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_READONLY,
|
|
¶m_priority);
|
|
|
|
ras_alps_read_attempts = 10;
|
|
(void) mca_base_component_var_register (&mca_ras_alps_component.base_version,
|
|
"appinfo_read_attempts",
|
|
"Maximum number of attempts to read ALPS "
|
|
"appinfo file", MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, 0, OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_READONLY, &ras_alps_read_attempts);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int
|
|
ras_alps_open(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int
|
|
orte_ras_alps_component_query(mca_base_module_t **module,
|
|
int *priority)
|
|
{
|
|
char *jid_str = NULL;
|
|
/* default to an invalid value */
|
|
orte_ras_alps_res_id = 0;
|
|
|
|
/* if we are not an HNP, then we must not be selected */
|
|
if (!ORTE_PROC_IS_HNP) {
|
|
*module = NULL;
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
/* Are we running under a ALPS job? */
|
|
/* BASIL_RESERVATION_ID is the equivalent of OMPI_ALPS_RESID
|
|
* on some systems
|
|
*/
|
|
if ((NULL == (jid_str = getenv("OMPI_ALPS_RESID"))) &&
|
|
(NULL == (jid_str = getenv("BASIL_RESERVATION_ID")))) {
|
|
orte_ras_alps_res_id = get_res_id();
|
|
}
|
|
else {
|
|
orte_ras_alps_res_id = strtoul(jid_str, NULL, 10);
|
|
}
|
|
if (0 != orte_ras_alps_res_id) {
|
|
*priority = param_priority;
|
|
opal_output_verbose(2, orte_ras_base_framework.framework_output,
|
|
"ras:alps: available for selection");
|
|
*module = (mca_base_module_t *) &orte_ras_alps_module;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/* Sadly, no */
|
|
|
|
opal_output(orte_ras_base_framework.framework_output,
|
|
"ras:alps: NOT available for selection -- "
|
|
"OMPI_ALPS_RESID or BASIL_RESERVATION_ID not set?");
|
|
*module = NULL;
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
int
|
|
orte_ras_alps_get_appinfo_attempts(int *attempts)
|
|
{
|
|
*attempts = ras_alps_read_attempts;
|
|
opal_output_verbose(2, orte_ras_base_framework.framework_output,
|
|
"ras:alps:orte_ras_alps_get_appinfo_attempts: %d",
|
|
*attempts);
|
|
return ORTE_SUCCESS;
|
|
}
|