- Based and updated from Ken's patch: since CLE-2.1 does not offer
the BATCH_PARTITION_ID anymore, use the ras-alps-command.sh script to figure out the jobs ID to query from ALPS. Gracefully report errors, update the help file and parse the sysconfig file This commit was SVN r21772.
Этот коммит содержится в:
родитель
77edc0e5b8
Коммит
784b9b9f5b
@ -224,10 +224,12 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* number of processors needed */
|
||||
asprintf(&tmp, "-n %lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, "-n");
|
||||
asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
opal_argv_append(&argc, &argv, "-N 1");
|
||||
opal_argv_append(&argc, &argv, "-N");
|
||||
opal_argv_append(&argc, &argv, "1");
|
||||
|
||||
/* create nodelist */
|
||||
nodelist_argv = NULL;
|
||||
@ -253,7 +255,8 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
nodelist_flat = opal_argv_join(nodelist_argv, ',');
|
||||
opal_argv_free(nodelist_argv);
|
||||
asprintf(&tmp, "-L %s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, "-L");
|
||||
asprintf(&tmp, "%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
@ -383,7 +386,7 @@ launch_apps:
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* JMS: short we stash the alps pid in the gpr somewhere for cleanup? */
|
||||
/* JMS: should we stash the alps pid in the gpr somewhere for cleanup? */
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
@ -476,9 +479,10 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
if (failed_launch) {
|
||||
/* we have a problem during launch */
|
||||
opal_output(0, "ERROR: alps failed to start the required daemons.");
|
||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
|
||||
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
|
||||
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
|
||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary (--prefix)");
|
||||
opal_output(0, "ERROR: on one or more remote nodes, compilation of the orted with dynamic libraries,");
|
||||
opal_output(0, "ERROR: lack of authority to execute on one or more specified nodes,");
|
||||
opal_output(0, "ERROR: or the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).");
|
||||
|
||||
/* report that the daemon has failed so we break out of the daemon
|
||||
* callback receive and exit
|
||||
|
@ -22,21 +22,18 @@
|
||||
While trying to determine what resources are available, the ALPS
|
||||
resource allocator expects to find the following environment variables:
|
||||
|
||||
BATCH_PARTITION_ID
|
||||
OMPI_ALPS_RESID
|
||||
|
||||
However, it was unable to find the following environment variable:
|
||||
|
||||
%s
|
||||
|
||||
#This is a fatal error.
|
||||
[alps-env-var-bad-value]
|
||||
[alps-env-var-invalid]
|
||||
While trying to determine what resources are available, the ALPS
|
||||
resource allocator uses the following environment variables:
|
||||
resource allocator uses the OMPI_ALPS_RESID environment variable.
|
||||
|
||||
ALPS_NODELIST value: %s
|
||||
ALPS_TASKS_PER_NODE value: %s
|
||||
|
||||
However, an error was encountered when trying to parse the following variable:
|
||||
However, an error was encountered when trying to parse the variable:
|
||||
|
||||
%s
|
||||
|
||||
|
@ -1 +1,73 @@
|
||||
apstat -a `apstat -r | awk '{if ($1!=A && $1=='$BATCH_PARTITION_ID') { print $2}}'` -r -v | egrep "(nid [0-9]+)" -o | awk '{print $2}'
|
||||
#!/bin/bash
|
||||
# The purpose of this shell script is to extract the ALPS resId for the
|
||||
# current batch job (under which we are running). Ideally, we would not
|
||||
# relegate such a function to a script; rather, we would extract this
|
||||
# value directly from the ALPS "reservations" file. However, we do not
|
||||
# know the file structure used for the "reservations" file and this
|
||||
# structure is not locally available. Then, in addition, we have the
|
||||
# complication that as of the instantiation date of this script, the
|
||||
# user command interface to ALPS is somewhat unreliable. So, to keep
|
||||
# the load module fairly simple, we use this script.
|
||||
|
||||
APSTAT=/usr/bin/apstat
|
||||
GREP=/usr/bin/grep
|
||||
AWK=/usr/bin/awk
|
||||
SLEEP=/usr/bin/sleep
|
||||
ECHO=/bin/echo
|
||||
|
||||
# If the old variable still is being set, use it.
|
||||
if [ "${BATCH_PARTITION_ID}" != "" ]
|
||||
then
|
||||
${ECHO} ${BATCH_PARTITION_ID}
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Extract the batch job ID directly from the environment, if available.
|
||||
jid=${BATCH_JOBID:--1}
|
||||
if [ $jid -eq -1 ]
|
||||
then
|
||||
|
||||
# Otherwise, parse it from the PBS/Torque environmental variable.
|
||||
jid=${PBS_JOBID:--1}
|
||||
if [ $jid == "-1" ]
|
||||
then
|
||||
${ECHO} -1
|
||||
exit 0
|
||||
fi
|
||||
jid=`${ECHO} $jid | ${AWK} -F\. '{print $1}'`
|
||||
fi
|
||||
|
||||
# Next, let the ALPS user command interface read the "reservations"
|
||||
# file; but let's not be too hasty about reporting failure.
|
||||
resId=""
|
||||
count=0
|
||||
while ( [ "$resId" == "" ] )
|
||||
do
|
||||
|
||||
# We're in a while loop, so skip the delay on the first trip.
|
||||
if [ $count -gt 0 ]
|
||||
then
|
||||
${SLEEP} 1
|
||||
fi
|
||||
|
||||
# Try to get the ALPS resId.
|
||||
resId=`${APSTAT} -r | ${GREP} $jid | ${AWK} '{print $1}'`
|
||||
|
||||
# Give up after 10 tries.
|
||||
count=`expr $count + 1 `
|
||||
if [ $count -ge 10 ]
|
||||
then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# If we still don't have it after 10 attempts, then, I reckon that it
|
||||
# just wasn't meant to be.
|
||||
if [ "$resId" == "" ]
|
||||
then
|
||||
${ECHO} 2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
${ECHO} $resId
|
||||
exit 0
|
||||
|
@ -28,16 +28,12 @@
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_alps_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_alps_module;
|
||||
ORTE_DECLSPEC int orte_ras_alps_get_appinfo_attempts( int *attempts );
|
||||
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_alps_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_alps_module;
|
||||
ORTE_DECLSPEC int orte_ras_alps_get_appinfo_attempts( int *attempts );
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -93,7 +93,7 @@ static int orte_ras_alps_component_query(mca_base_module_t **module, int *priori
|
||||
|
||||
/* Are we running under a ALPS job? */
|
||||
|
||||
if (NULL != getenv("BATCH_PARTITION_ID")) {
|
||||
if (NULL != getenv("OMPI_ALPS_RESID")) {
|
||||
mca_base_param_lookup_int(param_priority, priority);
|
||||
opal_output_verbose(1, orte_ras_base.ras_output,
|
||||
"ras:alps: available for selection");
|
||||
@ -104,7 +104,7 @@ static int orte_ras_alps_component_query(mca_base_module_t **module, int *priori
|
||||
/* Sadly, no */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:alps: NOT available for selection");
|
||||
"ras:alps: NOT available for selection -- OMPI_ALPS_RESID not set?");
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
@ -40,8 +40,8 @@
|
||||
*/
|
||||
static int orte_ras_alps_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_alps_finalize(void);
|
||||
int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned *uMe);
|
||||
static char *ras_alps_getline(FILE *fp);
|
||||
static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned int *uMe);
|
||||
|
||||
|
||||
/*
|
||||
@ -59,44 +59,47 @@ orte_ras_base_module_t orte_ras_alps_module = {
|
||||
*/
|
||||
static int orte_ras_alps_allocate(opal_list_t *nodes)
|
||||
{
|
||||
unsigned alps_res_id;
|
||||
const char alps_sysconfig[] = "/etc/sysconfig/alps"; /** Get ALPS scheduler information
|
||||
file pathname from system configuration. **/
|
||||
unsigned int alps_res_id;
|
||||
int ret;
|
||||
FILE *fp;
|
||||
char *alps_batch_id;
|
||||
char *endptr;
|
||||
char *str;
|
||||
char *alps_config_str;
|
||||
|
||||
alps_batch_id = getenv("BATCH_PARTITION_ID");
|
||||
alps_batch_id = getenv("OMPI_ALPS_RESID");
|
||||
if (NULL == alps_batch_id) {
|
||||
orte_show_help("help-ras-alps.txt", "alps-env-var-not-found", 1,
|
||||
"BATCH_PARTITION_ID");
|
||||
"OMPI_ALPS_RESID");
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
alps_res_id=(unsigned)atol(alps_batch_id);
|
||||
|
||||
/* Get ALPS scheduler information file pathname from system configuration. */
|
||||
asprintf(&str, "/etc/sysconfig/alps");
|
||||
if (NULL == str) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
alps_res_id=(unsigned int)strtol(alps_batch_id, &endptr, 10);
|
||||
|
||||
if (alps_batch_id[0] == '\0' || endptr[0] != '\0') {
|
||||
orte_show_help("help-ras-alps.txt", "alps-env-var-invalid", 1,
|
||||
alps_batch_id);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
opal_output_verbose(1, orte_ras_base.ras_output,
|
||||
"ras:alps:allocate: Using ALPS configuration file: \"%s\"", str);
|
||||
"ras:alps:allocate: Using ALPS configuration file: \"%s\"",
|
||||
alps_sysconfig);
|
||||
|
||||
fp = fopen(str, "r");
|
||||
fp = fopen(alps_sysconfig, "r");
|
||||
if (NULL == fp) {
|
||||
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
free(str);
|
||||
|
||||
while( (alps_config_str=ras_alps_getline(fp)) ) {
|
||||
|
||||
char *cpq;
|
||||
char *cpr;
|
||||
|
||||
cpq=strchr( alps_config_str, '#' ); /* Parse for comments */
|
||||
cpq=strchr( alps_config_str, '#' ); /* Parse comments, actually ANY # */
|
||||
cpr=strchr( alps_config_str, '=' ); /* Parse for variables */
|
||||
if( !cpr || /* Skip if not definition */
|
||||
(cpq && cpq<cpr) ) { /* Skip if commented */
|
||||
@ -127,7 +130,7 @@ static int orte_ras_alps_allocate(opal_list_t *nodes)
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
*cpr='\0';
|
||||
if( strlen(cpq)+8>PATH_MAX ) { /* Bad configuration */
|
||||
if( strlen(cpq)+8 > PATH_MAX ) { /* Bad configuration */
|
||||
|
||||
errno=ENAMETOOLONG;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
@ -143,32 +146,18 @@ static int orte_ras_alps_allocate(opal_list_t *nodes)
|
||||
fclose(fp);
|
||||
|
||||
opal_output_verbose(1, orte_ras_base.ras_output,
|
||||
"ras:alps:allocate: Located ALPS scheduler file: \"%s\"", str);
|
||||
"ras:alps:allocate: Located ALPS scheduler file: \"%s\"", str);
|
||||
|
||||
/* Parse ALPS scheduler information file (appinfo) for node list. */
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_alps_read_appinfo_file(nodes, str, &alps_res_id))) {
|
||||
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto cleanup;
|
||||
}
|
||||
free(str);
|
||||
|
||||
#if 0
|
||||
ret = orte_ras_alps_allocate_nodes(jobid, &nodes);
|
||||
|
||||
ret = orte_ras_alps_node_insert(&nodes);
|
||||
#endif
|
||||
|
||||
cleanup:
|
||||
#if 0
|
||||
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
#endif
|
||||
|
||||
/* All done */
|
||||
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
opal_output_verbose(1, orte_ras_base.ras_output,
|
||||
"ras:alps:allocate: success");
|
||||
@ -196,7 +185,8 @@ static char *ras_alps_getline(FILE *fp)
|
||||
return buff;
|
||||
}
|
||||
|
||||
int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned *uMe)
|
||||
|
||||
static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned int *uMe)
|
||||
{
|
||||
int iq;
|
||||
int ix;
|
||||
@ -244,6 +234,10 @@ int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned
|
||||
|
||||
szLen=ssBuf.st_size; /* Get buffer size */
|
||||
cpBuf=malloc(szLen+1); /* Allocate buffer */
|
||||
if (NULL == cpBuf) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Repeated attempts to read appinfo, with an increasing delay between *
|
||||
* successive attempts to allow scheduler I/O a chance to complete. */
|
||||
@ -306,7 +300,7 @@ int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned
|
||||
|
||||
oNow+=(oDet+oSlots+oEntry); /* Target next slot */
|
||||
|
||||
if( apInfo->resId!=*uMe ) continue; /* Filter to our reservation Id */
|
||||
if( apInfo->resId != *uMe ) continue; /* Filter to our reservation Id */
|
||||
|
||||
for( ix=0; ix<apInfo->numPlaces; ix++ ) {
|
||||
|
||||
@ -314,6 +308,10 @@ int orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, unsigned
|
||||
"ras:alps:read_appinfo: got NID %d", apSlots[ix].nid);
|
||||
|
||||
asprintf( &hostname, "%d", apSlots[ix].nid );
|
||||
if (NULL == hostname) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* If this matches the prior nodename, just add to the slot count. */
|
||||
if( NULL!=node && !strcmp(node->name, hostname) ) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user