1
1
openmpi/orte/mca/ras/slurm/ras_slurm_module.c
Tim Prins ade94b523b Fixed a number of issues related to resource allocation:
- Simplified the logic of the ras modules by moving the attribute handling into the base allocation function. This allows us to decide how to allocate based on the situation, and solves some of the allocation problems we were having with comm_spawn.
- moved the proxy component into the base. This was done because we always want to call the proxy functions if we are not on a HNP regardless of the attributes passed. 
- Got rid of the hostfile component. What little logic was in it was moved into the base to deal with other circumstances. The hostfile information is currently being propagated into the registry by the RDS, so we just use what is already in the registry.
- renamed some slurm function so that they have the proper prefix. Not strictly necessary as they were static, but it makes debugging much easier.
- fixed a buglet in the round_robin rmaps where we would return an error when really no error occured.

I tried to make proper corrections to all the ras modules, but I cannot test all of them.

This commit was SVN r12202.
2006-10-19 23:33:51 +00:00

412 строки
11 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "orte/dss/dss.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/ras_private.h"
#include "ras_slurm.h"
/*
* Local functions
*/
static int orte_ras_slurm_allocate(orte_jobid_t jobid, opal_list_t *attributes);
static int orte_ras_slurm_deallocate(orte_jobid_t jobid);
static int orte_ras_slurm_finalize(void);
static int orte_ras_slurm_discover(char *regexp, opal_list_t *nodelist);
static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***nodelist);
static int orte_ras_slurm_parse_range(char *base, char *range, char ***nodelist);
/*
* Global variable
*/
orte_ras_base_module_t orte_ras_slurm_module = {
orte_ras_slurm_allocate,
orte_ras_base_node_insert,
orte_ras_base_node_query,
orte_ras_base_node_query_alloc,
orte_ras_base_node_lookup,
orte_ras_slurm_deallocate,
orte_ras_slurm_finalize
};
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
*
*/
static int orte_ras_slurm_allocate(orte_jobid_t jobid, opal_list_t *attributes)
{
int ret;
char *slurm_node_str;
opal_list_t nodes;
opal_list_item_t* item;
OBJ_CONSTRUCT(&nodes, opal_list_t);
slurm_node_str = getenv("SLURM_NODELIST");
if (NULL == slurm_node_str) {
opal_show_help("help-ras-slurm.txt", "env-var-not-found", 1,
"SLURM_NODELIST");
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (ret = orte_ras_slurm_discover(slurm_node_str, &nodes))) {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate: discover failed!");
return ret;
}
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
while (NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
/* All done */
if (ORTE_SUCCESS == ret) {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate: success");
} else {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate: failure (base_allocate_nodes=%d)", ret);
}
return ret;
}
/*
* There's really nothing to do here
*/
static int orte_ras_slurm_deallocate(orte_jobid_t jobid)
{
opal_output(orte_ras_base.ras_output,
"ras:slurm:deallocate: success (nothing to do)");
return ORTE_SUCCESS;
}
/*
* There's really nothing to do here
*/
static int orte_ras_slurm_finalize(void)
{
opal_output(orte_ras_base.ras_output,
"ras:slurm:finalize: success (nothing to do)");
return ORTE_SUCCESS;
}
/**
* Discover the available resources. Obtain directly from SLURM (and
* therefore have no need to validate) -- ignore hostfile or any other
* user-specified parameters.
*
* - validate any nodes specified via hostfile/commandline
* - check for additional nodes that have already been allocated
*/
static int orte_ras_slurm_discover(char *regexp, opal_list_t* nodelist)
{
int i, j, len, ret, count, reps;
char *base, **names = NULL;
char *begptr, *endptr, *tasks_per_node;
int *slots;
if (NULL == regexp) {
return ORTE_SUCCESS;
}
base = strdup(regexp);
if (NULL == base) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: checking nodelist: %s", regexp);
/* Find the base */
len = strlen(regexp);
for (i = 0; i < len; ++i) {
if (base[i] == '[') {
base[i] = '\0';
break;
}
}
/* If we didn't find a range, then this is it */
if (i >= len) {
orte_ras_node_t *node;
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: found single node");
node = OBJ_NEW(orte_ras_node_t);
if (NULL == node) {
ret = ORTE_ERR_OUT_OF_RESOURCE;
} else {
opal_argv_append_nosize(&names, base);
ret = ORTE_SUCCESS;
}
free(base);
} else {
/* If we did find a range, find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
break;
}
}
if (j >= len) {
free(base);
return ORTE_ERR_NOT_FOUND;
}
ret = orte_ras_slurm_parse_ranges(base, base + i + 1, &names);
}
/* Find the number of slots per node */
slots = malloc(sizeof(int) * opal_argv_count(names));
if (NULL == slots) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
memset(slots, 0, sizeof(int) * opal_argv_count(names));
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
if (NULL == tasks_per_node) {
opal_show_help("help-ras-slurm.txt", "env-var-not-found", 1,
"SLURM_TASKS_PER_NODE");
return ORTE_ERR_NOT_FOUND;
}
begptr = tasks_per_node;
j = 0;
while (begptr) {
count = strtol(begptr, &endptr, 10);
if ((endptr[0] == '(') && (endptr[1] == 'x')) {
reps = strtol((endptr+2), &endptr, 10);
if (endptr[0] == ')') {
endptr++;
}
} else {
reps = 1;
}
for (i = 0; i < reps; i++) {
slots[j++] = count;
}
if (*endptr == ',') {
begptr = endptr + 1;
} else if (*endptr == '\0') {
break;
} else {
opal_show_help("help-ras-slurm.txt", "env-var-bad-value", 1,
"SLURM_TASKS_PER_NODE", tasks_per_node);
return ORTE_ERR_NOT_FOUND;
}
}
/* Convert the argv of node names to a list of ras_base_node_t's */
if (ORTE_SUCCESS == ret) {
for (i = 0; NULL != names && NULL != names[i]; ++i) {
orte_ras_node_t *node;
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: adding node %s (%d slot%s)",
names[i], slots[i], (1 == slots[i]) ? "" : "s");
node = OBJ_NEW(orte_ras_node_t);
if (NULL == node) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
node->node_name = strdup(names[i]);
node->node_arch = NULL;
node->node_state = ORTE_NODE_STATE_UP;
/* JMS: this should not be hard-wired to 0, but there's no
other value to put it to [yet]... */
node->node_cellid = 0;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = slots[i];
opal_list_append(nodelist, &node->super);
}
free(slots);
opal_argv_free(names);
/* Now add the nodes to the registry */
ret = orte_ras_base_node_insert(nodelist);
}
/* All done */
return ret;
}
/*
* Parse one or more ranges in a set
*/
static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***names)
{
int i, len, ret;
char *start, *orig;
/* Look for commas, the separator between ranges */
len = strlen(ranges);
for (orig = start = ranges, i = 0; i < len; ++i) {
if (',' == ranges[i]) {
ranges[i] = '\0';
if (ORTE_SUCCESS != (ret = orte_ras_slurm_parse_range(base, start, names))) {
return ret;
}
start = ranges + i + 1;
}
}
/* Pick up the last range, if it exists */
if (start < orig + len) {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: parse range %s (2)",
start);
if (ORTE_SUCCESS != (ret = orte_ras_slurm_parse_range(base, start, names))) {
return ret;
}
}
/* All done */
return ORTE_SUCCESS;
}
/*
* Parse a single range in a set
*/
static int orte_ras_slurm_parse_range(char *base, char *range, char ***names)
{
char *str, temp1[BUFSIZ], temp2[BUFSIZ];
size_t i, j, start, end;
size_t base_len, len;
size_t str_start, str_end;
size_t num_str_len;
bool found;
len = strlen(range);
base_len = strlen(base);
/* Silence compiler warnings; start and end are always assigned
properly, below */
start = end = 0;
/* Look for the beginning of the first number */
for (found = false, i = 0; i < len; ++i) {
if (isdigit((int) range[i])) {
if (!found) {
str_start = i;
start = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
return ORTE_ERR_NOT_FOUND;
}
/* Look for the end of the first number */
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
if (!isdigit((int) range[i])) {
break;
}
}
/* Was there no range, just a single number? */
if (i >= len) {
str_end = len;
end = start;
found = true;
}
/* Nope, there was a range. Look for the beginning of the second
number */
else {
str_end = i - 1;
for (; i < len; ++i) {
if (isdigit((int) range[i])) {
end = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
return ORTE_ERR_NOT_FOUND;
}
/* Make strings for all values in the range */
for (i = start; i <= end; ++i) {
len = base_len + 32;
str = malloc(len);
if (NULL == str) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
str[0] = '\0';
snprintf(temp1, BUFSIZ - 1, "%s", base);
snprintf(temp2, BUFSIZ - 1, "%lu", (long) i);
temp1[BUFSIZ - 1] = temp2[BUFSIZ - 1] = '\0';
/* Do we need zero pading? */
if (strlen(temp2) < num_str_len) {
for (j = 0; j < num_str_len - strlen(temp2); ++j) {
strcat(temp1, "0");
}
}
snprintf(str, len - 1, "%s%s", temp1, temp2);
str[len - 1] = '\0';
opal_argv_append_nosize(names, str);
}
/* All done */
return ORTE_SUCCESS;
}