1
1
openmpi/orte/mca/ras/slurm/ras_slurm_module.c
Ralph Castain f4a458532b This doesn't totally resolve the comm_spawn problem, but it helps a little. I'll continue working on it and hope to resolve it completely shortly. The issue primarily centers on where to start mapping the child job's processes, and how to deal with oversubscription that might result. At the moment, I am trying to resolve the first issue first (hey, that even sounds right!).
This change does a couple of things:

1. Since the USE_PARENT_ALLOC attribute is a directive about regarding allocation of resources to a job, it more properly should be an attribute of the RAS. Change the name to reflect that and move the attribute define to the ras_types.h file.

2. Add the attributes list to the RMAPS map_job interface. This provides us with the desired flexibility to dynamically specify directives for mapping. The system will - in the absence of any attribute-based directive - default to the values provided in the MCA parameters (either from environment or command-line interface).

This commit was SVN r12164.
2006-10-18 14:01:44 +00:00

430 строки
12 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "orte/dss/dss.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/ras_private.h"
#include "ras_slurm.h"
/*
* Local functions
*/
static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
static int deallocate(orte_jobid_t jobid);
static int finalize(void);
static int discover(char *regexp, opal_list_t *nodelist);
static int parse_ranges(char *base, char *ranges, char ***nodelist);
static int parse_range(char *base, char *range, char ***nodelist);
/*
* Global variable
*/
orte_ras_base_module_t orte_ras_slurm_module = {
allocate,
orte_ras_base_node_insert,
orte_ras_base_node_query,
orte_ras_base_node_query_alloc,
orte_ras_base_node_lookup,
deallocate,
finalize
};
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
*
*/
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
{
int ret;
char *slurm_node_str;
opal_list_t nodes;
opal_list_item_t* item;
orte_jobid_t *jptr;
orte_attribute_t *attr;
/* check the attributes to see if we are supposed to use the parent
* jobid's allocation. This can occur if we are doing a dynamic
* process spawn and don't want to go through the allocator again
*/
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
/* attribute was given - just reallocate to the new jobid */
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
ORTE_ERROR_LOG(ret);
return ret;
}
return ORTE_SUCCESS;
}
slurm_node_str = getenv("SLURM_NODELIST");
if (NULL == slurm_node_str) {
opal_show_help("help-ras-slurm.txt", "env-var-not-found", 1,
"SLURM_NODELIST");
return ORTE_ERR_NOT_FOUND;
}
OBJ_CONSTRUCT(&nodes, opal_list_t);
if (ORTE_SUCCESS != (ret = discover(slurm_node_str, &nodes))) {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate: discover failed!");
return ret;
}
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
while (NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
/* All done */
if (ORTE_SUCCESS == ret) {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate: success");
} else {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate: failure (base_allocate_nodes=%d)", ret);
}
return ret;
}
/*
* There's really nothing to do here
*/
static int deallocate(orte_jobid_t jobid)
{
opal_output(orte_ras_base.ras_output,
"ras:slurm:deallocate: success (nothing to do)");
return ORTE_SUCCESS;
}
/*
* There's really nothing to do here
*/
static int finalize(void)
{
opal_output(orte_ras_base.ras_output,
"ras:slurm:finalize: success (nothing to do)");
return ORTE_SUCCESS;
}
/**
* Discover the available resources. Obtain directly from SLURM (and
* therefore have no need to validate) -- ignore hostfile or any other
* user-specified parameters.
*
* - validate any nodes specified via hostfile/commandline
* - check for additional nodes that have already been allocated
*/
static int discover(char *regexp, opal_list_t* nodelist)
{
int i, j, len, ret, count, reps;
char *base, **names = NULL;
char *begptr, *endptr, *tasks_per_node;
int *slots;
if (NULL == regexp) {
return ORTE_SUCCESS;
}
base = strdup(regexp);
if (NULL == base) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: checking nodelist: %s", regexp);
/* Find the base */
len = strlen(regexp);
for (i = 0; i < len; ++i) {
if (base[i] == '[') {
base[i] = '\0';
break;
}
}
/* If we didn't find a range, then this is it */
if (i >= len) {
orte_ras_node_t *node;
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: found single node");
node = OBJ_NEW(orte_ras_node_t);
if (NULL == node) {
ret = ORTE_ERR_OUT_OF_RESOURCE;
} else {
opal_argv_append_nosize(&names, base);
ret = ORTE_SUCCESS;
}
free(base);
} else {
/* If we did find a range, find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
break;
}
}
if (j >= len) {
free(base);
return ORTE_ERR_NOT_FOUND;
}
ret = parse_ranges(base, base + i + 1, &names);
}
/* Find the number of slots per node */
slots = malloc(sizeof(int) * opal_argv_count(names));
if (NULL == slots) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
memset(slots, 0, sizeof(int) * opal_argv_count(names));
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
if (NULL == tasks_per_node) {
opal_show_help("help-ras-slurm.txt", "env-var-not-found", 1,
"SLURM_TASKS_PER_NODE");
return ORTE_ERR_NOT_FOUND;
}
begptr = tasks_per_node;
j = 0;
while (begptr) {
count = strtol(begptr, &endptr, 10);
if ((endptr[0] == '(') && (endptr[1] == 'x')) {
reps = strtol((endptr+2), &endptr, 10);
if (endptr[0] == ')') {
endptr++;
}
} else {
reps = 1;
}
for (i = 0; i < reps; i++) {
slots[j++] = count;
}
if (*endptr == ',') {
begptr = endptr + 1;
} else if (*endptr == '\0') {
break;
} else {
opal_show_help("help-ras-slurm.txt", "env-var-bad-value", 1,
"SLURM_TASKS_PER_NODE", tasks_per_node);
return ORTE_ERR_NOT_FOUND;
}
}
/* Convert the argv of node names to a list of ras_base_node_t's */
if (ORTE_SUCCESS == ret) {
for (i = 0; NULL != names && NULL != names[i]; ++i) {
orte_ras_node_t *node;
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: adding node %s (%d slot%s)",
names[i], slots[i], (1 == slots[i]) ? "" : "s");
node = OBJ_NEW(orte_ras_node_t);
if (NULL == node) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
node->node_name = strdup(names[i]);
node->node_arch = NULL;
node->node_state = ORTE_NODE_STATE_UP;
/* JMS: this should not be hard-wired to 0, but there's no
other value to put it to [yet]... */
node->node_cellid = 0;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = slots[i];
opal_list_append(nodelist, &node->super);
}
free(slots);
opal_argv_free(names);
/* Now add the nodes to the registry */
ret = orte_ras_base_node_insert(nodelist);
}
/* All done */
return ret;
}
/*
* Parse one or more ranges in a set
*/
static int parse_ranges(char *base, char *ranges, char ***names)
{
int i, len, ret;
char *start, *orig;
/* Look for commas, the separator between ranges */
len = strlen(ranges);
for (orig = start = ranges, i = 0; i < len; ++i) {
if (',' == ranges[i]) {
ranges[i] = '\0';
if (ORTE_SUCCESS != (ret = parse_range(base, start, names))) {
return ret;
}
start = ranges + i + 1;
}
}
/* Pick up the last range, if it exists */
if (start < orig + len) {
opal_output(orte_ras_base.ras_output,
"ras:slurm:allocate:discover: parse range %s (2)",
start);
if (ORTE_SUCCESS != (ret = parse_range(base, start, names))) {
return ret;
}
}
/* All done */
return ORTE_SUCCESS;
}
/*
* Parse a single range in a set
*/
static int parse_range(char *base, char *range, char ***names)
{
char *str, temp1[BUFSIZ], temp2[BUFSIZ];
size_t i, j, start, end;
size_t base_len, len;
size_t str_start, str_end;
size_t num_str_len;
bool found;
len = strlen(range);
base_len = strlen(base);
/* Silence compiler warnings; start and end are always assigned
properly, below */
start = end = 0;
/* Look for the beginning of the first number */
for (found = false, i = 0; i < len; ++i) {
if (isdigit((int) range[i])) {
if (!found) {
str_start = i;
start = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
return ORTE_ERR_NOT_FOUND;
}
/* Look for the end of the first number */
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
if (!isdigit((int) range[i])) {
break;
}
}
/* Was there no range, just a single number? */
if (i >= len) {
str_end = len;
end = start;
found = true;
}
/* Nope, there was a range. Look for the beginning of the second
number */
else {
str_end = i - 1;
for (; i < len; ++i) {
if (isdigit((int) range[i])) {
end = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
return ORTE_ERR_NOT_FOUND;
}
/* Make strings for all values in the range */
for (i = start; i <= end; ++i) {
len = base_len + 32;
str = malloc(len);
if (NULL == str) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
str[0] = '\0';
snprintf(temp1, BUFSIZ - 1, "%s", base);
snprintf(temp2, BUFSIZ - 1, "%lu", (long) i);
temp1[BUFSIZ - 1] = temp2[BUFSIZ - 1] = '\0';
/* Do we need zero pading? */
if (strlen(temp2) < num_str_len) {
for (j = 0; j < num_str_len - strlen(temp2); ++j) {
strcat(temp1, "0");
}
}
snprintf(str, len - 1, "%s%s", temp1, temp2);
str[len - 1] = '\0';
opal_argv_append_nosize(names, str);
}
/* All done */
return ORTE_SUCCESS;
}