1
1

Fix regular expression analyzer for slurmd - use a slurm-specific version

Fix multi-node routing for daemon startup when static ports are not set

This commit was SVN r24898.
Этот коммит содержится в:
Ralph Castain 2011-07-13 22:49:56 +00:00
родитель 8d1b31b887
Коммит 8853e0e80a
9 изменённых файлов: 400 добавлений и 212 удалений

Просмотреть файл

@ -16,6 +16,8 @@
# $HEADER$ # $HEADER$
# #
dist_pkgdata_DATA = help-ess-slurmd.txt
sources = \ sources = \
ess_slurmd.h \ ess_slurmd.h \
ess_slurmd_component.c \ ess_slurmd_component.c \

Просмотреть файл

@ -92,6 +92,11 @@ orte_ess_base_module_t orte_ess_slurmd_module = {
static bool app_init_complete; static bool app_init_complete;
static bool slurm20; static bool slurm20;
/* Local functions */
static int discover_nodes(char *regexp, char*** nodelist);
static int parse_ranges(char *base, char *ranges, char ***names);
static int parse_range(char *base, char *range, char ***names);
/**** MODULE FUNCTIONS ****/ /**** MODULE FUNCTIONS ****/
static int rte_init(void) static int rte_init(void)
@ -280,7 +285,7 @@ static int rte_init(void)
goto error; goto error;
} }
/* break that down into a list of nodes */ /* break that down into a list of nodes */
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(regexp, &nodes))) { if (ORTE_SUCCESS != (ret = discover_nodes(regexp, &nodes))) {
error = "could not parse node list"; error = "could not parse node list";
goto error; goto error;
} }
@ -604,38 +609,278 @@ static int update_nidmap(opal_byte_object_t *bo)
} }
/**
* Discover the available resources.
*
* In order to fully support slurm, we need to be able to handle
* node regexp/task_per_node strings such as:
* foo,bar 5,3
* foo 5
* foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16)
*
* @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST)
* @param **nodelist argv array to return the found nodes in
*/
static int discover_nodes(char *regexp, char*** names)
{
int i, j, len, ret;
char *base;
char *orig;
bool found_range = false;
bool more_to_come = false;
#if 0 orig = base = strdup(regexp);
/*** AVAILABLE SLURM ENVARS ***/ if (NULL == base) {
SLURM_JOB_ID=38749 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
SLURM_JOB_NUM_NODES=1 return ORTE_ERR_OUT_OF_RESOURCE;
SLURM_JOB_NODELIST=odin097 }
SLURM_JOB_CPUS_PER_NODE=4
SLURM_JOBID=38749 OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
SLURM_NNODES=1 "%s ess:slurmd:discover: checking nodelist: %s",
SLURM_NODELIST=odin097 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
SLURM_TASKS_PER_NODE=2 regexp));
SLURM_PRIO_PROCESS=0
SLURM_UMASK=0022 do {
SLURM_NPROCS=2 /* Find the base */
SLURM_CPUS_PER_TASK=1 len = strlen(base);
SLURM_STEPID=1 for (i = 0; i <= len; ++i) {
SLURM_SRUN_COMM_PORT=33650 if (base[i] == '[') {
SLURM_STEP_ID=1 /* we found a range. this gets dealt with below */
SLURM_STEP_NODELIST=odin097 base[i] = '\0';
SLURM_STEP_NUM_NODES=1 found_range = true;
SLURM_STEP_NUM_TASKS=2 break;
SLURM_STEP_TASKS_PER_NODE=2 }
SLURM_STEP_LAUNCHER_HOSTNAME=(null) if (base[i] == ',') {
SLURM_STEP_LAUNCHER_PORT=33650 /* we found a singleton node, and there are more to come */
SLURM_SRUN_COMM_HOST=129.79.240.100 base[i] = '\0';
SLURM_TASK_PID=5528 found_range = false;
SLURM_CPUS_ON_NODE=4 more_to_come = true;
SLURM_NODEID=0 break;
SLURM_PROCID=1 }
SLURM_LOCALID=1 if (base[i] == '\0') {
SLURM_LAUNCH_NODE_IPADDR=129.79.240.100 /* we found a singleton node */
SLURM_GTIDS=0,1 found_range = false;
SLURM_CHECKPOINT_PATH=/nfs/rinfs/san/homedirs/rhc more_to_come = false;
SLURMD_NODENAME=odin097 break;
#endif }
}
if(i == 0) {
/* we found a special character at the beginning of the string */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
if (found_range) {
/* If we found a range, now find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
break;
}
}
if (j >= len) {
/* we didn't find the end of the range */
orte_show_help("help-ess-slurdm.txt", "slurm-env-var-bad-value", 1, regexp);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
ret = parse_ranges(base, base + i + 1, names);
if(ORTE_SUCCESS != ret) {
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp);
ORTE_ERROR_LOG(ret);
free(orig);
return ret;
}
if(base[j + 1] == ',') {
more_to_come = true;
base = &base[j + 2];
} else {
more_to_come = false;
}
} else {
/* If we didn't find a range, just add the node */
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s ess:slurmd:discover: found node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
base));
if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) {
ORTE_ERROR_LOG(ret);
free(orig);
return ret;
}
/* set base equal to the (possible) next base to look at */
base = &base[i + 1];
}
} while(more_to_come);
free(orig);
/* All done */
return ret;
}
/*
* Parse one or more ranges in a set
*
* @param base The base text of the node name
* @param *ranges A pointer to a range. This can contain multiple ranges
* (i.e. "1-3,10" or "5" or "9,0100-0130,250")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int parse_ranges(char *base, char *ranges, char ***names)
{
int i, len, ret;
char *start, *orig;
/* Look for commas, the separator between ranges */
len = strlen(ranges);
for (orig = start = ranges, i = 0; i < len; ++i) {
if (',' == ranges[i]) {
ranges[i] = '\0';
ret = parse_range(base, start, names);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
return ret;
}
start = ranges + i + 1;
}
}
/* Pick up the last range, if it exists */
if (start < orig + len) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s ess:slurmd:discover: parse range %s (2)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
start));
ret = parse_range(base, start, names);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
return ret;
}
}
/* All done */
return ORTE_SUCCESS;
}
/*
* Parse a single range in a set and add the full names of the nodes
* found to the names argv
*
* @param base The base text of the node name
* @param *ranges A pointer to a single range. (i.e. "1-3" or "5")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int parse_range(char *base, char *range, char ***names)
{
char *str, temp1[BUFSIZ];
size_t i, j, start, end;
size_t base_len, len, num_len;
size_t str_start, str_end;
size_t num_str_len;
bool found;
int ret;
len = strlen(range);
base_len = strlen(base);
/* Silence compiler warnings; start and end are always assigned
properly, below */
start = end = 0;
/* Look for the beginning of the first number */
for (found = false, i = 0; i < len; ++i) {
if (isdigit((int) range[i])) {
if (!found) {
str_start = i;
start = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Look for the end of the first number */
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
if (!isdigit((int) range[i])) {
break;
}
}
/* Was there no range, just a single number? */
if (i >= len) {
str_end = len;
end = start;
found = true;
}
/* Nope, there was a range. Look for the beginning of the second
number */
else {
str_end = i - 1;
for (; i < len; ++i) {
if (isdigit((int) range[i])) {
end = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Make strings for all values in the range */
len = base_len + num_str_len + 32;
str = malloc(len);
if (NULL == str) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
strcpy(str, base);
for (i = start; i <= end; ++i) {
str[base_len] = '\0';
snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);
/* Do we need zero pading? */
if ((num_len = strlen(temp1)) < num_str_len) {
for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
str[j] = '0';
}
str[j] = '\0';
}
strcat(str, temp1);
ret = opal_argv_append_nosize(names, str);
if(ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
free(str);
return ret;
}
}
free(str);
/* All done */
return ORTE_SUCCESS;
}

41
orte/mca/ess/slurmd/help-ess-slurmd.txt Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
# -*- text -*-
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI MCA error messages.
#
[slurm-env-var-not-found]
While trying to determine what resources are available, ORTE
expects to find the following environment variables:
SLURM_NODELIST
SLURM_TASKS_PER_NODE
However, it was unable to find the following environment variable:
%s
#This is a fatal error.
[slurm-env-var-bad-value]
While trying to determine what nodes are being used, ORTE
uses the following environment variable:
SLURM_NODELIST value: %s
However, an error was encountered when trying to parse it
This is a fatal error.

Просмотреть файл

@ -386,15 +386,23 @@ static orte_process_name_t get_route(orte_process_name_t *target)
} }
/* THIS CAME FROM OUR OWN JOB FAMILY... */ /* THIS CAME FROM OUR OWN JOB FAMILY... */
if (orte_static_ports && if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) { if (orte_static_ports) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing to the HNP through my parent %s", "%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
ret = ORTE_PROC_MY_PARENT; ret = ORTE_PROC_MY_PARENT;
goto found; goto found;
} else {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing direct to the HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ret = ORTE_PROC_MY_HNP;
goto found;
} }
}
daemon.jobid = ORTE_PROC_MY_NAME->jobid; daemon.jobid = ORTE_PROC_MY_NAME->jobid;
/* find out what daemon hosts this proc */ /* find out what daemon hosts this proc */

Просмотреть файл

@ -347,15 +347,21 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */ /* THIS CAME FROM OUR OWN JOB FAMILY... */
/* if we are using static ports and this is going to the HNP, send through my parent */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (orte_static_ports && if (orte_static_ports) {
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing to the HNP through my parent %s", "%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
ret = ORTE_PROC_MY_PARENT; ret = ORTE_PROC_MY_PARENT;
goto found; goto found;
} else {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing direct to the HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ret = ORTE_PROC_MY_HNP;
goto found;
}
} }
daemon.jobid = ORTE_PROC_MY_NAME->jobid; daemon.jobid = ORTE_PROC_MY_NAME->jobid;

Просмотреть файл

@ -369,15 +369,21 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */ /* THIS CAME FROM OUR OWN JOB FAMILY... */
/* if we are using static ports and this is going to the HNP, send through my parent */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (orte_static_ports && if (orte_static_ports) {
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing to the HNP through my parent %s", "%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
ret = ORTE_PROC_MY_PARENT; ret = ORTE_PROC_MY_PARENT;
goto found; goto found;
} else {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing direct to the HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ret = ORTE_PROC_MY_HNP;
goto found;
}
} }
daemon.jobid = ORTE_PROC_MY_NAME->jobid; daemon.jobid = ORTE_PROC_MY_NAME->jobid;

Просмотреть файл

@ -19,13 +19,30 @@ main(int argc, char **argv)
{ {
int rc; int rc;
char *regex, *save; char *regex, *save;
char **nodes; char **nodes=NULL;
int i;
if (argc < 1 || NULL == argv[1]) { if (argc < 1 || NULL == argv[1]) {
fprintf(stderr, "usage: regex <comma-separated list of nodes>\n"); fprintf(stderr, "usage: regex <comma-separated list of nodes>\n");
return 1; return 1;
} }
orte_init(&argc, &argv, ORTE_PROC_NON_MPI);
if (NULL != strchr(argv[1], '[')) {
/* given a regex to analyze */
fprintf(stderr, "ANALYZING REGEX: %s\n", argv[1]);
if (ORTE_SUCCESS != (rc = orte_regex_extract_node_names(argv[1], &nodes))) {
ORTE_ERROR_LOG(rc);
}
for (i=0; NULL != nodes; i++) {
fprintf(stderr, "%s\n", nodes[i]);
}
opal_argv_free(nodes);
orte_finalize();
return 0;
}
save = strdup(argv[1]); save = strdup(argv[1]);
if (ORTE_SUCCESS != (rc = orte_regex_create(save, &regex))) { if (ORTE_SUCCESS != (rc = orte_regex_create(save, &regex))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -57,4 +57,12 @@ it does not know how to parse:
regexp: %s regexp: %s
Please contact the Open MPI help list for assistance. Please contact the Open MPI help list for assistance.
#
[regex:num-digits-missing]
While trying to parse a regular expression to extract the node
names, the regex parser was unable to determine the number of
digits in the names:
regexp: %s
Please contact the Open MPI help list for assistance.

Просмотреть файл

@ -362,6 +362,11 @@ int orte_regex_extract_node_names(char *regexp, char ***names)
} else { } else {
suffix = NULL; suffix = NULL;
} }
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s regex:extract:nodenames: parsing range %s %s %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
base, base + i, suffix));
ret = regex_parse_node_ranges(base, base + i, num_digits, suffix, names); ret = regex_parse_node_ranges(base, base + i, num_digits, suffix, names);
if (NULL != suffix) { if (NULL != suffix) {
free(suffix); free(suffix);
@ -621,156 +626,6 @@ int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
#if 0
static int parse_node_range(char *orig, char ***names, orte_vpid_t *vpid_start,
int *ppn, int *step, int *nrank)
{
char *base, *ptr, *ptr2, *next, *suffix;
int i, j, len, rc=ORTE_SUCCESS;
bool found_range;
/* protect input */
base = strdup(orig);
suffix = '\0';
/* default to no procs */
*vpid_start = ORTE_VPID_INVALID;
/* start by searching for ranges and proc specifications */
len = strlen(base);
ptr = NULL;
found_range = false;
for (i = 0; i <= len; ++i) {
if (base[i] == '[') {
/* we found a range. this gets dealt with below */
base[i] = '\0';
found_range = true;
break;
}
if (base[i] == '\0') {
/* we found a singleton node - no procs on it */
base[i] = '\0';
found_range = false;
break;
}
if (base[i] == '(') {
/* we found a singleton node that has procs on it */
base[i] = '\0';
found_range = false;
ptr = &base[i+1];
break;
}
}
if (i == 0) {
/* we found a special character at the beginning of the string */
orte_show_help("help-regex.txt", "regex:special-char", true, orig);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
if (found_range) {
/* If we found a range, now find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
if (j < len-2) {
if (base[j+1] == '(') {
/* procs are in this range and there is no suffix */
ptr = &base[j+2];
} else {
/* we must have a suffix */
suffix = base[j+1];
if (j < len-3 && base[j+2] == '(') {
/* we also have procs in this range */
ptr = &base[j+3];
}
}
}
break;
}
}
if (j >= len) {
/* we didn't find the end of the range */
orte_show_help("help-regex.txt", "regex:end-range-missing", true, orig);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
rc = regex_parse_node_range(base, base + i + 1, suffix, names);
if (ORTE_SUCCESS != rc) {
orte_show_help("help-regex.txt", "regex:bad-value", true, orig);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
} else {
/* If we didn't find a range, just add the node */
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s regex:extract:nodenames: found node: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), base));
if(ORTE_SUCCESS != (rc = opal_argv_append_nosize(names, base))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
}
if (NULL != ptr) { /* we have procs on these nodes */
/* find the end of the description */
ptr2 = strchr(ptr, ')');
if (NULL == ptr2) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, ptr);
return ORTE_ERROR;
}
*ptr2 = '\0';
/* the proc description is in the format:
* starting-vpidxppn:step:starting-node-rank
* where step=step between vpids
*/
/* start by extracting the starting vpid */
if (NULL == (ptr2 = strchr(ptr, 'x'))) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, ptr);
return ORTE_ERROR;
}
*ptr2 = '\0';
orte_util_convert_string_to_vpid(vpid_start, ptr);
/* get ppn */
next = ptr2 + 1;
if (NULL == (ptr2 = strchr(next, ':'))) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, next);
return ORTE_ERROR;
}
*ptr2 = '\0';
*ppn = strtol(next, NULL, 10);
/* get step */
next = ptr2 + 1;
if (NULL == (ptr2 = strchr(next, ':'))) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, next);
return ORTE_ERROR;
}
*ptr2 = '\0';
*step = strtol(next, NULL, 10);
/* get the starting node rank */
next = ptr2 + 1;
*nrank = strtol(next, NULL, 10);
}
cleanup:
free(base);
return rc;
}
#endif
static void range_construct(orte_regex_range_t *ptr) static void range_construct(orte_regex_range_t *ptr)
{ {
ptr->start = 0; ptr->start = 0;