diff --git a/orte/mca/ess/slurmd/Makefile.am b/orte/mca/ess/slurmd/Makefile.am index 93cbe08b79..9431dbb3bd 100644 --- a/orte/mca/ess/slurmd/Makefile.am +++ b/orte/mca/ess/slurmd/Makefile.am @@ -16,6 +16,8 @@ # $HEADER$ # +dist_pkgdata_DATA = help-ess-slurmd.txt + sources = \ ess_slurmd.h \ ess_slurmd_component.c \ diff --git a/orte/mca/ess/slurmd/ess_slurmd_module.c b/orte/mca/ess/slurmd/ess_slurmd_module.c index fcc8860179..372ae1369c 100644 --- a/orte/mca/ess/slurmd/ess_slurmd_module.c +++ b/orte/mca/ess/slurmd/ess_slurmd_module.c @@ -92,6 +92,11 @@ orte_ess_base_module_t orte_ess_slurmd_module = { static bool app_init_complete; static bool slurm20; +/* Local functions */ +static int discover_nodes(char *regexp, char*** nodelist); +static int parse_ranges(char *base, char *ranges, char ***names); +static int parse_range(char *base, char *range, char ***names); + /**** MODULE FUNCTIONS ****/ static int rte_init(void) @@ -280,7 +285,7 @@ static int rte_init(void) goto error; } /* break that down into a list of nodes */ - if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(regexp, &nodes))) { + if (ORTE_SUCCESS != (ret = discover_nodes(regexp, &nodes))) { error = "could not parse node list"; goto error; } @@ -604,38 +609,278 @@ static int update_nidmap(opal_byte_object_t *bo) } +/** + * Discover the available resources. + * + * In order to fully support slurm, we need to be able to handle + * node regexp/task_per_node strings such as: + * foo,bar 5,3 + * foo 5 + * foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16) + * + * @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST) + * @param **nodelist argv array to return the found nodes in + */ +static int discover_nodes(char *regexp, char*** names) +{ + int i, j, len, ret; + char *base; + char *orig; + bool found_range = false; + bool more_to_come = false; + + orig = base = strdup(regexp); + if (NULL == base) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "%s ess:slurmd:discover: checking nodelist: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + regexp)); + + do { + /* Find the base */ + len = strlen(base); + for (i = 0; i <= len; ++i) { + if (base[i] == '[') { + /* we found a range. this gets dealt with below */ + base[i] = '\0'; + found_range = true; + break; + } + if (base[i] == ',') { + /* we found a singleton node, and there are more to come */ + base[i] = '\0'; + found_range = false; + more_to_come = true; + break; + } + if (base[i] == '\0') { + /* we found a singleton node */ + found_range = false; + more_to_come = false; + break; + } + } + if(i == 0) { + /* we found a special character at the beginning of the string */ + orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp); + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + free(orig); + return ORTE_ERR_BAD_PARAM; + } + + if (found_range) { + /* If we found a range, now find the end of the range */ + for (j = i; j < len; ++j) { + if (base[j] == ']') { + base[j] = '\0'; + break; + } + } + if (j >= len) { + /* we didn't find the end of the range */ + orte_show_help("help-ess-slurdm.txt", "slurm-env-var-bad-value", 1, regexp); + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + free(orig); + return ORTE_ERR_BAD_PARAM; + } + + ret = parse_ranges(base, base + i + 1, names); + if(ORTE_SUCCESS != ret) { + orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp); + ORTE_ERROR_LOG(ret); + free(orig); + return ret; + } + if(base[j + 1] == ',') { + more_to_come = true; + base = &base[j + 2]; + } else { + more_to_come = false; + } + } else { + /* If we didn't find a range, just add the node */ + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "%s ess:slurmd:discover: found node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + base)); + + if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) { + ORTE_ERROR_LOG(ret); + free(orig); + return ret; + } + /* set base equal to the (possible) next base to look at */ + base = &base[i + 1]; + } + } while(more_to_come); + + free(orig); + + /* All done */ + return ret; +} -#if 0 -/*** AVAILABLE SLURM ENVARS ***/ -SLURM_JOB_ID=38749 -SLURM_JOB_NUM_NODES=1 -SLURM_JOB_NODELIST=odin097 -SLURM_JOB_CPUS_PER_NODE=4 -SLURM_JOBID=38749 -SLURM_NNODES=1 -SLURM_NODELIST=odin097 -SLURM_TASKS_PER_NODE=2 -SLURM_PRIO_PROCESS=0 -SLURM_UMASK=0022 -SLURM_NPROCS=2 -SLURM_CPUS_PER_TASK=1 -SLURM_STEPID=1 -SLURM_SRUN_COMM_PORT=33650 -SLURM_STEP_ID=1 -SLURM_STEP_NODELIST=odin097 -SLURM_STEP_NUM_NODES=1 -SLURM_STEP_NUM_TASKS=2 -SLURM_STEP_TASKS_PER_NODE=2 -SLURM_STEP_LAUNCHER_HOSTNAME=(null) -SLURM_STEP_LAUNCHER_PORT=33650 -SLURM_SRUN_COMM_HOST=129.79.240.100 -SLURM_TASK_PID=5528 -SLURM_CPUS_ON_NODE=4 -SLURM_NODEID=0 -SLURM_PROCID=1 -SLURM_LOCALID=1 -SLURM_LAUNCH_NODE_IPADDR=129.79.240.100 -SLURM_GTIDS=0,1 -SLURM_CHECKPOINT_PATH=/nfs/rinfs/san/homedirs/rhc -SLURMD_NODENAME=odin097 -#endif + +/* + * Parse one or more ranges in a set + * + * @param base The base text of the node name + * @param *ranges A pointer to a range. This can contain multiple ranges + * (i.e. "1-3,10" or "5" or "9,0100-0130,250") + * @param ***names An argv array to add the newly discovered nodes to + */ +static int parse_ranges(char *base, char *ranges, char ***names) +{ + int i, len, ret; + char *start, *orig; + + /* Look for commas, the separator between ranges */ + + len = strlen(ranges); + for (orig = start = ranges, i = 0; i < len; ++i) { + if (',' == ranges[i]) { + ranges[i] = '\0'; + ret = parse_range(base, start, names); + if (ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + return ret; + } + start = ranges + i + 1; + } + } + + /* Pick up the last range, if it exists */ + + if (start < orig + len) { + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "%s ess:slurmd:discover: parse range %s (2)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + start)); + + ret = parse_range(base, start, names); + if (ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + /* All done */ + return ORTE_SUCCESS; +} + + +/* + * Parse a single range in a set and add the full names of the nodes + * found to the names argv + * + * @param base The base text of the node name + * @param *ranges A pointer to a single range. (i.e. "1-3" or "5") + * @param ***names An argv array to add the newly discovered nodes to + */ +static int parse_range(char *base, char *range, char ***names) +{ + char *str, temp1[BUFSIZ]; + size_t i, j, start, end; + size_t base_len, len, num_len; + size_t str_start, str_end; + size_t num_str_len; + bool found; + int ret; + + len = strlen(range); + base_len = strlen(base); + /* Silence compiler warnings; start and end are always assigned + properly, below */ + start = end = 0; + + /* Look for the beginning of the first number */ + + for (found = false, i = 0; i < len; ++i) { + if (isdigit((int) range[i])) { + if (!found) { + str_start = i; + start = atoi(range + i); + found = true; + break; + } + } + } + if (!found) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + /* Look for the end of the first number */ + + for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) { + if (!isdigit((int) range[i])) { + break; + } + } + + /* Was there no range, just a single number? */ + + if (i >= len) { + str_end = len; + end = start; + found = true; + } + + /* Nope, there was a range. Look for the beginning of the second + number */ + + else { + str_end = i - 1; + for (; i < len; ++i) { + if (isdigit((int) range[i])) { + end = atoi(range + i); + found = true; + break; + } + } + } + if (!found) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + /* Make strings for all values in the range */ + + len = base_len + num_str_len + 32; + str = malloc(len); + if (NULL == str) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + strcpy(str, base); + for (i = start; i <= end; ++i) { + str[base_len] = '\0'; + snprintf(temp1, BUFSIZ - 1, "%lu", (long) i); + + /* Do we need zero pading? */ + + if ((num_len = strlen(temp1)) < num_str_len) { + for (j = base_len; j < base_len + (num_str_len - num_len); ++j) { + str[j] = '0'; + } + str[j] = '\0'; + } + strcat(str, temp1); + ret = opal_argv_append_nosize(names, str); + if(ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + free(str); + return ret; + } + } + free(str); + + /* All done */ + return ORTE_SUCCESS; +} diff --git a/orte/mca/ess/slurmd/help-ess-slurmd.txt b/orte/mca/ess/slurmd/help-ess-slurmd.txt new file mode 100644 index 0000000000..8e9bf8400f --- /dev/null +++ b/orte/mca/ess/slurmd/help-ess-slurmd.txt @@ -0,0 +1,41 @@ +# -*- text -*- +# +# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI MCA error messages. +# +[slurm-env-var-not-found] +While trying to determine what resources are available, ORTE +expects to find the following environment variables: + + SLURM_NODELIST + SLURM_TASKS_PER_NODE + +However, it was unable to find the following environment variable: + + %s + +#This is a fatal error. +[slurm-env-var-bad-value] +While trying to determine what nodes are being used, ORTE +uses the following environment variable: + + SLURM_NODELIST value: %s + +However, an error was encountered when trying to parse it + +This is a fatal error. diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 8e1179ee11..40f307f48b 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -386,16 +386,24 @@ static orte_process_name_t get_route(orte_process_name_t *target) } /* THIS CAME FROM OUR OWN JOB FAMILY... */ - if (orte_static_ports && - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) { - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routing to the HNP through my parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); - ret = ORTE_PROC_MY_PARENT; - goto found; + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { + if (orte_static_ports) { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing to the HNP through my parent %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); + ret = ORTE_PROC_MY_PARENT; + goto found; + } else { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing direct to the HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ret = ORTE_PROC_MY_HNP; + goto found; + } } + daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { diff --git a/orte/mca/routed/linear/routed_linear.c b/orte/mca/routed/linear/routed_linear.c index 4bf57c9502..88943b8bf2 100644 --- a/orte/mca/routed/linear/routed_linear.c +++ b/orte/mca/routed/linear/routed_linear.c @@ -347,15 +347,21 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* THIS CAME FROM OUR OWN JOB FAMILY... */ - /* if we are using static ports and this is going to the HNP, send through my parent */ - if (orte_static_ports && - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) { - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routing to the HNP through my parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); - ret = ORTE_PROC_MY_PARENT; - goto found; + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { + if (orte_static_ports) { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing to the HNP through my parent %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); + ret = ORTE_PROC_MY_PARENT; + goto found; + } else { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing direct to the HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ret = ORTE_PROC_MY_HNP; + goto found; + } } daemon.jobid = ORTE_PROC_MY_NAME->jobid; diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index e507615bec..63a91599b8 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -369,15 +369,21 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* THIS CAME FROM OUR OWN JOB FAMILY... */ - /* if we are using static ports and this is going to the HNP, send through my parent */ - if (orte_static_ports && - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) { - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routing to the HNP through my parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); - ret = ORTE_PROC_MY_PARENT; - goto found; + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { + if (orte_static_ports) { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing to the HNP through my parent %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); + ret = ORTE_PROC_MY_PARENT; + goto found; + } else { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing direct to the HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ret = ORTE_PROC_MY_HNP; + goto found; + } } daemon.jobid = ORTE_PROC_MY_NAME->jobid; diff --git a/orte/test/system/regex.c b/orte/test/system/regex.c index 9d3fcd4006..1dad862c20 100644 --- a/orte/test/system/regex.c +++ b/orte/test/system/regex.c @@ -19,13 +19,30 @@ main(int argc, char **argv) { int rc; char *regex, *save; - char **nodes; + char **nodes=NULL; + int i; if (argc < 1 || NULL == argv[1]) { fprintf(stderr, "usage: regex \n"); return 1; } + orte_init(&argc, &argv, ORTE_PROC_NON_MPI); + + if (NULL != strchr(argv[1], '[')) { + /* given a regex to analyze */ + fprintf(stderr, "ANALYZING REGEX: %s\n", argv[1]); + if (ORTE_SUCCESS != (rc = orte_regex_extract_node_names(argv[1], &nodes))) { + ORTE_ERROR_LOG(rc); + } + for (i=0; NULL != nodes; i++) { + fprintf(stderr, "%s\n", nodes[i]); + } + opal_argv_free(nodes); + orte_finalize(); + return 0; + } + save = strdup(argv[1]); if (ORTE_SUCCESS != (rc = orte_regex_create(save, ®ex))) { ORTE_ERROR_LOG(rc); diff --git a/orte/util/help-regex.txt b/orte/util/help-regex.txt index 6e84269718..ef22db0901 100644 --- a/orte/util/help-regex.txt +++ b/orte/util/help-regex.txt @@ -57,4 +57,12 @@ it does not know how to parse: regexp: %s Please contact the Open MPI help list for assistance. +# +[regex:num-digits-missing] +While trying to parse a regular expression to extract the node +names, the regex parser was unable to determine the number of +digits in the names: + regexp: %s + +Please contact the Open MPI help list for assistance. diff --git a/orte/util/regex.c b/orte/util/regex.c index e8379b8db8..bece5c3fdd 100644 --- a/orte/util/regex.c +++ b/orte/util/regex.c @@ -362,6 +362,11 @@ int orte_regex_extract_node_names(char *regexp, char ***names) } else { suffix = NULL; } + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s regex:extract:nodenames: parsing range %s %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + base, base + i, suffix)); + ret = regex_parse_node_ranges(base, base + i, num_digits, suffix, names); if (NULL != suffix) { free(suffix); @@ -621,156 +626,6 @@ int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn) return ORTE_SUCCESS; } -#if 0 -static int parse_node_range(char *orig, char ***names, orte_vpid_t *vpid_start, - int *ppn, int *step, int *nrank) -{ - char *base, *ptr, *ptr2, *next, *suffix; - int i, j, len, rc=ORTE_SUCCESS; - bool found_range; - - /* protect input */ - base = strdup(orig); - suffix = '\0'; - - /* default to no procs */ - *vpid_start = ORTE_VPID_INVALID; - - /* start by searching for ranges and proc specifications */ - len = strlen(base); - ptr = NULL; - found_range = false; - for (i = 0; i <= len; ++i) { - if (base[i] == '[') { - /* we found a range. this gets dealt with below */ - base[i] = '\0'; - found_range = true; - break; - } - if (base[i] == '\0') { - /* we found a singleton node - no procs on it */ - base[i] = '\0'; - found_range = false; - break; - } - if (base[i] == '(') { - /* we found a singleton node that has procs on it */ - base[i] = '\0'; - found_range = false; - ptr = &base[i+1]; - break; - } - } - if (i == 0) { - /* we found a special character at the beginning of the string */ - orte_show_help("help-regex.txt", "regex:special-char", true, orig); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - - if (found_range) { - /* If we found a range, now find the end of the range */ - for (j = i; j < len; ++j) { - if (base[j] == ']') { - base[j] = '\0'; - if (j < len-2) { - if (base[j+1] == '(') { - /* procs are in this range and there is no suffix */ - ptr = &base[j+2]; - } else { - /* we must have a suffix */ - suffix = base[j+1]; - if (j < len-3 && base[j+2] == '(') { - /* we also have procs in this range */ - ptr = &base[j+3]; - } - } - } - break; - } - } - if (j >= len) { - /* we didn't find the end of the range */ - orte_show_help("help-regex.txt", "regex:end-range-missing", true, orig); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - - rc = regex_parse_node_range(base, base + i + 1, suffix, names); - if (ORTE_SUCCESS != rc) { - orte_show_help("help-regex.txt", "regex:bad-value", true, orig); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - } else { - /* If we didn't find a range, just add the node */ - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s regex:extract:nodenames: found node: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), base)); - - if(ORTE_SUCCESS != (rc = opal_argv_append_nosize(names, base))) { - ORTE_ERROR_LOG(rc); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - } - - if (NULL != ptr) { /* we have procs on these nodes */ - /* find the end of the description */ - ptr2 = strchr(ptr, ')'); - if (NULL == ptr2) { - /* malformed */ - orte_show_help("help-regex.txt", "regex:bad-value", true, ptr); - return ORTE_ERROR; - } - *ptr2 = '\0'; - - /* the proc description is in the format: - * starting-vpidxppn:step:starting-node-rank - * where step=step between vpids - */ - - /* start by extracting the starting vpid */ - if (NULL == (ptr2 = strchr(ptr, 'x'))) { - /* malformed */ - orte_show_help("help-regex.txt", "regex:bad-value", true, ptr); - return ORTE_ERROR; - } - *ptr2 = '\0'; - orte_util_convert_string_to_vpid(vpid_start, ptr); - - /* get ppn */ - next = ptr2 + 1; - if (NULL == (ptr2 = strchr(next, ':'))) { - /* malformed */ - orte_show_help("help-regex.txt", "regex:bad-value", true, next); - return ORTE_ERROR; - } - *ptr2 = '\0'; - *ppn = strtol(next, NULL, 10); - - /* get step */ - next = ptr2 + 1; - if (NULL == (ptr2 = strchr(next, ':'))) { - /* malformed */ - orte_show_help("help-regex.txt", "regex:bad-value", true, next); - return ORTE_ERROR; - } - *ptr2 = '\0'; - *step = strtol(next, NULL, 10); - - /* get the starting node rank */ - next = ptr2 + 1; - *nrank = strtol(next, NULL, 10); - } - -cleanup: - free(base); - return rc; -} -#endif - static void range_construct(orte_regex_range_t *ptr) { ptr->start = 0;