1
1
openmpi/orte/util/regex.c
Abhishek Kulkarni afbe3e99c6 * Wrap all the direct error-code checks of the form (OMPI_ERR_* == ret) with
(OMPI_ERR_* = OPAL_SOS_GET_ERR_CODE(ret)), since the return value could be a
 SOS-encoded error. The OPAL_SOS_GET_ERR_CODE() takes in a SOS error and returns
 back the native error code.

* Since OPAL_SUCCESS is preserved by SOS, also change all calls of the form
  (OPAL_ERROR == ret) to (OPAL_SUCCESS != ret). We thus avoid having to
  decode 'ret' to get the native error code.

This commit was SVN r23162.
2010-05-17 23:08:56 +00:00

1285 строки
44 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/types.h"
#include "orte/constants.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_IFADDRS_H
#include <ifaddrs.h>
#endif
#include "opal/util/argv.h"
#include "opal/util/opal_sos.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/regex.h"
#define ORTE_MAX_NODE_PREFIX 50
static int regex_parse_node_ranges(char *base, char *ranges, char ***names);
static int regex_parse_node_range(char *base, char *range, char suffix, char ***names);
int orte_regex_extract_node_names(char *regexp, char ***names)
{
int i, j, len, ret;
char *base;
char *orig;
bool found_range = false;
bool more_to_come = false;
if (NULL == regexp) {
*names = NULL;
return ORTE_SUCCESS;
}
orig = base = strdup(regexp);
if (NULL == base) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s regex:extract:nodenames: checking nodelist: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
regexp));
do {
/* Find the base */
len = strlen(base);
for (i = 0; i <= len; ++i) {
if (base[i] == '[') {
/* we found a range. this gets dealt with below */
base[i] = '\0';
found_range = true;
break;
}
if (base[i] == ',') {
/* we found a singleton node, and there are more to come */
base[i] = '\0';
found_range = false;
more_to_come = true;
break;
}
if (base[i] == '\0') {
/* we found a singleton node */
found_range = false;
more_to_come = false;
break;
}
}
if(i == 0) {
/* we found a special character at the beginning of the string */
orte_show_help("help-regex.txt", "regex:special-char", true, regexp);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
if (found_range) {
/* If we found a range, now find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
break;
}
}
if (j >= len) {
/* we didn't find the end of the range */
orte_show_help("help-regex.txt", "regex:end-range-missing", true, regexp);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
ret = regex_parse_node_ranges(base, base + i + 1, names);
if(ORTE_SUCCESS != ret) {
orte_show_help("help-regex.txt", "regex:bad-value", true, regexp);
free(orig);
return ret;
}
if(base[j + 1] == ',') {
more_to_come = true;
base = &base[j + 2];
} else {
more_to_come = false;
}
} else {
/* If we didn't find a range, just add the node */
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s regex:extract:nodenames: found node: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), base));
if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) {
ORTE_ERROR_LOG(ret);
free(orig);
return ret;
}
/* set base equal to the (possible) next base to look at */
base = &base[i + 1];
}
} while(more_to_come);
free(orig);
/* All done */
return ret;
}
/*
* Parse one or more ranges in a set
*
* @param base The base text of the node name
* @param *ranges A pointer to a range. This can contain multiple ranges
* (i.e. "1-3,10" or "5" or "9,0100-0130,250")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int regex_parse_node_ranges(char *base, char *ranges, char ***names)
{
int i, len, ret;
char *start, *orig;
/* Look for commas, the separator between ranges */
len = strlen(ranges);
for (orig = start = ranges, i = 0; i < len; ++i) {
if (',' == ranges[i]) {
ranges[i] = '\0';
ret = regex_parse_node_range(base, start, '\0', names);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
return ret;
}
start = ranges + i + 1;
}
}
/* Pick up the last range, if it exists */
if (start < orig + len) {
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s regex:parse:ranges: parse range %s (2)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), start));
ret = regex_parse_node_range(base, start, '\0', names);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
return ret;
}
}
/* All done */
return ORTE_SUCCESS;
}
/*
* Parse a single range in a set and add the full names of the nodes
* found to the names argv
*
* @param base The base text of the node name
* @param *ranges A pointer to a single range. (i.e. "1-3" or "5")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int regex_parse_node_range(char *base, char *range, char suffix, char ***names)
{
char *str, temp1[BUFSIZ];
size_t i, j, start, end;
size_t base_len, len, num_len;
size_t str_start, str_end;
size_t num_str_len;
bool found;
int ret;
len = strlen(range);
base_len = strlen(base);
/* Silence compiler warnings; start and end are always assigned
properly, below */
start = end = 0;
/* Look for the beginning of the first number */
for (found = false, i = 0; i < len; ++i) {
if (isdigit((int) range[i])) {
if (!found) {
str_start = i;
start = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Look for the end of the first number */
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
if (!isdigit((int) range[i])) {
break;
}
}
/* Was there no range, just a single number? */
if (i >= len) {
str_end = len;
end = start;
found = true;
}
/* Nope, there was a range. Look for the beginning of the second
number */
else {
str_end = i - 1;
for (; i < len; ++i) {
if (isdigit((int) range[i])) {
end = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Make strings for all values in the range */
len = base_len + num_str_len + 32;
str = (char *) malloc(len);
if (NULL == str) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
strcpy(str, base);
for (i = start; i <= end; ++i) {
str[base_len] = '\0';
snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);
/* Do we need zero padding? */
if ((num_len = strlen(temp1)) < num_str_len) {
for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
str[j] = '0';
}
str[j] = '\0';
}
strcat(str, temp1);
/* if there is a suffix, add it */
if ('\0' != suffix) {
num_len = strlen(str);
str[num_len] = suffix;
str[num_len+1] = '\0';
}
ret = opal_argv_append_nosize(names, str);
if(ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
free(str);
return ret;
}
}
free(str);
/* All done */
return ORTE_SUCCESS;
}
/* Compute the #procs on each node given a regex of form
* "#procs(x#nodes),#procs(x#nodes). In other words, an
* expression of "4(x30) will be interpreted to mean four
* procs on each of the next 30 nodes.
*/
int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn)
{
int *tmp;
char *begptr, *endptr, *orig;
int i, j, count, reps;
/* init null answer */
*ppn = NULL;
tmp = (int *) malloc(sizeof(int) * num_nodes);
if (NULL == tmp) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memset(tmp, 0, sizeof(int) * num_nodes);
orig = begptr = strdup(regexp);
if (NULL == begptr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
return ORTE_ERR_OUT_OF_RESOURCE;
}
j = 0;
while (begptr) {
count = strtol(begptr, &endptr, 10);
if ((endptr[0] == '(') && (endptr[1] == 'x')) {
reps = strtol((endptr+2), &endptr, 10);
if (endptr[0] == ')') {
endptr++;
}
} else {
reps = 1;
}
for (i = 0; i < reps && j < num_nodes; i++) {
tmp[j++] = count;
}
if (*endptr == ',') {
begptr = endptr + 1;
} else if (*endptr == '\0' || j >= num_nodes) {
break;
} else {
orte_show_help("help-regex.txt", "regex:bad-ppn", true, regexp);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(tmp);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
}
free(orig);
/* return values */
*ppn = tmp;
return ORTE_SUCCESS;
}
static void compute_vpids(orte_node_t *node, orte_jobid_t jobid,
orte_vpid_t *start_vpid, orte_vpid_t *end_vpid,
int32_t *ppn, orte_node_rank_t *nrank)
{
int32_t nppn, k;
orte_proc_t *proc, *start_proc, *end_proc;
nppn = 0;
start_proc = NULL;
end_proc = NULL;
for (k=0; k < node->procs->size; k++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k)) ||
proc->name.jobid != jobid) {
continue;
}
nppn++;
if (NULL == start_proc) {
start_proc = proc;
} else if (NULL == end_proc) {
end_proc = proc;
}
}
*ppn = nppn;
if (NULL == start_proc) {
/* nobody was mapped to this node */
*start_vpid = ORTE_VPID_INVALID;
*nrank = ORTE_NODE_RANK_INVALID;
} else {
*start_vpid = start_proc->name.vpid;
*nrank = start_proc->node_rank;
}
if (NULL == end_proc) {
/* could have been only one proc mapped, or none */
*end_vpid = ORTE_VPID_INVALID;
} else {
*end_vpid = end_proc->name.vpid;
}
}
static void start_sequence(orte_jobid_t jobid, orte_node_t *node,
orte_regex_node_t *ndreg, char suffix, int32_t nodenum)
{
int32_t j, ppn;
orte_vpid_t start_vpid, end_vpid;
orte_node_rank_t nrank;
opal_value_array_append_item(&ndreg->suffix, &suffix);
opal_value_array_append_item(&ndreg->nodes, &nodenum);
j = 0;
opal_value_array_append_item(&ndreg->cnt, &j);
compute_vpids(node, jobid, &start_vpid, &end_vpid, &ppn, &nrank);
opal_value_array_append_item(&ndreg->starting_vpid, &start_vpid);
opal_value_array_append_item(&ndreg->ppn, &ppn);
opal_value_array_append_item(&ndreg->nrank, &nrank);
}
char* orte_regex_encode_maps(orte_job_t *jdata)
{
orte_node_t *node;
orte_regex_node_t *ndreg;
int32_t nodenum, i, n;
bool found, fullname;
opal_list_t nodelist;
int len;
char prefix[ORTE_MAX_NODE_PREFIX];
int startnum;
opal_list_item_t *item;
char **regexargs = NULL, *tmp, *tmp2;
int32_t num_nodes, start, cnt, ppn, nppn;
orte_vpid_t vpid_start, start_vpid, end_vpid, base;
char *regexp = NULL;
bool byslot;
orte_node_rank_t node_rank, nrank;
char suffix, sfx;
orte_app_context_t *app;
/* this is only for one app_context */
if (jdata->num_apps > 1) {
return NULL;
}
/* determine the mapping policy */
byslot = true;
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
byslot = false;
}
/* setup the list of nodes with same prefixes */
OBJ_CONSTRUCT(&nodelist, opal_list_t);
/* cycle through the node pool */
for (n=0; n < orte_node_pool->size; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
continue;
}
/* determine this node's prefix by looking for first non-alpha char */
fullname = false;
len = strlen(node->name);
startnum = -1;
memset(prefix, 0, ORTE_MAX_NODE_PREFIX);
suffix = '\0';
for (i=0; i < len; i++) {
if (!isalpha(node->name[i])) {
/* found a non-alpha char */
if (!isdigit(node->name[i])) {
/* if it is anything but a digit, we just use
* the entire name, which by definition is unique
* by the way we created the node pool
*/
fullname = true;
break;
}
if ('0' == node->name[i]) {
/* if the digit is 0, then add it to the prefix */
prefix[i] = node->name[i];
continue;
}
/* okay, this defines end of the prefix */
startnum = i;
break;
}
prefix[i] = node->name[i];
}
if (fullname || startnum < 0) {
ndreg = OBJ_NEW(orte_regex_node_t);
ndreg->prefix = strdup(node->name);
start_sequence(jdata->jobid, node, ndreg, suffix, -1);
opal_list_append(&nodelist, &ndreg->super);
continue;
}
/* search for a suffix */
if (isalpha(node->name[len-1])) {
suffix = node->name[len-1];
}
nodenum = strtol(&node->name[startnum], NULL, 10);
/* is this prefix already on our list? */
found = false;
for (item = opal_list_get_first(&nodelist);
!found && item != opal_list_get_end(&nodelist);
item = opal_list_get_next(item)) {
ndreg = (orte_regex_node_t*)item;
if (0 == strcmp(prefix, ndreg->prefix)) {
/* yes - flag it */
found = true;
/* see if we have a range or a break in the list - we
* break the list if one of the following conditions occurs:
*
* 1. the node number is out of sequence
*
* 2. the vpid of the first proc on the node is out
* of sequence - i.e., does not equal the vpid of
* the first proc on the first node + step if bynode,
* or the last proc on the prior node + 1 if byslot
*
* 3. the starting node rank on the node is out of sequence
*/
num_nodes = opal_value_array_get_size(&ndreg->nodes)-1;
start = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->nodes, int32_t, num_nodes);
cnt = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->cnt, int32_t, num_nodes);
sfx = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->suffix, char, num_nodes);
if (suffix != sfx) {
/* break in suffix - start new range */
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
} else if (nodenum != cnt+start+1) {
/* have a break in the node sequence - start new range */
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
} else {
/* cycle through the procs on this node and see if the vpids
* for this jobid break the sequencing
*/
vpid_start = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->starting_vpid, orte_vpid_t, num_nodes);
ppn = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->ppn, int32_t, num_nodes);
nrank = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->nrank, orte_node_rank_t, num_nodes);
compute_vpids(node, jdata->jobid, &start_vpid, &end_vpid, &nppn, &node_rank);
/* if the ppn doesn't match, then that breaks the sequence */
if (nppn != ppn) {
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
break;
}
/* if the starting node rank doesn't match, then that breaks the sequence */
if (nrank != node_rank) {
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
break;
}
/* if the vpids don't align correctly, then that breaks the sequence */
if (byslot) {
base = vpid_start + (ppn * (cnt+1));
if (start_vpid != base) {
/* break sequence */
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
break;
}
} else {
if (start_vpid != (vpid_start + 1)) {
/* break sequence */
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
break;
}
}
/* otherwise, if everything matches, just increment the cnt */
OPAL_VALUE_ARRAY_SET_ITEM(&ndreg->cnt, int32_t, num_nodes, cnt+1);
}
}
}
if (!found) {
/* need to add it */
ndreg = OBJ_NEW(orte_regex_node_t);
ndreg->prefix = strdup(prefix);
start_sequence(jdata->jobid, node, ndreg, suffix, nodenum);
opal_list_append(&nodelist, &ndreg->super);
}
}
/* the regular expression begins with the jobid */
asprintf(&tmp, "LJID=%s", ORTE_LOCAL_JOBID_PRINT(jdata->jobid));
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* next comes the total slots allocated to us */
asprintf(&tmp, "SLOTS=%d", (int)jdata->total_slots_alloc);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* the control flags for this job */
asprintf(&tmp, "CTRLS=%d", (int)jdata->controls);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* the stdin target for the job */
asprintf(&tmp, "STDIN=%d", (int)jdata->stdin_target);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* the app_context for the job - can only be one! Just include
* the required portions
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
asprintf(&tmp, "APP=\"%s:%s\"", app->app, app->cwd);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
tmp2 = opal_argv_join(app->argv, '#');
asprintf(&tmp, "ARGV=\"%s\"", (NULL == tmp2) ? "NULL" : tmp2);
free(tmp2);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
tmp2 = opal_argv_join(app->env, '#');
asprintf(&tmp, "ENV=\"%s\"", (NULL == tmp2) ? "NULL" : tmp2);
free(tmp2);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* next comes the starting daemon vpid */
asprintf(&tmp, "DVPID=%s", ORTE_VPID_PRINT(jdata->map->daemon_vpid_start));
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* begin constructing the regular expression for each prefix */
for (item = opal_list_get_first(&nodelist);
item != opal_list_get_end(&nodelist);
item = opal_list_get_next(item)) {
ndreg = (orte_regex_node_t*)item;
/* how many values are in the array? */
num_nodes = opal_value_array_get_size(&ndreg->nodes);
if (0 == num_nodes) {
/* solitary node */
asprintf(&tmp, "%s", ndreg->prefix);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
continue;
}
/* build the regexargs array */
for (i=0; i < num_nodes; i++) {
/* get the index and the cnt */
sfx = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->suffix, char, i);
start = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->nodes, int32_t, i);
cnt = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->cnt, int32_t, i);
vpid_start = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->starting_vpid, orte_vpid_t, i);
ppn = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->ppn, int32_t, i);
nrank = OPAL_VALUE_ARRAY_GET_ITEM(&ndreg->nrank, orte_node_rank_t, i);
/* if we have a range, construct it that way */
if (0 < cnt) {
if (ORTE_VPID_INVALID == vpid_start) {
/* no procs from this job on these nodes */
if ('\0' == sfx) {
asprintf(&tmp, "%s[%d-%d]", ndreg->prefix, start, start+cnt);
} else {
asprintf(&tmp, "%s[%d-%d]%c", ndreg->prefix, start, start+cnt, sfx);
}
} else {
if ('\0' == sfx) {
asprintf(&tmp, "%s[%d-%d](%sx%d:%d:%d)", ndreg->prefix, start, start+cnt,
ORTE_VPID_PRINT(vpid_start), ppn,
(byslot) ? 1 : (int)jdata->map->num_nodes, (int)nrank);
} else {
asprintf(&tmp, "%s[%d-%d]%c(%sx%d:%d:%d)", ndreg->prefix, start, start+cnt,
sfx, ORTE_VPID_PRINT(vpid_start), ppn,
(byslot) ? 1 : (int)jdata->map->num_nodes, (int)nrank);
}
}
} else {
/* single node - could be due to a break in the numbering, suffix, and/or vpids,
* or because it was a fullname node with no numbering in it
*/
if (ORTE_VPID_INVALID == vpid_start) {
/* no procs from this job on this node */
if (start < 0) {
/* fullname node */
if ('\0' == sfx) {
asprintf(&tmp, "%s", ndreg->prefix);
} else {
asprintf(&tmp, "%s%c", ndreg->prefix, sfx);
}
} else {
if ('\0' == sfx) {
asprintf(&tmp, "%s%d", ndreg->prefix, start);
} else {
asprintf(&tmp, "%s%d%c", ndreg->prefix, start, sfx);
}
}
} else {
if (start < 0) {
if ('\0' == sfx) {
asprintf(&tmp, "%s(%sx%d:%d:%d)", ndreg->prefix,
ORTE_VPID_PRINT(vpid_start), ppn,
(byslot) ? 1 : (int)jdata->map->num_nodes, (int)nrank);
} else {
asprintf(&tmp, "%s%c(%sx%d:%d:%d)", ndreg->prefix, sfx,
ORTE_VPID_PRINT(vpid_start), ppn,
(byslot) ? 1 : (int)jdata->map->num_nodes, (int)nrank);
}
} else {
if ('\0' == sfx) {
asprintf(&tmp, "%s%d(%sx%d:%d:%d)", ndreg->prefix, start,
ORTE_VPID_PRINT(vpid_start), ppn,
(byslot) ? 1 : (int)jdata->map->num_nodes, (int)nrank);
} else {
asprintf(&tmp, "%s%d%c(%sx%d:%d:%d)", ndreg->prefix, start, sfx,
ORTE_VPID_PRINT(vpid_start), ppn,
(byslot) ? 1 : (int)jdata->map->num_nodes, (int)nrank);
}
}
}
}
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
}
}
/* assemble final result */
regexp = opal_argv_join(regexargs, ',');
/* cleanup */
opal_argv_free(regexargs);
while (NULL != (item = opal_list_remove_first(&nodelist))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodelist);
return regexp;
}
static int parse_node_range(char *orig, char ***names, orte_vpid_t *vpid_start,
int *ppn, int *step, int *nrank)
{
char *base, *ptr, *ptr2, *next, suffix;
int i, j, len, rc=ORTE_SUCCESS;
bool found_range;
/* protect input */
base = strdup(orig);
suffix = '\0';
/* default to no procs */
*vpid_start = ORTE_VPID_INVALID;
/* start by searching for ranges and proc specifications */
len = strlen(base);
ptr = NULL;
found_range = false;
for (i = 0; i <= len; ++i) {
if (base[i] == '[') {
/* we found a range. this gets dealt with below */
base[i] = '\0';
found_range = true;
break;
}
if (base[i] == '\0') {
/* we found a singleton node - no procs on it */
base[i] = '\0';
found_range = false;
break;
}
if (base[i] == '(') {
/* we found a singleton node that has procs on it */
base[i] = '\0';
found_range = false;
ptr = &base[i+1];
break;
}
}
if (i == 0) {
/* we found a special character at the beginning of the string */
orte_show_help("help-regex.txt", "regex:special-char", true, orig);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
if (found_range) {
/* If we found a range, now find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
if (j < len-2) {
if (base[j+1] == '(') {
/* procs are in this range and there is no suffix */
ptr = &base[j+2];
} else {
/* we must have a suffix */
suffix = base[j+1];
if (j < len-3 && base[j+2] == '(') {
/* we also have procs in this range */
ptr = &base[j+3];
}
}
}
break;
}
}
if (j >= len) {
/* we didn't find the end of the range */
orte_show_help("help-regex.txt", "regex:end-range-missing", true, orig);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
rc = regex_parse_node_range(base, base + i + 1, suffix, names);
if(ORTE_SUCCESS != rc) {
orte_show_help("help-regex.txt", "regex:bad-value", true, orig);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
} else {
/* If we didn't find a range, just add the node */
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s regex:extract:nodenames: found node: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), base));
if(ORTE_SUCCESS != (rc = opal_argv_append_nosize(names, base))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
}
if (NULL != ptr) { /* we have procs on these nodes */
/* find the end of the description */
ptr2 = strchr(ptr, ')');
if (NULL == ptr2) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, ptr);
return ORTE_ERROR;
}
*ptr2 = '\0';
/* the proc description is in the format:
* starting-vpidxppn:step:starting-node-rank
* where step=step between vpids
*/
/* start by extracting the starting vpid */
if (NULL == (ptr2 = strchr(ptr, 'x'))) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, ptr);
return ORTE_ERROR;
}
*ptr2 = '\0';
orte_util_convert_string_to_vpid(vpid_start, ptr);
/* get ppn */
next = ptr2 + 1;
if (NULL == (ptr2 = strchr(next, ':'))) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, next);
return ORTE_ERROR;
}
*ptr2 = '\0';
*ppn = strtol(next, NULL, 10);
/* get step */
next = ptr2 + 1;
if (NULL == (ptr2 = strchr(next, ':'))) {
/* malformed */
orte_show_help("help-regex.txt", "regex:bad-value", true, next);
return ORTE_ERROR;
}
*ptr2 = '\0';
*step = strtol(next, NULL, 10);
/* get the starting node rank */
next = ptr2 + 1;
*nrank = strtol(next, NULL, 10);
}
cleanup:
free(base);
return rc;
}
int orte_regex_decode_maps(char *regexp, orte_odls_job_t **jobdat)
{
char **seqs, *ptr, **names, *ptr2, check[5];
int i, j, k, n, entry, rc;
int ppn, step, start_nrank, nrank;
int32_t tmp32;
orte_vpid_t daemon_vpid, vpid;
orte_jobid_t jobid;
orte_nid_t *nid;
orte_jmap_t *jmap;
orte_pmap_t *pmap;
bool found;
orte_odls_job_t *jdat;
orte_app_context_t *app;
opal_list_item_t *item;
int num_procs, num_nodes;
struct hostent *h;
opal_buffer_t buf;
char *uri, *addr;
orte_process_name_t proc;
char *proc_name;
bool hnp_entry;
/* if regexp is NULL, then nothing to parse */
if (NULL == regexp) {
return ORTE_ERR_SILENT;
}
/* ensure the global nidmap/pidmap arrays are initialized */
if (ORTE_SUCCESS != (rc = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* break the regexp into its component parts - this is trivial
* because they are all separated by commas!
*/
seqs = opal_argv_split(regexp, ',');
/* we need to have at least six elements or something is wrong */
if (opal_argv_count(seqs) < 6) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
/* start parsing with the first entry */
entry=0;
/* the first entry is the local jobid, so we extract that and
* convert it into a global jobid
*/
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
tmp32 = strtol(ptr, NULL, 10);
jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, tmp32);
/* do we already have a jmap entry for this job? */
found = false;
for (i=0; i < orte_jobmap.size; i++) {
if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) {
continue;
}
if (jmap->job == jobid) {
/* got it */
found = true;
break;
}
}
if (!found) {
/* don't already have it - add it */
jmap = OBJ_NEW(orte_jmap_t);
jmap->job = jobid;
opal_pointer_array_add(&orte_jobmap, jmap);
}
jdat = NULL;
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* even though we are unpacking an add_local_procs cmd, we cannot assume
* that no job record for this jobid exists. A race condition exists that
* could allow another daemon's procs to call us with a collective prior
* to our unpacking add_local_procs. So lookup the job record for this jobid
* and see if it already exists
*/
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
orte_odls_job_t *jdt = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jdt->jobid == jobid) {
jdat = jdt;
break;
}
}
if (NULL == jdat) {
/* setup jobdat object for this job */
jdat = OBJ_NEW(orte_odls_job_t);
jdat->jobid = jobid;
opal_list_append(&orte_local_jobdata, &jdat->super);
}
if (NULL != jobdat) {
*jobdat = jdat;
}
/* see if this was previously decoded */
if (NULL != jdat->regexp) {
/* yep - don't decode it again */
opal_argv_free(seqs);
return ORTE_SUCCESS;
}
/* next entry is the total slots allocated to this job */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
jdat->total_slots_alloc = strtol(ptr, NULL, 10);
/* next entry is the control flags for the job */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
jdat->controls = strtol(ptr, NULL, 10);
/* next entry - stdin target */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
jdat->stdin_target = strtol(ptr, NULL, 10);
/* next entry - the app_context itself */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* some shells will strip the starting and ending quotes, and some won't -
* so check for them here
*/
if ('\"' == *ptr) ptr++;
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
/* create the app_context object */
app = OBJ_NEW(orte_app_context_t);
jdat->apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t*));
jdat->apps[0] = app;
jdat->num_apps = 1;
/* get the app and the cwd by hand */
ptr2 = strchr(ptr, ':');
*ptr2 = '\0';
app->app = strdup(ptr);
ptr = ++ptr2;
app->cwd = strdup(ptr);
/* the next entry is the argv for the app_context, separated by '#'. We
* assume we can use argv_split for this purpose. First check, though, for
* NULL, indicating there were no argvs
*/
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* some shells will strip the starting and ending quotes, and some won't -
* so check for them here
*/
if ('\"' == *ptr) ptr++;
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
for (i=0; i < 4; i++) {
check[i] = ptr[i];
}
check[4] = '\0';
if (0 != strcmp("NULL", check)) {
/* there are argvs */
app->argv = opal_argv_split(ptr, '#');
}
/* the next entry is the env for the app_context, also separated by '#'.
* Again, start by checking for NULL
*/
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* some shells will strip the starting and ending quotes, and some won't -
* so check for them here
*/
if ('\"' == *ptr) ptr++;
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
for (i=0; i < 4; i++) {
check[i] = ptr[i];
}
check[4] = '\0';
if (0 != strcmp("NULL", check)) {
/* there are argvs */
app->env = opal_argv_split(ptr, '#');
}
} else {
entry += 6;
}
/* next entry is the starting daemon vpid for the job being launched */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* use the standard vpid conversion routine to get the value - don't attempt
* to directly convert it with strtol as the value could be INVALID
*/
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&daemon_vpid, ptr))) {
ORTE_ERROR_LOG(rc);
opal_argv_free(seqs);
return rc;
}
/* the remaining entries contain the name of the nodes in the system, how
* many procs (if any) on each of those nodes, the starting vpid of the
* procs on those nodes, and the starting node rank for the procs on
* each node
*/
names = NULL;
num_procs = 0;
num_nodes = 0;
ppn = 0;
step = 0;
start_nrank = 0;
OBJ_CONSTRUCT(&buf, opal_buffer_t);
hnp_entry = true;
for (n=entry; n < opal_argv_count(seqs); n++) {
/* parse the node entry to get a list of all node names in it */
if (ORTE_SUCCESS != (rc = parse_node_range(seqs[n], &names, &vpid, &ppn, &step, &start_nrank))) {
ORTE_ERROR_LOG(rc);
opal_argv_free(seqs);
return rc;
}
for (i=0; i < opal_argv_count(names); i++) {
/* is this name already in our nidmap? */
found = false;
for (j=0; j < orte_nidmap.size; j++) {
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, j))) {
continue;
}
if (0 == strcmp(nid->name, names[i])) {
/* yep - we have it */
found = true;
break;
}
}
if (!found) {
/* must not already have it - create one */
nid = OBJ_NEW(orte_nid_t);
nid->name = strdup(names[i]);
nid->index = opal_pointer_array_add(&orte_nidmap, nid);
}
/* the hnp entry is always first in line. Since the hnp may not
* have any procs on it, the starting daemon vpid may be > 0. Thus,
* we ensure that the HNP always gets the correct daemon vpid for
* its node
*/
if (hnp_entry) {
/* this is the name of the HNP's node */
nid->daemon = 0;
hnp_entry = false;
/* do NOT increment the daemon_vpid as that refers to the
* starting point for -new- daemons. Since the HNP is
* always already present, the daemon vpid will only reflect
* the starting vpid for anyone else that had to be launched
*/
} else {
if (ORTE_VPID_INVALID != daemon_vpid &&
ORTE_VPID_INVALID == nid->daemon) {
nid->daemon = daemon_vpid++;
}
}
/* if we are a daemon and using static ports, create the contact info
* for the daemon on this node
*/
if (ORTE_PROC_IS_DAEMON && orte_static_ports) {
/* lookup the address of this node */
if (NULL == (h = gethostbyname(nid->name))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s orte:regex: constructing static path to node %s daemon %d addr %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nid->name, (int)nid->daemon, addr));
/* since we are using static ports, all my fellow daemons will be on my
* port. Setup the contact info for each daemon in my hash tables. Note
* that this will -not- open a port to those daemons, but will only
* define the info necessary for opening such a port if/when I communicate
* to them
*/
/* construct the URI */
proc.jobid = ORTE_PROC_MY_NAME->jobid;
proc.vpid = nid->daemon;
orte_util_convert_process_name_to_string(&proc_name, &proc);
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
free(proc_name);
free(uri);
}
/* if this node has procs on it, cycle through the ppn, adding a pmap
* for each new rank
*/
if (ORTE_VPID_INVALID != vpid) {
nrank = start_nrank;
for (k=0; k < ppn; k++) {
if (NULL != opal_pointer_array_get_item(&jmap->pmap, vpid)) {
/* this proc was already entered via some earlier step */
vpid += step;
continue;
}
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = nid->index;
pmap->local_rank = k;
pmap->node_rank = nrank++;
jmap->num_procs++;
opal_pointer_array_set_item(&jmap->pmap, vpid, pmap);
vpid += step;
/* increment #procs in the job */
num_procs++;
}
}
/* increment #nodes in the job */
num_nodes++;
}
opal_argv_free(names);
names = NULL;
}
/* if we are a daemon and using static ports, load the hash tables */
if (ORTE_PROC_IS_DAEMON && orte_static_ports) {
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&buf);
opal_argv_free(seqs);
if (NULL != jdat) {
/* record the regexp so it can be sent to the local procs */
jdat->regexp = strdup(regexp);
/* save the job data */
jdat->num_procs += num_procs;
jdat->num_nodes += num_nodes;
}
return ORTE_SUCCESS;
}