1
1
openmpi/orte/util/hostfile/hostfile.c
Gilles Gouaillardet 2e2366d193 util/hostfile: fix a double free error
As reported at https://stackoverflow.com/questions/52707242/mpirun-segmentation-fault-whenever-i-use-a-hostfile
mpirun crashes when the hostfile contains a "user@host" line.
The root cause is username was not strdup'ed and free'd twice by opal_argv_free() and free()

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>

(cherry picked from commit open-mpi/ompi@5803385d44)
2018-10-09 13:06:55 +09:00

1035 строки
37 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <errno.h>
#include <string.h>
#include <sys/stat.h>
#include "opal/class/opal_list.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/if.h"
#include "opal/util/net.h"
#include "opal/mca/installdirs/installdirs.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile_lex.h"
#include "orte/util/hostfile/hostfile.h"
static const char *cur_hostfile_name = NULL;
static void hostfile_parse_error(int token)
{
switch (token) {
case ORTE_HOSTFILE_STRING:
orte_show_help("help-hostfile.txt", "parse_error_string",
true,
cur_hostfile_name,
orte_util_hostfile_line,
token,
orte_util_hostfile_value.sval);
break;
case ORTE_HOSTFILE_IPV4:
case ORTE_HOSTFILE_IPV6:
case ORTE_HOSTFILE_INT:
orte_show_help("help-hostfile.txt", "parse_error_int",
true,
cur_hostfile_name,
orte_util_hostfile_line,
token,
orte_util_hostfile_value.ival);
break;
default:
orte_show_help("help-hostfile.txt", "parse_error",
true,
cur_hostfile_name,
orte_util_hostfile_line,
token );
break;
}
}
/**
* Return the integer following an = (actually may only return positive ints)
*/
static int hostfile_parse_int(void)
{
if (ORTE_HOSTFILE_EQUAL != orte_util_hostfile_lex())
return -1;
if (ORTE_HOSTFILE_INT != orte_util_hostfile_lex())
return -1;
return orte_util_hostfile_value.ival;
}
/**
* Return the string following an = (option to a keyword)
*/
static char *hostfile_parse_string(void)
{
int rc;
if (ORTE_HOSTFILE_EQUAL != orte_util_hostfile_lex()){
return NULL;
}
rc = orte_util_hostfile_lex();
if (ORTE_HOSTFILE_STRING != rc){
return NULL;
}
return strdup(orte_util_hostfile_value.sval);
}
static orte_node_t* hostfile_lookup(opal_list_t* nodes, const char* name)
{
opal_list_item_t* item;
for(item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
orte_node_t* node = (orte_node_t*)item;
if (strcmp(node->name, name) == 0) {
return node;
}
}
return NULL;
}
static int hostfile_parse_line(int token, opal_list_t* updates,
opal_list_t* exclude, bool keep_all)
{
int rc;
orte_node_t* node;
bool got_max = false;
char* value;
char **argv;
char* node_name = NULL;
char* username = NULL;
int cnt;
int number_of_slots = 0;
char buff[64];
if (ORTE_HOSTFILE_STRING == token ||
ORTE_HOSTFILE_HOSTNAME == token ||
ORTE_HOSTFILE_INT == token ||
ORTE_HOSTFILE_IPV4 == token ||
ORTE_HOSTFILE_IPV6 == token) {
if(ORTE_HOSTFILE_INT == token) {
snprintf(buff, 64, "%d", orte_util_hostfile_value.ival);
value = buff;
} else {
value = orte_util_hostfile_value.sval;
}
argv = opal_argv_split (value, '@');
cnt = opal_argv_count (argv);
if (1 == cnt) {
node_name = strdup(argv[0]);
} else if (2 == cnt) {
username = strdup(argv[0]);
node_name = strdup(argv[1]);
} else {
opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */
}
opal_argv_free (argv);
// Strip off the FQDN if present, ignore IP addresses
if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(node_name) ) {
char *ptr;
if (NULL != (ptr = strchr(node_name, '.'))) {
*ptr = '\0';
}
}
/* if the first letter of the name is '^', then this is a node
* to be excluded. Remove the ^ character so the nodename is
* usable, and put it on the exclude list
*/
if ('^' == node_name[0]) {
int i, len;
len = strlen(node_name);
for (i=1; i < len; i++) {
node_name[i-1] = node_name[i];
}
node_name[len-1] = '\0'; /* truncate */
OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
"%s hostfile: node %s is being excluded",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name));
/* see if this is another name for us */
if (orte_ifislocal(node_name)) {
/* Nodename has been allocated, that is for sure */
free (node_name);
node_name = strdup(orte_process_info.nodename);
}
/* Do we need to make a new node object? First check to see
if it's already in the exclude list */
if (NULL == (node = hostfile_lookup(exclude, node_name))) {
node = OBJ_NEW(orte_node_t);
node->name = node_name;
if (NULL != username) {
orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
}
opal_list_append(exclude, &node->super);
} else {
free(node_name);
}
return ORTE_SUCCESS;
}
/* this is not a node to be excluded, so we need to process it and
* add it to the "include" list. See if this host is actually us.
*/
if (orte_ifislocal(node_name)) {
/* Nodename has been allocated, that is for sure */
free (node_name);
node_name = strdup(orte_process_info.nodename);
}
OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
"%s hostfile: node %s is being included - keep all is %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name,
keep_all ? "TRUE" : "FALSE"));
/* Do we need to make a new node object? */
if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) {
node = OBJ_NEW(orte_node_t);
node->name = node_name;
node->slots = 1;
if (NULL != username) {
orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
}
opal_list_append(updates, &node->super);
} else {
/* this node was already found once - add a slot and mark slots as "given" */
node->slots++;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
free(node_name);
}
} else if (ORTE_HOSTFILE_RELATIVE == token) {
/* store this for later processing */
node = OBJ_NEW(orte_node_t);
node->name = strdup(orte_util_hostfile_value.sval);
opal_list_append(updates, &node->super);
} else if (ORTE_HOSTFILE_RANK == token) {
/* we can ignore the rank, but we need to extract the node name. we
* first need to shift over to the other side of the equal sign as
* this is where the node name will be
*/
while (!orte_util_hostfile_done &&
ORTE_HOSTFILE_EQUAL != token) {
token = orte_util_hostfile_lex();
}
if (orte_util_hostfile_done) {
/* bad syntax somewhere */
return ORTE_ERROR;
}
/* next position should be the node name */
token = orte_util_hostfile_lex();
if(ORTE_HOSTFILE_INT == token) {
snprintf(buff, 64, "%d", orte_util_hostfile_value.ival);
value = buff;
} else {
value = orte_util_hostfile_value.sval;
}
argv = opal_argv_split (value, '@');
cnt = opal_argv_count (argv);
if (1 == cnt) {
node_name = strdup(argv[0]);
} else if (2 == cnt) {
username = strdup(argv[0]);
node_name = strdup(argv[1]);
} else {
opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */
}
opal_argv_free (argv);
// Strip off the FQDN if present, ignore IP addresses
if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(node_name) ) {
char *ptr;
if (NULL != (ptr = strchr(node_name, '.'))) {
*ptr = '\0';
}
}
/* Do we need to make a new node object? */
if (NULL == (node = hostfile_lookup(updates, node_name))) {
node = OBJ_NEW(orte_node_t);
node->name = node_name;
node->slots = 1;
if (NULL != username) {
orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
}
opal_list_append(updates, &node->super);
} else {
/* add a slot */
node->slots++;
free(node_name);
}
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: node %s slots %d nodes-given %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots,
ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN) ? "TRUE" : "FALSE"));
/* mark the slots as "given" since we take them as being the
* number specified via the rankfile
*/
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
/* skip to end of line */
while (!orte_util_hostfile_done &&
ORTE_HOSTFILE_NEWLINE != token) {
token = orte_util_hostfile_lex();
}
return ORTE_SUCCESS;
} else {
hostfile_parse_error(token);
return ORTE_ERROR;
}
free(username);
while (!orte_util_hostfile_done) {
token = orte_util_hostfile_lex();
switch (token) {
case ORTE_HOSTFILE_DONE:
goto done;
case ORTE_HOSTFILE_NEWLINE:
goto done;
case ORTE_HOSTFILE_USERNAME:
username = hostfile_parse_string();
if (NULL != username) {
orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
free(username);
}
break;
case ORTE_HOSTFILE_PORT:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "port",
true,
cur_hostfile_name, rc);
return ORTE_ERROR;
}
orte_set_attribute(&node->attributes, ORTE_NODE_PORT, ORTE_ATTR_LOCAL, &rc, OPAL_INT);
break;
case ORTE_HOSTFILE_COUNT:
case ORTE_HOSTFILE_CPU:
case ORTE_HOSTFILE_SLOTS:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "slots",
true,
cur_hostfile_name, rc);
opal_list_remove_item(updates, &node->super);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
/* multiple definitions were given for the
* slot count - this is not allowed
*/
orte_show_help("help-hostfile.txt", "slots-given",
true,
cur_hostfile_name, node->name);
opal_list_remove_item(updates, &node->super);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->slots = rc;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
/* Ensure that slots_max >= slots */
if (node->slots_max != 0 && node->slots_max < node->slots) {
node->slots_max = node->slots;
}
break;
case ORTE_HOSTFILE_SLOTS_MAX:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "max_slots",
true,
cur_hostfile_name, ((size_t) rc));
opal_list_remove_item(updates, &node->super);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
/* Only take this update if it puts us >= node_slots */
if (rc >= node->slots) {
if (node->slots_max != rc) {
node->slots_max = rc;
got_max = true;
}
} else {
orte_show_help("help-hostfile.txt", "max_slots_lt",
true,
cur_hostfile_name, node->slots, rc);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
opal_list_remove_item(updates, &node->super);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
break;
case ORTE_HOSTFILE_STRING:
case ORTE_HOSTFILE_INT:
/* just ignore it */
break;
default:
hostfile_parse_error(token);
opal_list_remove_item(updates, &node->super);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
if (number_of_slots > node->slots) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
opal_list_remove_item(updates, &node->super);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
}
done:
if (got_max && !ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
node->slots = node->slots_max;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
}
return ORTE_SUCCESS;
}
/**
* Parse the specified file into a node list.
*/
static int hostfile_parse(const char *hostfile, opal_list_t* updates,
opal_list_t* exclude, bool keep_all)
{
int token;
int rc = ORTE_SUCCESS;
cur_hostfile_name = hostfile;
orte_util_hostfile_done = false;
orte_util_hostfile_in = fopen(hostfile, "r");
if (NULL == orte_util_hostfile_in) {
if (NULL == orte_default_hostfile ||
0 != strcmp(orte_default_hostfile, hostfile)) {
/* not the default hostfile, so not finding it
* is an error
*/
orte_show_help("help-hostfile.txt", "no-hostfile", true, hostfile);
rc = ORTE_ERR_SILENT;
goto unlock;
}
/* if this is the default hostfile and it was given,
* then it's an error
*/
if (orte_default_hostfile_given) {
orte_show_help("help-hostfile.txt", "no-hostfile", true, hostfile);
rc = ORTE_ERR_NOT_FOUND;
goto unlock;
}
/* otherwise, not finding it is okay */
rc = ORTE_SUCCESS;
goto unlock;
}
while (!orte_util_hostfile_done) {
token = orte_util_hostfile_lex();
switch (token) {
case ORTE_HOSTFILE_DONE:
orte_util_hostfile_done = true;
break;
case ORTE_HOSTFILE_NEWLINE:
break;
/*
* This looks odd, since we have several forms of host-definitions:
* hostname just plain as it is, being a ORTE_HOSTFILE_STRING
* IP4s and user@IPv4s
* hostname.domain and user@hostname.domain
*/
case ORTE_HOSTFILE_STRING:
case ORTE_HOSTFILE_INT:
case ORTE_HOSTFILE_HOSTNAME:
case ORTE_HOSTFILE_IPV4:
case ORTE_HOSTFILE_IPV6:
case ORTE_HOSTFILE_RELATIVE:
case ORTE_HOSTFILE_RANK:
rc = hostfile_parse_line(token, updates, exclude, keep_all);
if (ORTE_SUCCESS != rc) {
goto unlock;
}
break;
default:
hostfile_parse_error(token);
goto unlock;
}
}
fclose(orte_util_hostfile_in);
orte_util_hostfile_in = NULL;
orte_util_hostfile_lex_destroy();
unlock:
cur_hostfile_name = NULL;
return rc;
}
/**
* Parse the provided hostfile and add the nodes to the list.
*/
int orte_util_add_hostfile_nodes(opal_list_t *nodes,
char *hostfile)
{
opal_list_t exclude, adds;
opal_list_item_t *item, *itm;
int rc;
orte_node_t *nd, *node;
bool found;
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: checking hostfile %s for nodes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
OBJ_CONSTRUCT(&exclude, opal_list_t);
OBJ_CONSTRUCT(&adds, opal_list_t);
/* parse the hostfile and add any new contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &adds, &exclude, false))) {
goto cleanup;
}
/* check for any relative node directives */
for (item = opal_list_get_first(&adds);
item != opal_list_get_end(&adds);
item = opal_list_get_next(item)) {
node=(orte_node_t*)item;
if ('+' == node->name[0]) {
orte_show_help("help-hostfile.txt", "hostfile:relative-syntax",
true, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
}
/* remove from the list of nodes those that are in the exclude list */
while (NULL != (item = opal_list_remove_first(&exclude))) {
nd = (orte_node_t*)item;
/* check for matches on nodes */
for (itm = opal_list_get_first(&adds);
itm != opal_list_get_end(&adds);
itm = opal_list_get_next(itm)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
/* match - remove it */
opal_list_remove_item(&adds, itm);
OBJ_RELEASE(itm);
break;
}
}
OBJ_RELEASE(item);
}
/* transfer across all unique nodes */
while (NULL != (item = opal_list_remove_first(&adds))) {
nd = (orte_node_t*)item;
found = false;
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
found = true;
break;
}
}
if (!found) {
opal_list_append(nodes, &nd->super);
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: adding node %s slots %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nd->name, nd->slots));
} else {
OBJ_RELEASE(item);
}
}
cleanup:
OPAL_LIST_DESTRUCT(&exclude);
OPAL_LIST_DESTRUCT(&adds);
return rc;
}
/* Parse the provided hostfile and filter the nodes that are
* on the input list, removing those that
* are not found in the hostfile
*/
int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
char *hostfile,
bool remove)
{
opal_list_t newnodes, exclude;
opal_list_item_t *item1, *item2, *next, *item3;
orte_node_t *node_from_list, *node_from_file, *node_from_pool, *node3;
int rc = ORTE_SUCCESS;
char *cptr;
int num_empty, nodeidx;
bool want_all_empty = false;
opal_list_t keep;
bool found;
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: filtering nodes through hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
/* parse the hostfile and create local list of findings */
OBJ_CONSTRUCT(&newnodes, opal_list_t);
OBJ_CONSTRUCT(&exclude, opal_list_t);
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude, false))) {
OBJ_DESTRUCT(&newnodes);
OBJ_DESTRUCT(&exclude);
return rc;
}
/* if the hostfile was empty, then treat it as a no-op filter */
if (0 == opal_list_get_size(&newnodes)) {
OBJ_DESTRUCT(&newnodes);
OBJ_DESTRUCT(&exclude);
/* indicate that the hostfile was empty */
return ORTE_ERR_TAKE_NEXT_OPTION;
}
/* remove from the list of newnodes those that are in the exclude list
* since we could have added duplicate names above due to the */
while (NULL != (item1 = opal_list_remove_first(&exclude))) {
node_from_file = (orte_node_t*)item1;
/* check for matches on nodes */
for (item2 = opal_list_get_first(&newnodes);
item2 != opal_list_get_end(&newnodes);
item2 = opal_list_get_next(item2)) {
orte_node_t *node = (orte_node_t*)item2;
if (0 == strcmp(node_from_file->name, node->name)) {
/* match - remove it */
opal_list_remove_item(&newnodes, item2);
OBJ_RELEASE(item2);
break;
}
}
OBJ_RELEASE(item1);
}
/* now check our nodes and keep or mark those that match. We can
* destruct our hostfile list as we go since this won't be needed
*/
OBJ_CONSTRUCT(&keep, opal_list_t);
while (NULL != (item2 = opal_list_remove_first(&newnodes))) {
node_from_file = (orte_node_t*)item2;
next = opal_list_get_next(item2);
/* see if this is a relative node syntax */
if ('+' == node_from_file->name[0]) {
/* see if we specified empty nodes */
if ('e' == node_from_file->name[1] ||
'E' == node_from_file->name[1]) {
/* request for empty nodes - do they want
* all of them?
*/
if (NULL != (cptr = strchr(node_from_file->name, ':'))) {
/* the colon indicates a specific # are requested */
cptr++; /* step past : */
num_empty = strtol(cptr, NULL, 10);
} else {
/* want them all - set num_empty to max */
num_empty = INT_MAX;
want_all_empty = true;
}
/* search the list of nodes provided to us and find those
* that are empty
*/
item1 = opal_list_get_first(nodes);
while (0 < num_empty && item1 != opal_list_get_end(nodes)) {
node_from_list = (orte_node_t*)item1;
next = opal_list_get_next(item1); /* keep our place */
if (0 == node_from_list->slots_inuse) {
/* check to see if this node is explicitly called
* out later - if so, don't use it here
*/
for (item3 = opal_list_get_first(&newnodes);
item3 != opal_list_get_end(&newnodes);
item3 = opal_list_get_next(item3)) {
node3 = (orte_node_t*)item3;
if (0 == strcmp(node3->name, node_from_list->name)) {
/* match - don't use it */
goto skipnode;
}
}
if (remove) {
/* remove item from list */
opal_list_remove_item(nodes, item1);
/* xfer to keep list */
opal_list_append(&keep, item1);
} else {
/* mark as included */
ORTE_FLAG_SET(node_from_list, ORTE_NODE_FLAG_MAPPED);
}
--num_empty;
}
skipnode:
item1 = next;
}
/* did they get everything they wanted? */
if (!want_all_empty && 0 < num_empty) {
orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
true, num_empty);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
} else if ('n' == node_from_file->name[1] ||
'N' == node_from_file->name[1]) {
/* they want a specific relative node #, so
* look it up on global pool
*/
nodeidx = strtol(&node_from_file->name[2], NULL, 10);
if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) {
/* this is an error */
orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
true, nodeidx, node_from_file->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* search the list of nodes provided to us and find it */
for (item1 = opal_list_get_first(nodes);
item1 != opal_list_get_end(nodes);
item1 = opal_list_get_next(nodes)) {
node_from_list = (orte_node_t*)item1;
if (0 == strcmp(node_from_list->name, node_from_pool->name)) {
if (remove) {
/* match - remove item from list */
opal_list_remove_item(nodes, item1);
/* xfer to keep list */
opal_list_append(&keep, item1);
} else {
/* mark as included */
ORTE_FLAG_SET(node_from_list, ORTE_NODE_FLAG_MAPPED);
}
break;
}
}
} else {
/* invalid relative node syntax */
orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
true, node_from_file->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
} else {
/* we are looking for a specific node on the list
* search the provided list of nodes to see if this
* one is found
*/
found = false;
for (item1 = opal_list_get_first(nodes);
item1 != opal_list_get_end(nodes);
item1 = opal_list_get_next(item1)) {
node_from_list = (orte_node_t*)item1;
/* we have converted all aliases for ourself
* to our own detected nodename, so no need
* to check for interfaces again - a simple
* strcmp will suffice */
if (0 == strcmp(node_from_file->name, node_from_list->name)) {
/* if the slot count here is less than the
* total slots avail on this node, set it
* to the specified count - this allows people
* to subdivide an allocation
*/
if (ORTE_FLAG_TEST(node_from_file, ORTE_NODE_FLAG_SLOTS_GIVEN) &&
node_from_file->slots < node_from_list->slots) {
node_from_list->slots = node_from_file->slots;
}
if (remove) {
/* remove the node from the list */
opal_list_remove_item(nodes, item1);
/* xfer it to keep list */
opal_list_append(&keep, item1);
} else {
/* mark as included */
ORTE_FLAG_SET(node_from_list, ORTE_NODE_FLAG_MAPPED);
}
found = true;
break;
}
}
/* if the host in the newnode list wasn't found,
* then that is an error we need to report to the
* user and abort
*/
if (!found) {
orte_show_help("help-hostfile.txt", "hostfile:extra-node-not-found",
true, hostfile, node_from_file->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
}
/* cleanup the newnode list */
OBJ_RELEASE(item2);
}
/* if we still have entries on our hostfile list, then
* there were requested hosts that were not in our allocation.
* This is an error - report it to the user and return an error
*/
if (0 != opal_list_get_size(&newnodes)) {
orte_show_help("help-hostfile.txt", "not-all-mapped-alloc",
true, hostfile);
while (NULL != (item1 = opal_list_remove_first(&newnodes))) {
OBJ_RELEASE(item1);
}
OBJ_DESTRUCT(&newnodes);
return ORTE_ERR_SILENT;
}
if (!remove) {
/* all done */
OBJ_DESTRUCT(&newnodes);
return ORTE_SUCCESS;
}
/* clear the rest of the nodes list */
while (NULL != (item1 = opal_list_remove_first(nodes))) {
OBJ_RELEASE(item1);
}
/* the nodes list has been cleared - rebuild it in order */
while (NULL != (item1 = opal_list_remove_first(&keep))) {
opal_list_append(nodes, item1);
}
cleanup:
OBJ_DESTRUCT(&newnodes);
return rc;
}
int orte_util_get_ordered_host_list(opal_list_t *nodes,
char *hostfile)
{
opal_list_t exclude;
opal_list_item_t *item, *itm, *item2, *item1;
char *cptr;
int num_empty, i, nodeidx, startempty=0;
bool want_all_empty=false;
orte_node_t *node_from_pool, *newnode;
int rc;
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: creating ordered list of hosts from hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
OBJ_CONSTRUCT(&exclude, opal_list_t);
/* parse the hostfile and add the contents to the list, keeping duplicates */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, true))) {
goto cleanup;
}
/* parse the nodes to process any relative node directives */
item2 = opal_list_get_first(nodes);
while (item2 != opal_list_get_end(nodes)) {
orte_node_t *node=(orte_node_t*)item2;
/* save the next location in case this one gets removed */
item1 = opal_list_get_next(item2);
if ('+' != node->name[0]) {
item2 = item1;
continue;
}
/* see if we specified empty nodes */
if ('e' == node->name[1] ||
'E' == node->name[1]) {
/* request for empty nodes - do they want
* all of them?
*/
if (NULL != (cptr = strchr(node->name, ':'))) {
/* the colon indicates a specific # are requested */
cptr++; /* step past : */
num_empty = strtol(cptr, NULL, 10);
} else {
/* want them all - set num_empty to max */
num_empty = INT_MAX;
want_all_empty = true;
}
/* insert empty nodes into newnodes list in place of the current item.
* since item1 is the next item, we insert in front of it
*/
if (!orte_hnp_is_allocated && 0 == startempty) {
startempty = 1;
}
for (i=startempty; 0 < num_empty && i < orte_node_pool->size; i++) {
if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (0 == node_from_pool->slots_inuse) {
newnode = OBJ_NEW(orte_node_t);
newnode->name = strdup(node_from_pool->name);
/* if the slot count here is less than the
* total slots avail on this node, set it
* to the specified count - this allows people
* to subdivide an allocation
*/
if (node->slots < node_from_pool->slots) {
newnode->slots = node->slots;
} else {
newnode->slots = node_from_pool->slots;
}
opal_list_insert_pos(nodes, item1, &newnode->super);
/* track number added */
--num_empty;
}
}
/* bookmark where we stopped in case they ask for more */
startempty = i;
/* did they get everything they wanted? */
if (!want_all_empty && 0 < num_empty) {
orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
true, num_empty);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* since we have expanded the provided node, remove
* it from list
*/
opal_list_remove_item(nodes, item2);
OBJ_RELEASE(item2);
} else if ('n' == node->name[1] ||
'N' == node->name[1]) {
/* they want a specific relative node #, so
* look it up on global pool
*/
nodeidx = strtol(&node->name[2], NULL, 10);
/* if the HNP is not allocated, then we need to
* adjust the index as the node pool is offset
* by one
*/
if (!orte_hnp_is_allocated) {
nodeidx++;
}
/* see if that location is filled */
if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) {
/* this is an error */
orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
true, nodeidx, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* create the node object */
newnode = OBJ_NEW(orte_node_t);
newnode->name = strdup(node_from_pool->name);
/* if the slot count here is less than the
* total slots avail on this node, set it
* to the specified count - this allows people
* to subdivide an allocation
*/
if (node->slots < node_from_pool->slots) {
newnode->slots = node->slots;
} else {
newnode->slots = node_from_pool->slots;
}
/* insert it before item1 */
opal_list_insert_pos(nodes, item1, &newnode->super);
/* since we have expanded the provided node, remove
* it from list
*/
opal_list_remove_item(nodes, item2);
OBJ_RELEASE(item2);
} else {
/* invalid relative node syntax */
orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
true, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* move to next */
item2 = item1;
}
/* remove from the list of nodes those that are in the exclude list */
while(NULL != (item = opal_list_remove_first(&exclude))) {
orte_node_t *exnode = (orte_node_t*)item;
/* check for matches on nodes */
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
orte_node_t *node=(orte_node_t*)itm;
if (0 == strcmp(exnode->name, node->name)) {
/* match - remove it */
opal_list_remove_item(nodes, itm);
OBJ_RELEASE(itm);
/* have to cycle through the entire list as we could
* have duplicates
*/
}
}
OBJ_RELEASE(item);
}
cleanup:
OBJ_DESTRUCT(&exclude);
return rc;
}