Add relative indexing capabilities for hostfile and -host - we can now reference hosts using a relative syntax.
See the orte_hosts manpage for an explanation This commit was SVN r19364.
Этот коммит содержится в:
родитель
ef2bb46e45
Коммит
6d82efba21
@ -32,7 +32,10 @@
|
||||
|
||||
#include "dash_host.h"
|
||||
|
||||
|
||||
/* we can only enter this routine if no other allocation
|
||||
* was found, so we only need to know that finding any
|
||||
* relative node syntax should generate an immediate error
|
||||
*/
|
||||
int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
bool *override_oversubscribed,
|
||||
char ** host_argv)
|
||||
@ -71,6 +74,17 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
bump the slots count for each duplicate */
|
||||
|
||||
for (i = 0; NULL != mapped_nodes[i]; ++i) {
|
||||
/* if the specified node contains a relative node syntax,
|
||||
* this is an error
|
||||
*/
|
||||
if ('+' == mapped_nodes[i][0]) {
|
||||
orte_show_help("help-dash-host.txt", "dash-host:relative-syntax",
|
||||
true, mapped_nodes[i]);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* see if the node is already on the list */
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -127,16 +141,23 @@ cleanup:
|
||||
}
|
||||
|
||||
|
||||
/* the -host option can always be used in both absolute
|
||||
* and relative mode, so we have to check for pre-existing
|
||||
* allocations if we are to use relative node syntax
|
||||
*/
|
||||
int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
|
||||
char** host_argv)
|
||||
{
|
||||
opal_list_item_t* item;
|
||||
bool found;
|
||||
opal_list_item_t *next;
|
||||
orte_std_cntr_t i, j, k;
|
||||
orte_std_cntr_t i, j, k, len_mapped_node=0;
|
||||
int rc;
|
||||
char **mapped_nodes = NULL, **mini_map;
|
||||
orte_node_t *node;
|
||||
char **mapped_nodes = NULL, **mini_map, *cptr;
|
||||
orte_node_t *node, **nodepool;
|
||||
int nodeidx;
|
||||
int num_empty=0;
|
||||
bool want_all_empty = false;
|
||||
|
||||
/* if the incoming node list is empty, then there
|
||||
* is nothing to filter!
|
||||
@ -145,36 +166,89 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* setup for relative node syntax */
|
||||
nodepool = (orte_node_t**)orte_node_pool->addr;
|
||||
|
||||
/* Accumulate all of the host name mappings */
|
||||
for (j = 0; j < opal_argv_count(host_argv); ++j) {
|
||||
mini_map = opal_argv_split(host_argv[j], ',');
|
||||
|
||||
if (mapped_nodes == NULL) {
|
||||
mapped_nodes = mini_map;
|
||||
} else {
|
||||
for (k = 0; NULL != mini_map[k]; ++k) {
|
||||
rc = opal_argv_append_nosize(&mapped_nodes,
|
||||
mini_map[k]);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
for (k = 0; NULL != mini_map[k]; ++k) {
|
||||
if ('+' == mini_map[k][0]) {
|
||||
/* see if we specified empty nodes */
|
||||
if ('e' == mini_map[k][1] ||
|
||||
'E' == mini_map[k][1]) {
|
||||
/* request for empty nodes - do they want
|
||||
* all of them?
|
||||
*/
|
||||
if (NULL != (cptr = strchr(mini_map[k], ':'))) {
|
||||
/* the colon indicates a specific # are requested */
|
||||
cptr++; /* step past : */
|
||||
num_empty += strtol(cptr, NULL, 10);
|
||||
} else {
|
||||
/* want them all - set num_empty to max */
|
||||
num_empty = INT_MAX;
|
||||
want_all_empty = true;
|
||||
}
|
||||
} else if ('n' == mini_map[k][1] ||
|
||||
'N' == mini_map[k][1]) {
|
||||
/* they want a specific relative node #, so
|
||||
* look it up on global pool
|
||||
*/
|
||||
nodeidx = strtol(&mini_map[k][2], NULL, 10);
|
||||
if (nodeidx < 0 ||
|
||||
nodeidx > (int)orte_node_pool->size) {
|
||||
/* this is an error */
|
||||
orte_show_help("help-dash-host.txt", "dash-host:relative-node-out-of-bounds",
|
||||
true, nodeidx, mini_map[k]);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* if the HNP is not allocated, then we need to
|
||||
* adjust the index as the node pool is offset
|
||||
* by one
|
||||
*/
|
||||
if (!orte_hnp_is_allocated) {
|
||||
nodeidx++;
|
||||
}
|
||||
/* see if that location is filled */
|
||||
|
||||
if (NULL == nodepool[nodeidx]) {
|
||||
/* this is an error */
|
||||
orte_show_help("help-dash-host.txt", "dash-host:relative-node-not-found",
|
||||
true, nodeidx, mini_map[k]);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* add this node to the list */
|
||||
opal_argv_append_nosize(&mapped_nodes, nodepool[nodeidx]->name);
|
||||
} else {
|
||||
/* invalid relative node syntax */
|
||||
orte_show_help("help-dash-host.txt", "dash-host:invalid-relative-node-syntax",
|
||||
true, mini_map[k]);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
} else { /* non-relative syntax - add to list */
|
||||
if (OPAL_SUCCESS != (rc = opal_argv_append_nosize(&mapped_nodes,
|
||||
mini_map[k]))) {
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
opal_argv_free(mini_map);
|
||||
}
|
||||
opal_argv_free(mini_map);
|
||||
}
|
||||
|
||||
/* Did we find anything? If not, then do nothing */
|
||||
if (NULL == mapped_nodes) {
|
||||
if (NULL == mapped_nodes && 0 == num_empty) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* we found some info - filter what is on the list...
|
||||
* i.e., go through the list and remove any nodes that
|
||||
* were -not- included on the -host list
|
||||
*/
|
||||
j=0;
|
||||
k = opal_argv_count(mapped_nodes);
|
||||
len_mapped_node = opal_argv_count(mapped_nodes);
|
||||
item = opal_list_get_first(nodes);
|
||||
while (item != opal_list_get_end(nodes)) {
|
||||
/* hang on to next item in case this one gets removed */
|
||||
@ -182,25 +256,33 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
|
||||
node = (orte_node_t*)item;
|
||||
/* search -host list to see if this one is found */
|
||||
found = false;
|
||||
for (i = 0; NULL != mapped_nodes[i]; ++i) {
|
||||
for (i = 0; i < len_mapped_node; ++i) {
|
||||
/* we have a match if one of two conditions is met:
|
||||
* 1. the node_name and mapped_nodes directly match
|
||||
* 2. the node_name is the local system name AND
|
||||
* either the mapped_node is "localhost" OR it
|
||||
* is a local interface as found by opal_ifislocal
|
||||
*/
|
||||
if (0 == strcmp(node->name, mapped_nodes[i]) ||
|
||||
(0 == strcmp(node->name, orte_process_info.nodename) &&
|
||||
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
|
||||
* 1. the node_name and mapped_nodes directly match
|
||||
* 2. the node_name is the local system name AND
|
||||
* either the mapped_node is "localhost" OR it
|
||||
* is a local interface as found by opal_ifislocal
|
||||
*/
|
||||
if (NULL != mapped_nodes[i] &&
|
||||
(0 == strcmp(node->name, mapped_nodes[i]) ||
|
||||
(0 == strcmp(node->name, orte_process_info.nodename) &&
|
||||
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i]))))) {
|
||||
found = true; /* found it - leave it alone */
|
||||
j++;
|
||||
/* keep cycling here in case there are multiple instances
|
||||
* of the node on the mapped_node array - this will
|
||||
* allow us to properly account for them all so we don't
|
||||
* think something was specified but wasn't found
|
||||
*/
|
||||
free(mapped_nodes[i]);
|
||||
mapped_nodes[i] = NULL;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
/* if this node wasn't found on the list of explicitly called-out
|
||||
* nodes, see if we wanted empty nodes and if this one is empty
|
||||
*/
|
||||
if (0 < num_empty && 0 == node->slots_inuse) {
|
||||
/* both true - keep this one */
|
||||
found = true;
|
||||
--num_empty;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
opal_list_remove_item(nodes, item);
|
||||
OBJ_RELEASE(item);
|
||||
@ -209,22 +291,34 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
|
||||
}
|
||||
|
||||
/* was something specified that was -not- found? */
|
||||
if (j < k) {
|
||||
char *tmp;
|
||||
tmp = opal_argv_join(mapped_nodes, ',');
|
||||
orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
|
||||
true, tmp);
|
||||
free(tmp);
|
||||
for (i=0; i < len_mapped_node; i++) {
|
||||
if (NULL != mapped_nodes[i]) {
|
||||
orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
|
||||
true, mapped_nodes[i]);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* did they ask for more than we could provide */
|
||||
if (!want_all_empty && 0 < num_empty) {
|
||||
orte_show_help("help-dash-host.txt", "dash-host:not-enough-empty",
|
||||
true, num_empty);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = ORTE_SUCCESS;
|
||||
/* done filtering existing list */
|
||||
|
||||
|
||||
cleanup:
|
||||
for (i=0; i < len_mapped_node; i++) {
|
||||
if (NULL != mapped_nodes[i]) {
|
||||
free(mapped_nodes[i]);
|
||||
mapped_nodes[i] = NULL;
|
||||
}
|
||||
}
|
||||
if (NULL != mapped_nodes) {
|
||||
opal_argv_free(mapped_nodes);
|
||||
free(mapped_nodes);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -20,9 +20,49 @@
|
||||
#
|
||||
|
||||
[not-all-mapped-alloc]
|
||||
Some of the requested hosts are not included in the current allocation.
|
||||
At least one of the requested hosts is not included in the current allocation.
|
||||
|
||||
The requested hosts were specified with --host as:
|
||||
%s
|
||||
Missing requested host: %s
|
||||
|
||||
Please check your allocation or your request.
|
||||
#
|
||||
[dash-host:relative-syntax]
|
||||
A relative host was specified, but no prior allocation has been made.
|
||||
Thus, there is no way to determine the proper host to be used.
|
||||
|
||||
-host: %s
|
||||
|
||||
Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[dash-host:relative-node-not-found]
|
||||
A relative host was specified, but was not found. The requested host was
|
||||
specified with --host as:
|
||||
|
||||
Index: %d
|
||||
Syntax given: %s
|
||||
|
||||
Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[dash-host:relative-node-out-of-bounds]
|
||||
A relative host was specified, but the index given is beyond the number
|
||||
of hosts in the current allocation:
|
||||
|
||||
Index: %d
|
||||
#hosts: %d
|
||||
|
||||
You could obtain a larger allocation or reduce the relative host index.
|
||||
Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[dash-host:invalid-relative-node-syntax]
|
||||
A relative host was improperly specified - the value provided was.
|
||||
|
||||
-host: %s
|
||||
|
||||
You may have forgotten to preface a node with 'N' or 'n', or used the 'e' or 'E' to indicate
|
||||
empty nodes. Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[dash-host:not-enough-empty]
|
||||
The requested number of empty hosts was not available - the system was short by %d hosts.
|
||||
|
||||
Please recheck your allocation - further information is available on the
|
||||
orte_hosts man page.
|
||||
|
@ -60,3 +60,44 @@ The requested hosts were in this hostfile:
|
||||
|
||||
Please verify that you have specified the allocated resources properly in
|
||||
the provided hostfile.
|
||||
#
|
||||
[hostfile:relative-syntax]
|
||||
A relative host was specified, but no prior allocation has been made.
|
||||
Thus, there is no way to determine the proper host to be used.
|
||||
|
||||
hostfile entry: %s
|
||||
|
||||
Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[hostfile:relative-node-not-found]
|
||||
A relative host was specified, but was not found. The requested host was
|
||||
specified with as:
|
||||
|
||||
Index: %d
|
||||
Syntax given: %s
|
||||
|
||||
Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[hostfile:relative-node-out-of-bounds]
|
||||
A relative host was specified, but the index given is beyond the number
|
||||
of hosts in the current allocation:
|
||||
|
||||
Index: %d
|
||||
#hosts: %d
|
||||
|
||||
You could obtain a larger allocation or reduce the relative host index.
|
||||
Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[hostfile:invalid-relative-node-syntax]
|
||||
A relative host was improperly specified - the value provided was.
|
||||
|
||||
hostfile entry: %s
|
||||
|
||||
You may have forgotten to preface a node with 'N' or 'n', or used the 'e' or 'E' to indicate
|
||||
empty nodes. Please see the orte_hosts man page for further information.
|
||||
#
|
||||
[hostfile:not-enough-empty]
|
||||
The requested number of empty hosts was not available - the system was short by %d hosts.
|
||||
|
||||
Please recheck your allocation - further information is available on the
|
||||
orte_hosts man page.
|
||||
|
@ -223,6 +223,10 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = node_name;
|
||||
}
|
||||
} else if (ORTE_HOSTFILE_RELATIVE == token) {
|
||||
/* store this for later processing */
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = strdup(orte_util_hostfile_value.sval);
|
||||
} else {
|
||||
hostfile_parse_error(token);
|
||||
return ORTE_ERROR;
|
||||
@ -363,6 +367,13 @@ static int hostfile_parse(const char *hostfile, opal_list_t* updates, opal_list_
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_RELATIVE:
|
||||
rc = hostfile_parse_line(token, updates, exclude, keep_all);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto unlock;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
hostfile_parse_error(token);
|
||||
goto unlock;
|
||||
@ -403,6 +414,20 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* parse the nodes to check for any relative node directives */
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_node_t *node=(orte_node_t*)item;
|
||||
|
||||
if ('+' == node->name[0]) {
|
||||
orte_show_help("help-hostfile.txt", "hostfile:relative-syntax",
|
||||
true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* remove from the list of nodes those that are in the exclude list */
|
||||
while(NULL != (item = opal_list_remove_first(&exclude))) {
|
||||
orte_node_t *exnode = (orte_node_t*)item;
|
||||
@ -447,8 +472,12 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
opal_list_item_t *item1, *item2;
|
||||
orte_node_t *node_from_list, *node_from_file;
|
||||
bool node_found;
|
||||
int rc;
|
||||
|
||||
int rc = ORTE_SUCCESS;
|
||||
char *cptr;
|
||||
int num_empty, nodeidx, i, startempty=0;
|
||||
orte_node_t **nodepool, *newnode;
|
||||
bool want_all_empty = false;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
"%s hostfile: filtering nodes through hostfile %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
|
||||
@ -461,14 +490,140 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* remove from the list of newnodes those that are in the exclude list */
|
||||
while(NULL != (item1 = opal_list_remove_first(&exclude))) {
|
||||
/* setup for relative node syntax */
|
||||
nodepool = (orte_node_t**)orte_node_pool->addr;
|
||||
|
||||
/* parse the newnodes to process any relative node directives */
|
||||
item2 = opal_list_get_first(&newnodes);
|
||||
while (item2 != opal_list_get_end(&newnodes)) {
|
||||
orte_node_t *node=(orte_node_t*)item2;
|
||||
|
||||
/* save the next location in case this one gets removed */
|
||||
item1 = opal_list_get_next(item2);
|
||||
|
||||
if ('+' != node->name[0]) {
|
||||
item2 = item1;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* see if we specified empty nodes */
|
||||
if ('e' == node->name[1] ||
|
||||
'E' == node->name[1]) {
|
||||
/* request for empty nodes - do they want
|
||||
* all of them?
|
||||
*/
|
||||
if (NULL != (cptr = strchr(node->name, ':'))) {
|
||||
/* the colon indicates a specific # are requested */
|
||||
cptr++; /* step past : */
|
||||
num_empty = strtol(cptr, NULL, 10);
|
||||
} else {
|
||||
/* want them all - set num_empty to max */
|
||||
num_empty = INT_MAX;
|
||||
want_all_empty = true;
|
||||
}
|
||||
/* add empty nodes to newnodes list */
|
||||
if (!orte_hnp_is_allocated && 0 == startempty) {
|
||||
startempty = 1;
|
||||
}
|
||||
for (i=startempty; 0 < num_empty && i < orte_node_pool->size && NULL != nodepool[i]; i++) {
|
||||
if (0 == nodepool[i]->slots_inuse) {
|
||||
if (NULL == (newnode = hostfile_lookup(&newnodes, nodepool[i]->name))) {
|
||||
newnode = OBJ_NEW(orte_node_t);
|
||||
newnode->name = strdup(nodepool[i]->name);
|
||||
/* keep track of slots allocated in hostfile, if any - we
|
||||
* will use this down below to determine how many slots
|
||||
* to actually allocate
|
||||
*/
|
||||
newnode->slots = node->slots;
|
||||
}
|
||||
/* even if we didn't create it, we have to put it back on list
|
||||
* in a manner that preserves order
|
||||
*/
|
||||
opal_list_insert_pos(&newnodes, item1, &newnode->super);
|
||||
/* track number added */
|
||||
--num_empty;
|
||||
}
|
||||
}
|
||||
/* bookmark where we stopped in case they ask for more */
|
||||
startempty = i;
|
||||
/* did they get everything they wanted? */
|
||||
if (!want_all_empty && 0 < num_empty) {
|
||||
orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
|
||||
true, num_empty);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* since we have expanded the provided node, remove
|
||||
* it from list
|
||||
*/
|
||||
opal_list_remove_item(&newnodes, item2);
|
||||
OBJ_RELEASE(item2);
|
||||
} else if ('n' == node->name[1] ||
|
||||
'N' == node->name[1]) {
|
||||
/* they want a specific relative node #, so
|
||||
* look it up on global pool
|
||||
*/
|
||||
nodeidx = strtol(&node->name[2], NULL, 10);
|
||||
if (nodeidx < 0 ||
|
||||
nodeidx > (int)orte_node_pool->size) {
|
||||
/* this is an error */
|
||||
orte_show_help("help-hostfile.txt", "hostfile:relative-node-out-of-bounds",
|
||||
true, nodeidx, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* if the HNP is not allocated, then we need to
|
||||
* adjust the index as the node pool is offset
|
||||
* by one
|
||||
*/
|
||||
if (!orte_hnp_is_allocated) {
|
||||
nodeidx++;
|
||||
}
|
||||
/* see if that location is filled */
|
||||
if (NULL == nodepool[nodeidx]) {
|
||||
/* this is an error */
|
||||
orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
|
||||
true, nodeidx, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
if (NULL == (newnode = hostfile_lookup(&newnodes, nodepool[nodeidx]->name))) {
|
||||
newnode = OBJ_NEW(orte_node_t);
|
||||
newnode->name = strdup(nodepool[nodeidx]->name);
|
||||
/* keep track of slots allocated in hostfile, if any - we
|
||||
* will use this down below to determine how many slots
|
||||
* to actually allocate
|
||||
*/
|
||||
newnode->slots = node->slots;
|
||||
}
|
||||
/* even if we didn't create it, we have to put it back on list */
|
||||
opal_list_append(&newnodes, &newnode->super);
|
||||
/* since we have expanded the provided node, remove
|
||||
* it from list
|
||||
*/
|
||||
opal_list_remove_item(&newnodes, item2);
|
||||
OBJ_RELEASE(item2);
|
||||
} else {
|
||||
/* invalid relative node syntax */
|
||||
orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
|
||||
true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* move to next */
|
||||
item2 = item1;
|
||||
}
|
||||
|
||||
/* remove from the list of newnodes those that are in the exclude list
|
||||
* since we could have added duplicate names above due to the */
|
||||
while (NULL != (item1 = opal_list_remove_first(&exclude))) {
|
||||
node_from_file = (orte_node_t*)item1;
|
||||
/* check for matches on nodes */
|
||||
for (item2 = opal_list_get_first(&newnodes);
|
||||
item2 != opal_list_get_end(&newnodes);
|
||||
item2 = opal_list_get_next(item2)) {
|
||||
orte_node_t *node=(orte_node_t*)item2;
|
||||
orte_node_t *node = (orte_node_t*)item2;
|
||||
if (0 == strcmp(node_from_file->name, node->name)) {
|
||||
/* match - remove it */
|
||||
opal_list_remove_item(&newnodes, item2);
|
||||
@ -533,16 +688,21 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&newnodes);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_util_get_ordered_host_list(opal_list_t *nodes,
|
||||
char *hostfile)
|
||||
{
|
||||
opal_list_t exclude;
|
||||
opal_list_item_t *item, *itm;
|
||||
opal_list_item_t *item, *itm, *item2, *item1;
|
||||
char *cptr;
|
||||
int num_empty, i, nodeidx, startempty=0;
|
||||
bool want_all_empty;
|
||||
orte_node_t **nodepool, *newnode;
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
@ -556,6 +716,138 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* setup to parse relative syntax */
|
||||
nodepool = (orte_node_t**)orte_node_pool->addr;
|
||||
|
||||
/* parse the nodes to process any relative node directives */
|
||||
item2 = opal_list_get_first(nodes);
|
||||
while (item2 != opal_list_get_end(nodes)) {
|
||||
orte_node_t *node=(orte_node_t*)item2;
|
||||
|
||||
/* save the next location in case this one gets removed */
|
||||
item1 = opal_list_get_next(item2);
|
||||
|
||||
if ('+' != node->name[0]) {
|
||||
item2 = item1;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* see if we specified empty nodes */
|
||||
if ('e' == node->name[1] ||
|
||||
'E' == node->name[1]) {
|
||||
/* request for empty nodes - do they want
|
||||
* all of them?
|
||||
*/
|
||||
if (NULL != (cptr = strchr(node->name, ':'))) {
|
||||
/* the colon indicates a specific # are requested */
|
||||
cptr++; /* step past : */
|
||||
num_empty = strtol(cptr, NULL, 10);
|
||||
} else {
|
||||
/* want them all - set num_empty to max */
|
||||
num_empty = INT_MAX;
|
||||
want_all_empty = true;
|
||||
}
|
||||
/* insert empty nodes into newnodes list in place of the current item.
|
||||
* since item1 is the next item, we insert in front of it
|
||||
*/
|
||||
if (!orte_hnp_is_allocated && 0 == startempty) {
|
||||
startempty = 1;
|
||||
}
|
||||
for (i=startempty; 0 < num_empty && i < orte_node_pool->size && NULL != nodepool[i]; i++) {
|
||||
if (0 == nodepool[i]->slots_inuse) {
|
||||
newnode = OBJ_NEW(orte_node_t);
|
||||
newnode->name = strdup(nodepool[i]->name);
|
||||
/* if the slot count here is less than the
|
||||
* total slots avail on this node, set it
|
||||
* to the specified count - this allows people
|
||||
* to subdivide an allocation
|
||||
*/
|
||||
if (node->slots < nodepool[i]->slots) {
|
||||
newnode->slots_alloc = node->slots;
|
||||
} else {
|
||||
newnode->slots_alloc = nodepool[i]->slots;
|
||||
}
|
||||
opal_list_insert_pos(nodes, item1, &newnode->super);
|
||||
/* track number added */
|
||||
--num_empty;
|
||||
}
|
||||
}
|
||||
/* bookmark where we stopped in case they ask for more */
|
||||
startempty = i;
|
||||
/* did they get everything they wanted? */
|
||||
if (!want_all_empty && 0 < num_empty) {
|
||||
orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
|
||||
true, num_empty);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* since we have expanded the provided node, remove
|
||||
* it from list
|
||||
*/
|
||||
opal_list_remove_item(nodes, item2);
|
||||
OBJ_RELEASE(item2);
|
||||
} else if ('n' == node->name[1] ||
|
||||
'N' == node->name[1]) {
|
||||
/* they want a specific relative node #, so
|
||||
* look it up on global pool
|
||||
*/
|
||||
nodeidx = strtol(&node->name[2], NULL, 10);
|
||||
if (nodeidx < 0 ||
|
||||
nodeidx > (int)orte_node_pool->size) {
|
||||
/* this is an error */
|
||||
orte_show_help("help-hostfile.txt", "hostfile:relative-node-out-of-bounds",
|
||||
true, nodeidx, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* if the HNP is not allocated, then we need to
|
||||
* adjust the index as the node pool is offset
|
||||
* by one
|
||||
*/
|
||||
if (!orte_hnp_is_allocated) {
|
||||
nodeidx++;
|
||||
}
|
||||
/* see if that location is filled */
|
||||
|
||||
if (NULL == nodepool[nodeidx]) {
|
||||
/* this is an error */
|
||||
orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
|
||||
true, nodeidx, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* create the node object */
|
||||
newnode = OBJ_NEW(orte_node_t);
|
||||
newnode->name = strdup(nodepool[nodeidx]->name);
|
||||
/* if the slot count here is less than the
|
||||
* total slots avail on this node, set it
|
||||
* to the specified count - this allows people
|
||||
* to subdivide an allocation
|
||||
*/
|
||||
if (node->slots < nodepool[nodeidx]->slots) {
|
||||
newnode->slots_alloc = node->slots;
|
||||
} else {
|
||||
newnode->slots_alloc = nodepool[nodeidx]->slots;
|
||||
}
|
||||
/* insert it before item1 */
|
||||
opal_list_insert_pos(nodes, item1, &newnode->super);
|
||||
/* since we have expanded the provided node, remove
|
||||
* it from list
|
||||
*/
|
||||
opal_list_remove_item(nodes, item2);
|
||||
OBJ_RELEASE(item2);
|
||||
} else {
|
||||
/* invalid relative node syntax */
|
||||
orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
|
||||
true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* move to next */
|
||||
item2 = item1;
|
||||
}
|
||||
|
||||
/* remove from the list of nodes those that are in the exclude list */
|
||||
while(NULL != (item = opal_list_remove_first(&exclude))) {
|
||||
orte_node_t *exnode = (orte_node_t*)item;
|
||||
|
@ -71,5 +71,6 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
|
||||
#define ORTE_HOSTFILE_NEWLINE 13
|
||||
#define ORTE_HOSTFILE_IPV6 14
|
||||
#define ORTE_HOSTFILE_SLOT 15
|
||||
#define ORTE_HOSTFILE_RELATIVE 16
|
||||
|
||||
#endif
|
||||
|
@ -120,6 +120,12 @@ username { orte_util_hostfile_value.sval = yytext;
|
||||
"user_name" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_USERNAME; }
|
||||
|
||||
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_RELATIVE; }
|
||||
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_RELATIVE; }
|
||||
|
||||
|
||||
[0-9]+ { orte_util_hostfile_value.ival = atol(yytext);
|
||||
return ORTE_HOSTFILE_INT; }
|
||||
%{ /* First detect hosts as standard Strings (but without ".")
|
||||
|
@ -62,6 +62,132 @@ order as in the prior table, with the RM providing the initial pool of nodes.
|
||||
.sp
|
||||
.
|
||||
.\" **************************
|
||||
.\" Relative Indexing
|
||||
.\" **************************
|
||||
.SH RELATIVE INDEXING
|
||||
.
|
||||
.PP
|
||||
Once an initial allocation has been specified (whether by an RM, default hostfile, or hostfile),
|
||||
subsequent hostfile and -host specifications can be made using relative indexing. This allows a
|
||||
user to stipulate which hosts are to be used for a given app_context without specifying the
|
||||
particular host name, but rather its relative position in the allocation.
|
||||
.sp
|
||||
This can probably best be understood through consideration of a few examples. Consider the case
|
||||
where an RM has allocated a set of nodes to the user named "foo1, foo2, foo3, foo4". The user
|
||||
wants the first app_context to have exclusive use of the first two nodes, and a second app_context
|
||||
to use the last two nodes. Of course, the user could printout the allocation to find the names
|
||||
of the nodes allocated to them and then use -host to specify this layout, but this is cumbersome
|
||||
and would require hand-manipulation for every invocation.
|
||||
.sp
|
||||
A simpler method is to utilize OpenRTE's relative indexing capability to specify the desired
|
||||
layout. In this case, a command line of:
|
||||
.sp
|
||||
mpirun -pernode -host +n1,+n2 ./app1 : -host +n3,+n4 ./app2
|
||||
.sp
|
||||
.PP
|
||||
would provide the desired pattern. The "+" syntax indicates that the information is being
|
||||
provided as a relative index to the existing allocation. Two methods of relative indexing
|
||||
are supported:
|
||||
.sp
|
||||
.TP
|
||||
.B +n<#>
|
||||
A relative index into the allocation referencing the <#> node. OpenRTE will substitute
|
||||
the <#> node in the allocation
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B +e[:<#>]
|
||||
A request for <#> empty nodes - i.e., OpenRTE is to substitute this reference with
|
||||
<#> nodes that have not yet been used by any other app_context. If the ":<#>" is not
|
||||
provided, OpenRTE will substitute the reference with all empty nodes. Note that OpenRTE
|
||||
does track the empty nodes that have been assigned in this manner, so multiple
|
||||
uses of this option will result in assignment of unique nodes up to the limit of the
|
||||
available empty nodes. Requests for more empty nodes than are available will generate
|
||||
an error.
|
||||
.sp
|
||||
.PP
|
||||
Relative indexing can be combined with absolute naming of hosts in any arbitrary manner,
|
||||
and can be used in hostfiles as well as with the -host command line option. In addition,
|
||||
any slot specification provided in hostfiles will be respected - thus, a user can specify
|
||||
that only a certain number of slots from a relative indexed host are to be used for a
|
||||
given app_context.
|
||||
.sp
|
||||
Another example may help illustrate this point. Consider the case where a user has a default
|
||||
hostfile containing:
|
||||
.sp
|
||||
.nf
|
||||
dummy1 slots=4
|
||||
dummy2 slots=4
|
||||
dummy3 slots=4
|
||||
dummy4 slots=4
|
||||
dummy5 slots=4
|
||||
.fi
|
||||
.sp
|
||||
.PP
|
||||
This may, for example, be a hostfile that describes a set of commonly-used resources that
|
||||
the user wishes to execute applications against. For this particular application, the user
|
||||
plans to map byslot, and wants the first two ranks to be on the second node of any allocation,
|
||||
the next ranks to land on an empty node, have one rank specifically on dummy4, the next rank
|
||||
to be on the second node of the allocation again, and finally any remaining ranks to be on
|
||||
whatever empty nodes are left. To accomplish this, the user provides a hostfile of:
|
||||
.sp
|
||||
.nf
|
||||
+n2 slots=2
|
||||
+e:1
|
||||
dummy4 slots=1
|
||||
+n2
|
||||
+e
|
||||
.fi
|
||||
.sp
|
||||
.PP
|
||||
The user can now use this information in combination with OpenRTE's sequential mapper to
|
||||
obtain their specific layout:
|
||||
.sp
|
||||
.nf
|
||||
mpirun --default-hostfile dummyhosts -hostfile mylayout -mca rmaps seq ./my_app
|
||||
.fi
|
||||
.sp
|
||||
.PP
|
||||
which will result in:
|
||||
.nf
|
||||
.sp
|
||||
rank0 being mapped to dummy3
|
||||
.br
|
||||
rank1 to dummy1 as the first empty node
|
||||
.br
|
||||
rank2 to dummy4
|
||||
.br
|
||||
rank3 to dummy3
|
||||
.br
|
||||
rank4 to dummy2 and rank5 to dummy5 as the last remaining unused nodes
|
||||
.sp
|
||||
.fi
|
||||
Note that the sequential mapper ignores the number of slots arguments as it only
|
||||
maps one rank at a time to each node in the list.
|
||||
.sp
|
||||
If the default round-robin mapper had been used, then the mapping would have resulted in:
|
||||
.sp
|
||||
.nf
|
||||
ranks 0 and 1 being mapped to dummy3 since two slots were specified
|
||||
.br
|
||||
ranks 2-5 on dummy1 as the first empty node, which has four slots
|
||||
.br
|
||||
rank6 on dummy4 since the hostfile specifies only a single slot from that node is to be used
|
||||
.br
|
||||
ranks 7 and 8 on dummy3 since only two slots remain available
|
||||
.br
|
||||
ranks 9-12 on dummy2 since it is the next available empty node and has four slots
|
||||
.br
|
||||
ranks 13-16 on dummy5 since it is the last remaining unused node and has four slots
|
||||
.fi
|
||||
.sp
|
||||
.PP
|
||||
Thus, the use of relative indexing can allow for complex mappings to be ported across
|
||||
allocations, including those obtained from automated resource managers, without the need
|
||||
for manual manipulation of scripts and/or command lines.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user