1
1

Add relative indexing capabilities for hostfile and -host - we can now reference hosts using a relative syntax.

See the orte_hosts manpage for an explanation

This commit was SVN r19364.
Этот коммит содержится в:
Ralph Castain 2008-08-19 15:16:27 +00:00
родитель ef2bb46e45
Коммит 6d82efba21
7 изменённых файлов: 649 добавлений и 49 удалений

Просмотреть файл

@ -32,7 +32,10 @@
#include "dash_host.h"
/* we can only enter this routine if no other allocation
* was found, so we only need to know that finding any
* relative node syntax should generate an immediate error
*/
int orte_util_add_dash_host_nodes(opal_list_t *nodes,
bool *override_oversubscribed,
char ** host_argv)
@ -71,6 +74,17 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
bump the slots count for each duplicate */
for (i = 0; NULL != mapped_nodes[i]; ++i) {
/* if the specified node contains a relative node syntax,
* this is an error
*/
if ('+' == mapped_nodes[i][0]) {
orte_show_help("help-dash-host.txt", "dash-host:relative-syntax",
true, mapped_nodes[i]);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* see if the node is already on the list */
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
@ -127,16 +141,23 @@ cleanup:
}
/* the -host option can always be used in both absolute
* and relative mode, so we have to check for pre-existing
* allocations if we are to use relative node syntax
*/
int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
char** host_argv)
{
opal_list_item_t* item;
bool found;
opal_list_item_t *next;
orte_std_cntr_t i, j, k;
orte_std_cntr_t i, j, k, len_mapped_node=0;
int rc;
char **mapped_nodes = NULL, **mini_map;
orte_node_t *node;
char **mapped_nodes = NULL, **mini_map, *cptr;
orte_node_t *node, **nodepool;
int nodeidx;
int num_empty=0;
bool want_all_empty = false;
/* if the incoming node list is empty, then there
* is nothing to filter!
@ -145,36 +166,89 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
return ORTE_SUCCESS;
}
/* setup for relative node syntax */
nodepool = (orte_node_t**)orte_node_pool->addr;
/* Accumulate all of the host name mappings */
for (j = 0; j < opal_argv_count(host_argv); ++j) {
mini_map = opal_argv_split(host_argv[j], ',');
if (mapped_nodes == NULL) {
mapped_nodes = mini_map;
} else {
for (k = 0; NULL != mini_map[k]; ++k) {
rc = opal_argv_append_nosize(&mapped_nodes,
mini_map[k]);
if (OPAL_SUCCESS != rc) {
for (k = 0; NULL != mini_map[k]; ++k) {
if ('+' == mini_map[k][0]) {
/* see if we specified empty nodes */
if ('e' == mini_map[k][1] ||
'E' == mini_map[k][1]) {
/* request for empty nodes - do they want
* all of them?
*/
if (NULL != (cptr = strchr(mini_map[k], ':'))) {
/* the colon indicates a specific # are requested */
cptr++; /* step past : */
num_empty += strtol(cptr, NULL, 10);
} else {
/* want them all - set num_empty to max */
num_empty = INT_MAX;
want_all_empty = true;
}
} else if ('n' == mini_map[k][1] ||
'N' == mini_map[k][1]) {
/* they want a specific relative node #, so
* look it up on global pool
*/
nodeidx = strtol(&mini_map[k][2], NULL, 10);
if (nodeidx < 0 ||
nodeidx > (int)orte_node_pool->size) {
/* this is an error */
orte_show_help("help-dash-host.txt", "dash-host:relative-node-out-of-bounds",
true, nodeidx, mini_map[k]);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* if the HNP is not allocated, then we need to
* adjust the index as the node pool is offset
* by one
*/
if (!orte_hnp_is_allocated) {
nodeidx++;
}
/* see if that location is filled */
if (NULL == nodepool[nodeidx]) {
/* this is an error */
orte_show_help("help-dash-host.txt", "dash-host:relative-node-not-found",
true, nodeidx, mini_map[k]);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* add this node to the list */
opal_argv_append_nosize(&mapped_nodes, nodepool[nodeidx]->name);
} else {
/* invalid relative node syntax */
orte_show_help("help-dash-host.txt", "dash-host:invalid-relative-node-syntax",
true, mini_map[k]);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
} else { /* non-relative syntax - add to list */
if (OPAL_SUCCESS != (rc = opal_argv_append_nosize(&mapped_nodes,
mini_map[k]))) {
goto cleanup;
}
}
opal_argv_free(mini_map);
}
opal_argv_free(mini_map);
}
/* Did we find anything? If not, then do nothing */
if (NULL == mapped_nodes) {
if (NULL == mapped_nodes && 0 == num_empty) {
return ORTE_SUCCESS;
}
/* we found some info - filter what is on the list...
* i.e., go through the list and remove any nodes that
* were -not- included on the -host list
*/
j=0;
k = opal_argv_count(mapped_nodes);
len_mapped_node = opal_argv_count(mapped_nodes);
item = opal_list_get_first(nodes);
while (item != opal_list_get_end(nodes)) {
/* hang on to next item in case this one gets removed */
@ -182,25 +256,33 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
node = (orte_node_t*)item;
/* search -host list to see if this one is found */
found = false;
for (i = 0; NULL != mapped_nodes[i]; ++i) {
for (i = 0; i < len_mapped_node; ++i) {
/* we have a match if one of two conditions is met:
* 1. the node_name and mapped_nodes directly match
* 2. the node_name is the local system name AND
* either the mapped_node is "localhost" OR it
* is a local interface as found by opal_ifislocal
*/
if (0 == strcmp(node->name, mapped_nodes[i]) ||
(0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
* 1. the node_name and mapped_nodes directly match
* 2. the node_name is the local system name AND
* either the mapped_node is "localhost" OR it
* is a local interface as found by opal_ifislocal
*/
if (NULL != mapped_nodes[i] &&
(0 == strcmp(node->name, mapped_nodes[i]) ||
(0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i]))))) {
found = true; /* found it - leave it alone */
j++;
/* keep cycling here in case there are multiple instances
* of the node on the mapped_node array - this will
* allow us to properly account for them all so we don't
* think something was specified but wasn't found
*/
free(mapped_nodes[i]);
mapped_nodes[i] = NULL;
}
}
if (!found) {
/* if this node wasn't found on the list of explicitly called-out
* nodes, see if we wanted empty nodes and if this one is empty
*/
if (0 < num_empty && 0 == node->slots_inuse) {
/* both true - keep this one */
found = true;
--num_empty;
}
}
if (!found) {
opal_list_remove_item(nodes, item);
OBJ_RELEASE(item);
@ -209,22 +291,34 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
}
/* was something specified that was -not- found? */
if (j < k) {
char *tmp;
tmp = opal_argv_join(mapped_nodes, ',');
orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
true, tmp);
free(tmp);
for (i=0; i < len_mapped_node; i++) {
if (NULL != mapped_nodes[i]) {
orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
true, mapped_nodes[i]);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
}
/* did they ask for more than we could provide */
if (!want_all_empty && 0 < num_empty) {
orte_show_help("help-dash-host.txt", "dash-host:not-enough-empty",
true, num_empty);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
rc = ORTE_SUCCESS;
/* done filtering existing list */
cleanup:
for (i=0; i < len_mapped_node; i++) {
if (NULL != mapped_nodes[i]) {
free(mapped_nodes[i]);
mapped_nodes[i] = NULL;
}
}
if (NULL != mapped_nodes) {
opal_argv_free(mapped_nodes);
free(mapped_nodes);
}
return rc;

Просмотреть файл

@ -20,9 +20,49 @@
#
[not-all-mapped-alloc]
Some of the requested hosts are not included in the current allocation.
At least one of the requested hosts is not included in the current allocation.
The requested hosts were specified with --host as:
%s
Missing requested host: %s
Please check your allocation or your request.
#
[dash-host:relative-syntax]
A relative host was specified, but no prior allocation has been made.
Thus, there is no way to determine the proper host to be used.
-host: %s
Please see the orte_hosts man page for further information.
#
[dash-host:relative-node-not-found]
A relative host was specified, but was not found. The requested host was
specified with --host as:
Index: %d
Syntax given: %s
Please see the orte_hosts man page for further information.
#
[dash-host:relative-node-out-of-bounds]
A relative host was specified, but the index given is beyond the number
of hosts in the current allocation:
Index: %d
#hosts: %d
You could obtain a larger allocation or reduce the relative host index.
Please see the orte_hosts man page for further information.
#
[dash-host:invalid-relative-node-syntax]
A relative host was improperly specified - the value provided was.
-host: %s
You may have forgotten to preface a node with 'N' or 'n', or used the 'e' or 'E' to indicate
empty nodes. Please see the orte_hosts man page for further information.
#
[dash-host:not-enough-empty]
The requested number of empty hosts was not available - the system was short by %d hosts.
Please recheck your allocation - further information is available on the
orte_hosts man page.

Просмотреть файл

@ -60,3 +60,44 @@ The requested hosts were in this hostfile:
Please verify that you have specified the allocated resources properly in
the provided hostfile.
#
[hostfile:relative-syntax]
A relative host was specified, but no prior allocation has been made.
Thus, there is no way to determine the proper host to be used.
hostfile entry: %s
Please see the orte_hosts man page for further information.
#
[hostfile:relative-node-not-found]
A relative host was specified, but was not found. The requested host was
specified with as:
Index: %d
Syntax given: %s
Please see the orte_hosts man page for further information.
#
[hostfile:relative-node-out-of-bounds]
A relative host was specified, but the index given is beyond the number
of hosts in the current allocation:
Index: %d
#hosts: %d
You could obtain a larger allocation or reduce the relative host index.
Please see the orte_hosts man page for further information.
#
[hostfile:invalid-relative-node-syntax]
A relative host was improperly specified - the value provided was.
hostfile entry: %s
You may have forgotten to preface a node with 'N' or 'n', or used the 'e' or 'E' to indicate
empty nodes. Please see the orte_hosts man page for further information.
#
[hostfile:not-enough-empty]
The requested number of empty hosts was not available - the system was short by %d hosts.
Please recheck your allocation - further information is available on the
orte_hosts man page.

Просмотреть файл

@ -223,6 +223,10 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node = OBJ_NEW(orte_node_t);
node->name = node_name;
}
} else if (ORTE_HOSTFILE_RELATIVE == token) {
/* store this for later processing */
node = OBJ_NEW(orte_node_t);
node->name = strdup(orte_util_hostfile_value.sval);
} else {
hostfile_parse_error(token);
return ORTE_ERROR;
@ -363,6 +367,13 @@ static int hostfile_parse(const char *hostfile, opal_list_t* updates, opal_list_
}
break;
case ORTE_HOSTFILE_RELATIVE:
rc = hostfile_parse_line(token, updates, exclude, keep_all);
if (ORTE_SUCCESS != rc) {
goto unlock;
}
break;
default:
hostfile_parse_error(token);
goto unlock;
@ -403,6 +414,20 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
goto cleanup;
}
/* parse the nodes to check for any relative node directives */
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
orte_node_t *node=(orte_node_t*)item;
if ('+' == node->name[0]) {
orte_show_help("help-hostfile.txt", "hostfile:relative-syntax",
true, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
}
/* remove from the list of nodes those that are in the exclude list */
while(NULL != (item = opal_list_remove_first(&exclude))) {
orte_node_t *exnode = (orte_node_t*)item;
@ -447,8 +472,12 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
opal_list_item_t *item1, *item2;
orte_node_t *node_from_list, *node_from_file;
bool node_found;
int rc;
int rc = ORTE_SUCCESS;
char *cptr;
int num_empty, nodeidx, i, startempty=0;
orte_node_t **nodepool, *newnode;
bool want_all_empty = false;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s hostfile: filtering nodes through hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
@ -461,14 +490,140 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
return rc;
}
/* remove from the list of newnodes those that are in the exclude list */
while(NULL != (item1 = opal_list_remove_first(&exclude))) {
/* setup for relative node syntax */
nodepool = (orte_node_t**)orte_node_pool->addr;
/* parse the newnodes to process any relative node directives */
item2 = opal_list_get_first(&newnodes);
while (item2 != opal_list_get_end(&newnodes)) {
orte_node_t *node=(orte_node_t*)item2;
/* save the next location in case this one gets removed */
item1 = opal_list_get_next(item2);
if ('+' != node->name[0]) {
item2 = item1;
continue;
}
/* see if we specified empty nodes */
if ('e' == node->name[1] ||
'E' == node->name[1]) {
/* request for empty nodes - do they want
* all of them?
*/
if (NULL != (cptr = strchr(node->name, ':'))) {
/* the colon indicates a specific # are requested */
cptr++; /* step past : */
num_empty = strtol(cptr, NULL, 10);
} else {
/* want them all - set num_empty to max */
num_empty = INT_MAX;
want_all_empty = true;
}
/* add empty nodes to newnodes list */
if (!orte_hnp_is_allocated && 0 == startempty) {
startempty = 1;
}
for (i=startempty; 0 < num_empty && i < orte_node_pool->size && NULL != nodepool[i]; i++) {
if (0 == nodepool[i]->slots_inuse) {
if (NULL == (newnode = hostfile_lookup(&newnodes, nodepool[i]->name))) {
newnode = OBJ_NEW(orte_node_t);
newnode->name = strdup(nodepool[i]->name);
/* keep track of slots allocated in hostfile, if any - we
* will use this down below to determine how many slots
* to actually allocate
*/
newnode->slots = node->slots;
}
/* even if we didn't create it, we have to put it back on list
* in a manner that preserves order
*/
opal_list_insert_pos(&newnodes, item1, &newnode->super);
/* track number added */
--num_empty;
}
}
/* bookmark where we stopped in case they ask for more */
startempty = i;
/* did they get everything they wanted? */
if (!want_all_empty && 0 < num_empty) {
orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
true, num_empty);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* since we have expanded the provided node, remove
* it from list
*/
opal_list_remove_item(&newnodes, item2);
OBJ_RELEASE(item2);
} else if ('n' == node->name[1] ||
'N' == node->name[1]) {
/* they want a specific relative node #, so
* look it up on global pool
*/
nodeidx = strtol(&node->name[2], NULL, 10);
if (nodeidx < 0 ||
nodeidx > (int)orte_node_pool->size) {
/* this is an error */
orte_show_help("help-hostfile.txt", "hostfile:relative-node-out-of-bounds",
true, nodeidx, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* if the HNP is not allocated, then we need to
* adjust the index as the node pool is offset
* by one
*/
if (!orte_hnp_is_allocated) {
nodeidx++;
}
/* see if that location is filled */
if (NULL == nodepool[nodeidx]) {
/* this is an error */
orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
true, nodeidx, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
if (NULL == (newnode = hostfile_lookup(&newnodes, nodepool[nodeidx]->name))) {
newnode = OBJ_NEW(orte_node_t);
newnode->name = strdup(nodepool[nodeidx]->name);
/* keep track of slots allocated in hostfile, if any - we
* will use this down below to determine how many slots
* to actually allocate
*/
newnode->slots = node->slots;
}
/* even if we didn't create it, we have to put it back on list */
opal_list_append(&newnodes, &newnode->super);
/* since we have expanded the provided node, remove
* it from list
*/
opal_list_remove_item(&newnodes, item2);
OBJ_RELEASE(item2);
} else {
/* invalid relative node syntax */
orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
true, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* move to next */
item2 = item1;
}
/* remove from the list of newnodes those that are in the exclude list
* since we could have added duplicate names above due to the */
while (NULL != (item1 = opal_list_remove_first(&exclude))) {
node_from_file = (orte_node_t*)item1;
/* check for matches on nodes */
for (item2 = opal_list_get_first(&newnodes);
item2 != opal_list_get_end(&newnodes);
item2 = opal_list_get_next(item2)) {
orte_node_t *node=(orte_node_t*)item2;
orte_node_t *node = (orte_node_t*)item2;
if (0 == strcmp(node_from_file->name, node->name)) {
/* match - remove it */
opal_list_remove_item(&newnodes, item2);
@ -533,16 +688,21 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
return ORTE_ERR_SILENT;
}
cleanup:
OBJ_DESTRUCT(&newnodes);
return ORTE_SUCCESS;
return rc;
}
int orte_util_get_ordered_host_list(opal_list_t *nodes,
char *hostfile)
{
opal_list_t exclude;
opal_list_item_t *item, *itm;
opal_list_item_t *item, *itm, *item2, *item1;
char *cptr;
int num_empty, i, nodeidx, startempty=0;
bool want_all_empty;
orte_node_t **nodepool, *newnode;
int rc;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
@ -556,6 +716,138 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
goto cleanup;
}
/* setup to parse relative syntax */
nodepool = (orte_node_t**)orte_node_pool->addr;
/* parse the nodes to process any relative node directives */
item2 = opal_list_get_first(nodes);
while (item2 != opal_list_get_end(nodes)) {
orte_node_t *node=(orte_node_t*)item2;
/* save the next location in case this one gets removed */
item1 = opal_list_get_next(item2);
if ('+' != node->name[0]) {
item2 = item1;
continue;
}
/* see if we specified empty nodes */
if ('e' == node->name[1] ||
'E' == node->name[1]) {
/* request for empty nodes - do they want
* all of them?
*/
if (NULL != (cptr = strchr(node->name, ':'))) {
/* the colon indicates a specific # are requested */
cptr++; /* step past : */
num_empty = strtol(cptr, NULL, 10);
} else {
/* want them all - set num_empty to max */
num_empty = INT_MAX;
want_all_empty = true;
}
/* insert empty nodes into newnodes list in place of the current item.
* since item1 is the next item, we insert in front of it
*/
if (!orte_hnp_is_allocated && 0 == startempty) {
startempty = 1;
}
for (i=startempty; 0 < num_empty && i < orte_node_pool->size && NULL != nodepool[i]; i++) {
if (0 == nodepool[i]->slots_inuse) {
newnode = OBJ_NEW(orte_node_t);
newnode->name = strdup(nodepool[i]->name);
/* if the slot count here is less than the
* total slots avail on this node, set it
* to the specified count - this allows people
* to subdivide an allocation
*/
if (node->slots < nodepool[i]->slots) {
newnode->slots_alloc = node->slots;
} else {
newnode->slots_alloc = nodepool[i]->slots;
}
opal_list_insert_pos(nodes, item1, &newnode->super);
/* track number added */
--num_empty;
}
}
/* bookmark where we stopped in case they ask for more */
startempty = i;
/* did they get everything they wanted? */
if (!want_all_empty && 0 < num_empty) {
orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
true, num_empty);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* since we have expanded the provided node, remove
* it from list
*/
opal_list_remove_item(nodes, item2);
OBJ_RELEASE(item2);
} else if ('n' == node->name[1] ||
'N' == node->name[1]) {
/* they want a specific relative node #, so
* look it up on global pool
*/
nodeidx = strtol(&node->name[2], NULL, 10);
if (nodeidx < 0 ||
nodeidx > (int)orte_node_pool->size) {
/* this is an error */
orte_show_help("help-hostfile.txt", "hostfile:relative-node-out-of-bounds",
true, nodeidx, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* if the HNP is not allocated, then we need to
* adjust the index as the node pool is offset
* by one
*/
if (!orte_hnp_is_allocated) {
nodeidx++;
}
/* see if that location is filled */
if (NULL == nodepool[nodeidx]) {
/* this is an error */
orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
true, nodeidx, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* create the node object */
newnode = OBJ_NEW(orte_node_t);
newnode->name = strdup(nodepool[nodeidx]->name);
/* if the slot count here is less than the
* total slots avail on this node, set it
* to the specified count - this allows people
* to subdivide an allocation
*/
if (node->slots < nodepool[nodeidx]->slots) {
newnode->slots_alloc = node->slots;
} else {
newnode->slots_alloc = nodepool[nodeidx]->slots;
}
/* insert it before item1 */
opal_list_insert_pos(nodes, item1, &newnode->super);
/* since we have expanded the provided node, remove
* it from list
*/
opal_list_remove_item(nodes, item2);
OBJ_RELEASE(item2);
} else {
/* invalid relative node syntax */
orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
true, node->name);
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* move to next */
item2 = item1;
}
/* remove from the list of nodes those that are in the exclude list */
while(NULL != (item = opal_list_remove_first(&exclude))) {
orte_node_t *exnode = (orte_node_t*)item;

Просмотреть файл

@ -71,5 +71,6 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
#define ORTE_HOSTFILE_NEWLINE 13
#define ORTE_HOSTFILE_IPV6 14
#define ORTE_HOSTFILE_SLOT 15
#define ORTE_HOSTFILE_RELATIVE 16
#endif

Просмотреть файл

@ -120,6 +120,12 @@ username { orte_util_hostfile_value.sval = yytext;
"user_name" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_USERNAME; }
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_RELATIVE; }
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_RELATIVE; }
[0-9]+ { orte_util_hostfile_value.ival = atol(yytext);
return ORTE_HOSTFILE_INT; }
%{ /* First detect hosts as standard Strings (but without ".")

Просмотреть файл

@ -62,6 +62,132 @@ order as in the prior table, with the RM providing the initial pool of nodes.
.sp
.
.\" **************************
.\" Relative Indexing
.\" **************************
.SH RELATIVE INDEXING
.
.PP
Once an initial allocation has been specified (whether by an RM, default hostfile, or hostfile),
subsequent hostfile and -host specifications can be made using relative indexing. This allows a
user to stipulate which hosts are to be used for a given app_context without specifying the
particular host name, but rather its relative position in the allocation.
.sp
This can probably best be understood through consideration of a few examples. Consider the case
where an RM has allocated a set of nodes to the user named "foo1, foo2, foo3, foo4". The user
wants the first app_context to have exclusive use of the first two nodes, and a second app_context
to use the last two nodes. Of course, the user could printout the allocation to find the names
of the nodes allocated to them and then use -host to specify this layout, but this is cumbersome
and would require hand-manipulation for every invocation.
.sp
A simpler method is to utilize OpenRTE's relative indexing capability to specify the desired
layout. In this case, a command line of:
.sp
mpirun -pernode -host +n1,+n2 ./app1 : -host +n3,+n4 ./app2
.sp
.PP
would provide the desired pattern. The "+" syntax indicates that the information is being
provided as a relative index to the existing allocation. Two methods of relative indexing
are supported:
.sp
.TP
.B +n<#>
A relative index into the allocation referencing the <#> node. OpenRTE will substitute
the <#> node in the allocation
.
.
.TP
.B +e[:<#>]
A request for <#> empty nodes - i.e., OpenRTE is to substitute this reference with
<#> nodes that have not yet been used by any other app_context. If the ":<#>" is not
provided, OpenRTE will substitute the reference with all empty nodes. Note that OpenRTE
does track the empty nodes that have been assigned in this manner, so multiple
uses of this option will result in assignment of unique nodes up to the limit of the
available empty nodes. Requests for more empty nodes than are available will generate
an error.
.sp
.PP
Relative indexing can be combined with absolute naming of hosts in any arbitrary manner,
and can be used in hostfiles as well as with the -host command line option. In addition,
any slot specification provided in hostfiles will be respected - thus, a user can specify
that only a certain number of slots from a relative indexed host are to be used for a
given app_context.
.sp
Another example may help illustrate this point. Consider the case where a user has a default
hostfile containing:
.sp
.nf
dummy1 slots=4
dummy2 slots=4
dummy3 slots=4
dummy4 slots=4
dummy5 slots=4
.fi
.sp
.PP
This may, for example, be a hostfile that describes a set of commonly-used resources that
the user wishes to execute applications against. For this particular application, the user
plans to map byslot, and wants the first two ranks to be on the second node of any allocation,
the next ranks to land on an empty node, have one rank specifically on dummy4, the next rank
to be on the second node of the allocation again, and finally any remaining ranks to be on
whatever empty nodes are left. To accomplish this, the user provides a hostfile of:
.sp
.nf
+n2 slots=2
+e:1
dummy4 slots=1
+n2
+e
.fi
.sp
.PP
The user can now use this information in combination with OpenRTE's sequential mapper to
obtain their specific layout:
.sp
.nf
mpirun --default-hostfile dummyhosts -hostfile mylayout -mca rmaps seq ./my_app
.fi
.sp
.PP
which will result in:
.nf
.sp
rank0 being mapped to dummy3
.br
rank1 to dummy1 as the first empty node
.br
rank2 to dummy4
.br
rank3 to dummy3
.br
rank4 to dummy2 and rank5 to dummy5 as the last remaining unused nodes
.sp
.fi
Note that the sequential mapper ignores the number of slots arguments as it only
maps one rank at a time to each node in the list.
.sp
If the default round-robin mapper had been used, then the mapping would have resulted in:
.sp
.nf
ranks 0 and 1 being mapped to dummy3 since two slots were specified
.br
ranks 2-5 on dummy1 as the first empty node, which has four slots
.br
rank6 on dummy4 since the hostfile specifies only a single slot from that node is to be used
.br
ranks 7 and 8 on dummy3 since only two slots remain available
.br
ranks 9-12 on dummy2 since it is the next available empty node and has four slots
.br
ranks 13-16 on dummy5 since it is the last remaining unused node and has four slots
.fi
.sp
.PP
Thus, the use of relative indexing can allow for complex mappings to be ported across
allocations, including those obtained from automated resource managers, without the need
for manual manipulation of scripts and/or command lines.
.
.
.\" **************************
.\" See Also Section
.\" **************************
.