1
1

Resolve an unexpected behavior in hostfile allocations. Now that we filter allocations to determine what will be used for mapping, let the initial global pool be the union of nodes from all sources (default hostfile, hostfiles, and dash-hosts). Each app will filter down to only those specified for it using its own hostfile and dash-host options.

cmr=v1.7.4:reviewer=jsquyres:subject=Resolve an unexpected behavior in hostfile allocations

This commit was SVN r30040.
Этот коммит содержится в:
Ralph Castain 2013-12-21 01:38:27 +00:00
родитель 6f6c3cc21c
Коммит 9c768df8b8
4 изменённых файлов: 113 добавлений и 65 удалений

Просмотреть файл

@ -52,13 +52,12 @@
/* function to display allocation */
void orte_ras_base_display_alloc(void)
{
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL;
char *tmp=NULL, *tmp2, *tmp3;
int i, istart;
orte_node_t *alloc;
if (orte_xml_output) {
asprintf(&tmp, "<allocation>\n");
pfx = "\t";
} else {
asprintf(&tmp, "\n====================== ALLOCATED NODES ======================\n");
}
@ -71,7 +70,16 @@ void orte_ras_base_display_alloc(void)
if (NULL == (alloc = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
opal_dss.print(&tmp2, pfx, alloc, ORTE_NODE);
if (orte_xml_output) {
/* need to create the output in XML format */
asprintf(&tmp2, "\t<host name=\"%s\" slots=\"%d\" max_slots=\"%d\" slots_inuse=\"%d\">\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
} else {
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
}
if (NULL == tmp) {
tmp = tmp2;
} else {
@ -85,7 +93,7 @@ void orte_ras_base_display_alloc(void)
fprintf(orte_xml_fp, "%s</allocation>\n", tmp);
fflush(orte_xml_fp);
} else {
opal_output(orte_clean_output, "%s\n\n=================================================================\n", tmp);
opal_output(orte_clean_output, "%s=================================================================\n", tmp);
}
free(tmp);
}
@ -103,7 +111,6 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
orte_std_cntr_t i;
orte_app_context_t *app;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
bool default_hostfile_used;
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate",
@ -229,17 +236,31 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
* the resulting list contains the UNION of all nodes specified
* in hostfiles from across all app_contexts
*
* Any app that has no hostfile but has a dash-host, will have
* those nodes added to the list
*
* Any app that fails to have a hostfile or a dash-host will be given the
* default hostfile, if we have it
* We then continue to add any hosts provided by dash-host and
* the default hostfile, if we have it. We will then filter out
* all the non-desired hosts (i.e., those not specified by
* -host and/or -hostfile) when we start the mapping process
*
* Note that any relative node syntax found in the hostfiles will
* generate an error in this scenario, so only non-relative syntax
* can be present
*/
default_hostfile_used = false;
if (NULL != orte_default_hostfile) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate parsing default hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_default_hostfile));
/* a default hostfile was provided - parse it */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
orte_default_hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
}
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
@ -279,25 +300,6 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy);
return;
}
} else if (!default_hostfile_used) {
if (NULL != orte_default_hostfile) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate parsing default hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_default_hostfile));
/* a default hostfile was provided - parse it */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
orte_default_hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
}
/* only look at it once */
default_hostfile_used = true;
}
}

Просмотреть файл

@ -339,9 +339,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
*output = tmp;
return ORTE_SUCCESS;
}
asprintf(&tmp, "\n%sData for node: %s\tNum procs: %ld",
asprintf(&tmp, "\n%sData for node: %s\tNum slots: %ld\tMax slots: %ld\tNum procs: %ld",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name,
(long)src->num_procs);
(long)src->slots, (long)src->slots_max, (long)src->num_procs);
/* does this node have any aliases? */
if (NULL != src->alias) {
for (i=0; NULL != src->alias[i]; i++) {

Просмотреть файл

@ -44,11 +44,15 @@
int orte_util_add_dash_host_nodes(opal_list_t *nodes,
char ** host_argv)
{
opal_list_item_t* item;
opal_list_item_t *item, *itm;
orte_std_cntr_t i, j, k;
int rc;
char **mapped_nodes = NULL, **mini_map;
orte_node_t *node;
orte_node_t *node, *nd;
opal_list_t adds;
bool found;
OBJ_CONSTRUCT(&adds, opal_list_t);
/* Accumulate all of the host name mappings */
for (j = 0; j < opal_argv_count(host_argv); ++j) {
@ -70,7 +74,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
/* Did we find anything? If not, then do nothing */
if (NULL == mapped_nodes) {
return ORTE_SUCCESS;
rc = ORTE_SUCCESS;
goto cleanup;
}
/* go through the names found and
@ -89,13 +94,12 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
}
/* see if the node is already on the list */
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*) item;
found = false;
OPAL_LIST_FOREACH(node, &adds, orte_node_t) {
if (0 == strcmp(node->name, mapped_nodes[i]) ||
(0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
found = true;
++node->slots;
/* the dash-host option presumes definition of num_slots */
node->slots_given = true;
@ -104,8 +108,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
}
/* If we didn't find it, add it to the list */
if (item == opal_list_get_end(nodes)) {
if (!found) {
node = OBJ_NEW(orte_node_t);
if (NULL == node) {
return ORTE_ERR_OUT_OF_RESOURCE;
@ -132,15 +135,36 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
node->slots = 1;
/* the dash-host option presumes definition of num_slots */
node->slots_given = true;
opal_list_append(nodes, &node->super);
opal_list_append(&adds, &node->super);
}
}
/* transfer across all unique nodes */
while (NULL != (item = opal_list_remove_first(&adds))) {
nd = (orte_node_t*)item;
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
found = true;
break;
}
}
if (!found) {
opal_list_append(nodes, &nd->super);
} else {
OBJ_RELEASE(item);
}
}
rc = ORTE_SUCCESS;
cleanup:
if (NULL != mapped_nodes) {
opal_argv_free(mapped_nodes);
}
OPAL_LIST_DESTRUCT(&adds);
return rc;
}

Просмотреть файл

@ -41,7 +41,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/ras_types.h"
#include "orte/mca/ras/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile_lex.h"
@ -176,7 +176,7 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
}
node_name[len-1] = '\0'; /* truncate */
OPAL_OUTPUT_VERBOSE((3, orte_debug_output,
OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
"%s hostfile: node %s is being excluded",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name));
@ -221,7 +221,7 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node_name = strdup(orte_process_info.nodename);
}
OPAL_OUTPUT_VERBOSE((3, orte_debug_output,
OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
"%s hostfile: node %s is being included - keep all is %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name,
keep_all ? "TRUE" : "FALSE"));
@ -502,27 +502,29 @@ unlock:
int orte_util_add_hostfile_nodes(opal_list_t *nodes,
char *hostfile)
{
opal_list_t exclude;
opal_list_t exclude, adds;
opal_list_item_t *item, *itm;
int rc;
orte_node_t *nd, *node;
bool found;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: checking hostfile %s for nodes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
OBJ_CONSTRUCT(&exclude, opal_list_t);
/* parse the hostfile and add the contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, false))) {
OBJ_CONSTRUCT(&adds, opal_list_t);
/* parse the hostfile and add any new contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &adds, &exclude, false))) {
goto cleanup;
}
/* parse the nodes to check for any relative node directives */
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
/* check for any relative node directives */
for (item = opal_list_get_first(&adds);
item != opal_list_get_end(&adds);
item = opal_list_get_next(item)) {
orte_node_t *node=(orte_node_t*)item;
node=(orte_node_t*)item;
if ('+' == node->name[0]) {
orte_show_help("help-hostfile.txt", "hostfile:relative-syntax",
@ -533,16 +535,16 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
}
/* remove from the list of nodes those that are in the exclude list */
while(NULL != (item = opal_list_remove_first(&exclude))) {
orte_node_t *exnode = (orte_node_t*)item;
while (NULL != (item = opal_list_remove_first(&exclude))) {
nd = (orte_node_t*)item;
/* check for matches on nodes */
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
for (itm = opal_list_get_first(&adds);
itm != opal_list_get_end(&adds);
itm = opal_list_get_next(itm)) {
orte_node_t *node=(orte_node_t*)itm;
if (0 == strcmp(exnode->name, node->name)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
/* match - remove it */
opal_list_remove_item(nodes, itm);
opal_list_remove_item(&adds, itm);
OBJ_RELEASE(itm);
break;
}
@ -550,8 +552,28 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
OBJ_RELEASE(item);
}
/* transfer across all unique nodes */
while (NULL != (item = opal_list_remove_first(&adds))) {
nd = (orte_node_t*)item;
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
found = true;
break;
}
}
if (!found) {
opal_list_append(nodes, &nd->super);
} else {
OBJ_RELEASE(item);
}
}
cleanup:
OBJ_DESTRUCT(&exclude);
OPAL_LIST_DESTRUCT(&exclude);
OPAL_LIST_DESTRUCT(&adds);
return rc;
}
@ -574,7 +596,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
opal_list_t keep;
bool found;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: filtering nodes through hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
@ -822,7 +844,7 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
orte_node_t *node_from_pool, *newnode;
int rc;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: creating ordered list of hosts from hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));