1
1

Resolve an unexpected behavior in hostfile allocations. Now that we filter allocations to determine what will be used for mapping, let the initial global pool be the union of nodes from all sources (default hostfile, hostfiles, and dash-hosts). Each app will filter down to only those specified for it using its own hostfile and dash-host options.

cmr=v1.7.4:reviewer=jsquyres:subject=Resolve an unexpected behavior in hostfile allocations

This commit was SVN r30040.
Этот коммит содержится в:
Ralph Castain 2013-12-21 01:38:27 +00:00
родитель 6f6c3cc21c
Коммит 9c768df8b8
4 изменённых файлов: 113 добавлений и 65 удалений

Просмотреть файл

@ -52,13 +52,12 @@
/* function to display allocation */ /* function to display allocation */
void orte_ras_base_display_alloc(void) void orte_ras_base_display_alloc(void)
{ {
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL; char *tmp=NULL, *tmp2, *tmp3;
int i, istart; int i, istart;
orte_node_t *alloc; orte_node_t *alloc;
if (orte_xml_output) { if (orte_xml_output) {
asprintf(&tmp, "<allocation>\n"); asprintf(&tmp, "<allocation>\n");
pfx = "\t";
} else { } else {
asprintf(&tmp, "\n====================== ALLOCATED NODES ======================\n"); asprintf(&tmp, "\n====================== ALLOCATED NODES ======================\n");
} }
@ -71,7 +70,16 @@ void orte_ras_base_display_alloc(void)
if (NULL == (alloc = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { if (NULL == (alloc = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue; continue;
} }
opal_dss.print(&tmp2, pfx, alloc, ORTE_NODE); if (orte_xml_output) {
/* need to create the output in XML format */
asprintf(&tmp2, "\t<host name=\"%s\" slots=\"%d\" max_slots=\"%d\" slots_inuse=\"%d\">\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
} else {
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
}
if (NULL == tmp) { if (NULL == tmp) {
tmp = tmp2; tmp = tmp2;
} else { } else {
@ -85,7 +93,7 @@ void orte_ras_base_display_alloc(void)
fprintf(orte_xml_fp, "%s</allocation>\n", tmp); fprintf(orte_xml_fp, "%s</allocation>\n", tmp);
fflush(orte_xml_fp); fflush(orte_xml_fp);
} else { } else {
opal_output(orte_clean_output, "%s\n\n=================================================================\n", tmp); opal_output(orte_clean_output, "%s=================================================================\n", tmp);
} }
free(tmp); free(tmp);
} }
@ -103,7 +111,6 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
orte_std_cntr_t i; orte_std_cntr_t i;
orte_app_context_t *app; orte_app_context_t *app;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
bool default_hostfile_used;
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate", "%s ras:base:allocate",
@ -229,17 +236,31 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
* the resulting list contains the UNION of all nodes specified * the resulting list contains the UNION of all nodes specified
* in hostfiles from across all app_contexts * in hostfiles from across all app_contexts
* *
* Any app that has no hostfile but has a dash-host, will have * We then continue to add any hosts provided by dash-host and
* those nodes added to the list * the default hostfile, if we have it. We will then filter out
* * all the non-desired hosts (i.e., those not specified by
* Any app that fails to have a hostfile or a dash-host will be given the * -host and/or -hostfile) when we start the mapping process
* default hostfile, if we have it
* *
* Note that any relative node syntax found in the hostfiles will * Note that any relative node syntax found in the hostfiles will
* generate an error in this scenario, so only non-relative syntax * generate an error in this scenario, so only non-relative syntax
* can be present * can be present
*/ */
default_hostfile_used = false; if (NULL != orte_default_hostfile) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate parsing default hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_default_hostfile));
/* a default hostfile was provided - parse it */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
orte_default_hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
}
for (i=0; i < jdata->apps->size; i++) { for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue; continue;
@ -279,25 +300,6 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy); OBJ_RELEASE(caddy);
return; return;
} }
} else if (!default_hostfile_used) {
if (NULL != orte_default_hostfile) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate parsing default hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_default_hostfile));
/* a default hostfile was provided - parse it */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
orte_default_hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
}
/* only look at it once */
default_hostfile_used = true;
} }
} }

Просмотреть файл

@ -339,9 +339,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
*output = tmp; *output = tmp;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
asprintf(&tmp, "\n%sData for node: %s\tNum procs: %ld", asprintf(&tmp, "\n%sData for node: %s\tNum slots: %ld\tMax slots: %ld\tNum procs: %ld",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, pfx2, (NULL == src->name) ? "UNKNOWN" : src->name,
(long)src->num_procs); (long)src->slots, (long)src->slots_max, (long)src->num_procs);
/* does this node have any aliases? */ /* does this node have any aliases? */
if (NULL != src->alias) { if (NULL != src->alias) {
for (i=0; NULL != src->alias[i]; i++) { for (i=0; NULL != src->alias[i]; i++) {

Просмотреть файл

@ -44,11 +44,15 @@
int orte_util_add_dash_host_nodes(opal_list_t *nodes, int orte_util_add_dash_host_nodes(opal_list_t *nodes,
char ** host_argv) char ** host_argv)
{ {
opal_list_item_t* item; opal_list_item_t *item, *itm;
orte_std_cntr_t i, j, k; orte_std_cntr_t i, j, k;
int rc; int rc;
char **mapped_nodes = NULL, **mini_map; char **mapped_nodes = NULL, **mini_map;
orte_node_t *node; orte_node_t *node, *nd;
opal_list_t adds;
bool found;
OBJ_CONSTRUCT(&adds, opal_list_t);
/* Accumulate all of the host name mappings */ /* Accumulate all of the host name mappings */
for (j = 0; j < opal_argv_count(host_argv); ++j) { for (j = 0; j < opal_argv_count(host_argv); ++j) {
@ -70,7 +74,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
/* Did we find anything? If not, then do nothing */ /* Did we find anything? If not, then do nothing */
if (NULL == mapped_nodes) { if (NULL == mapped_nodes) {
return ORTE_SUCCESS; rc = ORTE_SUCCESS;
goto cleanup;
} }
/* go through the names found and /* go through the names found and
@ -89,13 +94,12 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
} }
/* see if the node is already on the list */ /* see if the node is already on the list */
for (item = opal_list_get_first(nodes); found = false;
item != opal_list_get_end(nodes); OPAL_LIST_FOREACH(node, &adds, orte_node_t) {
item = opal_list_get_next(item)) {
node = (orte_node_t*) item;
if (0 == strcmp(node->name, mapped_nodes[i]) || if (0 == strcmp(node->name, mapped_nodes[i]) ||
(0 == strcmp(node->name, orte_process_info.nodename) && (0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) { (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
found = true;
++node->slots; ++node->slots;
/* the dash-host option presumes definition of num_slots */ /* the dash-host option presumes definition of num_slots */
node->slots_given = true; node->slots_given = true;
@ -104,8 +108,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
} }
/* If we didn't find it, add it to the list */ /* If we didn't find it, add it to the list */
if (!found) {
if (item == opal_list_get_end(nodes)) {
node = OBJ_NEW(orte_node_t); node = OBJ_NEW(orte_node_t);
if (NULL == node) { if (NULL == node) {
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
@ -132,15 +135,36 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
node->slots = 1; node->slots = 1;
/* the dash-host option presumes definition of num_slots */ /* the dash-host option presumes definition of num_slots */
node->slots_given = true; node->slots_given = true;
opal_list_append(nodes, &node->super); opal_list_append(&adds, &node->super);
} }
} }
/* transfer across all unique nodes */
while (NULL != (item = opal_list_remove_first(&adds))) {
nd = (orte_node_t*)item;
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
found = true;
break;
}
}
if (!found) {
opal_list_append(nodes, &nd->super);
} else {
OBJ_RELEASE(item);
}
}
rc = ORTE_SUCCESS; rc = ORTE_SUCCESS;
cleanup: cleanup:
if (NULL != mapped_nodes) { if (NULL != mapped_nodes) {
opal_argv_free(mapped_nodes); opal_argv_free(mapped_nodes);
} }
OPAL_LIST_DESTRUCT(&adds);
return rc; return rc;
} }

Просмотреть файл

@ -41,7 +41,7 @@
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/ras_types.h" #include "orte/mca/ras/base/base.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile_lex.h" #include "orte/util/hostfile/hostfile_lex.h"
@ -176,7 +176,7 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
} }
node_name[len-1] = '\0'; /* truncate */ node_name[len-1] = '\0'; /* truncate */
OPAL_OUTPUT_VERBOSE((3, orte_debug_output, OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
"%s hostfile: node %s is being excluded", "%s hostfile: node %s is being excluded",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name));
@ -221,7 +221,7 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node_name = strdup(orte_process_info.nodename); node_name = strdup(orte_process_info.nodename);
} }
OPAL_OUTPUT_VERBOSE((3, orte_debug_output, OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
"%s hostfile: node %s is being included - keep all is %s", "%s hostfile: node %s is being included - keep all is %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name,
keep_all ? "TRUE" : "FALSE")); keep_all ? "TRUE" : "FALSE"));
@ -502,27 +502,29 @@ unlock:
int orte_util_add_hostfile_nodes(opal_list_t *nodes, int orte_util_add_hostfile_nodes(opal_list_t *nodes,
char *hostfile) char *hostfile)
{ {
opal_list_t exclude; opal_list_t exclude, adds;
opal_list_item_t *item, *itm; opal_list_item_t *item, *itm;
int rc; int rc;
orte_node_t *nd, *node;
bool found;
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s hostfile: checking hostfile %s for nodes", "%s hostfile: checking hostfile %s for nodes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
OBJ_CONSTRUCT(&exclude, opal_list_t); OBJ_CONSTRUCT(&exclude, opal_list_t);
OBJ_CONSTRUCT(&adds, opal_list_t);
/* parse the hostfile and add the contents to the list */ /* parse the hostfile and add any new contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, false))) { if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &adds, &exclude, false))) {
goto cleanup; goto cleanup;
} }
/* parse the nodes to check for any relative node directives */ /* check for any relative node directives */
for (item = opal_list_get_first(nodes); for (item = opal_list_get_first(&adds);
item != opal_list_get_end(nodes); item != opal_list_get_end(&adds);
item = opal_list_get_next(item)) { item = opal_list_get_next(item)) {
orte_node_t *node=(orte_node_t*)item; node=(orte_node_t*)item;
if ('+' == node->name[0]) { if ('+' == node->name[0]) {
orte_show_help("help-hostfile.txt", "hostfile:relative-syntax", orte_show_help("help-hostfile.txt", "hostfile:relative-syntax",
@ -533,16 +535,16 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
} }
/* remove from the list of nodes those that are in the exclude list */ /* remove from the list of nodes those that are in the exclude list */
while(NULL != (item = opal_list_remove_first(&exclude))) { while (NULL != (item = opal_list_remove_first(&exclude))) {
orte_node_t *exnode = (orte_node_t*)item; nd = (orte_node_t*)item;
/* check for matches on nodes */ /* check for matches on nodes */
for (itm = opal_list_get_first(nodes); for (itm = opal_list_get_first(&adds);
itm != opal_list_get_end(nodes); itm != opal_list_get_end(&adds);
itm = opal_list_get_next(itm)) { itm = opal_list_get_next(itm)) {
orte_node_t *node=(orte_node_t*)itm; node = (orte_node_t*)itm;
if (0 == strcmp(exnode->name, node->name)) { if (0 == strcmp(nd->name, node->name)) {
/* match - remove it */ /* match - remove it */
opal_list_remove_item(nodes, itm); opal_list_remove_item(&adds, itm);
OBJ_RELEASE(itm); OBJ_RELEASE(itm);
break; break;
} }
@ -550,8 +552,28 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
/* transfer across all unique nodes */
while (NULL != (item = opal_list_remove_first(&adds))) {
nd = (orte_node_t*)item;
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
node = (orte_node_t*)itm;
if (0 == strcmp(nd->name, node->name)) {
found = true;
break;
}
}
if (!found) {
opal_list_append(nodes, &nd->super);
} else {
OBJ_RELEASE(item);
}
}
cleanup: cleanup:
OBJ_DESTRUCT(&exclude); OPAL_LIST_DESTRUCT(&exclude);
OPAL_LIST_DESTRUCT(&adds);
return rc; return rc;
} }
@ -574,7 +596,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
opal_list_t keep; opal_list_t keep;
bool found; bool found;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: filtering nodes through hostfile %s", "%s hostfile: filtering nodes through hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
@ -822,7 +844,7 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
orte_node_t *node_from_pool, *newnode; orte_node_t *node_from_pool, *newnode;
int rc; int rc;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s hostfile: creating ordered list of hosts from hostfile %s", "%s hostfile: creating ordered list of hosts from hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));