1
1

Fix a few places where we weren't completely identifying hostfile-based operations against "localhost" entries. Tell the mapper base to be silent when we don't want errors announced because nodes aren't available for mapping (something it is okay if they are fully used). Fix an infinite loop in the file prepositioning code.

This commit was SVN r27210.
Этот коммит содержится в:
Ralph Castain 2012-08-31 21:28:49 +00:00
родитель 888b04ab36
Коммит 95019cc310
13 изменённых файлов: 64 добавлений и 32 удалений

Просмотреть файл

@ -388,6 +388,7 @@ static int raw_preposition_files(orte_job_t *jdata,
outbound = OBJ_NEW(orte_filem_raw_outbound_t);
outbound->cbfunc = cbfunc;
outbound->cbdata = cbdata;
opal_list_append(&outbound_files, &outbound->super);
/* only the HNP should ever call this function - loop thru the
* fileset and initiate xcast transfer of each file to every
@ -405,7 +406,7 @@ static int raw_preposition_files(orte_job_t *jdata,
itm2 != opal_list_get_end(&optr->xfers);
itm2 = opal_list_get_next(itm2)) {
xptr = (orte_filem_raw_xfer_t*)itm2;
if (0 == strcmp(fs->local_target, xfer->src)) {
if (0 == strcmp(fs->local_target, xptr->src)) {
already_sent = true;
break;
}
@ -413,6 +414,9 @@ static int raw_preposition_files(orte_job_t *jdata,
}
if (already_sent) {
/* no need to send it again */
OPAL_OUTPUT_VERBOSE((3, orte_filem_base_output,
"%s filem:raw: file %s is already queued for output - ignoring",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->src));
OBJ_RELEASE(item);
continue;
}
@ -500,7 +504,16 @@ static int raw_preposition_files(orte_job_t *jdata,
}
return ORTE_SUCCESS;
}
opal_list_append(&outbound_files, &outbound->super);
if (0 < opal_output_get_verbosity(orte_filem_base_output)) {
opal_output(0, "%s Files to be positioned:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
for (itm2 = opal_list_get_first(&outbound->xfers);
itm2 != opal_list_get_end(&outbound->xfers);
itm2 = opal_list_get_next(itm2)) {
xptr = (orte_filem_raw_xfer_t*)itm2;
opal_output(0, "%s\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xptr->src);
}
}
return ORTE_SUCCESS;
#endif
@ -558,7 +571,7 @@ static int raw_link_local_files(orte_job_t *jdata,
char *my_dir, *path=NULL;
orte_proc_t *proc;
char *prefix;
int i, rc;
int i, j, rc;
orte_filem_raw_incoming_t *inbnd;
opal_list_item_t *item;
@ -645,8 +658,8 @@ static int raw_link_local_files(orte_job_t *jdata,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
inbnd->file));
/* cycle thru the link points and create symlinks to them */
for (i=0; NULL != inbnd->link_pts[i]; i++) {
if (ORTE_SUCCESS != (rc = create_link(my_dir, path, inbnd->link_pts[i]))) {
for (j=0; NULL != inbnd->link_pts[j]; j++) {
if (ORTE_SUCCESS != (rc = create_link(my_dir, path, inbnd->link_pts[j]))) {
ORTE_ERROR_LOG(rc);
free(my_dir);
free(path);

Просмотреть файл

@ -397,6 +397,7 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
*/
node->name = strdup(orte_process_info.nodename);
node->state = ORTE_NODE_STATE_UP;
node->slots_alloc = 1;
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;

Просмотреть файл

@ -68,6 +68,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
int rc, i;
orte_node_t *node, *hnp_node;
char *ptr;
bool hnp_alone = true;
/* get the number of nodes */
num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
@ -100,10 +101,13 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
* if this node is the same as the HNP's node so we don't double-enter it
*/
if (NULL != hnp_node &&
(0 == strcmp(node->name, hnp_node->name) || opal_ifislocal(node->name))) {
(0 == strcmp(node->name, hnp_node->name) ||
0 == strcmp(node->name, "localhost") ||
opal_ifislocal(node->name))) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
"%s ras:base:node_insert updating HNP info to %ld slots",
"%s ras:base:node_insert updating HNP [%s] info to %ld slots",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name,
(long)node->slots));
/* flag that hnp has been allocated */
@ -161,6 +165,8 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
if (NULL != strchr(node->name, '.')) {
orte_have_fqdn_allocation = true;
}
/* indicate the HNP is not alone */
hnp_alone = false;
}
}
@ -168,7 +174,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
* ensure we don't have any domain info in the node record
* for the hnp
*/
if (!orte_have_fqdn_allocation) {
if (!orte_have_fqdn_allocation && !hnp_alone) {
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
*ptr = '\0';
}

Просмотреть файл

@ -122,7 +122,7 @@ int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
*/
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
orte_app_context_t *app, orte_mapping_policy_t policy,
bool initial_map)
bool initial_map, bool silent)
{
opal_list_item_t *item, *next;
orte_node_t *node, *nd;
@ -242,9 +242,11 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
if (!silent) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
}
return ORTE_ERR_SILENT;
}
@ -271,9 +273,11 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
if (!silent) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
}
return ORTE_ERR_SILENT;
}
}
@ -337,8 +341,10 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
/* Sanity check to make sure we have resources available */
if (0 == num_slots) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:all-available-resources-used", true);
if (!silent) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:all-available-resources-used", true);
}
return ORTE_ERR_SILENT;
}

Просмотреть файл

@ -9,8 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -46,7 +46,7 @@ ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
orte_std_cntr_t *total_num_slots,
orte_app_context_t *app,
orte_mapping_policy_t policy,
bool initial_map);
bool initial_map, bool silent);
ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
orte_node_t *node,

Просмотреть файл

@ -631,7 +631,7 @@ static int orte_rmaps_lama_map_core(orte_job_t *jdata)
&num_slots,
cur_app_context,
jdata->map->mapping,
initial_map);
initial_map, false);
if(ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;

Просмотреть файл

@ -232,7 +232,7 @@ static int ppr_mapper(orte_job_t *jdata)
/* get the available nodes */
OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->mapping, initial_map))) {
jdata->map->mapping, initial_map, false))) {
ORTE_ERROR_LOG(rc);
goto error;
}

Просмотреть файл

@ -159,7 +159,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->mapping, initial_map))) {
map->mapping, initial_map, false))) {
ORTE_ERROR_LOG(rc);
goto error;
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
@ -175,7 +175,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
&num_slots,
app,
jdata->map->mapping,
false))) {
false, false))) {
ORTE_ERROR_LOG(rc);
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
@ -479,7 +479,7 @@ static int get_new_node(orte_proc_t *proc,
&num_slots,
app,
map->mapping,
false))) {
false, false))) {
ORTE_ERROR_LOG(rc);
goto release;
}
@ -722,7 +722,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->mapping, initial_map))) {
map->mapping, initial_map, false))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -119,7 +119,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->mapping, initial_map))) {
jdata->map->mapping, initial_map, false))) {
ORTE_ERROR_LOG(rc);
goto error;
}

Просмотреть файл

@ -95,7 +95,8 @@ static int staged_mapper(orte_job_t *jdata)
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->mapping, false))) {
jdata->map->mapping, false, true)) &&
ORTE_ERR_SILENT != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -152,8 +153,6 @@ static int staged_mapper(orte_job_t *jdata)
node->num_procs++;
node->slots_inuse++;
if (node->slots_inuse == node->slots_alloc) {
opal_output(0, "%s slots on node %s are fully used",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name);
opal_list_remove_item(&node_list, &node->super);
OBJ_RELEASE(node);
}

Просмотреть файл

@ -281,6 +281,10 @@ static void cleanup_node(orte_proc_t *proc)
break;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
"%s state:staged:track_procs node %s has %d slots alloc, %d slots inuse",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
(int)node->slots_alloc, (int)node->slots_inuse));
}
static void track_procs(int fd, short args, void *cbdata)

Просмотреть файл

@ -596,6 +596,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
orte_node_t *node = (orte_node_t*)item2;
if (0 == strcmp(node_from_file->name, node->name)) {
/* match - remove it */
opal_output(0, "HOST %s ON EXCLUDE LIST - REMOVING", node->name);
opal_list_remove_item(&newnodes, item2);
OBJ_RELEASE(item2);
break;
@ -723,6 +724,8 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
* we have to check for local interfaces
*/
if (0 == strcmp(node_from_file->name, node_from_list->name) ||
(0 == strcmp(node_from_file->name, "localhost") &&
0 == strcmp(node_from_list->name, orte_process_info.nodename)) ||
(opal_ifislocal(node_from_list->name) &&
opal_ifislocal(node_from_file->name))) {
/* if the slot count here is less than the