Fix a few places where we weren't completely identifying hostfile-based operations against "localhost" entries. Tell the mapper base to be silent when we don't want errors announced because nodes aren't available for mapping (something it is okay if they are fully used). Fix an infinite loop in the file prepositioning code.
This commit was SVN r27210.
Этот коммит содержится в:
родитель
888b04ab36
Коммит
95019cc310
@ -388,6 +388,7 @@ static int raw_preposition_files(orte_job_t *jdata,
|
||||
outbound = OBJ_NEW(orte_filem_raw_outbound_t);
|
||||
outbound->cbfunc = cbfunc;
|
||||
outbound->cbdata = cbdata;
|
||||
opal_list_append(&outbound_files, &outbound->super);
|
||||
|
||||
/* only the HNP should ever call this function - loop thru the
|
||||
* fileset and initiate xcast transfer of each file to every
|
||||
@ -405,7 +406,7 @@ static int raw_preposition_files(orte_job_t *jdata,
|
||||
itm2 != opal_list_get_end(&optr->xfers);
|
||||
itm2 = opal_list_get_next(itm2)) {
|
||||
xptr = (orte_filem_raw_xfer_t*)itm2;
|
||||
if (0 == strcmp(fs->local_target, xfer->src)) {
|
||||
if (0 == strcmp(fs->local_target, xptr->src)) {
|
||||
already_sent = true;
|
||||
break;
|
||||
}
|
||||
@ -413,6 +414,9 @@ static int raw_preposition_files(orte_job_t *jdata,
|
||||
}
|
||||
if (already_sent) {
|
||||
/* no need to send it again */
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_filem_base_output,
|
||||
"%s filem:raw: file %s is already queued for output - ignoring",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->src));
|
||||
OBJ_RELEASE(item);
|
||||
continue;
|
||||
}
|
||||
@ -500,7 +504,16 @@ static int raw_preposition_files(orte_job_t *jdata,
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
opal_list_append(&outbound_files, &outbound->super);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_filem_base_output)) {
|
||||
opal_output(0, "%s Files to be positioned:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
for (itm2 = opal_list_get_first(&outbound->xfers);
|
||||
itm2 != opal_list_get_end(&outbound->xfers);
|
||||
itm2 = opal_list_get_next(itm2)) {
|
||||
xptr = (orte_filem_raw_xfer_t*)itm2;
|
||||
opal_output(0, "%s\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xptr->src);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
#endif
|
||||
@ -558,7 +571,7 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
char *my_dir, *path=NULL;
|
||||
orte_proc_t *proc;
|
||||
char *prefix;
|
||||
int i, rc;
|
||||
int i, j, rc;
|
||||
orte_filem_raw_incoming_t *inbnd;
|
||||
opal_list_item_t *item;
|
||||
|
||||
@ -645,8 +658,8 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
inbnd->file));
|
||||
/* cycle thru the link points and create symlinks to them */
|
||||
for (i=0; NULL != inbnd->link_pts[i]; i++) {
|
||||
if (ORTE_SUCCESS != (rc = create_link(my_dir, path, inbnd->link_pts[i]))) {
|
||||
for (j=0; NULL != inbnd->link_pts[j]; j++) {
|
||||
if (ORTE_SUCCESS != (rc = create_link(my_dir, path, inbnd->link_pts[j]))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(my_dir);
|
||||
free(path);
|
||||
|
@ -397,6 +397,7 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
|
||||
*/
|
||||
node->name = strdup(orte_process_info.nodename);
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_alloc = 1;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
|
@ -68,6 +68,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
int rc, i;
|
||||
orte_node_t *node, *hnp_node;
|
||||
char *ptr;
|
||||
bool hnp_alone = true;
|
||||
|
||||
/* get the number of nodes */
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
|
||||
@ -100,10 +101,13 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
* if this node is the same as the HNP's node so we don't double-enter it
|
||||
*/
|
||||
if (NULL != hnp_node &&
|
||||
(0 == strcmp(node->name, hnp_node->name) || opal_ifislocal(node->name))) {
|
||||
(0 == strcmp(node->name, hnp_node->name) ||
|
||||
0 == strcmp(node->name, "localhost") ||
|
||||
opal_ifislocal(node->name))) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
|
||||
"%s ras:base:node_insert updating HNP info to %ld slots",
|
||||
"%s ras:base:node_insert updating HNP [%s] info to %ld slots",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name,
|
||||
(long)node->slots));
|
||||
|
||||
/* flag that hnp has been allocated */
|
||||
@ -161,6 +165,8 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
if (NULL != strchr(node->name, '.')) {
|
||||
orte_have_fqdn_allocation = true;
|
||||
}
|
||||
/* indicate the HNP is not alone */
|
||||
hnp_alone = false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -168,7 +174,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
* ensure we don't have any domain info in the node record
|
||||
* for the hnp
|
||||
*/
|
||||
if (!orte_have_fqdn_allocation) {
|
||||
if (!orte_have_fqdn_allocation && !hnp_alone) {
|
||||
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
|
||||
*ptr = '\0';
|
||||
}
|
||||
|
@ -122,7 +122,7 @@ int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
|
||||
*/
|
||||
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app, orte_mapping_policy_t policy,
|
||||
bool initial_map)
|
||||
bool initial_map, bool silent)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_node_t *node, *nd;
|
||||
@ -242,9 +242,11 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
|
||||
/** check that anything is here */
|
||||
if (0 == opal_list_get_size(allocated_nodes)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:no-available-resources",
|
||||
true);
|
||||
if (!silent) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:no-available-resources",
|
||||
true);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
@ -271,9 +273,11 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
|
||||
/** check that anything is here */
|
||||
if (0 == opal_list_get_size(allocated_nodes)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:no-available-resources",
|
||||
true);
|
||||
if (!silent) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:no-available-resources",
|
||||
true);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
@ -337,8 +341,10 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
|
||||
/* Sanity check to make sure we have resources available */
|
||||
if (0 == num_slots) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:all-available-resources-used", true);
|
||||
if (!silent) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:all-available-resources-used", true);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
|
@ -9,8 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -46,7 +46,7 @@ ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
|
||||
orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app,
|
||||
orte_mapping_policy_t policy,
|
||||
bool initial_map);
|
||||
bool initial_map, bool silent);
|
||||
|
||||
ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
|
@ -631,7 +631,7 @@ static int orte_rmaps_lama_map_core(orte_job_t *jdata)
|
||||
&num_slots,
|
||||
cur_app_context,
|
||||
jdata->map->mapping,
|
||||
initial_map);
|
||||
initial_map, false);
|
||||
if(ORTE_SUCCESS != ret ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
|
@ -232,7 +232,7 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
/* get the available nodes */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->mapping, initial_map))) {
|
||||
jdata->map->mapping, initial_map, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
|
@ -159,7 +159,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->mapping, initial_map))) {
|
||||
map->mapping, initial_map, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -175,7 +175,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
&num_slots,
|
||||
app,
|
||||
jdata->map->mapping,
|
||||
false))) {
|
||||
false, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
@ -479,7 +479,7 @@ static int get_new_node(orte_proc_t *proc,
|
||||
&num_slots,
|
||||
app,
|
||||
map->mapping,
|
||||
false))) {
|
||||
false, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto release;
|
||||
}
|
||||
@ -722,7 +722,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->mapping, initial_map))) {
|
||||
map->mapping, initial_map, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -119,7 +119,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->mapping, initial_map))) {
|
||||
jdata->map->mapping, initial_map, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
|
@ -95,7 +95,8 @@ static int staged_mapper(orte_job_t *jdata)
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->mapping, false))) {
|
||||
jdata->map->mapping, false, true)) &&
|
||||
ORTE_ERR_SILENT != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -152,8 +153,6 @@ static int staged_mapper(orte_job_t *jdata)
|
||||
node->num_procs++;
|
||||
node->slots_inuse++;
|
||||
if (node->slots_inuse == node->slots_alloc) {
|
||||
opal_output(0, "%s slots on node %s are fully used",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name);
|
||||
opal_list_remove_item(&node_list, &node->super);
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
|
@ -281,6 +281,10 @@ static void cleanup_node(orte_proc_t *proc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s state:staged:track_procs node %s has %d slots alloc, %d slots inuse",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
||||
(int)node->slots_alloc, (int)node->slots_inuse));
|
||||
}
|
||||
|
||||
static void track_procs(int fd, short args, void *cbdata)
|
||||
|
@ -596,6 +596,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
orte_node_t *node = (orte_node_t*)item2;
|
||||
if (0 == strcmp(node_from_file->name, node->name)) {
|
||||
/* match - remove it */
|
||||
opal_output(0, "HOST %s ON EXCLUDE LIST - REMOVING", node->name);
|
||||
opal_list_remove_item(&newnodes, item2);
|
||||
OBJ_RELEASE(item2);
|
||||
break;
|
||||
@ -723,6 +724,8 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
* we have to check for local interfaces
|
||||
*/
|
||||
if (0 == strcmp(node_from_file->name, node_from_list->name) ||
|
||||
(0 == strcmp(node_from_file->name, "localhost") &&
|
||||
0 == strcmp(node_from_list->name, orte_process_info.nodename)) ||
|
||||
(opal_ifislocal(node_from_list->name) &&
|
||||
opal_ifislocal(node_from_file->name))) {
|
||||
/* if the slot count here is less than the
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user