1
1

Let a restarted process have access to all known nodes instead of only those already in its prior job map

This commit was SVN r22225.
Этот коммит содержится в:
Ralph Castain 2009-11-19 19:45:11 +00:00
родитель 852e5d9ee0
Коммит 5e031d9ded

Просмотреть файл

@ -192,6 +192,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
} }
/* save the current node */ /* save the current node */
oldnode = proc->node; oldnode = proc->node;
/* point to the app */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx);
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: proc %s from node %s is to be restarted", "%s rmaps:resilient: proc %s from node %s is to be restarted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -248,22 +250,30 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
} }
} }
/* if no ftgrps are available, then just map it on the lightest loaded /* if no ftgrps are available, then just map it on the lightest loaded
* node in the current map, avoiding the current node if possible * node known to the system, avoiding the current node if possible and
* taking into account any limitations specified by user in hostfile
* and -host options
*/ */
if (NULL == target) { if (NULL == target) {
nd = oldnode; /* put it back where it was if nothing else is found */ nd = oldnode; /* put it back where it was if nothing else is found */
totprocs = 1000000; totprocs = 1000000;
OBJ_CONSTRUCT(&node_list, opal_list_t);
map = jdata->map; map = jdata->map;
for (k=0; k < map->nodes->size; k++) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->policy))) {
if (NULL == (node = opal_pointer_array_get_item(map->nodes, k)) || ORTE_ERROR_LOG(rc);
node == oldnode) { return rc;
continue; }
} /* find the lightest loaded node while deconstructing the list */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
if (node->num_procs < totprocs) { if (node->num_procs < totprocs) {
nd = node; nd = node;
totprocs = node->num_procs; totprocs = node->num_procs;
} }
OBJ_RELEASE(item);
} }
OBJ_DESTRUCT(&node_list);
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s", "%s rmaps:resilient: no avail fault groups found - placing proc on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),