1
1

Add protection against a bozo case where we could end up in an infinite loop while calculating ranks

This commit was SVN r25491.
Этот коммит содержится в:
Ralph Castain 2011-11-18 15:35:55 +00:00
родитель 88d32312d6
Коммит 1e5e9bde77
2 изменённых файлов: 21 добавлений и 4 удалений

Просмотреть файл

@ -185,3 +185,9 @@ Unknown binding level:
Target: %s
Cache level: %u
#
[ranking-error]
An infinite loop condition has been detected in the ranking
code when computing ranks %s. This indicates a problem
in the mapper. Please refer the problem to the attention
of the OMPI developers.

Просмотреть файл

@ -381,7 +381,8 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
orte_node_t *node;
orte_proc_t *proc, *ptr;
int rc;
bool added_one=false;
map = jdata->map;
if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) ||
@ -395,6 +396,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
cnt=0;
vpid=0;
while (cnt < jdata->num_procs) {
added_one = false;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
@ -408,8 +410,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
continue;
}
if (ORTE_VPID_INVALID != proc->name.vpid) {
/* vpid was already assigned, probably by the
* round-robin mapper. Some mappers require that
/* vpid was already assigned. Some mappers require that
* we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
* do the operation if required
@ -423,6 +424,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
* it in our loop - otherwise don't as we would be
* double counting
*/
added_one = true;
cnt++;
}
continue;
@ -444,8 +446,17 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
return rc;
}
cnt++;
added_one = true;
break; /* move on to next node */
}
}
}
/* it should be impossible, but check to see if there was nothing
* added during this pass and error out if not
*/
if (!added_one) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:ranking-error",
true, "bynode");
return ORTE_ERR_SILENT;
}
}
return ORTE_SUCCESS;