This is a fix for bug Ticket #27
We were stuck in an infinite loop inside the rmaps round_robin component when the user specified a host, then over subscribed it. Instead of retuning an error, we looped forever. For example: $ cat hostfile A slots=2 max-slots=2 B slots=2 max-slots=2 $ mpirun -np 3 --hostfile hostfile --host B <hang> The loop would not terminate because both host A and B are in the 'nodes' structure as they are both allocated to the job. However, after allocating 2 slots to host B, we remove it from the node list leaving us with a 'nodes' structure with just A in it. Since we can't use host A, we keep looping here until we find a node that we can use. This patch checks to make sure that if we get into this situation where rmaps is looping over the list a second time without finding a node during the first pass then we know that there are no nodes left to use, so we have a resource allocation error, and should return to the user. This patch should be moved to all of the release branches This commit was SVN r10131.
Этот коммит содержится в:
родитель
f4a7e9be78
Коммит
2f20a38c98
@ -104,7 +104,7 @@ static opal_list_item_t* get_next_mapped(opal_list_item_t *node_item,
|
||||
int num_mapped,
|
||||
opal_list_t* nodes)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
opal_list_item_t *item, *initial_item = NULL;
|
||||
|
||||
/* Wrap around to beginning if we are at the end of the list */
|
||||
if (opal_list_get_end(nodes) == opal_list_get_next(node_item)) {
|
||||
@ -129,6 +129,11 @@ static opal_list_item_t* get_next_mapped(opal_list_item_t *node_item,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* save the node we started with */
|
||||
if( NULL == initial_item ) {
|
||||
initial_item = item;
|
||||
}
|
||||
|
||||
/* Access next item in Round Robin Manner */
|
||||
if (opal_list_get_end(nodes) == opal_list_get_next(item)) {
|
||||
item = opal_list_get_first(nodes);
|
||||
@ -137,6 +142,12 @@ static opal_list_item_t* get_next_mapped(opal_list_item_t *node_item,
|
||||
item = opal_list_get_next(item);
|
||||
}
|
||||
|
||||
/* Check to make sure we didn't loop back around without
|
||||
* finding a node in the mapping */
|
||||
if( initial_item == item) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
} while( true );
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user