1
1

Loadbalancing across nodes (i.e., map-by node) wasn't working correctly - the algorithm relied on the nodes being defined in descending order of slots, or the numbe

r of slots remaing to be assigned being only one/node. Regardless, it didn't work for the case where nodes were defined in ascending order of slots.

Tetsuya's proposed patch didn't solve the problem for me, but it did correct the case where cpus/proc > 1. The final patch requires that we loop over the assignment
 algo until all procs are assigned or all nodes are filled - any remaining procs are then handled in the cleanup loop.

cmr=v1.7.5:reviewer=rhc:subject=fix map-by node for different cases

This commit was SVN r30798.
Этот коммит содержится в:
Ralph Castain 2014-02-22 16:39:41 +00:00
родитель f17ec1ab10
Коммит c8112c1086
2 изменённых файлов: 173 добавлений и 149 удалений

Просмотреть файл

@ -596,6 +596,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
}
}
if (cnt < app->num_procs) {
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
return ORTE_ERR_FATAL;
}
return ORTE_SUCCESS;

Просмотреть файл

@ -211,7 +211,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs)
{
int j, nprocs_mapped, lag, delta;
int j, nprocs_mapped, lag, delta, nnodes;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *item;
@ -239,118 +239,171 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
oversubscribed = true;
}
/* divide the procs evenly across all nodes - this is the
* average we have to maintain as we go, but we adjust
* the number on each node to reflect its available slots.
* Obviously, if all nodes have the same number of slots,
* then the avg is what we get on each node - this is
* the most common situation.
*/
navg = app->num_procs / opal_list_get_size(node_list);
if (0 == navg) {
/* if there are less procs than nodes, we have to
* place at least one/node
*/
navg = 1;
}
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - (navg * (float)opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d",
navg, extra_procs_to_assign, nxtra_nodes);
nnodes = opal_list_get_size(node_list);
nprocs_mapped = 0;
lag = 0;
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
do {
/* divide the procs evenly across all nodes - this is the
* average we have to maintain as we go, but we adjust
* the number on each node to reflect its available slots.
* Obviously, if all nodes have the same number of slots,
* then the avg is what we get on each node - this is
* the most common situation.
*/
if (NULL != node->topology) {
obj = hwloc_get_root_obj(node->topology);
}
#endif
/* add this node to the map, but only do so once */
if (!node->mapped) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (oversubscribed) {
/* everybody just takes their share */
num_procs_to_assign = navg + extra_procs_to_assign;
} else {
/* if we are not oversubscribed, then there are enough
* slots to handle all the procs. However, not every
* node will have the same number of slots, so we
* have to track how many procs to "shift" elsewhere
* to make up the difference
navg = app->num_procs / nnodes;
if (0 == navg) {
/* if there are less procs than nodes, we have to
* place at least one/node
*/
if (node->slots <= node->slots_inuse) {
/* if there are no extras to take, then we can
* ignore this node
navg = 1;
}
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - (navg * (float)opal_list_get_size(node_list))) / (float)nnodes;
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * nnodes);
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d",
navg, extra_procs_to_assign, nxtra_nodes);
lag = 0;
nnodes = 0;
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
*/
if (NULL != node->topology) {
obj = hwloc_get_root_obj(node->topology);
}
#endif
/* add this node to the map, but only do so once */
if (!node->mapped) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (oversubscribed) {
/* everybody just takes their share */
num_procs_to_assign = navg + extra_procs_to_assign;
} else {
/* if we are not oversubscribed, then there are enough
* slots to handle all the procs. However, not every
* node will have the same number of slots, so we
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (0 == extra_procs_to_assign) {
if (node->slots <= node->slots_inuse) {
/* if there are no extras to take, then we can
* ignore this node
*/
if (0 == extra_procs_to_assign) {
/* update how many we are lagging behind */
lag += navg;
continue;
}
/* everybody has to take at least the extras */
num_procs_to_assign = extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg;
continue;
}
/* everybody has to take at least the extras */
num_procs_to_assign = extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg;
} else {
/* if slots < avg (adjusted for cpus/proc), then take all */
if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) {
num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank;
/* update how many we are lagging behind */
lag += navg - num_procs_to_assign;
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg;
if (lag < delta) {
delta = lag;
/* if slots < avg (adjusted for cpus/proc), then take all */
if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) {
num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank;
/* update how many we are lagging behind */
lag += navg - num_procs_to_assign + extra_procs_to_assign;
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
"%s NODE %s LAGGING %d AVG %d ASSIGN %d EXTRA %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
lag, navg, num_procs_to_assign, extra_procs_to_assign));
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg;
if (lag < delta) {
delta = lag;
}
lag -= delta;
}
lag -= delta;
num_procs_to_assign = navg + delta + extra_procs_to_assign;
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
"%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta,
lag, navg, num_procs_to_assign, extra_procs_to_assign));
}
num_procs_to_assign = navg;
}
/* add in the extras */
num_procs_to_assign += extra_procs_to_assign;
nnodes++; // track how many nodes remain available
}
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
"%s NODE %s ASSIGNING %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
num_procs_to_assign));
for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
}
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
}
for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) {
} while (nprocs_mapped < app->num_procs && 0 < nnodes);
/* now fillin as required until fully mapped */
while (nprocs_mapped < app->num_procs) {
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
*/
if (NULL != node->topology) {
obj = hwloc_get_root_obj(node->topology);
}
#endif
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -358,49 +411,19 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
}
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
}
/* if we have some remaining lag, then put one/node until
* all are assigned
*/
for (item = opal_list_get_first(node_list);
0 < lag && item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
lag--;
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
}
}
@ -675,7 +698,7 @@ static int byobj_span(orte_job_t *jdata,
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < (int)app->num_procs) {
if (num_slots < (int)app->num_procs * orte_rmaps_base.cpus_per_rank) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -778,15 +801,15 @@ static int byobj_span(orte_job_t *jdata,
lag += navg;
} else {
/* if slots < avg, then take all */
if ((node->slots - node->slots_inuse) < navg) {
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
if ((node->slots - node->slots_inuse) < navg * orte_rmaps_base.cpus_per_rank) {
num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
/* update how many we are lagging behind */
lag += navg - (node->slots - node->slots_inuse);
lag += navg - (node->slots - node->slots_inuse) + extra_procs_to_assign;
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = (node->slots - node->slots_inuse) - navg;
delta = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank - navg;
if (lag < delta) {
delta = lag;
}
@ -838,7 +861,7 @@ static int byobj_span(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < (int)node->num_procs) {
if (node->slots < (int)node->num_procs * orte_rmaps_base.cpus_per_rank) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/