1
1

Update the map-by obj and map-by obj:span mappers to correct for errors in computing carryover across the nodes. Be a little less complex in the algorithm so it is easier to follow and debug.

Refs trac:4296

This commit was SVN r30826.

The following Trac tickets were found above:
  Ticket 4296 --> https://svn.open-mpi.org/trac/ompi/ticket/4296
Этот коммит содержится в:
Ralph Castain 2014-02-25 23:32:43 +00:00
родитель a0a850a77b
Коммит b880aa46bd

Просмотреть файл

@ -44,7 +44,6 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
int rc, i, nprocs_mapped; int rc, i, nprocs_mapped;
orte_node_t *node; orte_node_t *node;
orte_proc_t *proc; orte_proc_t *proc;
opal_list_item_t *item;
int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0; int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
hwloc_obj_t obj=NULL; hwloc_obj_t obj=NULL;
@ -69,10 +68,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
* map all specified procs or use all allocated slots * map all specified procs or use all allocated slots
*/ */
nprocs_mapped = 0; nprocs_mapped = 0;
for (item = opal_list_get_first(node_list); OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output, opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:slot working node %s", "mca:rmaps:rr:slot working node %s",
node->name); node->name);
@ -133,7 +129,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
* because we are oversubscribed. Figure out how many procs * because we are oversubscribed. Figure out how many procs
* to add * to add
*/ */
balance = (float)(app->num_procs - nprocs_mapped) / (float)opal_list_get_size(node_list); balance = (float)((int)app->num_procs - nprocs_mapped) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance; extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) { if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */ /* compute how many nodes need an extra proc */
@ -146,10 +142,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
add_one = true; add_one = true;
} }
for (item = opal_list_get_first(node_list); OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output, opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:slot working node %s", "mca:rmaps:rr:slot working node %s",
node->name); node->name);
@ -195,7 +188,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for /* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state * this node if it is in that state
*/ */
if (node->slots < (int)node->num_procs) { if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
/* flag the node as oversubscribed so that sched-yield gets /* flag the node as oversubscribed so that sched-yield gets
* properly set * properly set
*/ */
@ -214,7 +207,6 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
int j, nprocs_mapped, lag, delta, nnodes; int j, nprocs_mapped, lag, delta, nnodes;
orte_node_t *node; orte_node_t *node;
orte_proc_t *proc; orte_proc_t *proc;
opal_list_item_t *item;
int num_procs_to_assign, navg, idx; int num_procs_to_assign, navg, idx;
int extra_procs_to_assign=0, nxtra_nodes=0; int extra_procs_to_assign=0, nxtra_nodes=0;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
@ -250,7 +242,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
* then the avg is what we get on each node - this is * then the avg is what we get on each node - this is
* the most common situation. * the most common situation.
*/ */
navg = app->num_procs / nnodes; navg = ((int)app->num_procs - nprocs_mapped) / nnodes;
if (0 == navg) { if (0 == navg) {
/* if there are less procs than nodes, we have to /* if there are less procs than nodes, we have to
* place at least one/node * place at least one/node
@ -259,11 +251,11 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
} }
/* compute how many extra procs to put on each node */ /* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - (navg * (float)opal_list_get_size(node_list))) / (float)nnodes; balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * nnodes)) / (float)nnodes;
extra_procs_to_assign = (int)balance; extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) { if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */ /* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * nnodes); nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * nnodes);
/* add one so that we add an extra proc to the first nodes /* add one so that we add an extra proc to the first nodes
* until all procs are mapped * until all procs are mapped
*/ */
@ -278,10 +270,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
lag = 0; lag = 0;
nnodes = 0; nnodes = 0;
for (item = opal_list_get_first(node_list); OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning /* get the root object as we are not assigning
* locale except at the node level * locale except at the node level
@ -323,21 +312,17 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* if there are no extras to take, then we can /* if there are no extras to take, then we can
* ignore this node * ignore this node
*/ */
if (0 == extra_procs_to_assign) { num_procs_to_assign = 0;
/* update how many we are lagging behind */
lag += navg;
continue;
}
/* everybody has to take at least the extras */
num_procs_to_assign = extra_procs_to_assign;
/* update how many we are lagging behind */ /* update how many we are lagging behind */
lag += navg; lag += navg + extra_procs_to_assign;
} else { } else {
/* add in the extras */
lag += extra_procs_to_assign;
/* if slots < avg (adjusted for cpus/proc), then take all */ /* if slots < avg (adjusted for cpus/proc), then take all */
if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) { if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) {
num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank; num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank;
/* update how many we are lagging behind */ /* update how many we are lagging behind */
lag += navg - num_procs_to_assign + extra_procs_to_assign; lag += navg - num_procs_to_assign;
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
"%s NODE %s LAGGING %d AVG %d ASSIGN %d EXTRA %d", "%s NODE %s LAGGING %d AVG %d ASSIGN %d EXTRA %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
@ -352,7 +337,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
} }
lag -= delta; lag -= delta;
} }
num_procs_to_assign = navg + delta + extra_procs_to_assign; num_procs_to_assign = navg + delta;
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
"%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d", "%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta,
@ -392,10 +377,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* now fillin as required until fully mapped */ /* now fillin as required until fully mapped */
while (nprocs_mapped < app->num_procs) { while (nprocs_mapped < app->num_procs) {
for (item = opal_list_get_first(node_list); OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning /* get the root object as we are not assigning
* locale except at the node level * locale except at the node level
@ -449,16 +431,15 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
orte_vpid_t num_procs, orte_vpid_t num_procs,
hwloc_obj_type_t target, unsigned cache_level) hwloc_obj_type_t target, unsigned cache_level)
{ {
int i, j, nprocs_mapped; int i, nmapped, nprocs_mapped;
orte_node_t *node; orte_node_t *node;
orte_proc_t *proc; orte_proc_t *proc;
opal_list_item_t *item; int nprocs;
int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0; int idx;
int extra_procs_to_assign=0, nxtra_nodes=0, idx;
hwloc_obj_t obj=NULL; hwloc_obj_t obj=NULL;
unsigned int nobjs; unsigned int nobjs;
float balance; bool add_one;
bool add_one=false; bool second_pass;
/* there are two modes for mapping by object: span and not-span. The /* there are two modes for mapping by object: span and not-span. The
* span mode essentially operates as if there was just a single * span mode essentially operates as if there was just a single
@ -485,185 +466,103 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
ORTE_JOBID_PRINT(jdata->jobid), ORTE_JOBID_PRINT(jdata->jobid),
(int)num_slots, (unsigned long)num_procs); (int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs - can't /* quick check to see if we can map all the procs */
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < (app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (num_slots < (app->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app); true, app->num_procs, app->app);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
/* compute how many extra procs to put on each node */
if (1 == opal_list_get_size(node_list)) {
/* if there is only one node, then they all have to go on it */
extra_procs_to_assign = app->num_procs;
} else {
balance = (float)(((jdata->num_procs + app->num_procs)*orte_rmaps_base.cpus_per_rank) - num_slots) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = ((jdata->num_procs + app->num_procs)*orte_rmaps_base.cpus_per_rank) - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
}
} }
opal_output_verbose(2, orte_rmaps_base_framework.framework_output, /* we know we have enough slots, or that oversubscrption is allowed, so
"mca:rmaps:rr: mapping no-span by %s extra_procs %d extra_nodes %d", * start mapping procs onto objects, filling each object as we go until
hwloc_obj_type_string(target), * all procs are mapped. If one pass doesn't catch all the required procs,
extra_procs_to_assign, nxtra_nodes); * then loop thru the list again to handle the oversubscription
*/
nprocs_mapped = 0; nprocs_mapped = 0;
for (item = opal_list_get_first(node_list); second_pass = false;
item != opal_list_get_end(node_list); do {
item = opal_list_get_next(item)) { add_one = false;
node = (orte_node_t*)item; OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
/* bozo check */ if (NULL == node->topology) {
if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name);
true, node->name);
return ORTE_ERR_SILENT;
}
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (node->slots <= node->slots_inuse) {
/* everybody takes at least the extras */
num_procs_to_assign = extra_procs_to_assign;
} else {
num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank + extra_procs_to_assign;
if (app->num_procs < num_procs_to_assign) {
/* might have more slots than procs */
num_procs_to_assign = app->num_procs;
}
}
/* if this would oversubscribe the node and the user hasn't permitted
* oversubscription, then don't use it - since the total number of
* slots is adequate for this app, there should be room somewhere else
*/
if (node->slots < (node->slots_inuse + num_procs_to_assign) &&
ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping no-span would oversubscribe node %s - ignoring it",
node->name);
continue;
}
/* add this node to the map, if reqd */
if (!node->mapped) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
/* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:byobj: nprocs-to-assign %d for %d objs on node %s", num_procs_to_assign, nobjs, node->name);
/* if there are no objects of this type, then report the error
* and abort - this can happen, for example, on systems that
* don't report "sockets" as an independent object. However, IF
* this object is the default one - i.e., not specified by the
* user - then we can fall back to mapping by slot
*/
if (0 == nobjs) {
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
true, hwloc_obj_type_string(target), node->name);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} else { }
/* this was the default mapping policy, so clear the map /* get the number of objects of this type on this node */
* of any prior work and indicate that map-by slot is reqd nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
*/ if (0 == nobjs) {
for (i=0; i < jdata->map->nodes->size; i++) { continue;
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { }
/* compute the number of procs to go on this node */
nprocs = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
if (nprocs < 1) {
if (second_pass) {
/* already checked for oversubscription permission, so at least put
* one proc on it
*/
nprocs = 1;
} else {
/* if the user hasn't permitted oversubscription, then don't use it
* on the first pass
*/
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping no-span would oversubscribe node %s - ignoring it",
node->name);
continue; continue;
} }
for (idx=0; idx < node->procs->size; idx++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, idx))) {
continue;
}
if (proc->name.jobid != jdata->jobid) {
continue;
}
--node->num_procs;
OBJ_RELEASE(proc);
opal_pointer_array_set_item(node->procs, idx, NULL);
}
if (0 == node->num_procs) {
node->mapped = false;
OBJ_RELEASE(node);
opal_pointer_array_set_item(jdata->map->nodes, i, NULL);
}
} }
return ORTE_ERR_NOT_SUPPORTED;
} }
} /* add this node to the map, if reqd */
if (!node->mapped) {
/* compute the number of procs to go on each object */ if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
nperobj = num_procs_to_assign / nobjs; ORTE_ERROR_LOG(idx);
opal_output_verbose(2, orte_rmaps_base_framework.framework_output, return idx;
"mca:rmaps:rr:byobj: placing %d procs on each object", nperobj);
if ((int)(nperobj * nobjs) < num_procs_to_assign) {
/* compute how many objs need an extra proc */
nxtra_objs = num_procs_to_assign - nperobj * nobjs;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs);
}
/* loop through the number of objects */
for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
/* get the hwloc object */
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* map the reqd number of procs */
if (0 < nxtra_objs) {
nprocs = nperobj + 1;
--nxtra_objs;
} else {
nprocs = nperobj;
}
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
} }
nprocs_mapped++; node->mapped = true;
proc->locale = obj; OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
} }
} nmapped = 0;
/* not all nodes are equal, so only set oversubscribed for do {
* this node if it is in that state /* loop through the number of objects */
*/ for (i=0; i < (int)nobjs && nmapped < nprocs && nprocs_mapped < (int)app->num_procs; i++) {
if (node->slots < (int)node->num_procs) { /* get the hwloc object */
/* flag the node as oversubscribed so that sched-yield gets if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
* properly set ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
nmapped++;
proc->locale = obj;
}
} while (nmapped < nprocs && nprocs_mapped < (int)app->num_procs);
add_one = true;
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/ */
node->oversubscribed = true; if (node->slots < (int)node->num_procs) {
} /* flag the node as oversubscribed so that sched-yield gets
if (nprocs_mapped == app->num_procs) { * properly set
/* we are done */ */
break; node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
} }
second_pass = true;
} while (add_one && nprocs_mapped < app->num_procs);
if (nprocs_mapped < app->num_procs) {
/* usually means there were no objects of the requested type */
return ORTE_ERR_NOT_FOUND;
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -676,17 +575,13 @@ static int byobj_span(orte_job_t *jdata,
orte_vpid_t num_procs, orte_vpid_t num_procs,
hwloc_obj_type_t target, unsigned cache_level) hwloc_obj_type_t target, unsigned cache_level)
{ {
int i, j, nprocs_mapped, lag, delta, navg; int i, j, nprocs_mapped, navg;
orte_node_t *node; orte_node_t *node;
orte_proc_t *proc; orte_proc_t *proc;
opal_list_item_t *item; int nprocs, nxtra_objs;
int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0; int idx;
int extra_procs_to_assign=0, nxtra_nodes=0, idx;
hwloc_obj_t obj=NULL; hwloc_obj_t obj=NULL;
unsigned int nobjs; unsigned int nobjs;
float balance;
bool add_one=false;
bool oversubscribed=false;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output, opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu", "mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu",
@ -694,66 +589,54 @@ static int byobj_span(orte_job_t *jdata,
ORTE_JOBID_PRINT(jdata->jobid), ORTE_JOBID_PRINT(jdata->jobid),
(int)num_slots, (unsigned long)num_procs); (int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs - can't /* quick check to see if we can map all the procs */
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < (int)app->num_procs * orte_rmaps_base.cpus_per_rank) { if (num_slots < (int)app->num_procs * orte_rmaps_base.cpus_per_rank) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app); true, app->num_procs, app->app);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
oversubscribed = true;
} }
/* divide the procs evenly across all nodes - this is the /* we know we have enough slots, or that oversubscrption is allowed, so
* average we have to maintain as we go, but we adjust * next determine how many total objects we have to work with
* the number on each node to reflect its available slots.
* Obviously, if all nodes have the same number of slots,
* then the avg is what we get on each node - this is
* the most common situation.
*/ */
navg = app->num_procs / opal_list_get_size(node_list); nobjs = 0;
if (0 == navg) { OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
/* if there are less procs than nodes, we have to
* place at least one/node
*/
navg = 1;
}
/* compute how many extra procs to put on each node */
balance = (float)((jdata->num_procs + app->num_procs) - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = (jdata->num_procs + app->num_procs) - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping by %s navg %d extra_procs %d extra_nodes %d",
hwloc_obj_type_string(target),
navg, extra_procs_to_assign, nxtra_nodes);
nprocs_mapped = 0;
lag = 0;
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
/* bozo check */
if (NULL == node->topology) { if (NULL == node->topology) {
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
true, node->name); true, node->name);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
/* get the number of objects of this type on this node */
nobjs += opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
}
if (0 == nobjs) {
return ORTE_ERR_NOT_FOUND;
}
/* divide the procs evenly across all objects */
navg = app->num_procs / nobjs;
if (0 == navg) {
/* if there are less procs than objects, we have to
* place at least one/obj
*/
navg = 1;
}
/* compute how many objs need an extra proc */
if (0 > (nxtra_objs = app->num_procs - (navg * nobjs))) {
nxtra_objs = 0;
}
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping by %s navg %d extra_objs %d",
hwloc_obj_type_string(target),
navg, nxtra_objs);
nprocs_mapped = 0;
OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
/* add this node to the map, if reqd */ /* add this node to the map, if reqd */
if (!node->mapped) { if (!node->mapped) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
@ -764,76 +647,10 @@ static int byobj_span(orte_job_t *jdata,
OBJ_RETAIN(node); /* maintain accounting on object */ OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes); ++(jdata->map->num_nodes);
} }
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (oversubscribed) {
/* everybody just takes their share */
num_procs_to_assign = navg + extra_procs_to_assign;
} else {
/* if we are not oversubscribed, then there are enough
* slots to handle all the procs. However, not every
* node will have the same number of slots, so we
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (node->slots <= node->slots_inuse) {
/* if there are no extras to take, then we can
* safely remove this node as we don't need it
*/
if (0 == extra_procs_to_assign) {
opal_pointer_array_set_item(jdata->map->nodes, node->index, NULL);
OBJ_RELEASE(node);
--(jdata->map->num_nodes);
/* update how many we are lagging behind */
lag += navg;
continue;
}
/* everybody has to take at least the extras */
num_procs_to_assign = extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg;
} else {
/* if slots < avg, then take all */
if ((node->slots - node->slots_inuse) < navg * orte_rmaps_base.cpus_per_rank) {
num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
/* update how many we are lagging behind */
lag += navg - (node->slots - node->slots_inuse) + extra_procs_to_assign;
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank - navg;
if (lag < delta) {
delta = lag;
}
lag -= delta;
}
num_procs_to_assign = navg + delta + extra_procs_to_assign;
}
}
}
/* get the number of objects of this type on this node */ /* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(2, orte_rmaps_base_framework.framework_output, opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name); "mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name);
/* compute the number of procs to go on each object */
nperobj = num_procs_to_assign / nobjs;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:byobj: placing %d procs on each object", nperobj);
if ((int)(nperobj * nobjs) < num_procs_to_assign) {
/* compute how many objs need an extra proc */
nxtra_objs = num_procs_to_assign - nperobj * nobjs;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs);
}
/* loop through the number of objects */ /* loop through the number of objects */
for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) { for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
/* get the hwloc object */ /* get the hwloc object */
@ -841,13 +658,13 @@ static int byobj_span(orte_job_t *jdata,
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
/* map the reqd number of procs */ /* determine how many to map */
nprocs = navg;
if (0 < nxtra_objs) { if (0 < nxtra_objs) {
nprocs = nperobj + 1; nprocs++;
--nxtra_objs; nxtra_objs--;
} else {
nprocs = nperobj;
} }
/* map the reqd number of procs */
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) { for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;