1
1

Change the behavior of cpus-per-rank. We previously counted each cpu against the #slots. However, IBM has pointed out that "slot" is equated to the number of processes allowed to run on each node, and not the number of cpus on the node. This has been a continuing source of confusion, so make the distinction a "hard" one.

Each process occupies a "slot". We automatically set #slots = #cpus if nothing else is told to us. If you want to run more procs and slots, you must tell us to allow oversubscription.

A process can utilize multiple pe's if that option is given. If you try to bind more than one proc to a given pe, then we will error out unless you tell us to allow overloading.
Этот коммит содержится в:
Ralph Castain 2016-08-22 15:54:41 -07:00
родитель 6549c878a9
Коммит 7de4d6922b
5 изменённых файлов: 22 добавлений и 25 удалений

Просмотреть файл

@ -583,7 +583,7 @@ orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
proc->node = node;
node->num_procs++;
if (node->slots_inuse < node->slots) {
node->slots_inuse += orte_rmaps_base.cpus_per_rank;
++node->slots_inuse;
}
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -220,7 +220,7 @@ static int mindist_map(orte_job_t *jdata)
total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_CORE, 0, OPAL_HWLOC_AVAILABLE);
}
if (bynode) {
if (total_npus < num_procs_to_assign * orte_rmaps_base.cpus_per_rank) {
if (total_npus < num_procs_to_assign) {
/* check if oversubscribing is allowed */
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
@ -262,9 +262,9 @@ static int mindist_map(orte_job_t *jdata)
}
npus = opal_hwloc_base_get_npus(node->topology, obj);
if (bynode) {
required = ((num_procs_to_assign-j) > npus/orte_rmaps_base.cpus_per_rank) ? (npus/orte_rmaps_base.cpus_per_rank) : (num_procs_to_assign-j);
required = ((num_procs_to_assign-j) > npus) ? (npus) : (num_procs_to_assign-j);
} else {
required = npus/orte_rmaps_base.cpus_per_rank;
required = npus;
}
for (k = 0; (k < required) && (nprocs_mapped < app->num_procs); k++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
@ -367,7 +367,7 @@ static int mindist_map(orte_job_t *jdata)
nprocs_mapped++;
k++;
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
if (k > npus/orte_rmaps_base.cpus_per_rank-1) {
if (k > npus-1) {
numa_item = opal_list_get_next(numa_item);
if (numa_item == opal_list_get_end(&numa_list)) {
numa_item = opal_list_get_first(&numa_list);

Просмотреть файл

@ -130,7 +130,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
/* set the num_procs to equal the number of slots on these
* mapped nodes, taking into account the number of cpus/rank
*/
app->num_procs = num_slots / orte_rmaps_base.cpus_per_rank;
app->num_procs = num_slots;
/* sometimes, we have only one "slot" assigned, but may
* want more than one cpu/rank - so ensure we always wind
* up with at least one proc */

Просмотреть файл

@ -56,7 +56,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
/* check to see if we can map all the procs */
if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -85,11 +85,8 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
node->name);
continue;
}
/* assign a number of procs equal to the number of available
* slots divided by the number of cpus/rank the user
* requested
*/
num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
/* assign a number of procs equal to the number of available slots */
num_procs_to_assign = node->slots - node->slots_inuse;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:slot assigning %d procs to node %s",
(int)num_procs_to_assign, node->name);
@ -168,7 +165,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
--nxtra_nodes;
}
}
num_procs_to_assign = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) + extra_procs_to_assign;
num_procs_to_assign = node->slots - node->slots_inuse + extra_procs_to_assign;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr:slot adding up to %d procs to node %s",
num_procs_to_assign, node->name);
@ -182,7 +179,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -236,7 +233,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs */
if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -336,8 +333,8 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
}
}
/* if slots < avg + extra (adjusted for cpus/proc), then try to take all */
if ((node->slots - node->slots_inuse) < ((navg + extra_procs_to_assign) * orte_rmaps_base.cpus_per_rank)) {
num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank;
if ((node->slots - node->slots_inuse) < (navg + extra_procs_to_assign)) {
num_procs_to_assign = node->slots - node->slots_inuse;
/* if we can't take any proc, skip following steps */
if (num_procs_to_assign == 0) {
continue;
@ -366,7 +363,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -418,7 +415,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -488,7 +485,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs */
if (num_slots < (app->num_procs * orte_rmaps_base.cpus_per_rank)) {
if (num_slots < app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -528,7 +525,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
start = (jdata->bkmark_obj + 1) % nobjs;
}
/* compute the number of procs to go on this node */
nprocs = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
nprocs = node->slots - node->slots_inuse;
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: calculated nprocs %d", nprocs);
if (nprocs < 1) {
@ -648,7 +645,7 @@ static int byobj_span(orte_job_t *jdata,
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs */
if (num_slots < (int)app->num_procs * orte_rmaps_base.cpus_per_rank) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -743,7 +740,7 @@ static int byobj_span(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots < (int)node->num_procs * orte_rmaps_base.cpus_per_rank) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/

Просмотреть файл

@ -268,9 +268,9 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s dashhost: found existing node %s on input list - adding slots",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
/* transfer across the number of slots */
node->slots += nd->slots;
if (ORTE_FLAG_TEST(nd, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
/* transfer across the number of slots */
node->slots = nd->slots;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
}
break;