Correctly detect and handle oversubscription for comm_spawn
cmr=v1.7.4:reviewer=jsquyres:subject=Correctly detect and handle oversubscription for comm_spawn This commit was SVN r30186.
Этот коммит содержится в:
родитель
6e5fedeb04
Коммит
9fcb46d85a
@ -490,22 +490,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
|||||||
node->name, node->slots, node->slots_inuse));
|
node->name, node->slots, node->slots_inuse));
|
||||||
opal_list_remove_item(allocated_nodes, item);
|
opal_list_remove_item(allocated_nodes, item);
|
||||||
OBJ_RELEASE(item); /* "un-retain" it */
|
OBJ_RELEASE(item); /* "un-retain" it */
|
||||||
} else {
|
} else if (node->slots > node->slots_inuse) {
|
||||||
if (node->slots > node->slots_inuse) {
|
|
||||||
/* add the available slots */
|
/* add the available slots */
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
|
||||||
"%s node %s has %d slots available",
|
"%s node %s has %d slots available",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
node->name, node->slots));
|
node->name, node->slots - node->slots_inuse));
|
||||||
num_slots += node->slots - node->slots_inuse;
|
num_slots += node->slots - node->slots_inuse;
|
||||||
} else {
|
|
||||||
/* always allocate at least one */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s node %s has %d slots %d used",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
node->name, node->slots, node->slots_inuse));
|
|
||||||
num_slots++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** go on to next item */
|
/** go on to next item */
|
||||||
|
@ -98,6 +98,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
|||||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||||
"mca:rmaps:rr:slot assigning %d procs to node %s",
|
"mca:rmaps:rr:slot assigning %d procs to node %s",
|
||||||
(int)num_procs_to_assign, node->name);
|
(int)num_procs_to_assign, node->name);
|
||||||
|
|
||||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||||
/* add this node to the map - do it only once */
|
/* add this node to the map - do it only once */
|
||||||
if (!node->mapped) {
|
if (!node->mapped) {
|
||||||
@ -507,16 +508,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
|||||||
true, node->name);
|
true, node->name);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
/* add this node to the map, if reqd */
|
|
||||||
if (!node->mapped) {
|
|
||||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
|
||||||
ORTE_ERROR_LOG(idx);
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
node->mapped = true;
|
|
||||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
|
||||||
++(jdata->map->num_nodes);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* compute the number of procs to go on this node */
|
/* compute the number of procs to go on this node */
|
||||||
if (add_one) {
|
if (add_one) {
|
||||||
@ -538,6 +529,29 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if this would oversubscribe the node and the user hasn't permitted
|
||||||
|
* oversubscription, then don't use it - since the total number of
|
||||||
|
* slots is adequate for this app, there should be room somewhere else
|
||||||
|
*/
|
||||||
|
if (node->slots < (node->slots_inuse + num_procs_to_assign) &&
|
||||||
|
ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||||
|
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||||
|
"mca:rmaps:rr: mapping no-span would oversubscribe node %s - ignoring it",
|
||||||
|
node->name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* add this node to the map, if reqd */
|
||||||
|
if (!node->mapped) {
|
||||||
|
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||||
|
ORTE_ERROR_LOG(idx);
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
node->mapped = true;
|
||||||
|
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||||
|
++(jdata->map->num_nodes);
|
||||||
|
}
|
||||||
|
|
||||||
/* get the number of objects of this type on this node */
|
/* get the number of objects of this type on this node */
|
||||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
|
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
|
||||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user