Resolve the problem of binding in inverted topologies - check the relative depth of the map and bind objects in the topology, and let that determine whether we bind downward or upwards.
cmr=v1.7.5:reviewer=jsquyres:subject=Resolve the problem of binding in inverted topologies This commit was SVN r30643.
Этот коммит содержится в:
родитель
0ee38353ba
Коммит
ca0c806662
@ -177,7 +177,9 @@ typedef uint16_t opal_binding_policy_t;
|
|||||||
#define OPAL_BIND_IF_SUPPORTED 0x1000
|
#define OPAL_BIND_IF_SUPPORTED 0x1000
|
||||||
#define OPAL_BIND_ALLOW_OVERLOAD 0x2000
|
#define OPAL_BIND_ALLOW_OVERLOAD 0x2000
|
||||||
#define OPAL_BIND_GIVEN 0x4000
|
#define OPAL_BIND_GIVEN 0x4000
|
||||||
/* binding policies */
|
/* binding policies - any changes in these
|
||||||
|
* values must be reflected in orte/mca/rmaps/rmaps.h
|
||||||
|
*/
|
||||||
#define OPAL_BIND_TO_NONE 1
|
#define OPAL_BIND_TO_NONE 1
|
||||||
#define OPAL_BIND_TO_BOARD 2
|
#define OPAL_BIND_TO_BOARD 2
|
||||||
#define OPAL_BIND_TO_NUMA 3
|
#define OPAL_BIND_TO_NUMA 3
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -102,6 +102,7 @@ static void reset_usage(orte_node_t *node, orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int bind_upwards(orte_job_t *jdata,
|
static int bind_upwards(orte_job_t *jdata,
|
||||||
|
orte_node_t *node,
|
||||||
hwloc_obj_type_t target,
|
hwloc_obj_type_t target,
|
||||||
unsigned cache_level)
|
unsigned cache_level)
|
||||||
{
|
{
|
||||||
@ -109,14 +110,12 @@ static int bind_upwards(orte_job_t *jdata,
|
|||||||
* until we find an object of type target - and then bind
|
* until we find an object of type target - and then bind
|
||||||
* the process to that target
|
* the process to that target
|
||||||
*/
|
*/
|
||||||
int i, j;
|
int j;
|
||||||
orte_job_map_t *map;
|
orte_job_map_t *map;
|
||||||
orte_node_t *node;
|
|
||||||
orte_proc_t *proc;
|
orte_proc_t *proc;
|
||||||
hwloc_obj_t obj;
|
hwloc_obj_t obj;
|
||||||
hwloc_cpuset_t cpus;
|
hwloc_cpuset_t cpus;
|
||||||
unsigned int idx, ncpus;
|
unsigned int idx, ncpus;
|
||||||
struct hwloc_topology_support *support;
|
|
||||||
opal_hwloc_obj_data_t *data;
|
opal_hwloc_obj_data_t *data;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
@ -126,169 +125,104 @@ static int bind_upwards(orte_job_t *jdata,
|
|||||||
/* initialize */
|
/* initialize */
|
||||||
map = jdata->map;
|
map = jdata->map;
|
||||||
|
|
||||||
for (i=0; i < map->nodes->size; i++) {
|
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
/* cycle thru the procs */
|
||||||
|
for (j=0; j < node->procs->size; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!orte_do_not_launch) {
|
/* ignore procs from other jobs */
|
||||||
/* if we don't want to launch, then we are just testing the system,
|
if (proc->name.jobid != jdata->jobid) {
|
||||||
* so ignore questions about support capabilities
|
continue;
|
||||||
*/
|
}
|
||||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
|
/* ignore procs that have already been bound - should
|
||||||
/* check if topology supports cpubind - have to be careful here
|
* never happen, but safer
|
||||||
* as Linux doesn't currently support thread-level binding. This
|
*/
|
||||||
* may change in the future, though, and it isn't clear how hwloc
|
if (NULL != proc->cpu_bitmap) {
|
||||||
* interprets the current behavior. So check both flags to be sure.
|
continue;
|
||||||
*/
|
}
|
||||||
if (!support->cpubind->set_thisproc_cpubind &&
|
/* bozo check */
|
||||||
!support->cpubind->set_thisthread_cpubind) {
|
if (NULL == proc->locale) {
|
||||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
"BIND UPWARDS: LOCALE FOR PROC %s IS NULL",
|
||||||
/* we are not required to bind, so ignore this */
|
ORTE_NAME_PRINT(&proc->name));
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
/* starting at the locale, move up thru the parents
|
||||||
|
* to find the target object type
|
||||||
|
*/
|
||||||
|
for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) {
|
||||||
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
|
"%s bind:upward target %s type %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
hwloc_obj_type_string(target),
|
||||||
|
hwloc_obj_type_string(obj->type));
|
||||||
|
if (target == obj->type) {
|
||||||
|
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
|
/* get its index */
|
||||||
return ORTE_ERR_SILENT;
|
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
|
||||||
}
|
|
||||||
/* check if topology supports membind - have to be careful here
|
|
||||||
* as hwloc treats this differently than I (at least) would have
|
|
||||||
* expected. Per hwloc, Linux memory binding is at the thread,
|
|
||||||
* and not process, level. Thus, hwloc sets the "thisproc" flag
|
|
||||||
* to "false" on all Linux systems, and uses the "thisthread" flag
|
|
||||||
* to indicate binding capability - don't warn if the user didn't
|
|
||||||
* specifically request binding
|
|
||||||
*/
|
|
||||||
if (!support->membind->set_thisproc_membind &&
|
|
||||||
!support->membind->set_thisthread_membind &&
|
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
|
||||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
|
||||||
membind_warned = true;
|
|
||||||
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
|
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
}
|
/* track the number bound */
|
||||||
}
|
data = (opal_hwloc_obj_data_t*)obj->userdata;
|
||||||
|
data->num_bound++;
|
||||||
/* some systems do not report cores, and so we can get a situation where our
|
/* get the number of cpus under this location */
|
||||||
* default binding policy will fail for no necessary reason. So if we are
|
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
|
||||||
* computing a binding due to our default policy, and no cores are found
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||||
* on this node, just silently skip it - we will not bind
|
return ORTE_ERR_SILENT;
|
||||||
*/
|
|
||||||
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
|
|
||||||
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"Unable to bind-to core by default on node %s as no cores detected",
|
|
||||||
node->name);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* we share topologies in order
|
|
||||||
* to save space, so we need to reset the usage info to reflect
|
|
||||||
* our own current state
|
|
||||||
*/
|
|
||||||
reset_usage(node, jdata->jobid);
|
|
||||||
|
|
||||||
/* cycle thru the procs */
|
|
||||||
for (j=0; j < node->procs->size; j++) {
|
|
||||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* ignore procs from other jobs */
|
|
||||||
if (proc->name.jobid != jdata->jobid) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* ignore procs that have already been bound - should
|
|
||||||
* never happen, but safer
|
|
||||||
*/
|
|
||||||
if (NULL != proc->cpu_bitmap) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* bozo check */
|
|
||||||
if (NULL == proc->locale) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"BIND UPWARDS: LOCALE FOR PROC %s IS NULL",
|
|
||||||
ORTE_NAME_PRINT(&proc->name));
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
/* starting at the locale, move up thru the parents
|
|
||||||
* to find the target object type
|
|
||||||
*/
|
|
||||||
for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s bind:upward target %s type %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
hwloc_obj_type_string(target),
|
|
||||||
hwloc_obj_type_string(obj->type));
|
|
||||||
if (target == obj->type) {
|
|
||||||
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* get its index */
|
|
||||||
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
/* track the number bound */
|
|
||||||
data = (opal_hwloc_obj_data_t*)obj->userdata;
|
|
||||||
data->num_bound++;
|
|
||||||
/* get the number of cpus under this location */
|
|
||||||
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
/* error out if adding a proc would cause overload and that wasn't allowed,
|
|
||||||
* and it wasn't a default binding policy (i.e., the user requested it)
|
|
||||||
*/
|
|
||||||
if (ncpus < data->num_bound &&
|
|
||||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
|
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
|
||||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
|
||||||
data->num_bound, ncpus);
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
/* bind it here */
|
|
||||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
|
|
||||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
|
||||||
/* record the location */
|
|
||||||
proc->bind_location = obj;
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&proc->name),
|
|
||||||
proc->cpu_bitmap,
|
|
||||||
hwloc_obj_type_string(target),
|
|
||||||
idx, node->name);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
/* error out if adding a proc would cause overload and that wasn't allowed,
|
||||||
if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
|
* and it wasn't a default binding policy (i.e., the user requested it)
|
||||||
/* didn't find anyone to bind to - this is an error
|
|
||||||
* unless the user specified if-supported
|
|
||||||
*/
|
*/
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
|
if (ncpus < data->num_bound &&
|
||||||
opal_hwloc_base_print_binding(map->binding), node->name);
|
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
|
||||||
return ORTE_ERR_SILENT;
|
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||||
|
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||||
|
data->num_bound, ncpus);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
/* bind it here */
|
||||||
|
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
|
||||||
|
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
||||||
|
/* record the location */
|
||||||
|
proc->bind_location = obj;
|
||||||
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
|
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name),
|
||||||
|
proc->cpu_bitmap,
|
||||||
|
hwloc_obj_type_string(target),
|
||||||
|
idx, node->name);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
|
||||||
|
/* didn't find anyone to bind to - this is an error
|
||||||
|
* unless the user specified if-supported
|
||||||
|
*/
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
|
||||||
|
opal_hwloc_base_print_binding(map->binding), node->name);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int bind_downwards(orte_job_t *jdata,
|
static int bind_downwards(orte_job_t *jdata,
|
||||||
|
orte_node_t *node,
|
||||||
hwloc_obj_type_t target,
|
hwloc_obj_type_t target,
|
||||||
unsigned cache_level)
|
unsigned cache_level)
|
||||||
{
|
{
|
||||||
int i, j;
|
int j;
|
||||||
orte_job_map_t *map;
|
orte_job_map_t *map;
|
||||||
orte_node_t *node;
|
|
||||||
orte_proc_t *proc;
|
orte_proc_t *proc;
|
||||||
hwloc_obj_t trg_obj, nxt_obj;
|
hwloc_obj_t trg_obj, nxt_obj;
|
||||||
hwloc_cpuset_t cpus;
|
hwloc_cpuset_t cpus;
|
||||||
unsigned int ncpus;
|
unsigned int ncpus;
|
||||||
struct hwloc_topology_support *support;
|
|
||||||
opal_hwloc_obj_data_t *data;
|
opal_hwloc_obj_data_t *data;
|
||||||
int total_cpus;
|
int total_cpus;
|
||||||
hwloc_cpuset_t totalcpuset;
|
hwloc_cpuset_t totalcpuset;
|
||||||
@ -301,172 +235,105 @@ static int bind_downwards(orte_job_t *jdata,
|
|||||||
map = jdata->map;
|
map = jdata->map;
|
||||||
totalcpuset = hwloc_bitmap_alloc();
|
totalcpuset = hwloc_bitmap_alloc();
|
||||||
|
|
||||||
for (i=0; i < map->nodes->size; i++) {
|
/* cycle thru the procs */
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
for (j=0; j < node->procs->size; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!orte_do_not_launch) {
|
/* ignore procs from other jobs */
|
||||||
/* if we don't want to launch, then we are just testing the system,
|
if (proc->name.jobid != jdata->jobid) {
|
||||||
* so ignore questions about support capabilities
|
|
||||||
*/
|
|
||||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
|
|
||||||
/* check if topology supports cpubind - have to be careful here
|
|
||||||
* as Linux doesn't currently support thread-level binding. This
|
|
||||||
* may change in the future, though, and it isn't clear how hwloc
|
|
||||||
* interprets the current behavior. So check both flags to be sure.
|
|
||||||
*/
|
|
||||||
if (!support->cpubind->set_thisproc_cpubind &&
|
|
||||||
!support->cpubind->set_thisthread_cpubind) {
|
|
||||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
|
|
||||||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
|
||||||
/* we are not required to bind, so ignore this */
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
|
|
||||||
hwloc_bitmap_free(totalcpuset);
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
/* check if topology supports membind - have to be careful here
|
|
||||||
* as hwloc treats this differently than I (at least) would have
|
|
||||||
* expected. Per hwloc, Linux memory binding is at the thread,
|
|
||||||
* and not process, level. Thus, hwloc sets the "thisproc" flag
|
|
||||||
* to "false" on all Linux systems, and uses the "thisthread" flag
|
|
||||||
* to indicate binding capability - don't warn if the user didn't
|
|
||||||
* specifically request binding
|
|
||||||
*/
|
|
||||||
if (!support->membind->set_thisproc_membind &&
|
|
||||||
!support->membind->set_thisthread_membind &&
|
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
|
||||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
|
||||||
membind_warned = true;
|
|
||||||
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
|
|
||||||
hwloc_bitmap_free(totalcpuset);
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* some systems do not report cores, and so we can get a situation where our
|
|
||||||
* default binding policy will fail for no necessary reason. So if we are
|
|
||||||
* computing a binding due to our default policy, and no cores are found
|
|
||||||
* on this node, just silently skip it - we will not bind
|
|
||||||
*/
|
|
||||||
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
|
|
||||||
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"Unable to bind-to core by default on node %s as no cores detected",
|
|
||||||
node->name);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
/* ignore procs that have already been bound - should
|
||||||
/* we share topologies in order
|
* never happen, but safer
|
||||||
* to save space, so we need to reset the usage info to reflect
|
|
||||||
* our own current state
|
|
||||||
*/
|
*/
|
||||||
reset_usage(node, jdata->jobid);
|
if (NULL != proc->cpu_bitmap) {
|
||||||
|
continue;
|
||||||
/* cycle thru the procs */
|
}
|
||||||
for (j=0; j < node->procs->size; j++) {
|
/* we don't know if the target is a direct child of this locale,
|
||||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
* or if it is some depth below it, so we have to conduct a bit
|
||||||
continue;
|
* of a search. Let hwloc find the min usage one for us.
|
||||||
}
|
*/
|
||||||
/* ignore procs from other jobs */
|
trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology,
|
||||||
if (proc->name.jobid != jdata->jobid) {
|
proc->locale,
|
||||||
continue;
|
target, cache_level);
|
||||||
}
|
if (NULL == trg_obj) {
|
||||||
/* ignore procs that have already been bound - should
|
/* there aren't any such targets under this object */
|
||||||
* never happen, but safer
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||||
*/
|
hwloc_bitmap_free(totalcpuset);
|
||||||
if (NULL != proc->cpu_bitmap) {
|
return ORTE_ERR_SILENT;
|
||||||
continue;
|
}
|
||||||
}
|
/* record the location */
|
||||||
/* we don't know if the target is a direct child of this locale,
|
proc->bind_location = trg_obj;
|
||||||
* or if it is some depth below it, so we have to conduct a bit
|
/* start with a clean slate */
|
||||||
* of a search. Let hwloc find the min usage one for us.
|
hwloc_bitmap_zero(totalcpuset);
|
||||||
*/
|
total_cpus = 0;
|
||||||
trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology,
|
nxt_obj = trg_obj;
|
||||||
proc->locale,
|
do {
|
||||||
target, cache_level);
|
if (NULL == nxt_obj) {
|
||||||
if (NULL == trg_obj) {
|
/* could not find enough cpus to meet request */
|
||||||
/* there aren't any such targets under this object */
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||||
hwloc_bitmap_free(totalcpuset);
|
hwloc_bitmap_free(totalcpuset);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
/* record the location */
|
trg_obj = nxt_obj;
|
||||||
proc->bind_location = trg_obj;
|
/* get the number of cpus under this location */
|
||||||
/* start with a clean slate */
|
ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
|
||||||
hwloc_bitmap_zero(totalcpuset);
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
total_cpus = 0;
|
"%s GOT %d CPUS",
|
||||||
nxt_obj = trg_obj;
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
|
||||||
do {
|
/* track the number bound */
|
||||||
if (NULL == nxt_obj) {
|
if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
|
||||||
/* could not find enough cpus to meet request */
|
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
trg_obj->userdata = data;
|
||||||
|
}
|
||||||
|
data->num_bound++;
|
||||||
|
/* error out if adding a proc would cause overload and that wasn't allowed,
|
||||||
|
* and it wasn't a default binding policy (i.e., the user requested it)
|
||||||
|
*/
|
||||||
|
if (ncpus < data->num_bound &&
|
||||||
|
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
||||||
|
if (OPAL_BIND_GIVEN & opal_hwloc_binding_policy) {
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||||
|
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||||
|
data->num_bound, ncpus);
|
||||||
hwloc_bitmap_free(totalcpuset);
|
hwloc_bitmap_free(totalcpuset);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
|
||||||
trg_obj = nxt_obj;
|
|
||||||
/* get the number of cpus under this location */
|
|
||||||
ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s GOT %d CPUS",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
|
|
||||||
/* track the number bound */
|
|
||||||
if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
|
|
||||||
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
|
||||||
trg_obj->userdata = data;
|
|
||||||
}
|
|
||||||
data->num_bound++;
|
|
||||||
/* error out if adding a proc would cause overload and that wasn't allowed,
|
|
||||||
* and it wasn't a default binding policy (i.e., the user requested it)
|
|
||||||
*/
|
|
||||||
if (ncpus < data->num_bound &&
|
|
||||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
|
||||||
if (OPAL_BIND_GIVEN & opal_hwloc_binding_policy) {
|
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
|
||||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
|
||||||
data->num_bound, ncpus);
|
|
||||||
hwloc_bitmap_free(totalcpuset);
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
} else {
|
|
||||||
/* if this is the default binding policy, then just don't
|
|
||||||
* bind this proc
|
|
||||||
*/
|
|
||||||
data->num_bound--; // maintain count
|
|
||||||
/* show the proc as not bound */
|
|
||||||
proc->bind_location = NULL;
|
|
||||||
hwloc_bitmap_zero(totalcpuset);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* bind the proc here */
|
|
||||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
|
|
||||||
hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
|
|
||||||
/* track total #cpus */
|
|
||||||
total_cpus += ncpus;
|
|
||||||
/* move to the next location, in case we need it */
|
|
||||||
nxt_obj = trg_obj->next_cousin;
|
|
||||||
} while (total_cpus < orte_rmaps_base.cpus_per_rank);
|
|
||||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset);
|
|
||||||
if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
|
|
||||||
char tmp1[1024], tmp2[1024];
|
|
||||||
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), totalcpuset)) {
|
|
||||||
opal_output(orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s PROC %s ON %s IS NOT BOUND",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
|
||||||
} else {
|
} else {
|
||||||
opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), totalcpuset);
|
/* if this is the default binding policy, then just don't
|
||||||
opal_output(orte_rmaps_base_framework.framework_output,
|
* bind this proc
|
||||||
"%s BOUND PROC %s[%s] TO %s: %s",
|
*/
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
data->num_bound--; // maintain count
|
||||||
ORTE_NAME_PRINT(&proc->name), node->name,
|
/* show the proc as not bound */
|
||||||
tmp1, tmp2);
|
proc->bind_location = NULL;
|
||||||
|
hwloc_bitmap_zero(totalcpuset);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* bind the proc here */
|
||||||
|
cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
|
||||||
|
hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
|
||||||
|
/* track total #cpus */
|
||||||
|
total_cpus += ncpus;
|
||||||
|
/* move to the next location, in case we need it */
|
||||||
|
nxt_obj = trg_obj->next_cousin;
|
||||||
|
} while (total_cpus < orte_rmaps_base.cpus_per_rank);
|
||||||
|
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset);
|
||||||
|
if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
|
||||||
|
char tmp1[1024], tmp2[1024];
|
||||||
|
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), totalcpuset)) {
|
||||||
|
opal_output(orte_rmaps_base_framework.framework_output,
|
||||||
|
"%s PROC %s ON %s IS NOT BOUND",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||||
|
} else {
|
||||||
|
opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), totalcpuset);
|
||||||
|
opal_output(orte_rmaps_base_framework.framework_output,
|
||||||
|
"%s BOUND PROC %s[%s] TO %s: %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name), node->name,
|
||||||
|
tmp1, tmp2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
hwloc_bitmap_free(totalcpuset);
|
hwloc_bitmap_free(totalcpuset);
|
||||||
@ -715,17 +582,31 @@ static int bind_to_cpuset(orte_job_t *jdata)
|
|||||||
|
|
||||||
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||||
{
|
{
|
||||||
|
hwloc_obj_type_t hwb, hwm;
|
||||||
|
unsigned clvl=0;
|
||||||
|
opal_binding_policy_t bind;
|
||||||
|
orte_mapping_policy_t map;
|
||||||
|
orte_node_t *node;
|
||||||
|
int i, rc;
|
||||||
|
struct hwloc_topology_support *support;
|
||||||
|
bool force_down = false;
|
||||||
|
hwloc_cpuset_t totalcpuset;
|
||||||
|
int bind_depth, map_depth;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"mca:rmaps: compute bindings for job %s with policy %s",
|
"mca:rmaps: compute bindings for job %s with policy %s",
|
||||||
ORTE_JOBID_PRINT(jdata->jobid),
|
ORTE_JOBID_PRINT(jdata->jobid),
|
||||||
opal_hwloc_base_print_binding(jdata->map->binding));
|
opal_hwloc_base_print_binding(jdata->map->binding));
|
||||||
|
|
||||||
if (ORTE_MAPPING_BYUSER == ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
|
map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
|
||||||
|
bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);
|
||||||
|
|
||||||
|
if (ORTE_MAPPING_BYUSER == map) {
|
||||||
/* user specified binding by rankfile - nothing for us to do */
|
/* user specified binding by rankfile - nothing for us to do */
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
if (OPAL_BIND_TO_CPUSET == bind) {
|
||||||
int rc;
|
int rc;
|
||||||
/* cpuset was given - setup the bindings */
|
/* cpuset was given - setup the bindings */
|
||||||
if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
|
if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
|
||||||
@ -734,17 +615,76 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
if (OPAL_BIND_TO_NONE == bind) {
|
||||||
/* no binding requested */
|
/* no binding requested */
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
if (OPAL_BIND_TO_BOARD == bind) {
|
||||||
/* doesn't do anything at this time */
|
/* doesn't do anything at this time */
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* binding requested */
|
/* binding requested - convert the binding level to the hwloc obj type */
|
||||||
|
switch (bind) {
|
||||||
|
case OPAL_BIND_TO_NUMA:
|
||||||
|
hwb = HWLOC_OBJ_NODE;
|
||||||
|
break;
|
||||||
|
case OPAL_BIND_TO_SOCKET:
|
||||||
|
hwb = HWLOC_OBJ_SOCKET;
|
||||||
|
break;
|
||||||
|
case OPAL_BIND_TO_L3CACHE:
|
||||||
|
hwb = HWLOC_OBJ_CACHE;
|
||||||
|
clvl = 3;
|
||||||
|
break;
|
||||||
|
case OPAL_BIND_TO_L2CACHE:
|
||||||
|
hwb = HWLOC_OBJ_CACHE;
|
||||||
|
clvl = 2;
|
||||||
|
break;
|
||||||
|
case OPAL_BIND_TO_L1CACHE:
|
||||||
|
hwb = HWLOC_OBJ_CACHE;
|
||||||
|
clvl = 1;
|
||||||
|
break;
|
||||||
|
case OPAL_BIND_TO_CORE:
|
||||||
|
hwb = HWLOC_OBJ_CORE;
|
||||||
|
break;
|
||||||
|
case OPAL_BIND_TO_HWTHREAD:
|
||||||
|
hwb = HWLOC_OBJ_PU;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return ORTE_ERR_BAD_PARAM;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* do the same for the mapping policy */
|
||||||
|
switch (map) {
|
||||||
|
case ORTE_MAPPING_BYNUMA:
|
||||||
|
hwm = HWLOC_OBJ_NODE;
|
||||||
|
break;
|
||||||
|
case ORTE_MAPPING_BYSOCKET:
|
||||||
|
hwm = HWLOC_OBJ_SOCKET;
|
||||||
|
break;
|
||||||
|
case ORTE_MAPPING_BYL3CACHE:
|
||||||
|
hwm = HWLOC_OBJ_CACHE;
|
||||||
|
clvl = 3;
|
||||||
|
break;
|
||||||
|
case ORTE_MAPPING_BYL2CACHE:
|
||||||
|
hwm = HWLOC_OBJ_CACHE;
|
||||||
|
clvl = 2;
|
||||||
|
break;
|
||||||
|
case ORTE_MAPPING_BYL1CACHE:
|
||||||
|
hwm = HWLOC_OBJ_CACHE;
|
||||||
|
clvl = 1;
|
||||||
|
break;
|
||||||
|
case ORTE_MAPPING_BYCORE:
|
||||||
|
hwm = HWLOC_OBJ_CORE;
|
||||||
|
break;
|
||||||
|
case ORTE_MAPPING_BYHWTHREAD:
|
||||||
|
hwm = HWLOC_OBJ_PU;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return ORTE_ERR_BAD_PARAM;
|
||||||
|
}
|
||||||
|
|
||||||
/* if the job was mapped by the corresponding target, then
|
/* if the job was mapped by the corresponding target, then
|
||||||
* we bind in place
|
* we bind in place
|
||||||
*
|
*
|
||||||
@ -758,41 +698,19 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
|||||||
* procs to the resources below.
|
* procs to the resources below.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
if (ORTE_MAPPING_BYDIST == map) {
|
||||||
int rc = ORTE_SUCCESS;
|
int rc = ORTE_SUCCESS;
|
||||||
if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
if (OPAL_BIND_TO_NUMA == bind) {
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"mca:rmaps: bindings for job %s - dist to numa",
|
"mca:rmaps: bindings for job %s - dist to numa",
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
ORTE_JOBID_PRINT(jdata->jobid));
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
|
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
} else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
} else if (OPAL_BIND_TO_NUMA < bind) {
|
||||||
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
/* bind every proc downwards */
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
|
force_down = true;
|
||||||
ORTE_ERROR_LOG(rc);
|
goto execute;
|
||||||
}
|
|
||||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
/* if the binding policy is less than numa, then we are unbound - so
|
/* if the binding policy is less than numa, then we are unbound - so
|
||||||
* just ignore this and return (should have been caught in prior
|
* just ignore this and return (should have been caught in prior
|
||||||
@ -800,176 +718,127 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
|||||||
* and "board")
|
* and "board")
|
||||||
*/
|
*/
|
||||||
return rc;
|
return rc;
|
||||||
} else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
}
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
/* now deal with the remaining binding policies based on hardware */
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
if (bind == map) {
|
||||||
"mca:rmaps: bindings for job %s - hwthread to hwthread",
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
"mca:rmaps: bindings for job %s - bind in place",
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) {
|
ORTE_JOBID_PRINT(jdata->jobid));
|
||||||
ORTE_ERROR_LOG(rc);
|
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) {
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* HW threads are at the bottom, so we must have mapped somewhere
|
|
||||||
* above this level - and we need to bind downwards
|
|
||||||
*/
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"mca:rmaps: bindings for job %s - core to core",
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* if the mapping policy used is less than bycore, then it is a
|
|
||||||
* downward binding - i.e., the locale is above the binding location.
|
|
||||||
* for example, if we map-to-socket and bind-to-core, then we compare
|
|
||||||
* the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET.
|
|
||||||
* In this case, BYCORE > BYSOCKET, so we know that the locale is
|
|
||||||
* above the desired binding level (sockets are at a higher level than
|
|
||||||
* the desired core binding level), and we will have to bind downwards
|
|
||||||
*/
|
|
||||||
if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"mca:rmaps: bindings for job %s - L1cache to L1cache",
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* if the mapping policy is less than l1cache, then it is a
|
|
||||||
* downward binding
|
|
||||||
*/
|
|
||||||
if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"mca:rmaps: bindings for job %s - L2cache to L2cache",
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* if the mapping policy is less than l2cache, then it is a
|
|
||||||
* downward binding
|
|
||||||
*/
|
|
||||||
if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"mca:rmaps: bindings for job %s - L3cache to L3cache",
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* if the mapping policy is less than l3cache, then it is a
|
|
||||||
* downward binding
|
|
||||||
*/
|
|
||||||
if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"mca:rmaps: bindings for job %s - socket to socket",
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* if the mapping policy is less than bysocket, then it is a
|
|
||||||
* downward binding
|
|
||||||
*/
|
|
||||||
if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
|
||||||
int rc;
|
|
||||||
if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
||||||
"mca:rmaps: bindings for job %s - numa to numa",
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* if the mapping policy is less than numa, then it is a
|
|
||||||
* downward binding
|
|
||||||
*/
|
|
||||||
if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we get here, then the binding policy isn't supported */
|
/* we need to handle the remaining binding options on a per-node
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
* basis because different nodes could potentially have different
|
||||||
return ORTE_ERR_NOT_SUPPORTED;
|
* topologies, with different relative depths for the two levels
|
||||||
|
*/
|
||||||
|
execute:
|
||||||
|
/* initialize */
|
||||||
|
totalcpuset = hwloc_bitmap_alloc();
|
||||||
|
|
||||||
|
for (i=0; i < jdata->map->nodes->size; i++) {
|
||||||
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!orte_do_not_launch) {
|
||||||
|
/* if we don't want to launch, then we are just testing the system,
|
||||||
|
* so ignore questions about support capabilities
|
||||||
|
*/
|
||||||
|
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
|
||||||
|
/* check if topology supports cpubind - have to be careful here
|
||||||
|
* as Linux doesn't currently support thread-level binding. This
|
||||||
|
* may change in the future, though, and it isn't clear how hwloc
|
||||||
|
* interprets the current behavior. So check both flags to be sure.
|
||||||
|
*/
|
||||||
|
if (!support->cpubind->set_thisproc_cpubind &&
|
||||||
|
!support->cpubind->set_thisthread_cpubind) {
|
||||||
|
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
|
||||||
|
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
||||||
|
/* we are not required to bind, so ignore this */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
|
||||||
|
hwloc_bitmap_free(totalcpuset);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
/* check if topology supports membind - have to be careful here
|
||||||
|
* as hwloc treats this differently than I (at least) would have
|
||||||
|
* expected. Per hwloc, Linux memory binding is at the thread,
|
||||||
|
* and not process, level. Thus, hwloc sets the "thisproc" flag
|
||||||
|
* to "false" on all Linux systems, and uses the "thisthread" flag
|
||||||
|
* to indicate binding capability - don't warn if the user didn't
|
||||||
|
* specifically request binding
|
||||||
|
*/
|
||||||
|
if (!support->membind->set_thisproc_membind &&
|
||||||
|
!support->membind->set_thisthread_membind &&
|
||||||
|
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
||||||
|
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
||||||
|
membind_warned = true;
|
||||||
|
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
|
||||||
|
hwloc_bitmap_free(totalcpuset);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* some systems do not report cores, and so we can get a situation where our
|
||||||
|
* default binding policy will fail for no necessary reason. So if we are
|
||||||
|
* computing a binding due to our default policy, and no cores are found
|
||||||
|
* on this node, just silently skip it - we will not bind
|
||||||
|
*/
|
||||||
|
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
|
||||||
|
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
||||||
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
|
"Unable to bind-to core by default on node %s as no cores detected",
|
||||||
|
node->name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we share topologies in order
|
||||||
|
* to save space, so we need to reset the usage info to reflect
|
||||||
|
* our own current state
|
||||||
|
*/
|
||||||
|
reset_usage(node, jdata->jobid);
|
||||||
|
|
||||||
|
if (force_down) {
|
||||||
|
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* determine the relative depth on this node */
|
||||||
|
bind_depth = hwloc_get_type_depth(node->topology, hwb);
|
||||||
|
if (0 > bind_depth) {
|
||||||
|
/* didn't find such an object */
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
|
||||||
|
true, hwloc_obj_type_string(hwb), node->name);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
map_depth = hwloc_get_type_depth(node->topology, hwm);
|
||||||
|
if (0 > map_depth) {
|
||||||
|
/* didn't find such an object */
|
||||||
|
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
|
||||||
|
true, hwloc_obj_type_string(hwm), node->name);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
if (bind_depth > map_depth) {
|
||||||
|
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -100,19 +100,25 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t);
|
|||||||
((pol) & 0xff00)
|
((pol) & 0xff00)
|
||||||
|
|
||||||
/* round-robin policies */
|
/* round-robin policies */
|
||||||
#define ORTE_MAPPING_BYSLOT 1
|
/* start with hardware-based options
|
||||||
#define ORTE_MAPPING_BYNODE 2
|
* so the values match the corresponding
|
||||||
#define ORTE_MAPPING_BYBOARD 3
|
* levels in opal/mca/hwloc/hwloc.h
|
||||||
#define ORTE_MAPPING_BYNUMA 4
|
*/
|
||||||
#define ORTE_MAPPING_BYSOCKET 5
|
#define ORTE_MAPPING_BYNODE 1
|
||||||
#define ORTE_MAPPING_BYL3CACHE 6
|
#define ORTE_MAPPING_BYBOARD 2
|
||||||
#define ORTE_MAPPING_BYL2CACHE 7
|
#define ORTE_MAPPING_BYNUMA 3
|
||||||
#define ORTE_MAPPING_BYL1CACHE 8
|
#define ORTE_MAPPING_BYSOCKET 4
|
||||||
#define ORTE_MAPPING_BYCORE 9
|
#define ORTE_MAPPING_BYL3CACHE 5
|
||||||
#define ORTE_MAPPING_BYHWTHREAD 10
|
#define ORTE_MAPPING_BYL2CACHE 6
|
||||||
|
#define ORTE_MAPPING_BYL1CACHE 7
|
||||||
|
#define ORTE_MAPPING_BYCORE 8
|
||||||
|
#define ORTE_MAPPING_BYHWTHREAD 9
|
||||||
|
/* now take the other round-robin options */
|
||||||
|
#define ORTE_MAPPING_BYSLOT 10
|
||||||
#define ORTE_MAPPING_BYDIST 11
|
#define ORTE_MAPPING_BYDIST 11
|
||||||
/* convenience - declare anything <= 15 to be round-robin*/
|
/* convenience - declare anything <= 15 to be round-robin*/
|
||||||
#define ORTE_MAPPING_RR 0x000f
|
#define ORTE_MAPPING_RR 0x000f
|
||||||
|
|
||||||
/* sequential policy */
|
/* sequential policy */
|
||||||
#define ORTE_MAPPING_SEQ 20
|
#define ORTE_MAPPING_SEQ 20
|
||||||
/* staged execution mapping */
|
/* staged execution mapping */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user