From a3060cdd15347ce22a78863814bf6bd940e079c8 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 14 Sep 2012 22:01:19 +0000 Subject: [PATCH] Fix the bind_downward code - it was incorrectly looking across the entire node instead of only looking below the locale to which the proc had been assigned. In other words, if the proc was mapped to a core, then the only hwthreads that should be considered for binding are those directly below that core. The binding algo was incorrectly looking at ALL hwthreads in that scenario, causing the proc to be bound to an HT outside of the mapped location. This now results in the procs being bound within their assigned location. It also causes us to use only the 0th HT on a core unless --use-hwthread-cpus has been specified (in which case, we use all the HTs in a core). Bind to core binds you to all HTs regardless - the --use-hwthread-cpus only impacts the oversubscribed determination and when binding to HT. cmr:v1.7 This commit was SVN r27342. --- opal/mca/hwloc/base/base.h | 4 + opal/mca/hwloc/base/hwloc_base_open.c | 1 + opal/mca/hwloc/base/hwloc_base_util.c | 125 ++++++++++++++++++++++- opal/mca/hwloc/hwloc.h | 1 + orte/mca/rmaps/base/rmaps_base_binding.c | 90 +++++++--------- 5 files changed, 166 insertions(+), 55 deletions(-) diff --git a/opal/mca/hwloc/base/base.h b/opal/mca/hwloc/base/base.h index 82596c514a..19531a8d5d 100644 --- a/opal/mca/hwloc/base/base.h +++ b/opal/mca/hwloc/base/base.h @@ -145,6 +145,10 @@ OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t t hwloc_obj_type_t target, unsigned cache_level, opal_hwloc_resource_type_t rtype); +OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_find_min_bound_target_under_obj(hwloc_topology_t topo, + hwloc_obj_t obj, + hwloc_obj_type_t target, + unsigned cache_leve); OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, unsigned cache_level, diff --git a/opal/mca/hwloc/base/hwloc_base_open.c b/opal/mca/hwloc/base/hwloc_base_open.c index 5d7e3c0828..85c57af2d1 100644 --- a/opal/mca/hwloc/base/hwloc_base_open.c +++ b/opal/mca/hwloc/base/hwloc_base_open.c @@ -310,6 +310,7 @@ static void obj_data_const(opal_hwloc_obj_data_t *ptr) ptr->available = NULL; ptr->npus = 0; ptr->idx = UINT_MAX; + ptr->num_bound = 0; } static void obj_data_dest(opal_hwloc_obj_data_t *ptr) { diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 79386a6a5c..352cb50772 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -436,6 +436,25 @@ unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo, unsigned int cnt; hwloc_cpuset_t cpuset; + /* if the object is a hwthread (i.e., HWLOC_OBJ_PU), + * then the answer is always 1 since there isn't + * anything underneath it + */ + if (HWLOC_OBJ_PU == obj->type) { + return 1; + } + + /* if the object is a core (i.e., HWLOC_OBJ_CORE) and + * we are NOT treating hwthreads as independent cpus, + * then the answer is also 1 since we don't allow + * you to use the underlying hwthreads as separate + * entities + */ + if (HWLOC_OBJ_CORE == obj->type && + !opal_hwloc_use_hwthreads_as_cpus) { + return 1; + } + data = (opal_hwloc_obj_data_t*)obj->userdata; if (NULL == data || 0 == data->npus) { @@ -552,7 +571,7 @@ unsigned int opal_hwloc_base_get_obj_idx(hwloc_topology_t topo, * a multi-step test :-( * * And, of course, we make things even worse because we don't - * always care about what is physically or logicallly present, + * always care about what is physically or logically present, * but rather what is available to us. For example, we don't * want to map or bind to a cpu that is offline, or one that * we aren't allowed by use by the OS. So we have to also filter @@ -735,6 +754,110 @@ unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, return num_objs; } +static hwloc_obj_t df_search_min_bound(hwloc_topology_t topo, + hwloc_obj_t start, + hwloc_obj_type_t target, + unsigned cache_level, + unsigned int *min_bound) +{ + unsigned k; + hwloc_obj_t obj, save=NULL; + opal_hwloc_obj_data_t *data; + + if (target == start->type) { + if (HWLOC_OBJ_CACHE == start->type && cache_level != start->attr->cache.depth) { + goto notfound; + } + /* see how many procs are bound to us */ + data = (opal_hwloc_obj_data_t*)start->userdata; + if (NULL == data) { + data = OBJ_NEW(opal_hwloc_obj_data_t); + start->userdata = data; + } + OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_output, + "hwloc:base:min_bound_under_obj object %s:%u nbound %u min %u", + hwloc_obj_type_string(target), start->logical_index, + data->num_bound, *min_bound)); + if (data->num_bound < *min_bound) { + *min_bound = data->num_bound; + return start; + } + /* if we have more procs bound to us than the min, return NULL */ + return NULL; + } + + notfound: + for (k=0; k < start->arity; k++) { + obj = df_search_min_bound(topo, start->children[k], target, cache_level, min_bound); + if (NULL != obj) { + save = obj; + } + /* if the target level is HWTHREAD and we are NOT treating + * hwthreads as separate cpus, then we can only consider + * the 0th hwthread on a core + */ + if (HWLOC_OBJ_CORE == start->type && HWLOC_OBJ_PU == target && + !opal_hwloc_use_hwthreads_as_cpus) { + break; + } + } + + return save; +} + +hwloc_obj_t opal_hwloc_base_find_min_bound_target_under_obj(hwloc_topology_t topo, + hwloc_obj_t obj, + hwloc_obj_type_t target, + unsigned cache_level) +{ + unsigned int min_bound; + hwloc_obj_t loc; + + /* bozo check */ + if (NULL == topo || NULL == obj) { + OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_output, + "hwloc:base:find_min_bound_under_obj NULL %s", + (NULL == topo) ? "topology" : "object")); + return NULL; + } + + + /* if the object and target is the same type, then there is + * nothing under it, so just return itself + */ + if (target == obj->type) { + /* again, we have to treat caches differently as + * the levels distinguish them + */ + if (HWLOC_OBJ_CACHE == target && + cache_level < obj->attr->cache.depth) { + goto moveon; + } + return obj; + } + + moveon: + /* the hwloc accessors all report at the topo level, + * so we have to do some work + */ + min_bound = UINT_MAX; + + loc = df_search_min_bound(topo, obj, target, cache_level, &min_bound); + + if (HWLOC_OBJ_CACHE == target) { + OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_output, + "hwloc:base:min_bound_under_obj found min bound of %u on %s:%u:%u", + min_bound, hwloc_obj_type_string(target), + cache_level, loc->logical_index)); + } else { + OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_output, + "hwloc:base:min_bound_under_obj found min bound of %u on %s:%u", + min_bound, hwloc_obj_type_string(target), loc->logical_index)); + } + + return loc; +} + /* as above, only return the Nth instance of the specified object * type from inside the topology */ diff --git a/opal/mca/hwloc/hwloc.h b/opal/mca/hwloc/hwloc.h index 4228225b49..81ec3c4482 100644 --- a/opal/mca/hwloc/hwloc.h +++ b/opal/mca/hwloc/hwloc.h @@ -116,6 +116,7 @@ typedef struct { hwloc_cpuset_t available; unsigned int npus; unsigned int idx; + unsigned int num_bound; } opal_hwloc_obj_data_t; OBJ_CLASS_DECLARATION(opal_hwloc_obj_data_t); diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index 20bd897d3e..19ff432ba5 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -235,10 +235,11 @@ static int bind_downwards(orte_job_t *jdata, orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; - hwloc_obj_t obj; + hwloc_obj_t trg_obj; hwloc_cpuset_t cpus; - unsigned int n, idx, minval, ncpus, nobjs, nsave = 0, *nbound=NULL; + unsigned int ncpus; struct hwloc_topology_support *support; + opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: bind downward for job %s with bindings %s", @@ -268,9 +269,6 @@ static int bind_downwards(orte_job_t *jdata, continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); - if (NULL != nbound) { - free(nbound); - } return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here @@ -287,32 +285,12 @@ static int bind_downwards(orte_job_t *jdata, membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); - if (NULL != nbound) { - free(nbound); - } return ORTE_ERR_SILENT; } } } - /* get the number of objects of this type on this node */ - nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, - cache_level, OPAL_HWLOC_AVAILABLE); - if (0 == nobjs) { - orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true, - node->name, hwloc_obj_type_string(target)); - return ORTE_ERR_SILENT; - } - /* setup the array */ - if (NULL == nbound) { - nbound = (unsigned int*)malloc(nobjs * sizeof(int)); - nsave = nobjs; - } else if (nsave < nobjs) { - nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int)); - } - memset(nbound, 0, nobjs * sizeof(int)); - - /* cycle thru the procs */ + /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; @@ -327,53 +305,55 @@ static int bind_downwards(orte_job_t *jdata, if (NULL != proc->cpu_bitmap) { continue; } - /* cycle across the target objects and select the one with - * minimum usage + /* we don't know if the target is a direct child of this locale, + * or if it is some depth below it, so we have to conduct a bit + * of a search. Let hwloc find the min usage one for us */ - minval = UINT_MAX; - idx = 0; - for (n=0; n < nobjs; n++) { - if (nbound[n] < minval) { - minval = nbound[n]; - idx = n; - } + trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology, + proc->locale, + target, cache_level); + if (NULL == trg_obj) { + /* there aren't any such targets under this object */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); + return ORTE_ERR_SILENT; } /* track the number bound */ - ++nbound[idx]; + data = (opal_hwloc_obj_data_t*)trg_obj->userdata; + data->num_bound++; + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "%s GETTING NUMBER OF CPUS UNDER OBJECT %s[%d]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + hwloc_obj_type_string(target), trg_obj->logical_index); /* get the number of cpus under this location */ - obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, - idx, OPAL_HWLOC_AVAILABLE); - if (NULL == obj || 0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { + ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj); + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "%s GOT %d CPUS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus); + if (0 == ncpus) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); - free(nbound); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed */ - if (ncpus < nbound[idx] && + if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, - nbound[idx], ncpus); - free(nbound); + data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind the proc here */ - proc->bind_idx = idx; - cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); + proc->bind_idx = trg_obj->logical_index; + cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), - proc->cpu_bitmap, hwloc_obj_type_string(obj->type), - idx, node->name); + proc->cpu_bitmap, hwloc_obj_type_string(trg_obj->type), + proc->bind_idx, node->name); } } - if (NULL != nbound) { - free(nbound); - } - return ORTE_SUCCESS; } @@ -671,8 +651,10 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata) } return rc; } - /* HW threads are at the bottom, so all other bindings are upwards */ - if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_PU, 0))) { + /* HW threads are at the bottom, so we must have mapped somewhere + * above this level - and we need to bind downwards + */ + if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } return rc;