1
1

Ensure we use the first compute node's topology for mapping

Don't filter the topology by cpuset if you are mpirun until you know that no other compute nodes are involved. This deals with the corner case where mpirun is executing on a node of different topology from the compute nodes.

Simplify - don't mandate that all cpus in the given cpuset be present on every node. We can then run everything thru the filter as before, which ensures that any procs run on mpirun are also contained within the specified cpuset.

Correctly count the number of available PUs under each object when given a cpuset

Fix the default binding settings, and correctly count PUs when no cpuset is given

Ensure the binding policy gets set in all cases
Этот коммит содержится в:
Ralph Castain 2015-03-16 17:32:04 -07:00
родитель ccba8ce856
Коммит 43a3baad5e
12 изменённых файлов: 146 добавлений и 120 удалений

Просмотреть файл

@ -28,12 +28,6 @@ The specified %s policy is not recognized:
Please check for a typo or ensure that the option is a supported
one.
#
[cpu-not-found]
A specified %s processor does not exist in this topology:
CPU number: %d
Cpu set given: %s
#
[redefining-policy]
Conflicting directives for binding policy are causing the policy
to be redefined:

Просмотреть файл

@ -104,7 +104,9 @@ int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
/* since we are loading this from an external source, we have to
* explicitly set a flag so hwloc sets things up correctly
*/
if (0 != hwloc_topology_set_flags(t, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) {
if (0 != hwloc_topology_set_flags(t, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
rc = OPAL_ERROR;
hwloc_topology_destroy(t);
goto cleanup;
@ -133,6 +135,11 @@ int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
goto cleanup;
}
/* filter the cpus thru any default cpu set */
if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(t))) {
goto cleanup;
}
/* pass it back */
tarray[i] = t;

Просмотреть файл

@ -449,7 +449,8 @@ char* opal_hwloc_base_print_locality(opal_hwloc_locality_t locality)
static void obj_data_const(opal_hwloc_obj_data_t *ptr)
{
ptr->available = NULL;
ptr->npus = UINT_MAX;
ptr->npus_calculated = false;
ptr->npus = 0;
ptr->idx = UINT_MAX;
ptr->num_bound = 0;
}

Просмотреть файл

@ -80,14 +80,12 @@ hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
* numbered within their sockets instead). So we find the
* specified PU, and then return the core object that contains it */
obj = hwloc_get_pu_obj_by_os_index(topo, lid);
if (NULL == obj) {
opal_show_help("help-opal-hwloc-base.txt",
"cpu-not-found", true, "physical",
lid, (NULL == opal_hwloc_base_cpu_set) ? "None" : opal_hwloc_base_cpu_set);
return NULL; // failed to find it
}
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"physical cpu %d %s found in cpuset %s",
lid, (NULL == obj) ? "not" : "is",
(NULL == opal_hwloc_base_cpu_set) ? "None" : opal_hwloc_base_cpu_set));
/* we now need to shift upward to the core including this PU */
if (HWLOC_OBJ_CORE == obj_type) {
if (NULL != obj && HWLOC_OBJ_CORE == obj_type) {
obj = obj->parent;
}
return obj;
@ -98,12 +96,10 @@ hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
/* Now do the actual lookup. */
obj = hwloc_get_obj_by_type(topo, obj_type, lid);
if (NULL == obj) {
opal_show_help("help-opal-hwloc-base.txt",
"cpu-not-found", true, "logical",
lid, (NULL == opal_hwloc_base_cpu_set) ? "None" : opal_hwloc_base_cpu_set);
return NULL;
}
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"logical cpu %d %s found in cpuset %s",
lid, (NULL == obj) ? "not" : "is",
(NULL == opal_hwloc_base_cpu_set) ? "None" : opal_hwloc_base_cpu_set));
/* Found the right core (or PU). Return the object */
return obj;
@ -117,6 +113,7 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
hwloc_obj_t root, pu;
hwloc_cpuset_t avail = NULL, pucpus, res;
opal_hwloc_topo_data_t *sum;
opal_hwloc_obj_data_t *data;
char **ranges=NULL, **range=NULL;
int idx, cpu, start, end;
@ -129,8 +126,6 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
/* should only ever enter here once, but check anyway */
if (NULL != sum->available) {
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"hwloc:base:filter_cpus specified - already done"));
return OPAL_SUCCESS;
}
@ -156,43 +151,38 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
case 1:
/* only one cpu given - get that object */
cpu = strtoul(range[0], NULL, 10);
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
opal_argv_free(ranges);
opal_argv_free(range);
hwloc_bitmap_free(avail);
hwloc_bitmap_free(res);
hwloc_bitmap_free(pucpus);
return OPAL_ERR_SILENT;
if (NULL != (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
hwloc_bitmap_or(res, avail, pucpus);
hwloc_bitmap_copy(avail, res);
data = (opal_hwloc_obj_data_t*)pu->userdata;
if (NULL == data) {
pu->userdata = (void*)OBJ_NEW(opal_hwloc_obj_data_t);
data = (opal_hwloc_obj_data_t*)pu->userdata;
}
data->npus++;
}
hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
hwloc_bitmap_or(res, avail, pucpus);
hwloc_bitmap_copy(avail, res);
break;
case 2:
/* range given */
start = strtoul(range[0], NULL, 10);
end = strtoul(range[1], NULL, 10);
for (cpu=start; cpu <= end; cpu++) {
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
opal_argv_free(ranges);
opal_argv_free(range);
hwloc_bitmap_free(avail);
hwloc_bitmap_free(res);
hwloc_bitmap_free(pucpus);
return OPAL_ERR_SILENT;
if (NULL != (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
hwloc_bitmap_or(res, avail, pucpus);
hwloc_bitmap_copy(avail, res);
data = (opal_hwloc_obj_data_t*)pu->userdata;
if (NULL == data) {
pu->userdata = (void*)OBJ_NEW(opal_hwloc_obj_data_t);
data = (opal_hwloc_obj_data_t*)pu->userdata;
}
data->npus++;
}
hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
hwloc_bitmap_or(res, avail, pucpus);
hwloc_bitmap_copy(avail, res);
}
break;
default:
hwloc_bitmap_free(avail);
hwloc_bitmap_free(res);
hwloc_bitmap_free(pucpus);
opal_argv_free(ranges);
opal_argv_free(range);
return OPAL_ERR_BAD_PARAM;
break;
}
opal_argv_free(range);
}
@ -249,7 +239,7 @@ static void fill_cache_line_size(void)
int opal_hwloc_base_get_topology(void)
{
int rc;
int rc=OPAL_SUCCESS;
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"hwloc:base:get_topology"));
@ -262,15 +252,11 @@ int opal_hwloc_base_get_topology(void)
0 != hwloc_topology_load(opal_hwloc_topology)) {
return OPAL_ERR_NOT_SUPPORTED;
}
/* filter the cpus thru any default cpu set */
rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology);
if (OPAL_SUCCESS != rc) {
if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
return rc;
}
} else {
rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file);
if (OPAL_SUCCESS != rc) {
if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file))) {
return rc;
}
}
@ -435,7 +421,7 @@ hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
opal_hwloc_topo_data_t *rdata;
opal_hwloc_obj_data_t *data;
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
OPAL_OUTPUT_VERBOSE((10, opal_hwloc_base_framework.framework_output,
"hwloc:base: get available cpus"));
/* get the node-level information */
@ -448,8 +434,6 @@ hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"hwloc:base:get_available_cpus first time - filtering cpus"));
}
/* ensure the topo-level cpuset was prepared */
opal_hwloc_base_filter_cpus(topo);
/* are we asking about the root object? */
if (obj == root) {
@ -498,9 +482,17 @@ hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
static void df_search_cores(hwloc_obj_t obj, unsigned int *cnt)
{
unsigned k;
opal_hwloc_obj_data_t *data;
if (HWLOC_OBJ_CORE == obj->type) {
*cnt += 1;
data = (opal_hwloc_obj_data_t*)obj->userdata;
if (NULL == data) {
return;
}
if (NULL == opal_hwloc_base_cpu_set) {
data->npus = 1;
}
*cnt += data->npus;
return;
}
@ -547,28 +539,8 @@ unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
unsigned int cnt = 0;
hwloc_cpuset_t cpuset;
/* if the object is a hwthread (i.e., HWLOC_OBJ_PU),
* then the answer is always 1 since there isn't
* anything underneath it
*/
if (HWLOC_OBJ_PU == obj->type) {
return 1;
}
/* if the object is a core (i.e., HWLOC_OBJ_CORE) and
* we are NOT treating hwthreads as independent cpus,
* then the answer is also 1 since we don't allow
* you to use the underlying hwthreads as separate
* entities
*/
if (HWLOC_OBJ_CORE == obj->type &&
!opal_hwloc_use_hwthreads_as_cpus) {
return 1;
}
data = (opal_hwloc_obj_data_t*)obj->userdata;
if (NULL == data || UINT_MAX == data->npus) {
if (NULL == data || !data->npus_calculated) {
if (!opal_hwloc_use_hwthreads_as_cpus) {
/* if we are treating cores as cpus, then we really
* want to know how many cores are in this object.
@ -618,6 +590,7 @@ unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
obj->userdata = (void*)data;
}
data->npus = cnt;
data->npus_calculated = true;
}
return data->npus;
@ -876,6 +849,10 @@ static hwloc_obj_t df_search_min_bound(hwloc_topology_t topo,
opal_hwloc_obj_data_t *data;
if (target == start->type) {
/* only consider procs that are allowed */
if (0 == (k = opal_hwloc_base_get_npus(topo, start))) {
goto notfound;
}
if (HWLOC_OBJ_CACHE == start->type && cache_level != start->attr->cache.depth) {
goto notfound;
}
@ -885,6 +862,7 @@ static hwloc_obj_t df_search_min_bound(hwloc_topology_t topo,
data = OBJ_NEW(opal_hwloc_obj_data_t);
start->userdata = data;
}
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"hwloc:base:min_bound_under_obj object %s:%u nbound %u min %u",
hwloc_obj_type_string(target), start->logical_index,

Просмотреть файл

@ -143,6 +143,7 @@ typedef uint8_t opal_hwloc_resource_type_t;
typedef struct {
opal_object_t super;
hwloc_cpuset_t available;
bool npus_calculated;
unsigned int npus;
unsigned int idx;
unsigned int num_bound;

Просмотреть файл

@ -145,7 +145,7 @@ int orte_ess_base_orted_setup(char **hosts)
/* get the local topology */
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "topology discovery";
goto error;
}

Просмотреть файл

@ -197,7 +197,7 @@ static int rte_init(void)
{
/* get the local topology */
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "topology discovery";
goto error;
}

Просмотреть файл

@ -96,16 +96,6 @@ static int rte_init(void)
goto error;
}
#if OPAL_HAVE_HWLOC
/* get the topology */
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
error = "topology discovery";
goto error;
}
}
#endif
/* we don't have to call pmix.init because the pmix select did it */
/**** THE FOLLOWING ARE REQUIRED VALUES ***/
@ -202,6 +192,16 @@ static int rte_init(void)
free(string_key);
}
#if OPAL_HAVE_HWLOC
/* if it wasn't passed down to us, get the topology */
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "topology discovery";
goto error;
}
}
#endif
/* we don't need to force the routed system to pick the
* "direct" component as that should happen automatically
* in those cases where we are direct launched (i.e., no

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -85,21 +85,32 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
#if OPAL_HAVE_HWLOC
{
orte_topology_t *t;
hwloc_topology_t t;
orte_node_t *node;
int i;
/* set the nodes to point to the topology
* of mpirun's node for any nodes that didn't send
* back their topology, thus indicating they are different
*/
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
for (i=1; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
/* if we got back topology info from the first node, then we use
* it as the "standard" for all other nodes unless they sent
* back their own topology */
if (1 < orte_process_info.num_procs) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 1)) ||
NULL == (t = node->topology)) {
/* something is wrong */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
OBJ_RELEASE(caddy);
return;
}
if (NULL == node->topology) {
node->topology = t->topo;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setting topo to that from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
for (i=2; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (NULL == node->topology) {
node->topology = t;
}
}
}
/* if this is an unmanaged allocation, then set the default
@ -874,12 +885,14 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
if (10 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
opal_dss.dump(0, topo, OPAL_HWLOC_TOPO);
}
if (orte_hetero_nodes) {
if (1 == dname.vpid || orte_hetero_nodes) {
/* the user has told us that something is different, so just store it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s ADDING TOPOLOGY PER USER REQUEST",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
t = OBJ_NEW(orte_topology_t);
/* filter the topology as we'll need it that way later */
opal_hwloc_base_filter_cpus(topo);
t->topo = topo;
t->sig = sig;
opal_pointer_array_add(orte_node_topologies, t);
@ -909,6 +922,8 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
"%s NEW TOPOLOGY - ADDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
t = OBJ_NEW(orte_topology_t);
/* filter the topology as we'll need it that way later */
opal_hwloc_base_filter_cpus(topo);
t->topo = topo;
t->sig = sig;
opal_pointer_array_add(orte_node_topologies, t);

Просмотреть файл

@ -351,6 +351,10 @@ static int bind_downwards(orte_job_t *jdata,
nxt_obj = trg_obj->next_cousin;
} while (total_cpus < orte_rmaps_base.cpus_per_rank);
hwloc_bitmap_list_asprintf(&cpu_bitmap, totalcpuset);
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"%s PROC %s BITMAP %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), cpu_bitmap);
orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
if (NULL != cpu_bitmap) {
free(cpu_bitmap);
@ -680,9 +684,9 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
int bind_depth, map_depth;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: compute bindings for job %s with policy %s",
"mca:rmaps: compute bindings for job %s with policy %s[%x]",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
opal_hwloc_base_print_binding(jdata->map->binding), jdata->map->binding);
map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);

Просмотреть файл

@ -153,10 +153,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* ranking was already handled, so just use it here */
map->ranking = orte_rmaps_base.ranking;
#if OPAL_HAVE_HWLOC
map->binding = opal_hwloc_binding_policy;
#endif
if (NULL != orte_rmaps_base.ppr) {
map->ppr = strdup(orte_rmaps_base.ppr);
}
@ -231,14 +227,42 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
jdata->map->ranking = orte_rmaps_base.ranking;
}
#if OPAL_HAVE_HWLOC
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
jdata->map->binding = opal_hwloc_binding_policy;
}
#endif
}
#if OPAL_HAVE_HWLOC
/* define the binding policy for this job - if the user specified one
* already (e.g., during the call to comm_spawn), then we don't
* override it */
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
/* if the user specified a default binding policy via
* MCA param, then we use it */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
jdata->map->binding = opal_hwloc_binding_policy;
} else {
/* if nothing was specified, then we default to a policy
* based on number of procs and cpus_per_rank */
if (2 <= nprocs) {
if (1 < orte_rmaps_base.cpus_per_rank) {
/* assigning multiple cpus to a rank implies threading,
* so we only bind to the NUMA level */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
} else {
/* for performance, bind to core */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CORE);
}
} else {
if (1 < orte_rmaps_base.cpus_per_rank) {
/* assigning multiple cpus to a rank implies threading,
* so we only bind to the NUMA level */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
} else {
/* for performance, bind to socket */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_SOCKET);
}
}
}
}
/* if we are not going to launch, then we need to set any
* undefined topologies to match our own so the mapper
* can operate

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -752,8 +752,10 @@ int orte_daemon(int argc, char *argv[])
char *coprocessors;
uint8_t tflag;
/* add the local topology, if different from the HNP's or user directed us to */
if (orte_hetero_nodes || 0 != strcmp(orte_topo_signature, orted_globals.hnp_topo_sig)) {
/* add the local topology, if different from the HNP's or user directed us to,
* but always if we are the first daemon to ensure we get a compute node */
if (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes ||
0 != strcmp(orte_topo_signature, orted_globals.hnp_topo_sig)) {
tflag = 1;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &tflag, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(ret);