1
1

Finish the binding cleanup by removing the no-longer-used binding level scheme. This proved to be fallible as there is no guarantee that the hierarchy it used matched physical reality of the machine (e.g., is L3 "above" the socket or not). Still have to complete the ppr update, but get the rest of it correct.

This commit was SVN r28223.
This commit is contained in:
Ralph Castain 2013-03-26 20:09:49 +00:00
parent 44e371a65d
commit 317915225c
13 changed files with 80 additions and 287 deletions

View File

@ -243,8 +243,10 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
/* If we find >0 NUMA nodes, then investigate further */
if (i > 0) {
opal_hwloc_level_t bind_level;
unsigned int bind_index;
int numa, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;
hwloc_obj_t obj;
/* JMS This tells me how many numa nodes are *available*,
but it's not how many are being used *by this job*.
@ -254,27 +256,30 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
used *in this job*. */
mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i;
/* Fill opal_hwloc_my_cpuset and find out to what level
this process is bound (if at all) */
opal_hwloc_base_get_local_cpuset();
opal_hwloc_base_get_level_and_index(opal_hwloc_my_cpuset,
&bind_level, &bind_index);
if (OPAL_HWLOC_NODE_LEVEL != bind_level) {
/* We are bound to *something* (i.e., our binding
level is less than "node", meaning the entire
machine), so discover which NUMA node this process
is bound */
if (OPAL_HWLOC_NUMA_LEVEL == bind_level) {
mca_btl_sm_component.mem_node = my_mem_node = (int) bind_index;
} else {
if (OPAL_SUCCESS ==
opal_hwloc_base_get_local_index(HWLOC_OBJ_NODE, 0, &bind_index)) {
mca_btl_sm_component.mem_node = my_mem_node = (int) bind_index;
} else {
/* Weird. We can't figure out what NUMA node
we're on. :-( */
mca_btl_sm_component.mem_node = my_mem_node = -1;
/* if we are not bound, then there is nothing further to do */
if (NULL != ompi_process_info.cpuset) {
/* count the number of NUMA nodes to which we are bound */
for (w=0; w < i; w++) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0, w,
OPAL_HWLOC_AVAILABLE))) {
continue;
}
/* get that NUMA node's available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
/* see if we intersect */
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
n_bound++;
numa = w;
}
}
/* if we are located on more than one NUMA, or we didn't find
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_sm_component.mem_node = my_mem_node = numa;
} else {
mca_btl_sm_component.mem_node = my_mem_node = -1;
}
}
}

View File

@ -246,8 +246,10 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
/* If we find >0 NUMA nodes, then investigate further */
if (i > 0) {
opal_hwloc_level_t bind_level;
unsigned int bind_index;
int numa, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;
hwloc_obj_t obj;
/* JMS This tells me how many numa nodes are *available*,
but it's not how many are being used *by this job*.
@ -255,29 +257,32 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
the previous carto-based implementation), but it really
should be improved to be how many NUMA nodes are being
used *in this job*. */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;
mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i;
/* Fill opal_hwloc_my_cpuset and find out to what level
this process is bound (if at all) */
opal_hwloc_base_get_local_cpuset();
opal_hwloc_base_get_level_and_index(opal_hwloc_my_cpuset,
&bind_level, &bind_index);
if (OPAL_HWLOC_NODE_LEVEL != bind_level) {
/* We are bound to *something* (i.e., our binding
level is less than "node", meaning the entire
machine), so discover which NUMA node this process
is bound */
if (OPAL_HWLOC_NUMA_LEVEL == bind_level) {
mca_btl_smcuda_component.mem_node = my_mem_node = (int) bind_index;
} else {
if (OPAL_SUCCESS ==
opal_hwloc_base_get_local_index(HWLOC_OBJ_NODE, 0, &bind_index)) {
mca_btl_smcuda_component.mem_node = my_mem_node = (int) bind_index;
} else {
/* Weird. We can't figure out what NUMA node
we're on. :-( */
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
/* if we are not bound, then there is nothing further to do */
if (NULL != ompi_process_info.cpuset) {
/* count the number of NUMA nodes to which we are bound */
for (w=0; w < i; w++) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0, w,
OPAL_HWLOC_AVAILABLE))) {
continue;
}
/* get that NUMA node's available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
/* see if we intersect */
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
n_bound++;
numa = w;
}
}
/* if we are located on more than one NUMA, or we didn't find
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_sm_component.mem_node = my_mem_node = numa;
} else {
mca_btl_sm_component.mem_node = my_mem_node = -1;
}
}
}

View File

@ -90,6 +90,7 @@ struct ompi_process_info_t {
char *job_session_dir;
char *proc_session_dir;
char nodename[100]; /* BWB: FIX ME: This really needs to be a rational constant */
char *cpuset;
};
typedef struct ompi_process_info_t ompi_process_info_t;
#define OMPI_LOCAL_RANK_INVALID (-1)

View File

@ -130,6 +130,7 @@ ompi_rte_init(int *argc, char ***argv)
ompi_process_info.job_session_dir = NULL; /* BWB: FIX ME */
ompi_process_info.proc_session_dir = NULL; /* BWB: FIX ME */
gethostname(ompi_process_info.nodename, sizeof(ompi_process_info.nodename));
ompi_process_info.cpuset = NULL;
/* setup hwloc */
if (NULL == opal_hwloc_topology) {
@ -149,6 +150,7 @@ ompi_rte_init(int *argc, char ***argv)
if (0 != hwloc_bitmap_compare(boundset, rootset) ||
opal_hwloc_base_single_cpu(rootset) ||
opal_hwloc_base_single_cpu(boundset)) {
hwloc_bitmap_list_asprintf(&ompi_process_info.cpuset, boundset);
ompi_rte_proc_is_bound = true;
}
}

View File

@ -93,12 +93,6 @@ OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list;
OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set;
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus;
typedef struct {
opal_list_item_t super;
hwloc_obj_t obj;
} opal_hwloc_obj_list_item_t;
OBJ_CLASS_DECLARATION(opal_hwloc_obj_list_item_t);
/* convenience macro for debugging */
#define OPAL_HWLOC_SHOW_BINDING(n, v) \
do { \
@ -193,12 +187,6 @@ OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_obj_idx(hwloc_topology_t topo,
hwloc_obj_t obj,
opal_hwloc_resource_type_t rtype);
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_level_and_index(hwloc_cpuset_t cpus,
opal_hwloc_level_t *bind_level,
unsigned int *bind_idx);
OPAL_DECLSPEC int opal_hwloc_base_get_local_index(hwloc_obj_type_t type,
unsigned cache_level,
unsigned int *idx);
/**
* Get the number of pu's under a given hwloc object.
@ -206,7 +194,6 @@ OPAL_DECLSPEC int opal_hwloc_base_get_local_index(hwloc_obj_type_t type,
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
hwloc_obj_t target);
OPAL_DECLSPEC char* opal_hwloc_base_print_binding(opal_binding_policy_t binding);
OPAL_DECLSPEC char* opal_hwloc_base_print_level(opal_hwloc_level_t level);
/**
* Determine if there is a single cpu in a bitmap.

View File

@ -479,8 +479,4 @@ OBJ_CLASS_INSTANCE(opal_hwloc_topo_data_t,
opal_object_t,
topo_data_const,
topo_data_dest);
OBJ_CLASS_INSTANCE(opal_hwloc_obj_list_item_t,
opal_list_item_t,
NULL, NULL);
#endif

View File

@ -1249,6 +1249,7 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
}
/* get the width of the topology at this depth */
width = hwloc_get_nbobjs_by_depth(topo, d);
/* scan all objects at this depth to see if
* our locations overlap with them
*/
@ -1312,162 +1313,6 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
return locality;
}
static hwloc_obj_t df_search_level(hwloc_obj_t start,
hwloc_cpuset_t cpus,
opal_hwloc_level_t *bind_level)
{
unsigned k;
hwloc_obj_t obj;
hwloc_cpuset_t avail;
/* get the available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, start);
if (NULL != avail && 0 == hwloc_bitmap_compare(avail, cpus)) {
/* convert the level */
if (HWLOC_OBJ_MACHINE == start->type) {
*bind_level = OPAL_HWLOC_NODE_LEVEL;
} else if (HWLOC_OBJ_NODE == start->type) {
*bind_level = OPAL_HWLOC_NUMA_LEVEL;
} else if (HWLOC_OBJ_SOCKET == start->type) {
*bind_level = OPAL_HWLOC_SOCKET_LEVEL;
} else if (HWLOC_OBJ_CACHE == start->type) {
if (3 == start->attr->cache.depth) {
*bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
} else if (2 == start->attr->cache.depth) {
*bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
} else {
*bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
}
} else if (HWLOC_OBJ_CORE == start->type) {
*bind_level = OPAL_HWLOC_CORE_LEVEL;
} else if (HWLOC_OBJ_PU == start->type) {
*bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
} else {
/* We don't know what level it is, so just assign it to
"node" */
*bind_level = OPAL_HWLOC_NODE_LEVEL;
}
return start;
}
/* continue the search */
for (k=0; k < start->arity; k++) {
obj = df_search_level(start->children[k], cpus, bind_level);
if (NULL != obj) {
return obj;
}
}
return NULL;
}
hwloc_obj_t opal_hwloc_base_get_level_and_index(hwloc_cpuset_t cpus,
opal_hwloc_level_t *bind_level,
unsigned int *bind_idx)
{
hwloc_obj_t root, obj;
/* if we don't have topology info, nothing we can do */
if (NULL == opal_hwloc_topology) {
*bind_level = OPAL_HWLOC_NODE_LEVEL;
*bind_idx = 0;
return NULL;
}
/* start at the node level and do a down-first
* search until we find an exact match for the cpus
*/
*bind_level = OPAL_HWLOC_NODE_LEVEL;
*bind_idx = 0;
root = hwloc_get_root_obj(opal_hwloc_topology);
obj = df_search_level(root, cpus, bind_level);
if (NULL == obj) {
/* no match found */
return NULL;
}
/* get the index */
*bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, obj, OPAL_HWLOC_AVAILABLE);
return obj;
}
int opal_hwloc_base_get_local_index(hwloc_obj_type_t type,
unsigned cache_level,
unsigned int *idx)
{
opal_hwloc_level_t bind_level;
unsigned int bind_idx;
hwloc_obj_t obj;
/* if we don't have topology info, nothing we can do */
if (NULL == opal_hwloc_topology) {
return OPAL_ERR_NOT_AVAILABLE;
}
/* ensure we have our local cpuset */
opal_hwloc_base_get_local_cpuset();
/* if we are not bound, then this is meaningless */
obj = opal_hwloc_base_get_level_and_index(opal_hwloc_my_cpuset,
&bind_level, &bind_idx);
if (OPAL_HWLOC_NODE_LEVEL == bind_level) {
return OPAL_ERR_NOT_BOUND;
}
/* if the type/level match, then we are done */
if (type == opal_hwloc_levels[bind_level]) {
if (HWLOC_OBJ_CACHE == type) {
if ((cache_level == 1 && bind_level == OPAL_HWLOC_L1CACHE_LEVEL) ||
(cache_level == 2 && bind_level == OPAL_HWLOC_L2CACHE_LEVEL) ||
(cache_level == 3 && bind_level == OPAL_HWLOC_L3CACHE_LEVEL)) {
*idx = bind_idx;
return OPAL_SUCCESS;
}
} else {
*idx = bind_idx;
return OPAL_SUCCESS;
}
}
/* if the binding level is below the type, then we cannot
* answer the question as we could run on multiple objects
* of that type - e.g., if we are bound to NUMA and we are
* asked for the idx of the socket we are on, then we can
* only answer "unknown"
*/
if (type > opal_hwloc_levels[bind_level]) {
return OPAL_ERR_MULTIPLE_AFFINITIES;
}
if (type == HWLOC_OBJ_CACHE) {
if ((cache_level == 1 && OPAL_HWLOC_L1CACHE_LEVEL < bind_level) ||
(cache_level == 2 && OPAL_HWLOC_L2CACHE_LEVEL < bind_level) ||
(cache_level == 3 && OPAL_HWLOC_L3CACHE_LEVEL < bind_level)) {
return OPAL_ERR_MULTIPLE_AFFINITIES;
}
}
/* move upward until we find the specified type */
while (NULL != obj) {
obj = obj->parent;
if (obj->type == type) {
if (type == HWLOC_OBJ_CACHE) {
if (cache_level == obj->attr->cache.depth) {
break;
}
} else {
break;
}
}
}
if (NULL == obj) {
return OPAL_ERR_NOT_FOUND;
}
/* get the index of this object */
*idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, obj, OPAL_HWLOC_AVAILABLE);
return OPAL_SUCCESS;
}
char* opal_hwloc_base_print_binding(opal_binding_policy_t binding)
{
char *ret, *bind;
@ -1534,40 +1379,6 @@ char* opal_hwloc_base_print_binding(opal_binding_policy_t binding)
return ret;
}
char* opal_hwloc_base_print_level(opal_hwloc_level_t level)
{
char *ret = "unknown";
switch(level) {
case OPAL_HWLOC_NODE_LEVEL:
ret = "NODE";
break;
case OPAL_HWLOC_NUMA_LEVEL:
ret = "NUMA";
break;
case OPAL_HWLOC_SOCKET_LEVEL:
ret = "SOCKET";
break;
case OPAL_HWLOC_L3CACHE_LEVEL:
ret = "L3CACHE";
break;
case OPAL_HWLOC_L2CACHE_LEVEL:
ret = "L2CACHE";
break;
case OPAL_HWLOC_L1CACHE_LEVEL:
ret = "L1CACHE";
break;
case OPAL_HWLOC_CORE_LEVEL:
ret = "CORE";
break;
case OPAL_HWLOC_HWTHREAD_LEVEL:
ret = "HWTHREAD";
break;
}
return ret;
}
/*
* Turn an int bitmap to a "a-b,c" range kind of string
*/

View File

@ -64,23 +64,6 @@ typedef struct opal_hwloc_base_component_2_0_0_t opal_hwloc_component_t;
"hwloc", 2, 0, 0
/* Define a hierarchical level value that
* helps resolve the hwloc behavior of
* treating caches as a single type of
* entity - must always be available
*/
typedef enum {
OPAL_HWLOC_NODE_LEVEL=0,
OPAL_HWLOC_NUMA_LEVEL,
OPAL_HWLOC_SOCKET_LEVEL,
OPAL_HWLOC_L3CACHE_LEVEL,
OPAL_HWLOC_L2CACHE_LEVEL,
OPAL_HWLOC_L1CACHE_LEVEL,
OPAL_HWLOC_CORE_LEVEL,
OPAL_HWLOC_HWTHREAD_LEVEL
} opal_hwloc_level_t;
#define OPAL_HWLOC_LEVEL_T OPAL_INT
/* ******************************************************************** */
/* Although we cannot bind if --without-hwloc is set,
* we do still need to know some basic locality data

View File

@ -617,8 +617,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
*/
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - hwthread to hwthread",
@ -637,8 +635,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_CORE_LEVEL;
if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - core to core",
@ -668,8 +664,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - L1cache to L1cache",
@ -694,8 +688,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - L2cache to L2cache",
@ -720,8 +712,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - L3cache to L3cache",
@ -746,8 +736,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_SOCKET_LEVEL;
if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - socket to socket",
@ -772,8 +760,6 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_NUMA_LEVEL;
if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - numa to numa",

View File

@ -37,6 +37,26 @@ orte_rmaps_base_module_t orte_rmaps_ppr_module = {
ppr_mapper
};
/* RHC: will eventually remove this
* definition as it is no longer reqd
* in the rest of OMPI system.
*
* Define a hierarchical level value that
* helps resolve the hwloc behavior of
* treating caches as a single type of
* entity - must always be available
*/
typedef enum {
OPAL_HWLOC_NODE_LEVEL=0,
OPAL_HWLOC_NUMA_LEVEL,
OPAL_HWLOC_SOCKET_LEVEL,
OPAL_HWLOC_L3CACHE_LEVEL,
OPAL_HWLOC_L2CACHE_LEVEL,
OPAL_HWLOC_L1CACHE_LEVEL,
OPAL_HWLOC_CORE_LEVEL,
OPAL_HWLOC_HWTHREAD_LEVEL
} opal_hwloc_level_t;
#if OPAL_HAVE_HWLOC
static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx,

View File

@ -55,7 +55,6 @@ struct orte_job_map_t {
orte_ranking_policy_t ranking;
#if OPAL_HAVE_HWLOC
opal_binding_policy_t binding;
opal_hwloc_level_t bind_level;
#endif
/* mapping options */
char *ppr;

View File

@ -660,13 +660,12 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
if (orte_devel_level_output) {
#if OPAL_HAVE_HWLOC
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s Binding policy: %s[%s] Cpu set: %s PPR: %s",
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s Binding policy: %s Cpu set: %s PPR: %s",
pfx2, (NULL == src->req_mapper) ? "NULL" : src->req_mapper,
(NULL == src->last_mapper) ? "NULL" : src->last_mapper,
orte_rmaps_base_print_mapping(src->mapping),
orte_rmaps_base_print_ranking(src->ranking),
opal_hwloc_base_print_binding(src->binding),
opal_hwloc_base_print_level(src->bind_level),
(NULL == opal_hwloc_base_cpu_set) ? "NULL" : opal_hwloc_base_cpu_set,
(NULL == src->ppr) ? "NULL" : src->ppr);
#else

View File

@ -1002,7 +1002,6 @@ static void orte_job_map_construct(orte_job_map_t* map)
map->ranking = 0;
#if OPAL_HAVE_HWLOC
map->binding = 0;
map->bind_level = OPAL_HWLOC_NODE_LEVEL;
#endif
map->ppr = NULL;
map->cpus_per_rank = 1;