1
1

Support physical processor ids in rankfile

Этот коммит содержится в:
Ralph Castain 2014-11-09 20:50:12 -08:00
родитель 6c8c9cb4a3
Коммит 2a90788724
8 изменённых файлов: 92 добавлений и 33 удалений

Просмотреть файл

@ -191,10 +191,11 @@ OPAL_DECLSPEC bool opal_hwloc_base_single_cpu(hwloc_cpuset_t cpuset);
/**
* Provide a utility to parse a slot list against the local
* logical cpus, and produce a cpuset for the described binding
* cpus of given type, and produce a cpuset for the described binding
*/
OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str,
hwloc_topology_t topo,
opal_hwloc_resource_type_t rtype,
hwloc_cpuset_t cpumask);
OPAL_DECLSPEC char* opal_hwloc_base_find_coprocessors(hwloc_topology_t topo);
@ -266,8 +267,10 @@ OPAL_DECLSPEC int opal_hwloc_base_cset2mapstr(char *str, int len,
hwloc_topology_t topo,
hwloc_cpuset_t cpuset);
/* get the hwloc object that corresponds to the given LOGICAL processor id */
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, int lid);
/* get the hwloc object that corresponds to the given processor id and type */
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
int lid,
opal_hwloc_resource_type_t rtype);
#endif

Просмотреть файл

@ -28,9 +28,10 @@ The specified %s policy is not recognized:
Please check for a typo or ensure that the option is a supported
one.
#
[logical-cpu-not-found]
A specified logical processor does not exist in this topology:
[cpu-not-found]
A specified %s processor does not exist in this topology:
CPU number: %d
Cpu set given: %s
#
[redefining-policy]

Просмотреть файл

@ -43,16 +43,19 @@
/*
* Provide the hwloc object that corresponds to the given
* LOGICAL processor id. Remember: "processor" here [usually] means "core" --
* processor id of the given type. Remember: "processor" here [usually] means "core" --
* except that on some platforms, hwloc won't find any cores; it'll
* only find PUs (!). On such platforms, then do the same calculation
* but with PUs instead of COREs.
*/
hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, int lid)
hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
int lid,
opal_hwloc_resource_type_t rtype)
{
hwloc_obj_type_t obj_type = HWLOC_OBJ_CORE;
hwloc_obj_t obj;
int cnt;
/* hwloc isn't able to find cores on all platforms. Example:
PPC64 running RHEL 5.4 (linux kernel 2.6.18) only reports NUMA
nodes and PU's. Fine.
@ -70,12 +73,34 @@ hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, int lid)
obj_type = HWLOC_OBJ_PU;
}
if (OPAL_HWLOC_PHYSICAL == rtype) {
/* find the pu */
obj = hwloc_get_obj_by_type(topo, obj_type, 0);
cnt = 0;
opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
"Searching for %d PHYSICAL PU", lid);
while (lid != cnt && NULL != obj) {
obj = obj->next_cousin;
cnt++;
}
if (lid != cnt) {
opal_show_help("help-opal-hwloc-base.txt",
"cpu-not-found", true, "physical",
lid, opal_hwloc_base_cpu_set);
return NULL; // failed to find it
}
return obj;
}
opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
"Searching for %d LOGICAL PU", lid);
/* Now do the actual lookup. */
obj = hwloc_get_obj_by_type(topo, obj_type, lid);
if (NULL == obj) {
opal_show_help("help-opal-hwloc-base.txt",
"logical-cpu-not-found", true,
opal_hwloc_base_cpu_set);
"cpu-not-found", true, "logical",
lid, opal_hwloc_base_cpu_set);
return NULL;
}
@ -130,7 +155,7 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
case 1:
/* only one cpu given - get that object */
cpu = strtoul(range[0], NULL, 10);
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu))) {
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
opal_argv_free(ranges);
opal_argv_free(range);
return OPAL_ERROR;
@ -144,7 +169,7 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
start = strtoul(range[0], NULL, 10);
end = strtoul(range[1], NULL, 10);
for (cpu=start; cpu <= end; cpu++) {
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu))) {
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
opal_argv_free(ranges);
opal_argv_free(range);
hwloc_bitmap_free(avail);
@ -1030,6 +1055,7 @@ void opal_hwloc_base_clear_usage(hwloc_topology_t topo)
static int socket_to_cpu_set(char *cpus,
hwloc_topology_t topo,
opal_hwloc_resource_type_t rtype,
hwloc_bitmap_t cpumask)
{
char **range;
@ -1042,7 +1068,7 @@ static int socket_to_cpu_set(char *cpus,
if ('*' == cpus[0]) {
/* requesting cpumask for ALL sockets */
obj = hwloc_get_root_obj(topo);
/* set to all available logical processors - essentially,
/* set to all available processors - essentially,
* this specification equates to unbound
*/
res = opal_hwloc_base_get_available_cpus(topo, obj);
@ -1055,8 +1081,8 @@ static int socket_to_cpu_set(char *cpus,
switch (range_cnt) {
case 1: /* no range was present, so just one socket given */
socket_id = atoi(range[0]);
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, OPAL_HWLOC_LOGICAL);
/* get the available logical cpus for this socket */
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, rtype);
/* get the available cpus for this socket */
res = opal_hwloc_base_get_available_cpus(topo, obj);
hwloc_bitmap_or(cpumask, cpumask, res);
break;
@ -1066,8 +1092,8 @@ static int socket_to_cpu_set(char *cpus,
upper_range = atoi(range[1]);
/* cycle across the range of sockets */
for (socket_id=lower_range; socket_id<=upper_range; socket_id++) {
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, OPAL_HWLOC_LOGICAL);
/* get the available logical cpus for this socket */
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, rtype);
/* get the available cpus for this socket */
res = opal_hwloc_base_get_available_cpus(topo, obj);
/* set the corresponding bits in the bitmask */
hwloc_bitmap_or(cpumask, cpumask, res);
@ -1084,6 +1110,7 @@ static int socket_to_cpu_set(char *cpus,
static int socket_core_to_cpu_set(char *socket_core_list,
hwloc_topology_t topo,
opal_hwloc_resource_type_t rtype,
hwloc_bitmap_t cpumask)
{
int rc=OPAL_SUCCESS, i, j;
@ -1102,7 +1129,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
/* get the object for this socket id */
if (NULL == (socket = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0,
socket_id, OPAL_HWLOC_LOGICAL))) {
socket_id, rtype))) {
opal_argv_free(socket_core);
return OPAL_ERR_NOT_FOUND;
}
@ -1123,7 +1150,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
corestr = socket_core[i];
}
if ('*' == corestr[0]) {
/* set to all available logical cpus on this socket */
/* set to all available cpus on this socket */
res = opal_hwloc_base_get_available_cpus(topo, socket);
hwloc_bitmap_or(cpumask, cpumask, res);
/* we are done - already assigned all cores! */
@ -1188,6 +1215,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
int opal_hwloc_base_slot_list_parse(const char *slot_str,
hwloc_topology_t topo,
opal_hwloc_resource_type_t rtype,
hwloc_cpuset_t cpumask)
{
char **item;
@ -1233,7 +1261,7 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
* it could specify multiple sockets
*/
if (OPAL_SUCCESS != (rc = socket_to_cpu_set(&item[i][1], /* skip the 'S' */
topo, cpumask))) {
topo, rtype, cpumask))) {
opal_argv_free(item);
return rc;
}
@ -1242,13 +1270,13 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
if ('S' == item[i][0] ||
's' == item[i][0]) {
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(&item[i][1], /* skip the 'S' */
topo, cpumask))) {
topo, rtype, cpumask))) {
opal_argv_free(item);
return rc;
}
} else {
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(item[i],
topo, cpumask))) {
topo, rtype, cpumask))) {
opal_argv_free(item);
return rc;
}
@ -1263,9 +1291,10 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
case 1: /* only one core, or a list of cores, specified */
list = opal_argv_split(range[0], ',');
for (j=0; NULL != list[j]; j++) {
opal_output(0, "LIST %d VAL %s", j, list[j]);
core_id = atoi(list[j]);
/* find the specified logical available cpu */
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id))) {
/* find the specified available cpu */
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
opal_argv_free(range);
opal_argv_free(item);
return OPAL_ERROR;
@ -1283,7 +1312,7 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
upper_range = atoi(range[1]);
for (core_id=lower_range; core_id <= upper_range; core_id++) {
/* find the specified logical available cpu */
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id))) {
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
opal_argv_free(range);
opal_argv_free(item);
return OPAL_ERROR;

Просмотреть файл

@ -118,7 +118,8 @@ int orte_ess_base_proc_binding(void)
hwloc_bitmap_zero(cpus);
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
opal_hwloc_topology, cpus))) {
opal_hwloc_topology,
OPAL_HWLOC_LOGICAL, cpus))) {
error = "Setting processor affinity failed";
hwloc_bitmap_free(cpus);
goto error;

Просмотреть файл

@ -81,7 +81,10 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
char *slots;
bool initial_map=true;
#if OPAL_HAVE_HWLOC
opal_hwloc_resource_type_t rtype;
#endif
/* only handle initial launch of rf job */
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
@ -113,6 +116,19 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* convenience def */
map = jdata->map;
#if OPAL_HAVE_HWLOC
/* default to LOGICAL processors */
if (mca_rmaps_rank_file_component.physical) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_file: using PHYSICAL processors");
rtype = OPAL_HWLOC_PHYSICAL;
} else {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_file: using LOGICAL processors");
rtype = OPAL_HWLOC_LOGICAL;
}
#endif
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
@ -276,7 +292,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
}
bitmap = hwloc_bitmap_alloc();
/* parse the slot_list to find the socket and core */
if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(slots, node->topology, bitmap))) {
if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(slots, node->topology, rtype, bitmap))) {
ORTE_ERROR_LOG(rc);
goto error;
}

Просмотреть файл

@ -10,8 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Voltaire. All rights reserved
*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -41,6 +41,7 @@ int orte_rmaps_rank_file_lex_destroy (void);
struct orte_rmaps_rf_component_t {
orte_rmaps_base_component_t super;
char *slot_list;
bool physical;
};
typedef struct orte_rmaps_rf_component_t orte_rmaps_rf_component_t;

Просмотреть файл

@ -10,8 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Voltaire. All rights reserved
*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -92,6 +92,14 @@ static int orte_rmaps_rank_file_register(void)
MCA_BASE_VAR_SCOPE_READONLY, &orte_rankfile);
(void) mca_base_var_register_synonym(tmp, "orte", "orte", NULL, "rankfile", 0);
mca_rmaps_rank_file_component.physical = false;
(void) mca_base_component_var_register(c, "physical", "Rankfile contains physical cpu designations",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_rmaps_rank_file_component.physical);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -391,7 +391,7 @@ int orte_daemon(int argc, char *argv[])
res = hwloc_bitmap_alloc();
for (i=0; NULL != cores[i]; i++) {
core = strtoul(cores[i], NULL, 10);
if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core))) {
if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
/* turn off the show help forwarding as we won't
* be able to cycle the event library to send
*/