Support physical processor ids in rankfile
Этот коммит содержится в:
родитель
6c8c9cb4a3
Коммит
2a90788724
@ -191,10 +191,11 @@ OPAL_DECLSPEC bool opal_hwloc_base_single_cpu(hwloc_cpuset_t cpuset);
|
||||
|
||||
/**
|
||||
* Provide a utility to parse a slot list against the local
|
||||
* logical cpus, and produce a cpuset for the described binding
|
||||
* cpus of given type, and produce a cpuset for the described binding
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
hwloc_topology_t topo,
|
||||
opal_hwloc_resource_type_t rtype,
|
||||
hwloc_cpuset_t cpumask);
|
||||
|
||||
OPAL_DECLSPEC char* opal_hwloc_base_find_coprocessors(hwloc_topology_t topo);
|
||||
@ -266,8 +267,10 @@ OPAL_DECLSPEC int opal_hwloc_base_cset2mapstr(char *str, int len,
|
||||
hwloc_topology_t topo,
|
||||
hwloc_cpuset_t cpuset);
|
||||
|
||||
/* get the hwloc object that corresponds to the given LOGICAL processor id */
|
||||
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, int lid);
|
||||
/* get the hwloc object that corresponds to the given processor id and type */
|
||||
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
|
||||
int lid,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -28,9 +28,10 @@ The specified %s policy is not recognized:
|
||||
Please check for a typo or ensure that the option is a supported
|
||||
one.
|
||||
#
|
||||
[logical-cpu-not-found]
|
||||
A specified logical processor does not exist in this topology:
|
||||
[cpu-not-found]
|
||||
A specified %s processor does not exist in this topology:
|
||||
|
||||
CPU number: %d
|
||||
Cpu set given: %s
|
||||
#
|
||||
[redefining-policy]
|
||||
|
@ -43,16 +43,19 @@
|
||||
|
||||
/*
|
||||
* Provide the hwloc object that corresponds to the given
|
||||
* LOGICAL processor id. Remember: "processor" here [usually] means "core" --
|
||||
* processor id of the given type. Remember: "processor" here [usually] means "core" --
|
||||
* except that on some platforms, hwloc won't find any cores; it'll
|
||||
* only find PUs (!). On such platforms, then do the same calculation
|
||||
* but with PUs instead of COREs.
|
||||
*/
|
||||
hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, int lid)
|
||||
hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
|
||||
int lid,
|
||||
opal_hwloc_resource_type_t rtype)
|
||||
{
|
||||
hwloc_obj_type_t obj_type = HWLOC_OBJ_CORE;
|
||||
hwloc_obj_t obj;
|
||||
|
||||
int cnt;
|
||||
|
||||
/* hwloc isn't able to find cores on all platforms. Example:
|
||||
PPC64 running RHEL 5.4 (linux kernel 2.6.18) only reports NUMA
|
||||
nodes and PU's. Fine.
|
||||
@ -70,12 +73,34 @@ hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, int lid)
|
||||
obj_type = HWLOC_OBJ_PU;
|
||||
}
|
||||
|
||||
if (OPAL_HWLOC_PHYSICAL == rtype) {
|
||||
/* find the pu */
|
||||
obj = hwloc_get_obj_by_type(topo, obj_type, 0);
|
||||
cnt = 0;
|
||||
opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
|
||||
"Searching for %d PHYSICAL PU", lid);
|
||||
while (lid != cnt && NULL != obj) {
|
||||
obj = obj->next_cousin;
|
||||
cnt++;
|
||||
}
|
||||
if (lid != cnt) {
|
||||
opal_show_help("help-opal-hwloc-base.txt",
|
||||
"cpu-not-found", true, "physical",
|
||||
lid, opal_hwloc_base_cpu_set);
|
||||
return NULL; // failed to find it
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, opal_hwloc_base_framework.framework_output,
|
||||
"Searching for %d LOGICAL PU", lid);
|
||||
|
||||
/* Now do the actual lookup. */
|
||||
obj = hwloc_get_obj_by_type(topo, obj_type, lid);
|
||||
if (NULL == obj) {
|
||||
opal_show_help("help-opal-hwloc-base.txt",
|
||||
"logical-cpu-not-found", true,
|
||||
opal_hwloc_base_cpu_set);
|
||||
"cpu-not-found", true, "logical",
|
||||
lid, opal_hwloc_base_cpu_set);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -130,7 +155,7 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
|
||||
case 1:
|
||||
/* only one cpu given - get that object */
|
||||
cpu = strtoul(range[0], NULL, 10);
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu))) {
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
|
||||
opal_argv_free(ranges);
|
||||
opal_argv_free(range);
|
||||
return OPAL_ERROR;
|
||||
@ -144,7 +169,7 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
|
||||
start = strtoul(range[0], NULL, 10);
|
||||
end = strtoul(range[1], NULL, 10);
|
||||
for (cpu=start; cpu <= end; cpu++) {
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu))) {
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, cpu, OPAL_HWLOC_LOGICAL))) {
|
||||
opal_argv_free(ranges);
|
||||
opal_argv_free(range);
|
||||
hwloc_bitmap_free(avail);
|
||||
@ -1030,6 +1055,7 @@ void opal_hwloc_base_clear_usage(hwloc_topology_t topo)
|
||||
|
||||
static int socket_to_cpu_set(char *cpus,
|
||||
hwloc_topology_t topo,
|
||||
opal_hwloc_resource_type_t rtype,
|
||||
hwloc_bitmap_t cpumask)
|
||||
{
|
||||
char **range;
|
||||
@ -1042,7 +1068,7 @@ static int socket_to_cpu_set(char *cpus,
|
||||
if ('*' == cpus[0]) {
|
||||
/* requesting cpumask for ALL sockets */
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
/* set to all available logical processors - essentially,
|
||||
/* set to all available processors - essentially,
|
||||
* this specification equates to unbound
|
||||
*/
|
||||
res = opal_hwloc_base_get_available_cpus(topo, obj);
|
||||
@ -1055,8 +1081,8 @@ static int socket_to_cpu_set(char *cpus,
|
||||
switch (range_cnt) {
|
||||
case 1: /* no range was present, so just one socket given */
|
||||
socket_id = atoi(range[0]);
|
||||
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, OPAL_HWLOC_LOGICAL);
|
||||
/* get the available logical cpus for this socket */
|
||||
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, rtype);
|
||||
/* get the available cpus for this socket */
|
||||
res = opal_hwloc_base_get_available_cpus(topo, obj);
|
||||
hwloc_bitmap_or(cpumask, cpumask, res);
|
||||
break;
|
||||
@ -1066,8 +1092,8 @@ static int socket_to_cpu_set(char *cpus,
|
||||
upper_range = atoi(range[1]);
|
||||
/* cycle across the range of sockets */
|
||||
for (socket_id=lower_range; socket_id<=upper_range; socket_id++) {
|
||||
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, OPAL_HWLOC_LOGICAL);
|
||||
/* get the available logical cpus for this socket */
|
||||
obj = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0, socket_id, rtype);
|
||||
/* get the available cpus for this socket */
|
||||
res = opal_hwloc_base_get_available_cpus(topo, obj);
|
||||
/* set the corresponding bits in the bitmask */
|
||||
hwloc_bitmap_or(cpumask, cpumask, res);
|
||||
@ -1084,6 +1110,7 @@ static int socket_to_cpu_set(char *cpus,
|
||||
|
||||
static int socket_core_to_cpu_set(char *socket_core_list,
|
||||
hwloc_topology_t topo,
|
||||
opal_hwloc_resource_type_t rtype,
|
||||
hwloc_bitmap_t cpumask)
|
||||
{
|
||||
int rc=OPAL_SUCCESS, i, j;
|
||||
@ -1102,7 +1129,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
|
||||
|
||||
/* get the object for this socket id */
|
||||
if (NULL == (socket = opal_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0,
|
||||
socket_id, OPAL_HWLOC_LOGICAL))) {
|
||||
socket_id, rtype))) {
|
||||
opal_argv_free(socket_core);
|
||||
return OPAL_ERR_NOT_FOUND;
|
||||
}
|
||||
@ -1123,7 +1150,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
|
||||
corestr = socket_core[i];
|
||||
}
|
||||
if ('*' == corestr[0]) {
|
||||
/* set to all available logical cpus on this socket */
|
||||
/* set to all available cpus on this socket */
|
||||
res = opal_hwloc_base_get_available_cpus(topo, socket);
|
||||
hwloc_bitmap_or(cpumask, cpumask, res);
|
||||
/* we are done - already assigned all cores! */
|
||||
@ -1188,6 +1215,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
|
||||
|
||||
int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
hwloc_topology_t topo,
|
||||
opal_hwloc_resource_type_t rtype,
|
||||
hwloc_cpuset_t cpumask)
|
||||
{
|
||||
char **item;
|
||||
@ -1233,7 +1261,7 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
* it could specify multiple sockets
|
||||
*/
|
||||
if (OPAL_SUCCESS != (rc = socket_to_cpu_set(&item[i][1], /* skip the 'S' */
|
||||
topo, cpumask))) {
|
||||
topo, rtype, cpumask))) {
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
}
|
||||
@ -1242,13 +1270,13 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
if ('S' == item[i][0] ||
|
||||
's' == item[i][0]) {
|
||||
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(&item[i][1], /* skip the 'S' */
|
||||
topo, cpumask))) {
|
||||
topo, rtype, cpumask))) {
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(item[i],
|
||||
topo, cpumask))) {
|
||||
topo, rtype, cpumask))) {
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
}
|
||||
@ -1263,9 +1291,10 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
case 1: /* only one core, or a list of cores, specified */
|
||||
list = opal_argv_split(range[0], ',');
|
||||
for (j=0; NULL != list[j]; j++) {
|
||||
opal_output(0, "LIST %d VAL %s", j, list[j]);
|
||||
core_id = atoi(list[j]);
|
||||
/* find the specified logical available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id))) {
|
||||
/* find the specified available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERROR;
|
||||
@ -1283,7 +1312,7 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
upper_range = atoi(range[1]);
|
||||
for (core_id=lower_range; core_id <= upper_range; core_id++) {
|
||||
/* find the specified logical available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id))) {
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERROR;
|
||||
|
@ -118,7 +118,8 @@ int orte_ess_base_proc_binding(void)
|
||||
hwloc_bitmap_zero(cpus);
|
||||
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
|
||||
opal_hwloc_topology, cpus))) {
|
||||
opal_hwloc_topology,
|
||||
OPAL_HWLOC_LOGICAL, cpus))) {
|
||||
error = "Setting processor affinity failed";
|
||||
hwloc_bitmap_free(cpus);
|
||||
goto error;
|
||||
|
@ -81,7 +81,10 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
|
||||
char *slots;
|
||||
bool initial_map=true;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
opal_hwloc_resource_type_t rtype;
|
||||
#endif
|
||||
|
||||
/* only handle initial launch of rf job */
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
@ -113,6 +116,19 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
|
||||
/* convenience def */
|
||||
map = jdata->map;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* default to LOGICAL processors */
|
||||
if (mca_rmaps_rank_file_component.physical) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_file: using PHYSICAL processors");
|
||||
rtype = OPAL_HWLOC_PHYSICAL;
|
||||
} else {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_file: using LOGICAL processors");
|
||||
rtype = OPAL_HWLOC_LOGICAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
@ -276,7 +292,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
}
|
||||
bitmap = hwloc_bitmap_alloc();
|
||||
/* parse the slot_list to find the socket and core */
|
||||
if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(slots, node->topology, bitmap))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(slots, node->topology, rtype, bitmap))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
|
@ -10,8 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Voltaire. All rights reserved
|
||||
*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -41,6 +41,7 @@ int orte_rmaps_rank_file_lex_destroy (void);
|
||||
struct orte_rmaps_rf_component_t {
|
||||
orte_rmaps_base_component_t super;
|
||||
char *slot_list;
|
||||
bool physical;
|
||||
};
|
||||
typedef struct orte_rmaps_rf_component_t orte_rmaps_rf_component_t;
|
||||
|
||||
|
@ -10,8 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Voltaire. All rights reserved
|
||||
*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -92,6 +92,14 @@ static int orte_rmaps_rank_file_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &orte_rankfile);
|
||||
(void) mca_base_var_register_synonym(tmp, "orte", "orte", NULL, "rankfile", 0);
|
||||
|
||||
mca_rmaps_rank_file_component.physical = false;
|
||||
(void) mca_base_component_var_register(c, "physical", "Rankfile contains physical cpu designations",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_rmaps_rank_file_component.physical);
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -391,7 +391,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
res = hwloc_bitmap_alloc();
|
||||
for (i=0; NULL != cores[i]; i++) {
|
||||
core = strtoul(cores[i], NULL, 10);
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core))) {
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
|
||||
/* turn off the show help forwarding as we won't
|
||||
* be able to cycle the event library to send
|
||||
*/
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user