Merge pull request #692 from rhc54/topic/mapper
Fix hetero operations. An error in the hwloc utilities only allocated…
Этот коммит содержится в:
Коммит
c6bb227073
@ -1703,26 +1703,23 @@ static char *bitmap2rangestr(int bitmap)
|
||||
static int build_map(int *num_sockets_arg, int *num_cores_arg,
|
||||
hwloc_cpuset_t cpuset, int ***map, hwloc_topology_t topo)
|
||||
{
|
||||
static int num_sockets = -1, num_cores = -1;
|
||||
int num_sockets, num_cores;
|
||||
int socket_index, core_index, pu_index;
|
||||
hwloc_obj_t socket, core, pu;
|
||||
int **data;
|
||||
|
||||
/* Find out how many sockets we have (cached so that we don't have
|
||||
to look this up every time) */
|
||||
if (num_sockets < 0) {
|
||||
num_sockets = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_SOCKET);
|
||||
/* some systems (like the iMac) only have one
|
||||
* socket and so don't report a socket
|
||||
*/
|
||||
if (0 == num_sockets) {
|
||||
num_sockets = 1;
|
||||
}
|
||||
/* Lazy: take the total number of cores that we have in the
|
||||
topology; that'll be more than the max number of cores
|
||||
under any given socket */
|
||||
num_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
|
||||
/* Find out how many sockets we have */
|
||||
num_sockets = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_SOCKET);
|
||||
/* some systems (like the iMac) only have one
|
||||
* socket and so don't report a socket
|
||||
*/
|
||||
if (0 == num_sockets) {
|
||||
num_sockets = 1;
|
||||
}
|
||||
/* Lazy: take the total number of cores that we have in the
|
||||
topology; that'll be more than the max number of cores
|
||||
under any given socket */
|
||||
num_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
|
||||
*num_sockets_arg = num_sockets;
|
||||
*num_cores_arg = num_cores;
|
||||
|
||||
@ -1791,7 +1788,7 @@ int opal_hwloc_base_cset2str(char *str, int len,
|
||||
int ret, socket_index, core_index;
|
||||
char tmp[BUFSIZ];
|
||||
const int stmp = sizeof(tmp) - 1;
|
||||
int **map;
|
||||
int **map=NULL;
|
||||
hwloc_obj_t root;
|
||||
opal_hwloc_topo_data_t *sum;
|
||||
|
||||
@ -1819,7 +1816,6 @@ int opal_hwloc_base_cset2str(char *str, int len,
|
||||
if (OPAL_SUCCESS != (ret = build_map(&num_sockets, &num_cores, cpuset, &map, topo))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Iterate over the data matrix and build up the string */
|
||||
first = true;
|
||||
for (socket_index = 0; socket_index < num_sockets; ++socket_index) {
|
||||
@ -1837,8 +1833,12 @@ int opal_hwloc_base_cset2str(char *str, int len,
|
||||
}
|
||||
}
|
||||
}
|
||||
free(map[0]);
|
||||
free(map);
|
||||
if (NULL != map) {
|
||||
if (NULL != map[0]) {
|
||||
free(map[0]);
|
||||
}
|
||||
free(map);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -22,6 +23,7 @@ struct orte_ras_sim_component_t {
|
||||
char * slots;
|
||||
char * slots_max;
|
||||
char *topofiles;
|
||||
char *topologies;
|
||||
bool have_cpubind;
|
||||
bool have_membind;
|
||||
};
|
||||
|
@ -12,6 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -95,6 +96,13 @@ static int ras_sim_register(void)
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ras_simulator_component.topofiles);
|
||||
mca_ras_simulator_component.topologies = NULL;
|
||||
(void) mca_base_component_var_register (component, "topologies",
|
||||
"Comma-separated list of topology descriptions for simulated nodes",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ras_simulator_component.topologies);
|
||||
mca_ras_simulator_component.have_cpubind = true;
|
||||
(void) mca_base_component_var_register (component, "have_cpubind",
|
||||
"Topology supports binding to cpus",
|
||||
|
@ -3,6 +3,7 @@
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -56,6 +57,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
unsigned j, k;
|
||||
struct hwloc_topology_support *support;
|
||||
char **files=NULL;
|
||||
char **topos = NULL;
|
||||
bool use_local_topology = false;
|
||||
#endif
|
||||
char **node_cnt=NULL;
|
||||
@ -65,32 +67,39 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
char prefix[6];
|
||||
|
||||
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
|
||||
slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
|
||||
max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
|
||||
|
||||
/* backfill the slot_cnt as reqd so we don't have to
|
||||
* specify slot_cnt for each set of nodes - we'll set
|
||||
* */
|
||||
tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
|
||||
for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
|
||||
opal_argv_append_nosize(&slot_cnt, tmp);
|
||||
if (NULL != mca_ras_simulator_component.slots) {
|
||||
slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
|
||||
/* backfile the slot_cnt so every topology has a cnt */
|
||||
tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
|
||||
for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
|
||||
opal_argv_append_nosize(&slot_cnt, tmp);
|
||||
}
|
||||
}
|
||||
/* backfill the max_slot_cnt as reqd */
|
||||
tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
|
||||
for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
|
||||
opal_argv_append_nosize(&max_slot_cnt, tmp);
|
||||
if (NULL != mca_ras_simulator_component.slots_max) {
|
||||
max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
|
||||
/* backfill the max_slot_cnt as reqd */
|
||||
tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
|
||||
for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
|
||||
opal_argv_append_nosize(&max_slot_cnt, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL == mca_ras_simulator_component.topofiles) {
|
||||
/* use our topology */
|
||||
use_local_topology = true;
|
||||
} else {
|
||||
if (NULL != mca_ras_simulator_component.topofiles) {
|
||||
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
|
||||
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
|
||||
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
||||
goto error_silent;
|
||||
}
|
||||
} else if (NULL != mca_ras_simulator_component.topologies) {
|
||||
topos = opal_argv_split(mca_ras_simulator_component.topologies, ',');
|
||||
if (opal_argv_count(topos) != opal_argv_count(node_cnt)) {
|
||||
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
||||
goto error_silent;
|
||||
}
|
||||
} else {
|
||||
/* use our topology */
|
||||
use_local_topology = true;
|
||||
}
|
||||
#else
|
||||
/* If we don't have hwloc and hwloc files were specified, then
|
||||
@ -123,7 +132,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
if (use_local_topology) {
|
||||
/* use our topology */
|
||||
topo = opal_hwloc_topology;
|
||||
} else {
|
||||
} else if (NULL != files) {
|
||||
if (0 != hwloc_topology_init(&topo)) {
|
||||
orte_show_help("help-ras-simulator.txt",
|
||||
"hwloc API fail", true,
|
||||
@ -188,6 +197,69 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
t->topo = topo;
|
||||
t->sig = opal_hwloc_base_get_topo_signature(topo);
|
||||
opal_pointer_array_add(orte_node_topologies, t);
|
||||
} else {
|
||||
if (0 != hwloc_topology_init(&topo)) {
|
||||
orte_show_help("help-ras-simulator.txt",
|
||||
"hwloc API fail", true,
|
||||
__FILE__, __LINE__, "hwloc_topology_init");
|
||||
goto error_silent;
|
||||
}
|
||||
if (0 != hwloc_topology_set_synthetic(topo, topos[n])) {
|
||||
orte_show_help("help-ras-simulator.txt",
|
||||
"hwloc API fail", true,
|
||||
__FILE__, __LINE__, "hwloc_topology_set_synthetic");
|
||||
hwloc_topology_destroy(topo);
|
||||
goto error_silent;
|
||||
}
|
||||
if (0 != hwloc_topology_load(topo)) {
|
||||
orte_show_help("help-ras-simulator.txt",
|
||||
"hwloc API fail", true,
|
||||
__FILE__, __LINE__, "hwloc_topology_load");
|
||||
hwloc_topology_destroy(topo);
|
||||
goto error_silent;
|
||||
}
|
||||
if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
|
||||
orte_show_help("help-ras-simulator.txt",
|
||||
"hwloc API fail", true,
|
||||
__FILE__, __LINE__, "opal_hwloc_base_filter_cpus");
|
||||
hwloc_topology_destroy(topo);
|
||||
goto error_silent;
|
||||
}
|
||||
/* remove the hostname from the topology. Unfortunately, hwloc
|
||||
* decided to add the source hostname to the "topology", thus
|
||||
* rendering it unusable as a pure topological description. So
|
||||
* we remove that information here.
|
||||
*/
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
for (k=0; k < obj->infos_count; k++) {
|
||||
if (NULL == obj->infos[k].name ||
|
||||
NULL == obj->infos[k].value) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
|
||||
free(obj->infos[k].name);
|
||||
free(obj->infos[k].value);
|
||||
/* left justify the array */
|
||||
for (j=k; j < obj->infos_count-1; j++) {
|
||||
obj->infos[j] = obj->infos[j+1];
|
||||
}
|
||||
obj->infos[obj->infos_count-1].name = NULL;
|
||||
obj->infos[obj->infos_count-1].value = NULL;
|
||||
obj->infos_count--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* unfortunately, hwloc does not include support info in its
|
||||
* xml output :-(( To aid in debugging, we set it here
|
||||
*/
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
|
||||
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
|
||||
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
|
||||
/* add it to our array */
|
||||
t = OBJ_NEW(orte_topology_t);
|
||||
t->topo = topo;
|
||||
t->sig = opal_hwloc_base_get_topo_signature(topo);
|
||||
opal_pointer_array_add(orte_node_topologies, t);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -196,9 +268,19 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
asprintf(&node->name, "%s%0*d", prefix, dig, i);
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = (NULL == max_slot_cnt[n] ? 0 : atoi(max_slot_cnt[n]));
|
||||
node->slots = (NULL == slot_cnt[n] ? 0 : atoi(slot_cnt[n]));
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
|
||||
node->slots_max = 0;
|
||||
} else {
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
node->slots_max = opal_hwloc_base_get_npus(topo, obj);
|
||||
}
|
||||
if (NULL == slot_cnt || NULL == slot_cnt[n]) {
|
||||
node->slots = 0;
|
||||
} else {
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
node->slots = opal_hwloc_base_get_npus(topo, obj);
|
||||
}
|
||||
node->topology = topo;
|
||||
#endif
|
||||
opal_output_verbose(1, orte_ras_base_framework.framework_output,
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -277,7 +277,8 @@ static int bind_downwards(orte_job_t *jdata,
|
||||
}
|
||||
/* bozo check */
|
||||
locale = NULL;
|
||||
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
|
||||
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
|
||||
NULL == locale) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
|
||||
hwloc_bitmap_free(totalcpuset);
|
||||
return ORTE_ERR_SILENT;
|
||||
|
@ -93,7 +93,16 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
nprocs = 0;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
nprocs += app->num_procs;
|
||||
if (0 == app->num_procs) {
|
||||
opal_list_t nodes;
|
||||
orte_std_cntr_t slots;
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
nprocs += slots;
|
||||
} else {
|
||||
nprocs += app->num_procs;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
|
@ -529,7 +529,7 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
if (orte_get_attribute(&src->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&loc, OPAL_PTR)) {
|
||||
if (NULL != loc) {
|
||||
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(locale, sizeof(locale), src->node->topology, loc->cpuset)) {
|
||||
strcpy(locale, "UNBOUND");
|
||||
strcpy(locale, "NODE");
|
||||
}
|
||||
} else {
|
||||
strcpy(locale, "UNKNOWN");
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user