Fix hetero operations. An error in the hwloc utilities only allocated memory for the first display of a binding map, and then assumed that all nodes had the same number of cores in them. This resulted in memory corruption whenever someone displayed a binding pattern for a hetero cluster, and a smaller node was first in line.
Этот коммит содержится в:
родитель
7455802a36
Коммит
ed93154e43
@ -1703,14 +1703,12 @@ static char *bitmap2rangestr(int bitmap)
|
|||||||
static int build_map(int *num_sockets_arg, int *num_cores_arg,
|
static int build_map(int *num_sockets_arg, int *num_cores_arg,
|
||||||
hwloc_cpuset_t cpuset, int ***map, hwloc_topology_t topo)
|
hwloc_cpuset_t cpuset, int ***map, hwloc_topology_t topo)
|
||||||
{
|
{
|
||||||
static int num_sockets = -1, num_cores = -1;
|
int num_sockets, num_cores;
|
||||||
int socket_index, core_index, pu_index;
|
int socket_index, core_index, pu_index;
|
||||||
hwloc_obj_t socket, core, pu;
|
hwloc_obj_t socket, core, pu;
|
||||||
int **data;
|
int **data;
|
||||||
|
|
||||||
/* Find out how many sockets we have (cached so that we don't have
|
/* Find out how many sockets we have */
|
||||||
to look this up every time) */
|
|
||||||
if (num_sockets < 0) {
|
|
||||||
num_sockets = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_SOCKET);
|
num_sockets = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_SOCKET);
|
||||||
/* some systems (like the iMac) only have one
|
/* some systems (like the iMac) only have one
|
||||||
* socket and so don't report a socket
|
* socket and so don't report a socket
|
||||||
@ -1722,7 +1720,6 @@ static int build_map(int *num_sockets_arg, int *num_cores_arg,
|
|||||||
topology; that'll be more than the max number of cores
|
topology; that'll be more than the max number of cores
|
||||||
under any given socket */
|
under any given socket */
|
||||||
num_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
|
num_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
|
||||||
}
|
|
||||||
*num_sockets_arg = num_sockets;
|
*num_sockets_arg = num_sockets;
|
||||||
*num_cores_arg = num_cores;
|
*num_cores_arg = num_cores;
|
||||||
|
|
||||||
@ -1791,7 +1788,7 @@ int opal_hwloc_base_cset2str(char *str, int len,
|
|||||||
int ret, socket_index, core_index;
|
int ret, socket_index, core_index;
|
||||||
char tmp[BUFSIZ];
|
char tmp[BUFSIZ];
|
||||||
const int stmp = sizeof(tmp) - 1;
|
const int stmp = sizeof(tmp) - 1;
|
||||||
int **map;
|
int **map=NULL;
|
||||||
hwloc_obj_t root;
|
hwloc_obj_t root;
|
||||||
opal_hwloc_topo_data_t *sum;
|
opal_hwloc_topo_data_t *sum;
|
||||||
|
|
||||||
@ -1819,7 +1816,6 @@ int opal_hwloc_base_cset2str(char *str, int len,
|
|||||||
if (OPAL_SUCCESS != (ret = build_map(&num_sockets, &num_cores, cpuset, &map, topo))) {
|
if (OPAL_SUCCESS != (ret = build_map(&num_sockets, &num_cores, cpuset, &map, topo))) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Iterate over the data matrix and build up the string */
|
/* Iterate over the data matrix and build up the string */
|
||||||
first = true;
|
first = true;
|
||||||
for (socket_index = 0; socket_index < num_sockets; ++socket_index) {
|
for (socket_index = 0; socket_index < num_sockets; ++socket_index) {
|
||||||
@ -1837,8 +1833,12 @@ int opal_hwloc_base_cset2str(char *str, int len,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (NULL != map) {
|
||||||
|
if (NULL != map[0]) {
|
||||||
free(map[0]);
|
free(map[0]);
|
||||||
|
}
|
||||||
free(map);
|
free(map);
|
||||||
|
}
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -22,6 +23,7 @@ struct orte_ras_sim_component_t {
|
|||||||
char * slots;
|
char * slots;
|
||||||
char * slots_max;
|
char * slots_max;
|
||||||
char *topofiles;
|
char *topofiles;
|
||||||
|
char *topologies;
|
||||||
bool have_cpubind;
|
bool have_cpubind;
|
||||||
bool have_membind;
|
bool have_membind;
|
||||||
};
|
};
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -95,6 +96,13 @@ static int ras_sim_register(void)
|
|||||||
OPAL_INFO_LVL_9,
|
OPAL_INFO_LVL_9,
|
||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ras_simulator_component.topofiles);
|
&mca_ras_simulator_component.topofiles);
|
||||||
|
mca_ras_simulator_component.topologies = NULL;
|
||||||
|
(void) mca_base_component_var_register (component, "topologies",
|
||||||
|
"Comma-separated list of topology descriptions for simulated nodes",
|
||||||
|
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ras_simulator_component.topologies);
|
||||||
mca_ras_simulator_component.have_cpubind = true;
|
mca_ras_simulator_component.have_cpubind = true;
|
||||||
(void) mca_base_component_var_register (component, "have_cpubind",
|
(void) mca_base_component_var_register (component, "have_cpubind",
|
||||||
"Topology supports binding to cpus",
|
"Topology supports binding to cpus",
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
|
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -56,6 +57,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
unsigned j, k;
|
unsigned j, k;
|
||||||
struct hwloc_topology_support *support;
|
struct hwloc_topology_support *support;
|
||||||
char **files=NULL;
|
char **files=NULL;
|
||||||
|
char **topos = NULL;
|
||||||
bool use_local_topology = false;
|
bool use_local_topology = false;
|
||||||
#endif
|
#endif
|
||||||
char **node_cnt=NULL;
|
char **node_cnt=NULL;
|
||||||
@ -65,32 +67,39 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
char prefix[6];
|
char prefix[6];
|
||||||
|
|
||||||
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
|
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
|
||||||
|
if (NULL != mca_ras_simulator_component.slots) {
|
||||||
slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
|
slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
|
||||||
max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
|
/* backfile the slot_cnt so every topology has a cnt */
|
||||||
|
|
||||||
/* backfill the slot_cnt as reqd so we don't have to
|
|
||||||
* specify slot_cnt for each set of nodes - we'll set
|
|
||||||
* */
|
|
||||||
tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
|
tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
|
||||||
for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
|
for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
|
||||||
opal_argv_append_nosize(&slot_cnt, tmp);
|
opal_argv_append_nosize(&slot_cnt, tmp);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (NULL != mca_ras_simulator_component.slots_max) {
|
||||||
|
max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
|
||||||
/* backfill the max_slot_cnt as reqd */
|
/* backfill the max_slot_cnt as reqd */
|
||||||
tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
|
tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
|
||||||
for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
|
for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
|
||||||
opal_argv_append_nosize(&max_slot_cnt, tmp);
|
opal_argv_append_nosize(&max_slot_cnt, tmp);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
if (NULL == mca_ras_simulator_component.topofiles) {
|
if (NULL != mca_ras_simulator_component.topofiles) {
|
||||||
/* use our topology */
|
|
||||||
use_local_topology = true;
|
|
||||||
} else {
|
|
||||||
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
|
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
|
||||||
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
|
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
|
||||||
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
||||||
goto error_silent;
|
goto error_silent;
|
||||||
}
|
}
|
||||||
|
} else if (NULL != mca_ras_simulator_component.topologies) {
|
||||||
|
topos = opal_argv_split(mca_ras_simulator_component.topologies, ',');
|
||||||
|
if (opal_argv_count(topos) != opal_argv_count(node_cnt)) {
|
||||||
|
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
||||||
|
goto error_silent;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* use our topology */
|
||||||
|
use_local_topology = true;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
/* If we don't have hwloc and hwloc files were specified, then
|
/* If we don't have hwloc and hwloc files were specified, then
|
||||||
@ -123,7 +132,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
if (use_local_topology) {
|
if (use_local_topology) {
|
||||||
/* use our topology */
|
/* use our topology */
|
||||||
topo = opal_hwloc_topology;
|
topo = opal_hwloc_topology;
|
||||||
} else {
|
} else if (NULL != files) {
|
||||||
if (0 != hwloc_topology_init(&topo)) {
|
if (0 != hwloc_topology_init(&topo)) {
|
||||||
orte_show_help("help-ras-simulator.txt",
|
orte_show_help("help-ras-simulator.txt",
|
||||||
"hwloc API fail", true,
|
"hwloc API fail", true,
|
||||||
@ -188,6 +197,69 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
t->topo = topo;
|
t->topo = topo;
|
||||||
t->sig = opal_hwloc_base_get_topo_signature(topo);
|
t->sig = opal_hwloc_base_get_topo_signature(topo);
|
||||||
opal_pointer_array_add(orte_node_topologies, t);
|
opal_pointer_array_add(orte_node_topologies, t);
|
||||||
|
} else {
|
||||||
|
if (0 != hwloc_topology_init(&topo)) {
|
||||||
|
orte_show_help("help-ras-simulator.txt",
|
||||||
|
"hwloc API fail", true,
|
||||||
|
__FILE__, __LINE__, "hwloc_topology_init");
|
||||||
|
goto error_silent;
|
||||||
|
}
|
||||||
|
if (0 != hwloc_topology_set_synthetic(topo, topos[n])) {
|
||||||
|
orte_show_help("help-ras-simulator.txt",
|
||||||
|
"hwloc API fail", true,
|
||||||
|
__FILE__, __LINE__, "hwloc_topology_set_synthetic");
|
||||||
|
hwloc_topology_destroy(topo);
|
||||||
|
goto error_silent;
|
||||||
|
}
|
||||||
|
if (0 != hwloc_topology_load(topo)) {
|
||||||
|
orte_show_help("help-ras-simulator.txt",
|
||||||
|
"hwloc API fail", true,
|
||||||
|
__FILE__, __LINE__, "hwloc_topology_load");
|
||||||
|
hwloc_topology_destroy(topo);
|
||||||
|
goto error_silent;
|
||||||
|
}
|
||||||
|
if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
|
||||||
|
orte_show_help("help-ras-simulator.txt",
|
||||||
|
"hwloc API fail", true,
|
||||||
|
__FILE__, __LINE__, "opal_hwloc_base_filter_cpus");
|
||||||
|
hwloc_topology_destroy(topo);
|
||||||
|
goto error_silent;
|
||||||
|
}
|
||||||
|
/* remove the hostname from the topology. Unfortunately, hwloc
|
||||||
|
* decided to add the source hostname to the "topology", thus
|
||||||
|
* rendering it unusable as a pure topological description. So
|
||||||
|
* we remove that information here.
|
||||||
|
*/
|
||||||
|
obj = hwloc_get_root_obj(topo);
|
||||||
|
for (k=0; k < obj->infos_count; k++) {
|
||||||
|
if (NULL == obj->infos[k].name ||
|
||||||
|
NULL == obj->infos[k].value) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
|
||||||
|
free(obj->infos[k].name);
|
||||||
|
free(obj->infos[k].value);
|
||||||
|
/* left justify the array */
|
||||||
|
for (j=k; j < obj->infos_count-1; j++) {
|
||||||
|
obj->infos[j] = obj->infos[j+1];
|
||||||
|
}
|
||||||
|
obj->infos[obj->infos_count-1].name = NULL;
|
||||||
|
obj->infos[obj->infos_count-1].value = NULL;
|
||||||
|
obj->infos_count--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* unfortunately, hwloc does not include support info in its
|
||||||
|
* xml output :-(( To aid in debugging, we set it here
|
||||||
|
*/
|
||||||
|
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
|
||||||
|
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
|
||||||
|
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
|
||||||
|
/* add it to our array */
|
||||||
|
t = OBJ_NEW(orte_topology_t);
|
||||||
|
t->topo = topo;
|
||||||
|
t->sig = opal_hwloc_base_get_topo_signature(topo);
|
||||||
|
opal_pointer_array_add(orte_node_topologies, t);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -196,9 +268,19 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
asprintf(&node->name, "%s%0*d", prefix, dig, i);
|
asprintf(&node->name, "%s%0*d", prefix, dig, i);
|
||||||
node->state = ORTE_NODE_STATE_UP;
|
node->state = ORTE_NODE_STATE_UP;
|
||||||
node->slots_inuse = 0;
|
node->slots_inuse = 0;
|
||||||
node->slots_max = (NULL == max_slot_cnt[n] ? 0 : atoi(max_slot_cnt[n]));
|
|
||||||
node->slots = (NULL == slot_cnt[n] ? 0 : atoi(slot_cnt[n]));
|
|
||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
|
if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
|
||||||
|
node->slots_max = 0;
|
||||||
|
} else {
|
||||||
|
obj = hwloc_get_root_obj(topo);
|
||||||
|
node->slots_max = opal_hwloc_base_get_npus(topo, obj);
|
||||||
|
}
|
||||||
|
if (NULL == slot_cnt || NULL == slot_cnt[n]) {
|
||||||
|
node->slots = 0;
|
||||||
|
} else {
|
||||||
|
obj = hwloc_get_root_obj(topo);
|
||||||
|
node->slots = opal_hwloc_base_get_npus(topo, obj);
|
||||||
|
}
|
||||||
node->topology = topo;
|
node->topology = topo;
|
||||||
#endif
|
#endif
|
||||||
opal_output_verbose(1, orte_ras_base_framework.framework_output,
|
opal_output_verbose(1, orte_ras_base_framework.framework_output,
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -277,7 +277,8 @@ static int bind_downwards(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
/* bozo check */
|
/* bozo check */
|
||||||
locale = NULL;
|
locale = NULL;
|
||||||
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
|
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
|
||||||
|
NULL == locale) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
|
||||||
hwloc_bitmap_free(totalcpuset);
|
hwloc_bitmap_free(totalcpuset);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
|
@ -93,9 +93,18 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
|||||||
nprocs = 0;
|
nprocs = 0;
|
||||||
for (i=0; i < jdata->apps->size; i++) {
|
for (i=0; i < jdata->apps->size; i++) {
|
||||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
|
if (0 == app->num_procs) {
|
||||||
|
opal_list_t nodes;
|
||||||
|
orte_std_cntr_t slots;
|
||||||
|
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||||
|
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
|
||||||
|
OPAL_LIST_DESTRUCT(&nodes);
|
||||||
|
nprocs += slots;
|
||||||
|
} else {
|
||||||
nprocs += app->num_procs;
|
nprocs += app->num_procs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"mca:rmaps: nprocs %s",
|
"mca:rmaps: nprocs %s",
|
||||||
ORTE_VPID_PRINT(nprocs));
|
ORTE_VPID_PRINT(nprocs));
|
||||||
|
@ -529,7 +529,7 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
|||||||
if (orte_get_attribute(&src->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&loc, OPAL_PTR)) {
|
if (orte_get_attribute(&src->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&loc, OPAL_PTR)) {
|
||||||
if (NULL != loc) {
|
if (NULL != loc) {
|
||||||
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(locale, sizeof(locale), src->node->topology, loc->cpuset)) {
|
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(locale, sizeof(locale), src->node->topology, loc->cpuset)) {
|
||||||
strcpy(locale, "UNBOUND");
|
strcpy(locale, "NODE");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
strcpy(locale, "UNKNOWN");
|
strcpy(locale, "UNKNOWN");
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user