1
1

Merge pull request #692 from rhc54/topic/mapper

Fix hetero operations. An error in the hwloc utilities only allocated…
Этот коммит содержится в:
rhc54 2015-07-07 13:33:42 -07:00
родитель a4aff5e3d9 ed93154e43
Коммит c6bb227073
7 изменённых файлов: 146 добавлений и 44 удалений

Просмотреть файл

@ -1703,26 +1703,23 @@ static char *bitmap2rangestr(int bitmap)
static int build_map(int *num_sockets_arg, int *num_cores_arg,
hwloc_cpuset_t cpuset, int ***map, hwloc_topology_t topo)
{
static int num_sockets = -1, num_cores = -1;
int num_sockets, num_cores;
int socket_index, core_index, pu_index;
hwloc_obj_t socket, core, pu;
int **data;
/* Find out how many sockets we have (cached so that we don't have
to look this up every time) */
if (num_sockets < 0) {
num_sockets = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_SOCKET);
/* some systems (like the iMac) only have one
* socket and so don't report a socket
*/
if (0 == num_sockets) {
num_sockets = 1;
}
/* Lazy: take the total number of cores that we have in the
topology; that'll be more than the max number of cores
under any given socket */
num_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
/* Find out how many sockets we have */
num_sockets = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_SOCKET);
/* some systems (like the iMac) only have one
* socket and so don't report a socket
*/
if (0 == num_sockets) {
num_sockets = 1;
}
/* Lazy: take the total number of cores that we have in the
topology; that'll be more than the max number of cores
under any given socket */
num_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
*num_sockets_arg = num_sockets;
*num_cores_arg = num_cores;
@ -1791,7 +1788,7 @@ int opal_hwloc_base_cset2str(char *str, int len,
int ret, socket_index, core_index;
char tmp[BUFSIZ];
const int stmp = sizeof(tmp) - 1;
int **map;
int **map=NULL;
hwloc_obj_t root;
opal_hwloc_topo_data_t *sum;
@ -1819,7 +1816,6 @@ int opal_hwloc_base_cset2str(char *str, int len,
if (OPAL_SUCCESS != (ret = build_map(&num_sockets, &num_cores, cpuset, &map, topo))) {
return ret;
}
/* Iterate over the data matrix and build up the string */
first = true;
for (socket_index = 0; socket_index < num_sockets; ++socket_index) {
@ -1837,8 +1833,12 @@ int opal_hwloc_base_cset2str(char *str, int len,
}
}
}
free(map[0]);
free(map);
if (NULL != map) {
if (NULL != map[0]) {
free(map[0]);
}
free(map);
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,6 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,6 +23,7 @@ struct orte_ras_sim_component_t {
char * slots;
char * slots_max;
char *topofiles;
char *topologies;
bool have_cpubind;
bool have_membind;
};

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -95,6 +96,13 @@ static int ras_sim_register(void)
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_ras_simulator_component.topofiles);
mca_ras_simulator_component.topologies = NULL;
(void) mca_base_component_var_register (component, "topologies",
"Comma-separated list of topology descriptions for simulated nodes",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_ras_simulator_component.topologies);
mca_ras_simulator_component.have_cpubind = true;
(void) mca_base_component_var_register (component, "have_cpubind",
"Topology supports binding to cpus",

Просмотреть файл

@ -3,6 +3,7 @@
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -56,6 +57,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
unsigned j, k;
struct hwloc_topology_support *support;
char **files=NULL;
char **topos = NULL;
bool use_local_topology = false;
#endif
char **node_cnt=NULL;
@ -65,32 +67,39 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
char prefix[6];
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
/* backfill the slot_cnt as reqd so we don't have to
* specify slot_cnt for each set of nodes - we'll set
* */
tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
opal_argv_append_nosize(&slot_cnt, tmp);
if (NULL != mca_ras_simulator_component.slots) {
slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
/* backfile the slot_cnt so every topology has a cnt */
tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
opal_argv_append_nosize(&slot_cnt, tmp);
}
}
/* backfill the max_slot_cnt as reqd */
tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
opal_argv_append_nosize(&max_slot_cnt, tmp);
if (NULL != mca_ras_simulator_component.slots_max) {
max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
/* backfill the max_slot_cnt as reqd */
tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
opal_argv_append_nosize(&max_slot_cnt, tmp);
}
}
#if OPAL_HAVE_HWLOC
if (NULL == mca_ras_simulator_component.topofiles) {
/* use our topology */
use_local_topology = true;
} else {
if (NULL != mca_ras_simulator_component.topofiles) {
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
goto error_silent;
}
} else if (NULL != mca_ras_simulator_component.topologies) {
topos = opal_argv_split(mca_ras_simulator_component.topologies, ',');
if (opal_argv_count(topos) != opal_argv_count(node_cnt)) {
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
goto error_silent;
}
} else {
/* use our topology */
use_local_topology = true;
}
#else
/* If we don't have hwloc and hwloc files were specified, then
@ -123,7 +132,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
if (use_local_topology) {
/* use our topology */
topo = opal_hwloc_topology;
} else {
} else if (NULL != files) {
if (0 != hwloc_topology_init(&topo)) {
orte_show_help("help-ras-simulator.txt",
"hwloc API fail", true,
@ -188,6 +197,69 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
t->topo = topo;
t->sig = opal_hwloc_base_get_topo_signature(topo);
opal_pointer_array_add(orte_node_topologies, t);
} else {
if (0 != hwloc_topology_init(&topo)) {
orte_show_help("help-ras-simulator.txt",
"hwloc API fail", true,
__FILE__, __LINE__, "hwloc_topology_init");
goto error_silent;
}
if (0 != hwloc_topology_set_synthetic(topo, topos[n])) {
orte_show_help("help-ras-simulator.txt",
"hwloc API fail", true,
__FILE__, __LINE__, "hwloc_topology_set_synthetic");
hwloc_topology_destroy(topo);
goto error_silent;
}
if (0 != hwloc_topology_load(topo)) {
orte_show_help("help-ras-simulator.txt",
"hwloc API fail", true,
__FILE__, __LINE__, "hwloc_topology_load");
hwloc_topology_destroy(topo);
goto error_silent;
}
if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
orte_show_help("help-ras-simulator.txt",
"hwloc API fail", true,
__FILE__, __LINE__, "opal_hwloc_base_filter_cpus");
hwloc_topology_destroy(topo);
goto error_silent;
}
/* remove the hostname from the topology. Unfortunately, hwloc
* decided to add the source hostname to the "topology", thus
* rendering it unusable as a pure topological description. So
* we remove that information here.
*/
obj = hwloc_get_root_obj(topo);
for (k=0; k < obj->infos_count; k++) {
if (NULL == obj->infos[k].name ||
NULL == obj->infos[k].value) {
continue;
}
if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
free(obj->infos[k].name);
free(obj->infos[k].value);
/* left justify the array */
for (j=k; j < obj->infos_count-1; j++) {
obj->infos[j] = obj->infos[j+1];
}
obj->infos[obj->infos_count-1].name = NULL;
obj->infos[obj->infos_count-1].value = NULL;
obj->infos_count--;
break;
}
}
/* unfortunately, hwloc does not include support info in its
* xml output :-(( To aid in debugging, we set it here
*/
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
/* add it to our array */
t = OBJ_NEW(orte_topology_t);
t->topo = topo;
t->sig = opal_hwloc_base_get_topo_signature(topo);
opal_pointer_array_add(orte_node_topologies, t);
}
#endif
@ -196,9 +268,19 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
asprintf(&node->name, "%s%0*d", prefix, dig, i);
node->state = ORTE_NODE_STATE_UP;
node->slots_inuse = 0;
node->slots_max = (NULL == max_slot_cnt[n] ? 0 : atoi(max_slot_cnt[n]));
node->slots = (NULL == slot_cnt[n] ? 0 : atoi(slot_cnt[n]));
#if OPAL_HAVE_HWLOC
if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
node->slots_max = 0;
} else {
obj = hwloc_get_root_obj(topo);
node->slots_max = opal_hwloc_base_get_npus(topo, obj);
}
if (NULL == slot_cnt || NULL == slot_cnt[n]) {
node->slots = 0;
} else {
obj = hwloc_get_root_obj(topo);
node->slots = opal_hwloc_base_get_npus(topo, obj);
}
node->topology = topo;
#endif
opal_output_verbose(1, orte_ras_base_framework.framework_output,

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -277,7 +277,8 @@ static int bind_downwards(orte_job_t *jdata,
}
/* bozo check */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
NULL == locale) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;

Просмотреть файл

@ -93,7 +93,16 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
nprocs += app->num_procs;
if (0 == app->num_procs) {
opal_list_t nodes;
orte_std_cntr_t slots;
OBJ_CONSTRUCT(&nodes, opal_list_t);
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
OPAL_LIST_DESTRUCT(&nodes);
nprocs += slots;
} else {
nprocs += app->num_procs;
}
}
}
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,

Просмотреть файл

@ -529,7 +529,7 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
if (orte_get_attribute(&src->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&loc, OPAL_PTR)) {
if (NULL != loc) {
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(locale, sizeof(locale), src->node->topology, loc->cpuset)) {
strcpy(locale, "UNBOUND");
strcpy(locale, "NODE");
}
} else {
strcpy(locale, "UNKNOWN");