1
1

Extend capability to support heterogeneous clusters with multiple topologies

This commit was SVN r25474.
Этот коммит содержится в:
Ralph Castain 2011-11-13 23:23:09 +00:00
родитель 6b5e1b89cf
Коммит 793f4c688f
3 изменённых файлов: 129 добавлений и 67 удалений

Просмотреть файл

@ -18,10 +18,12 @@ BEGIN_C_DECLS
struct orte_ras_sim_component_t { struct orte_ras_sim_component_t {
orte_ras_base_component_t super; orte_ras_base_component_t super;
int num_nodes; char *num_nodes;
int slots; int slots;
int slots_max; int slots_max;
char *topofile; char *topofiles;
bool have_cpubind;
bool have_membind;
}; };
typedef struct orte_ras_sim_component_t orte_ras_sim_component_t; typedef struct orte_ras_sim_component_t orte_ras_sim_component_t;

Просмотреть файл

@ -65,10 +65,8 @@ orte_ras_sim_component_t mca_ras_simulator_component = {
static int ras_sim_open(void) static int ras_sim_open(void)
{ {
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version, int tmp;
"num_nodes",
"Number of nodes to simulate",
false, false, 0, &mca_ras_simulator_component.num_nodes);
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version, mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"slots", "slots",
"Number of slots on each node to simulate", "Number of slots on each node to simulate",
@ -79,9 +77,28 @@ static int ras_sim_open(void)
false, false, 0, &mca_ras_simulator_component.slots_max); false, false, 0, &mca_ras_simulator_component.slots_max);
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version, mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"topo_file", "num_nodes",
"File containing xml topology description for simulated nodes", "Comma-separated list of number of nodes to simulate for each topology",
false, false, NULL, &mca_ras_simulator_component.topofile); false, false, NULL, &mca_ras_simulator_component.num_nodes);
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"topo_files",
"Comma-separated list of files containing xml topology descriptions for simulated nodes",
false, false, NULL, &mca_ras_simulator_component.topofiles);
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"have_cpubind",
"Topology supports binding to cpus",
false, false, (int)true, &tmp);
mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"have_membind",
"Topology supports binding to memory",
false, false, (int)true, &tmp);
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp);
#else
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"num_nodes",
"Number of nodes to simulate",
false, false, NULL, &mca_ras_simulator_component.num_nodes);
#endif #endif
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -90,7 +107,7 @@ static int ras_sim_open(void)
static int ras_sim_component_query(mca_base_module_t **module, int *priority) static int ras_sim_component_query(mca_base_module_t **module, int *priority)
{ {
if (0 < mca_ras_simulator_component.num_nodes) { if (NULL != mca_ras_simulator_component.num_nodes) {
*module = (mca_base_module_t *) &orte_ras_sim_module; *module = (mca_base_module_t *) &orte_ras_sim_module;
*priority = 1000; *priority = 1000;
/* cannot launch simulated nodes or resolve their names to addresses */ /* cannot launch simulated nodes or resolve their names to addresses */

Просмотреть файл

@ -17,6 +17,7 @@
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "opal/mca/hwloc/hwloc.h" #include "opal/mca/hwloc/hwloc.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "ras_sim.h" #include "ras_sim.h"
@ -39,30 +40,63 @@ orte_ras_base_module_t orte_ras_sim_module = {
static int allocate(opal_list_t *nodes) static int allocate(opal_list_t *nodes)
{ {
int i, val, dig; int i, n, val, dig, num_nodes;
orte_node_t *node; orte_node_t *node;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
hwloc_topology_t topo; hwloc_topology_t topo;
hwloc_obj_t obj; hwloc_obj_t obj;
unsigned j, k; unsigned j, k;
struct hwloc_topology_support *support;
char **files=NULL;
bool use_local_topology = false;
#endif
char **node_cnt=NULL;
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
#if OPAL_HAVE_HWLOC
if (NULL == mca_ras_simulator_component.topofiles) {
/* use our topology */
use_local_topology = true;
if (1 != opal_argv_count(node_cnt)) {
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
return ORTE_ERR_SILENT;
}
} else {
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
return ORTE_ERR_SILENT;
}
}
#endif #endif
/* count the total number of nodes */
val = 0;
for (n=0; NULL != node_cnt[n]; n++) {
val += strtol(node_cnt[n], NULL, 10);
}
/* get number of digits */ /* get number of digits */
val = mca_ras_simulator_component.num_nodes;
for (dig=0; 0 != val; dig++) { for (dig=0; 0 != val; dig++) {
val /= 10; val /= 10;
} }
/* process the request */
val = 0;
for (n=0; NULL != node_cnt[n]; n++) {
num_nodes = strtol(node_cnt[n], NULL, 10);
/* check for topology */ /* check for topology */
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
if (NULL == mca_ras_simulator_component.topofile) { if (use_local_topology) {
/* use our topology */ /* use our topology */
topo = opal_hwloc_topology; topo = opal_hwloc_topology;
} else { } else {
if (0 != hwloc_topology_init(&topo)) { if (0 != hwloc_topology_init(&topo)) {
return ORTE_ERROR; return ORTE_ERROR;
} }
if (0 != hwloc_topology_set_xml(topo, mca_ras_simulator_component.topofile)) { if (0 != hwloc_topology_set_xml(topo, files[n])) {
hwloc_topology_destroy(topo); hwloc_topology_destroy(topo);
return ORTE_ERROR; return ORTE_ERROR;
} }
@ -101,12 +135,20 @@ static int allocate(opal_list_t *nodes)
break; break;
} }
} }
/* unfortunately, hwloc does not include support info in its
* xml output :-(( To aid in debugging, we set it here
*/
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
/* add it to our array */
opal_pointer_array_add(orte_node_topologies, topo);
} }
#endif #endif
for (i=0; i < mca_ras_simulator_component.num_nodes; i++) { for (i=0; i < num_nodes; i++) {
node = OBJ_NEW(orte_node_t); node = OBJ_NEW(orte_node_t);
asprintf(&node->name, "node%0*d", dig, i); asprintf(&node->name, "node%0*d", dig, val++);
node->state = ORTE_NODE_STATE_UP; node->state = ORTE_NODE_STATE_UP;
node->slots_inuse = 0; node->slots_inuse = 0;
node->slots_max = mca_ras_simulator_component.slots_max; node->slots_max = mca_ras_simulator_component.slots_max;
@ -116,6 +158,7 @@ static int allocate(opal_list_t *nodes)
#endif #endif
opal_list_append(nodes, &node->super); opal_list_append(nodes, &node->super);
} }
}
/* record the number of allocated nodes */ /* record the number of allocated nodes */
orte_num_allocated_nodes = opal_list_get_size(nodes); orte_num_allocated_nodes = opal_list_get_size(nodes);