1
1

Extend capability to support heterogeneous clusters with multiple topologies

This commit was SVN r25474.
Этот коммит содержится в:
Ralph Castain 2011-11-13 23:23:09 +00:00
родитель 6b5e1b89cf
Коммит 793f4c688f
3 изменённых файлов: 129 добавлений и 67 удалений

Просмотреть файл

@ -18,10 +18,12 @@ BEGIN_C_DECLS
struct orte_ras_sim_component_t {
orte_ras_base_component_t super;
int num_nodes;
char *num_nodes;
int slots;
int slots_max;
char *topofile;
char *topofiles;
bool have_cpubind;
bool have_membind;
};
typedef struct orte_ras_sim_component_t orte_ras_sim_component_t;

Просмотреть файл

@ -65,10 +65,8 @@ orte_ras_sim_component_t mca_ras_simulator_component = {
static int ras_sim_open(void)
{
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"num_nodes",
"Number of nodes to simulate",
false, false, 0, &mca_ras_simulator_component.num_nodes);
int tmp;
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"slots",
"Number of slots on each node to simulate",
@ -79,9 +77,28 @@ static int ras_sim_open(void)
false, false, 0, &mca_ras_simulator_component.slots_max);
#if OPAL_HAVE_HWLOC
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"topo_file",
"File containing xml topology description for simulated nodes",
false, false, NULL, &mca_ras_simulator_component.topofile);
"num_nodes",
"Comma-separated list of number of nodes to simulate for each topology",
false, false, NULL, &mca_ras_simulator_component.num_nodes);
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"topo_files",
"Comma-separated list of files containing xml topology descriptions for simulated nodes",
false, false, NULL, &mca_ras_simulator_component.topofiles);
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"have_cpubind",
"Topology supports binding to cpus",
false, false, (int)true, &tmp);
mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"have_membind",
"Topology supports binding to memory",
false, false, (int)true, &tmp);
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp);
#else
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"num_nodes",
"Number of nodes to simulate",
false, false, NULL, &mca_ras_simulator_component.num_nodes);
#endif
return ORTE_SUCCESS;
@ -90,7 +107,7 @@ static int ras_sim_open(void)
static int ras_sim_component_query(mca_base_module_t **module, int *priority)
{
if (0 < mca_ras_simulator_component.num_nodes) {
if (NULL != mca_ras_simulator_component.num_nodes) {
*module = (mca_base_module_t *) &orte_ras_sim_module;
*priority = 1000;
/* cannot launch simulated nodes or resolve their names to addresses */

Просмотреть файл

@ -17,6 +17,7 @@
#include "opal/class/opal_list.h"
#include "opal/mca/hwloc/hwloc.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "ras_sim.h"
@ -39,30 +40,63 @@ orte_ras_base_module_t orte_ras_sim_module = {
static int allocate(opal_list_t *nodes)
{
int i, val, dig;
int i, n, val, dig, num_nodes;
orte_node_t *node;
#if OPAL_HAVE_HWLOC
hwloc_topology_t topo;
hwloc_obj_t obj;
unsigned j, k;
struct hwloc_topology_support *support;
char **files=NULL;
bool use_local_topology = false;
#endif
char **node_cnt=NULL;
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
#if OPAL_HAVE_HWLOC
if (NULL == mca_ras_simulator_component.topofiles) {
/* use our topology */
use_local_topology = true;
if (1 != opal_argv_count(node_cnt)) {
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
return ORTE_ERR_SILENT;
}
} else {
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
return ORTE_ERR_SILENT;
}
}
#endif
/* count the total number of nodes */
val = 0;
for (n=0; NULL != node_cnt[n]; n++) {
val += strtol(node_cnt[n], NULL, 10);
}
/* get number of digits */
val = mca_ras_simulator_component.num_nodes;
for (dig=0; 0 != val; dig++) {
val /= 10;
}
/* process the request */
val = 0;
for (n=0; NULL != node_cnt[n]; n++) {
num_nodes = strtol(node_cnt[n], NULL, 10);
/* check for topology */
#if OPAL_HAVE_HWLOC
if (NULL == mca_ras_simulator_component.topofile) {
if (use_local_topology) {
/* use our topology */
topo = opal_hwloc_topology;
} else {
if (0 != hwloc_topology_init(&topo)) {
return ORTE_ERROR;
}
if (0 != hwloc_topology_set_xml(topo, mca_ras_simulator_component.topofile)) {
if (0 != hwloc_topology_set_xml(topo, files[n])) {
hwloc_topology_destroy(topo);
return ORTE_ERROR;
}
@ -101,12 +135,20 @@ static int allocate(opal_list_t *nodes)
break;
}
}
/* unfortunately, hwloc does not include support info in its
* xml output :-(( To aid in debugging, we set it here
*/
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
/* add it to our array */
opal_pointer_array_add(orte_node_topologies, topo);
}
#endif
for (i=0; i < mca_ras_simulator_component.num_nodes; i++) {
for (i=0; i < num_nodes; i++) {
node = OBJ_NEW(orte_node_t);
asprintf(&node->name, "node%0*d", dig, i);
asprintf(&node->name, "node%0*d", dig, val++);
node->state = ORTE_NODE_STATE_UP;
node->slots_inuse = 0;
node->slots_max = mca_ras_simulator_component.slots_max;
@ -116,6 +158,7 @@ static int allocate(opal_list_t *nodes)
#endif
opal_list_append(nodes, &node->super);
}
}
/* record the number of allocated nodes */
orte_num_allocated_nodes = opal_list_get_size(nodes);