Extend capability to support heterogeneous clusters with multiple topologies
This commit was SVN r25474.
Этот коммит содержится в:
родитель
6b5e1b89cf
Коммит
793f4c688f
@ -18,10 +18,12 @@ BEGIN_C_DECLS
|
||||
|
||||
struct orte_ras_sim_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int num_nodes;
|
||||
char *num_nodes;
|
||||
int slots;
|
||||
int slots_max;
|
||||
char *topofile;
|
||||
char *topofiles;
|
||||
bool have_cpubind;
|
||||
bool have_membind;
|
||||
};
|
||||
typedef struct orte_ras_sim_component_t orte_ras_sim_component_t;
|
||||
|
||||
|
@ -65,10 +65,8 @@ orte_ras_sim_component_t mca_ras_simulator_component = {
|
||||
|
||||
static int ras_sim_open(void)
|
||||
{
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"num_nodes",
|
||||
"Number of nodes to simulate",
|
||||
false, false, 0, &mca_ras_simulator_component.num_nodes);
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"slots",
|
||||
"Number of slots on each node to simulate",
|
||||
@ -79,18 +77,37 @@ static int ras_sim_open(void)
|
||||
false, false, 0, &mca_ras_simulator_component.slots_max);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"topo_file",
|
||||
"File containing xml topology description for simulated nodes",
|
||||
false, false, NULL, &mca_ras_simulator_component.topofile);
|
||||
"num_nodes",
|
||||
"Comma-separated list of number of nodes to simulate for each topology",
|
||||
false, false, NULL, &mca_ras_simulator_component.num_nodes);
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"topo_files",
|
||||
"Comma-separated list of files containing xml topology descriptions for simulated nodes",
|
||||
false, false, NULL, &mca_ras_simulator_component.topofiles);
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"have_cpubind",
|
||||
"Topology supports binding to cpus",
|
||||
false, false, (int)true, &tmp);
|
||||
mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp);
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"have_membind",
|
||||
"Topology supports binding to memory",
|
||||
false, false, (int)true, &tmp);
|
||||
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp);
|
||||
#else
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"num_nodes",
|
||||
"Number of nodes to simulate",
|
||||
false, false, NULL, &mca_ras_simulator_component.num_nodes);
|
||||
#endif
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int ras_sim_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (0 < mca_ras_simulator_component.num_nodes) {
|
||||
if (NULL != mca_ras_simulator_component.num_nodes) {
|
||||
*module = (mca_base_module_t *) &orte_ras_sim_module;
|
||||
*priority = 1000;
|
||||
/* cannot launch simulated nodes or resolve their names to addresses */
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "ras_sim.h"
|
||||
@ -39,83 +40,125 @@ orte_ras_base_module_t orte_ras_sim_module = {
|
||||
|
||||
static int allocate(opal_list_t *nodes)
|
||||
{
|
||||
int i, val, dig;
|
||||
int i, n, val, dig, num_nodes;
|
||||
orte_node_t *node;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_topology_t topo;
|
||||
hwloc_obj_t obj;
|
||||
unsigned j, k;
|
||||
struct hwloc_topology_support *support;
|
||||
char **files=NULL;
|
||||
bool use_local_topology = false;
|
||||
#endif
|
||||
char **node_cnt=NULL;
|
||||
|
||||
node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL == mca_ras_simulator_component.topofiles) {
|
||||
/* use our topology */
|
||||
use_local_topology = true;
|
||||
if (1 != opal_argv_count(node_cnt)) {
|
||||
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
} else {
|
||||
files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
|
||||
if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
|
||||
orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* count the total number of nodes */
|
||||
val = 0;
|
||||
for (n=0; NULL != node_cnt[n]; n++) {
|
||||
val += strtol(node_cnt[n], NULL, 10);
|
||||
}
|
||||
/* get number of digits */
|
||||
val = mca_ras_simulator_component.num_nodes;
|
||||
for (dig=0; 0 != val; dig++) {
|
||||
val /= 10;
|
||||
}
|
||||
|
||||
/* check for topology */
|
||||
/* process the request */
|
||||
val = 0;
|
||||
for (n=0; NULL != node_cnt[n]; n++) {
|
||||
num_nodes = strtol(node_cnt[n], NULL, 10);
|
||||
|
||||
/* check for topology */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL == mca_ras_simulator_component.topofile) {
|
||||
/* use our topology */
|
||||
topo = opal_hwloc_topology;
|
||||
} else {
|
||||
if (0 != hwloc_topology_init(&topo)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (0 != hwloc_topology_set_xml(topo, mca_ras_simulator_component.topofile)) {
|
||||
hwloc_topology_destroy(topo);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* since we are loading this from an external source, we have to
|
||||
* explicitly set a flag so hwloc sets things up correctly
|
||||
*/
|
||||
if (0 != hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)) {
|
||||
hwloc_topology_destroy(topo);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (0 != hwloc_topology_load(topo)) {
|
||||
hwloc_topology_destroy(topo);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* remove the hostname from the topology. Unfortunately, hwloc
|
||||
* decided to add the source hostname to the "topology", thus
|
||||
* rendering it unusable as a pure topological description. So
|
||||
* we remove that information here.
|
||||
*/
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
for (k=0; k < obj->infos_count; k++) {
|
||||
if (NULL == obj->infos[k].name ||
|
||||
NULL == obj->infos[k].value) {
|
||||
continue;
|
||||
if (use_local_topology) {
|
||||
/* use our topology */
|
||||
topo = opal_hwloc_topology;
|
||||
} else {
|
||||
if (0 != hwloc_topology_init(&topo)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
|
||||
free(obj->infos[k].name);
|
||||
free(obj->infos[k].value);
|
||||
/* left justify the array */
|
||||
for (j=k; j < obj->infos_count-1; j++) {
|
||||
obj->infos[j] = obj->infos[j+1];
|
||||
if (0 != hwloc_topology_set_xml(topo, files[n])) {
|
||||
hwloc_topology_destroy(topo);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* since we are loading this from an external source, we have to
|
||||
* explicitly set a flag so hwloc sets things up correctly
|
||||
*/
|
||||
if (0 != hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)) {
|
||||
hwloc_topology_destroy(topo);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (0 != hwloc_topology_load(topo)) {
|
||||
hwloc_topology_destroy(topo);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* remove the hostname from the topology. Unfortunately, hwloc
|
||||
* decided to add the source hostname to the "topology", thus
|
||||
* rendering it unusable as a pure topological description. So
|
||||
* we remove that information here.
|
||||
*/
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
for (k=0; k < obj->infos_count; k++) {
|
||||
if (NULL == obj->infos[k].name ||
|
||||
NULL == obj->infos[k].value) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
|
||||
free(obj->infos[k].name);
|
||||
free(obj->infos[k].value);
|
||||
/* left justify the array */
|
||||
for (j=k; j < obj->infos_count-1; j++) {
|
||||
obj->infos[j] = obj->infos[j+1];
|
||||
}
|
||||
obj->infos[obj->infos_count-1].name = NULL;
|
||||
obj->infos[obj->infos_count-1].value = NULL;
|
||||
obj->infos_count--;
|
||||
break;
|
||||
}
|
||||
obj->infos[obj->infos_count-1].name = NULL;
|
||||
obj->infos[obj->infos_count-1].value = NULL;
|
||||
obj->infos_count--;
|
||||
break;
|
||||
}
|
||||
/* unfortunately, hwloc does not include support info in its
|
||||
* xml output :-(( To aid in debugging, we set it here
|
||||
*/
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
|
||||
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
|
||||
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
|
||||
/* add it to our array */
|
||||
opal_pointer_array_add(orte_node_topologies, topo);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i=0; i < mca_ras_simulator_component.num_nodes; i++) {
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
asprintf(&node->name, "node%0*d", dig, i);
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = mca_ras_simulator_component.slots_max;
|
||||
node->slots = mca_ras_simulator_component.slots;
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
asprintf(&node->name, "node%0*d", dig, val++);
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = mca_ras_simulator_component.slots_max;
|
||||
node->slots = mca_ras_simulator_component.slots;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
node->topology = topo;
|
||||
node->topology = topo;
|
||||
#endif
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
}
|
||||
|
||||
/* record the number of allocated nodes */
|
||||
orte_num_allocated_nodes = opal_list_get_size(nodes);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user