1
1

Add a simple pattern mapper as an example of how to use the topology info to create desired mappings. Let the user specify a pattern based on resource types, and map that pattern across all available nodes as resources permit.

Don't automatically display the topology for each node when --display-devel-map is set as it can overwhelm the reader. Use a separate flag --display-topo to get it.

This commit was SVN r25396.
Этот коммит содержится в:
Ralph Castain 2011-10-29 15:12:45 +00:00
родитель 12a589130a
Коммит 648c85b41b
11 изменённых файлов: 834 добавлений и 5 удалений

Просмотреть файл

@ -179,7 +179,12 @@ int orte_rmaps_base_open(void)
orte_rmaps_base.display_map = true;
orte_devel_level_output = true;
}
/* should we display the topology along with the map? */
mca_base_param_reg_int_name("rmaps", "base_display_topo_with_map",
"Whether to display the topology with the map",
false, false, (int)false, &value);
orte_display_topo_with_map = OPAL_INT_TO_BOOL(value);
/* Open up all the components that we can find */
if (ORTE_SUCCESS !=
mca_base_components_open("rmaps", orte_rmaps_base.rmaps_output,

36
orte/mca/rmaps/ppr/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-rmaps-ppr.txt
sources = \
rmaps_ppr.c \
rmaps_ppr.h \
rmaps_ppr_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_rmaps_ppr_DSO
component_noinst =
component_install = mca_rmaps_ppr.la
else
component_noinst = libmca_rmaps_ppr.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_rmaps_ppr_la_SOURCES = $(sources)
mca_rmaps_ppr_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_rmaps_ppr_la_SOURCES =$(sources)
libmca_rmaps_ppr_la_LDFLAGS = -module -avoid-version

26
orte/mca/rmaps/ppr/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_orte_rmaps_ppr_CONFIG([action-if-found], [action-if-not-found])
# -------------------------------------------------------------------------
AC_DEFUN([MCA_orte_rmaps_ppr_CONFIG],[
AC_REQUIRE([MCA_opal_hwloc_CONFIG_REQUIRE])
AC_CONFIG_FILES([orte/mca/rmaps/ppr/Makefile])
# All we check for is whether $OPAL_HAVE_HWLOC is 1.
# See big comment in opal/mca/hwloc/configure.m4.
AC_MSG_CHECKING([if hwloc is enabled])
AS_IF([test $OPAL_HAVE_HWLOC -eq 1],
[AC_MSG_RESULT([yes])
$1],
[AC_MSG_RESULT([no])
$2])
])dnl

71
orte/mca/rmaps/ppr/help-orte-rmaps-ppr.txt Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
#
#
[invalid-ppr]
An invalid value was given for the number of processes
per resource (ppr) to be mapped on each node:
PPR: %s
The specification must be a comma-separated list containing
combinations of number, followed by a colon, followed
by the resource type. For example, a value of "1:socket" indicates that
one process is to be mapped onto each socket. Values are supported
for hwthread, core, L1-3 caches, socket, numa, and node. Note that
enough characters must be provided to clearly specify the desired
resource (e.g., "nu" for "numa").
#
[unrecognized-ppr-option]
An unrecognized value was given for the number of processes
per resource (ppr) to be mapped on each node:
Value: %s
PPR: %s
The specification must be a number, followed by a colon, followed
by the resource type. For example, a value of "1:slot" indicates that
anything over one process per slot is to be considered oversubscribed.
Only values for "hwthread", "core", "socket",
"l1cache", "l2cache", "l3cache", "numa", and "node" are allowed. Note that
enough characters must be provided to clearly specify the desired
resource (e.g., "nu" for "numa").
#
[ppr-violation]
The provided mapping directives resulted in too many processes
being placed on a node:
Node: %s
Num procs: %d
Limiting resource: %s
Num resources: %d
Specified constraint: %s
Please adjust and try again.
#
[ppr-too-many-procs]
Your job has requested more processes than the ppr for
this topology can support:
App: %s
Number of procs: %d
PPR: %s
Please revise the conflict and try again.

453
orte/mca/rmaps/ppr/rmaps_ppr.c Обычный файл
Просмотреть файл

@ -0,0 +1,453 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/base/base.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_ppr.h"
static int ppr(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_ppr_module = {
ppr
};
static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node);
static void prune(orte_node_t *node,
opal_hwloc_level_t *level,
orte_vpid_t *nmapped);
static int ppr(orte_job_t *jdata)
{
int rc, local_limit, j;
orte_rmaps_ppr_component_t *c = &mca_rmaps_ppr_component;
orte_node_t *node;
orte_proc_t *proc;
orte_app_context_t *app;
orte_vpid_t total_procs, nprocs_mapped;
hwloc_obj_t obj;
hwloc_obj_type_t lowest;
opal_hwloc_level_t level;
unsigned cache_level;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
unsigned int nobjs, i;
orte_std_cntr_t idx;
/* only handle initial launch of loadbalanced
* or NPERxxx jobs - allow restarting of failed apps
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not in initial state - ppr cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->super.base_version.mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not using ppr mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
jdata->map->last_mapper = strdup(c->super.base_version.mca_component_name);
/* convenience */
local_limit = mca_rmaps_ppr_component.ppr[mca_rmaps_ppr_component.start];
level = mca_rmaps_ppr_component.start;
/* find the lowest level that was defined in the ppr */
lowest = opal_hwloc_levels[mca_rmaps_ppr_component.start];
if (OPAL_HWLOC_L3CACHE_LEVEL == mca_rmaps_ppr_component.start) {
cache_level = 3;
} else if (OPAL_HWLOC_L2CACHE_LEVEL == mca_rmaps_ppr_component.start) {
cache_level = 2;
} else if (OPAL_HWLOC_L1CACHE_LEVEL == mca_rmaps_ppr_component.start) {
cache_level = 1;
}
for (idx=0; idx < jdata->apps->size; idx++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
continue;
}
/* if the number of total procs was given, set that
* limit - otherwise, set to max so we simply fill
* all the nodes with the pattern
*/
if (0 < app->num_procs) {
total_procs = app->num_procs;
} else {
total_procs = ORTE_VPID_MAX;
}
/* get the available nodes */
OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* cycle across the nodes */
nprocs_mapped = 0;
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
/* add the node to the map */
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
goto error;
}
OBJ_RETAIN(node); /* maintain accounting on object */
jdata->map->num_nodes++;
/* if we are mapping solely at the node level, just put
* that many procs on this node
*/
if (HWLOC_OBJ_MACHINE == lowest) {
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node))) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
nprocs_mapped++;
proc->locale = obj;
}
} else {
/* get the number of lowest resources on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
lowest, cache_level,
OPAL_HWLOC_AVAILABLE);
/* map the specified number of procs to each such resource on this node,
* recording the locale of each proc so we know its cpuset
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
lowest, cache_level,
i, OPAL_HWLOC_AVAILABLE);
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node))) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
nprocs_mapped++;
proc->locale = obj;
}
}
if (mca_rmaps_ppr_component.pruning_reqd) {
/* go up the ladder and prune the procs according to
* the specification, adjusting the count of procs on the
* node as we go
*/
level--;
prune(node, &level, &nprocs_mapped);
}
}
/* set the total slots used to the number of procs placed
* on this node
*/
node->slots_inuse = node->num_procs;
/* if no-oversubscribe was specified, check to see if
* we have violated the total slot specification - regardless,
* if slots_max was given, we are not allowed to violate it!
*/
if ((!(jdata->map->oversubscribe) && node->slots < node->slots_inuse) ||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, node->num_procs, app->app);
rc = ORTE_ERR_SILENT;
goto error;
}
/* update the number of procs in the job and the app */
jdata->num_procs += node->num_procs;
app->num_procs = node->num_procs;
/* if we haven't mapped all the procs, continue on to the
* next node
*/
if (total_procs == nprocs_mapped) {
break;
}
}
if (nprocs_mapped < total_procs) {
/* couldn't map them all */
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
true, app->app, app->num_procs, mca_rmaps_ppr_component.given_ppr);
rc = ORTE_ERR_SILENT;
goto error;
}
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
{
unsigned k;
hwloc_obj_t nxt;
if (1 < obj->arity) {
return obj;
}
for (k=0; k < obj->arity; k++) {
nxt = find_split(topo, obj->children[k]);
if (NULL != nxt) {
return nxt;
}
}
return NULL;
}
/* recursively climb the topology, pruning procs beyond that allowed
* by the given ppr
*/
static void prune(orte_node_t *node,
opal_hwloc_level_t *level,
orte_vpid_t *nmapped)
{
hwloc_obj_t obj, top;
unsigned int i, nobjs;
hwloc_obj_type_t lvl;
unsigned cache_level, k;
int nprocs;
hwloc_cpuset_t avail, cpus, childcpus;
int n, limit, nmax, nunder, idx, idxmax;
orte_proc_t *proc, *pptr, *procmax;
opal_hwloc_level_t ll;
char dang[64];
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: pruning level %d",
*level);
/* convenience */
ll = *level;
/* convenience */
lvl = opal_hwloc_levels[ll];
limit = mca_rmaps_ppr_component.ppr[ll];
if (0 == limit) {
/* no limit at this level, so move up if necessary */
if (0 == ll) {
/* done */
return;
}
*level -= 1;
prune(node, level, nmapped);
return;
}
/* handle the darn cache thing again */
if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
cache_level = 3;
} else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
cache_level = 2;
} else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
cache_level = 1;
}
/* get the number of resources at this level on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
lvl, cache_level,
OPAL_HWLOC_AVAILABLE);
/* for each resource, compute the number of procs sitting
* underneath it and check against the limit
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
lvl, cache_level,
i, OPAL_HWLOC_AVAILABLE);
/* get the available cpuset */
avail = opal_hwloc_base_get_available_cpus(node->topology, obj);
/* look at the intersection of this object's cpuset and that
* of each proc - if they intersect, then count this proc
* against the limit
*/
nprocs = 0;
for (n=0; n < node->procs->size; n++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
if (hwloc_bitmap_intersects(avail, cpus)) {
nprocs++;
}
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: found %d procs limit %d",
nprocs, limit);
/* check against the limit */
while (limit < nprocs) {
/* need to remove procs - do this in a semi-intelligent
* manner to provide a little load balancing by cycling
* across the objects beneath this one, removing procs
* in a round-robin fashion until the limit is satisfied
*
* NOTE: I'm sure someone more knowledgeable with hwloc
* will come up with a more efficient way to do this, so
* consider this is a starting point
*/
/* find the first level that has more than
* one child beneath it - if all levels
* have only one child, then return this
* object
*/
top = find_split(node->topology, obj);
hwloc_obj_type_snprintf(dang, 64, top, 1);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);
/* cycle across the children of this object */
nmax = 0;
procmax = NULL;
idx = 0;
/* find the child with the most procs underneath it */
for (k=0; k < top->arity && limit < nprocs; k++) {
/* get this object's available cpuset */
childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]);
nunder = 0;
pptr = NULL;
for (n=0; n < node->procs->size; n++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
if (hwloc_bitmap_intersects(childcpus, cpus)) {
nunder++;
if (NULL == pptr) {
/* save the location of the first proc under this object */
pptr = proc;
idx = n;
}
}
}
if (nmax < nunder) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
k, nunder, nmax);
nmax = nunder;
procmax = pptr;
idxmax = idx;
}
}
if (NULL == procmax) {
/* can't find anything to remove - error out */
goto error;
}
/* remove it */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: removing proc at posn %d",
idxmax);
opal_pointer_array_set_item(node->procs, idxmax, NULL);
node->num_procs--;
nprocs--;
*nmapped -= 1;
OBJ_RELEASE(procmax);
}
}
/* finished with this level - move up if necessary */
if (0 == ll) {
return;
}
*level -= 1;
prune(node, level, nmapped);
return;
error:
opal_output(0, "INFINITE LOOP");
}
static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node)
{
orte_proc_t *proc;
int rc;
proc = OBJ_NEW(orte_proc_t);
/* set the jobid */
proc->name.jobid = jdata->jobid;
/* we do not set the vpid here - this will be done
* during a second phase, but we do set the epoch here
since they all start with the same value. */
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
/* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT;
proc->app_idx = 0;
OBJ_RETAIN(node); /* maintain accounting on object */
proc->node = node;
proc->nodename = node->name;
node->num_procs++;
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
return NULL;
}
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
return proc;
}

37
orte/mca/rmaps/ppr/rmaps_ppr.h Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_RMAPS_PPR_H
#define ORTE_RMAPS_PPR_H
#include "orte_config.h"
#include "opal/mca/hwloc/hwloc.h"
#include "orte/mca/rmaps/rmaps.h"
BEGIN_C_DECLS
struct orte_rmaps_ppr_component_t {
orte_rmaps_base_component_t super;
char *given_ppr;
bool selected;
bool pruning_reqd;
int ppr[OPAL_HWLOC_HWTHREAD_LEVEL];
opal_hwloc_level_t start;
};
typedef struct orte_rmaps_ppr_component_t orte_rmaps_ppr_component_t;
ORTE_MODULE_DECLSPEC extern orte_rmaps_ppr_component_t mca_rmaps_ppr_component;
extern orte_rmaps_base_module_t orte_rmaps_ppr_module;
END_C_DECLS
#endif

181
orte/mca/rmaps/ppr/rmaps_ppr_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,181 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_ppr.h"
/*
* Local functions
*/
static int orte_rmaps_ppr_open(void);
static int orte_rmaps_ppr_close(void);
static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority);
orte_rmaps_ppr_component_t mca_rmaps_ppr_component = {
{
{
ORTE_RMAPS_BASE_VERSION_2_0_0,
"ppr", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_rmaps_ppr_open, /* component open */
orte_rmaps_ppr_close, /* component close */
orte_rmaps_ppr_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_rmaps_ppr_open(void)
{
char **ppr, *ctmp, **ck;
int i, n;
size_t value;
opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
/* initialize */
mca_rmaps_ppr_component.selected = false;
mca_rmaps_ppr_component.pruning_reqd = false;
memset(mca_rmaps_ppr_component.ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
n=0;
mca_base_param_reg_string(&mca_rmaps_ppr_component.super.base_version,
"pattern",
"Comma-separate list of number of processes on a given resource type [default: none]",
false, false, NULL, &mca_rmaps_ppr_component.given_ppr);
ctmp = mca_rmaps_ppr_component.given_ppr;
if (NULL != ctmp) {
ppr = opal_argv_split(ctmp, ',');
/* check validity of mppr spec */
for (i=0; NULL != ppr[i]; i++) {
/* split on the colon */
ck = opal_argv_split(ppr[i], ':');
if (2 != opal_argv_count(ck)) {
/* must provide a specification */
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, ctmp);
opal_argv_free(ppr);
opal_argv_free(ck);
free(ctmp);
return ORTE_ERR_SILENT;
}
value = strlen(ck[1]);
if (0 == strncasecmp(ck[1], "hwthread", value) ||
0 == strncasecmp(ck[1], "thread", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
start = OPAL_HWLOC_HWTHREAD_LEVEL;
n++;
} else if (0 == strncasecmp(ck[1], "core", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_CORE_LEVEL) {
start = OPAL_HWLOC_CORE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "socket", value) ||
0 == strncasecmp(ck[1], "skt", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
start = OPAL_HWLOC_SOCKET_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "l1cache", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
start = OPAL_HWLOC_L1CACHE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "l2cache", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
start = OPAL_HWLOC_L2CACHE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "l3cache", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
start = OPAL_HWLOC_L3CACHE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "numa", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_NUMA_LEVEL) {
start = OPAL_HWLOC_NUMA_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "node", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
n++;
} else {
/* unknown spec */
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], ctmp);
opal_argv_free(ppr);
opal_argv_free(ck);
free(ctmp);
return ORTE_ERR_SILENT;
}
opal_argv_free(ck);
}
opal_argv_free(ppr);
mca_rmaps_ppr_component.selected = true;
mca_rmaps_ppr_component.start = start;
/* if more than one level was specified, then pruning will be reqd */
if (1 < n) {
mca_rmaps_ppr_component.pruning_reqd = true;
}
}
return ORTE_SUCCESS;
}
static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority)
{
if (mca_rmaps_ppr_component.selected) {
*priority = 1000;
*module = (mca_base_module_t *)&orte_rmaps_ppr_module;
return ORTE_SUCCESS;
}
/* cannot run without ppr spec */
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
/**
* Close all subsystems.
*/
static int orte_rmaps_ppr_close(void)
{
if (NULL != mca_rmaps_ppr_component.given_ppr) {
free(mca_rmaps_ppr_component.given_ppr);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -23,6 +23,7 @@
#include <sys/types.h>
#include "opal/util/argv.h"
#include "opal/mca/hwloc/hwloc.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h"
@ -408,7 +409,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
tmp = tmp2;
#if OPAL_HAVE_HWLOC
if (NULL != src->topology) {
if (orte_display_topo_with_map && NULL != src->topology) {
char *pfx3;
asprintf(&tmp2, "%s\n%s\tDetected Resources:", tmp, pfx2);
free(tmp);
@ -461,7 +462,8 @@ PRINT_PROCS:
int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type)
{
char *tmp, *tmp2, *pfx2;
char *locale=NULL;
/* set default result */
*output = NULL;
@ -521,9 +523,15 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
free(tmp);
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
#if OPAL_HAVE_HWLOC
if (NULL != src->locale) {
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
}
#endif
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tSlot list: %s", tmp, pfx2,
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
(NULL == src->slot_list) ? "NULL" : src->slot_list);
(NULL == locale) ? "UNKNOWN" : locale, (NULL == src->slot_list) ? "NULL" : src->slot_list);
free(tmp);
/* set the return */

Просмотреть файл

@ -75,6 +75,7 @@ bool orte_homogeneous_nodes = false;
bool orte_hetero_apps = false;
bool orte_never_launched = false;
bool orte_devel_level_output = false;
bool orte_display_topo_with_map = false;
char **orte_launch_environ;
@ -920,6 +921,9 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
proc->state = ORTE_PROC_STATE_UNDEF;
proc->app_idx = 0;
#if OPAL_HAVE_HWLOC
proc->locale = NULL;
#endif
proc->slot_list = NULL;
proc->node = NULL;
proc->prior_node = NULL;

Просмотреть файл

@ -481,6 +481,10 @@ struct orte_proc_t {
orte_exit_code_t exit_code;
/* the app_context that generated this proc */
orte_app_idx_t app_idx;
#if OPAL_HAVE_HWLOC
/* hwloc object to which this process was mapped */
hwloc_obj_t locale;
#endif
/* a cpu list, if specified by the user */
char *slot_list;
/* pointer to the node where this proc is executing */
@ -599,6 +603,7 @@ ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
ORTE_DECLSPEC extern bool orte_hetero_apps;
ORTE_DECLSPEC extern bool orte_never_launched;
ORTE_DECLSPEC extern bool orte_devel_level_output;
ORTE_DECLSPEC extern bool orte_display_topo_with_map;
ORTE_DECLSPEC extern char **orte_launch_environ;

Просмотреть файл

@ -291,6 +291,9 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ "rmaps", "base", "display_devel_map", '\0', "display-devel-map", "display-devel-map", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display a detailed process map (mostly intended for developers) just before launch"},
{ "rmaps", "base", "display_topo_with_map", '\0', "display-topo", "display-topo", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display the topology as part of the process map (mostly intended for developers) just before launch"},
{ NULL, NULL, NULL, 'H', "host", "host", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of hosts to invoke processes on" },