1
1
openmpi/orte/mca/rmaps/ppr/rmaps_ppr.c
2011-12-02 13:18:54 +00:00

594 строки
21 KiB
C

/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/base/base.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_ppr.h"
static int ppr_mapper(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_ppr_module = {
ppr_mapper
};
static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node,
orte_app_idx_t idx);
#if OPAL_HAVE_HWLOC
static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx,
orte_node_t *node,
opal_hwloc_level_t *level,
orte_vpid_t *nmapped);
#endif
static int ppr[OPAL_HWLOC_HWTHREAD_LEVEL+1];
static int ppr_mapper(orte_job_t *jdata)
{
int rc = ORTE_SUCCESS, j, n;
mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
orte_node_t *node;
orte_proc_t *proc;
orte_app_context_t *app;
orte_vpid_t total_procs, nprocs_mapped;
opal_hwloc_level_t level, start=OPAL_HWLOC_NODE_LEVEL;
#if OPAL_HAVE_HWLOC
hwloc_obj_t obj;
hwloc_obj_type_t lowest;
unsigned cache_level=0;
unsigned int nobjs, i;
#endif
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_app_idx_t idx;
char **ppr_req, **ck;
size_t len;
bool pruning_reqd = false;
/* only handle initial launch of loadbalanced
* or NPERxxx jobs - allow restarting of failed apps
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not in initial state - ppr cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not using ppr mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL == jdata->map->ppr ||
!(ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
/* not for us */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not using ppr mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: mapping job %s with ppr %s",
ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);
/* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
jdata->map->last_mapper = strdup(c->mca_component_name);
/* initialize */
memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
/* parse option */
n=0;
ppr_req = opal_argv_split(jdata->map->ppr, ',');
for (j=0; NULL != ppr_req[j]; j++) {
/* split on the colon */
ck = opal_argv_split(ppr_req[j], ':');
if (2 != opal_argv_count(ck)) {
/* must provide a specification */
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
opal_argv_free(ppr_req);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
len = strlen(ck[1]);
if (0 == strncasecmp(ck[1], "node", len)) {
ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
start = OPAL_HWLOC_NODE_LEVEL;
n++;
#if OPAL_HAVE_HWLOC
} else if (0 == strncasecmp(ck[1], "hwthread", len) ||
0 == strncasecmp(ck[1], "thread", len)) {
ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
start = OPAL_HWLOC_HWTHREAD_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
n++;
} else if (0 == strncasecmp(ck[1], "core", len)) {
ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_CORE_LEVEL) {
start = OPAL_HWLOC_CORE_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
}
n++;
} else if (0 == strncasecmp(ck[1], "socket", len) ||
0 == strncasecmp(ck[1], "skt", len)) {
ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
start = OPAL_HWLOC_SOCKET_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
}
n++;
} else if (0 == strncasecmp(ck[1], "l1cache", len)) {
ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
start = OPAL_HWLOC_L1CACHE_LEVEL;
cache_level = 1;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
}
n++;
} else if (0 == strncasecmp(ck[1], "l2cache", len)) {
ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
start = OPAL_HWLOC_L2CACHE_LEVEL;
cache_level = 2;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
}
n++;
} else if (0 == strncasecmp(ck[1], "l3cache", len)) {
ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
start = OPAL_HWLOC_L3CACHE_LEVEL;
cache_level = 3;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
}
n++;
} else if (0 == strncasecmp(ck[1], "numa", len)) {
ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_NUMA_LEVEL) {
start = OPAL_HWLOC_NUMA_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
}
n++;
#endif
} else {
/* unknown spec */
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
opal_argv_free(ppr_req);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
opal_argv_free(ck);
}
opal_argv_free(ppr_req);
/* if nothing was given, that's an error */
if (0 == n) {
opal_output(0, "NOTHING GIVEN");
return ORTE_ERR_SILENT;
}
/* if more than one level was specified, then pruning will be reqd */
if (1 < n) {
pruning_reqd = true;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s assigned policy %s",
ORTE_JOBID_PRINT(jdata->jobid),
orte_rmaps_base_print_mapping(jdata->map->mapping));
/* convenience */
level = start;
#if OPAL_HAVE_HWLOC
lowest = opal_hwloc_levels[start];
#endif
for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
continue;
}
/* if the number of total procs was given, set that
* limit - otherwise, set to max so we simply fill
* all the nodes with the pattern
*/
if (0 < app->num_procs) {
total_procs = app->num_procs;
} else {
total_procs = ORTE_VPID_MAX;
}
/* get the available nodes */
OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->mapping))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* cycle across the nodes */
nprocs_mapped = 0;
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
#if OPAL_HAVE_HWLOC
/* bozo check */
if (NULL == node->topology) {
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
true, node->name);
rc = ORTE_ERR_SILENT;
goto error;
}
#endif
/* add the node to the map */
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
goto error;
}
OBJ_RETAIN(node); /* maintain accounting on object */
jdata->map->num_nodes++;
/* if we are mapping solely at the node level, just put
* that many procs on this node
*/
if (OPAL_HWLOC_NODE_LEVEL == start) {
#if OPAL_HAVE_HWLOC
obj = hwloc_get_root_obj(node->topology);
#endif
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
nprocs_mapped++;
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
}
#if OPAL_HAVE_HWLOC
} else {
/* get the number of lowest resources on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
lowest, cache_level,
OPAL_HWLOC_AVAILABLE);
/* map the specified number of procs to each such resource on this node,
* recording the locale of each proc so we know its cpuset
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
lowest, cache_level,
i, OPAL_HWLOC_AVAILABLE);
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
nprocs_mapped++;
proc->locale = obj;
}
}
if (pruning_reqd) {
/* go up the ladder and prune the procs according to
* the specification, adjusting the count of procs on the
* node as we go
*/
level--;
prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
}
#endif
}
/* set the total slots used to the number of procs placed
* on this node
*/
node->slots_inuse = node->num_procs;
/* if no-oversubscribe was specified, check to see if
* we have violated the total slot specification - regardless,
* if slots_max was given, we are not allowed to violate it!
*/
if ((node->slots < node->slots_inuse) ||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, node->num_procs, app->app);
rc = ORTE_ERR_SILENT;
goto error;
}
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
/* update the number of procs in the job and the app */
jdata->num_procs += node->num_procs;
app->num_procs = node->num_procs;
/* if we haven't mapped all the procs, continue on to the
* next node
*/
if (total_procs == nprocs_mapped) {
break;
}
}
if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
/* couldn't map them all */
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
true, app->app, app->num_procs, jdata->map->ppr);
rc = ORTE_ERR_SILENT;
goto error;
}
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
#if OPAL_HAVE_HWLOC
static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
{
unsigned k;
hwloc_obj_t nxt;
if (1 < obj->arity) {
return obj;
}
for (k=0; k < obj->arity; k++) {
nxt = find_split(topo, obj->children[k]);
if (NULL != nxt) {
return nxt;
}
}
return NULL;
}
/* recursively climb the topology, pruning procs beyond that allowed
* by the given ppr
*/
static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx,
orte_node_t *node,
opal_hwloc_level_t *level,
orte_vpid_t *nmapped)
{
hwloc_obj_t obj, top;
unsigned int i, nobjs;
hwloc_obj_type_t lvl;
unsigned cache_level = 0, k;
int nprocs;
hwloc_cpuset_t avail, cpus, childcpus;
int n, limit, nmax, nunder, idx, idxmax = 0;
orte_proc_t *proc, *pptr, *procmax;
opal_hwloc_level_t ll;
char dang[64];
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: pruning level %d",
*level);
/* convenience */
ll = *level;
/* convenience */
lvl = opal_hwloc_levels[ll];
limit = ppr[ll];
if (0 == limit) {
/* no limit at this level, so move up if necessary */
if (0 == ll) {
/* done */
return;
}
--(*level);
prune(jobid, app_idx, node, level, nmapped);
return;
}
/* handle the darn cache thing again */
if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
cache_level = 3;
} else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
cache_level = 2;
} else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
cache_level = 1;
}
/* get the number of resources at this level on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
lvl, cache_level,
OPAL_HWLOC_AVAILABLE);
/* for each resource, compute the number of procs sitting
* underneath it and check against the limit
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
lvl, cache_level,
i, OPAL_HWLOC_AVAILABLE);
/* get the available cpuset */
avail = opal_hwloc_base_get_available_cpus(node->topology, obj);
/* look at the intersection of this object's cpuset and that
* of each proc in the job/app - if they intersect, then count this proc
* against the limit
*/
nprocs = 0;
for (n=0; n < node->procs->size; n++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
if (proc->name.jobid != jobid ||
proc->app_idx != app_idx) {
continue;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
if (hwloc_bitmap_intersects(avail, cpus)) {
nprocs++;
}
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: found %d procs limit %d",
nprocs, limit);
/* check against the limit */
while (limit < nprocs) {
/* need to remove procs - do this in a semi-intelligent
* manner to provide a little load balancing by cycling
* across the objects beneath this one, removing procs
* in a round-robin fashion until the limit is satisfied
*
* NOTE: I'm sure someone more knowledgeable with hwloc
* will come up with a more efficient way to do this, so
* consider this is a starting point
*/
/* find the first level that has more than
* one child beneath it - if all levels
* have only one child, then return this
* object
*/
top = find_split(node->topology, obj);
hwloc_obj_type_snprintf(dang, 64, top, 1);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);
/* cycle across the children of this object */
nmax = 0;
procmax = NULL;
idx = 0;
/* find the child with the most procs underneath it */
for (k=0; k < top->arity && limit < nprocs; k++) {
/* get this object's available cpuset */
childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]);
nunder = 0;
pptr = NULL;
for (n=0; n < node->procs->size; n++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
if (proc->name.jobid != jobid ||
proc->app_idx != app_idx) {
continue;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
if (hwloc_bitmap_intersects(childcpus, cpus)) {
nunder++;
if (NULL == pptr) {
/* save the location of the first proc under this object */
pptr = proc;
idx = n;
}
}
}
if (nmax < nunder) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
k, nunder, nmax);
nmax = nunder;
procmax = pptr;
idxmax = idx;
}
}
if (NULL == procmax) {
/* can't find anything to remove - error out */
goto error;
}
/* remove it */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: removing proc at posn %d",
idxmax);
opal_pointer_array_set_item(node->procs, idxmax, NULL);
node->num_procs--;
nprocs--;
*nmapped -= 1;
OBJ_RELEASE(procmax);
}
}
/* finished with this level - move up if necessary */
if (0 == ll) {
return;
}
--(*level);
prune(jobid, app_idx, node, level, nmapped);
return;
error:
opal_output(0, "INFINITE LOOP");
}
#endif
static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node,
orte_app_idx_t idx)
{
orte_proc_t *proc;
int rc;
proc = OBJ_NEW(orte_proc_t);
/* set the jobid */
proc->name.jobid = jdata->jobid;
/* we do not set the vpid here - this will be done
* during a second phase, but we do set the epoch here
since they all start with the same value. */
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
/* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT;
proc->app_idx = idx;
OBJ_RETAIN(node); /* maintain accounting on object */
proc->node = node;
proc->nodename = node->name;
node->num_procs++;
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
return NULL;
}
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
return proc;
}