cb221b6f6f
Since we now support the dynamic addition of hosts to the orte_node_pool, there is no longer any reason to require advanced specification of all possible nodes. Instead, use a precedence method to initially allocate only those hosts that were specified in the cmd line: * rankfile, if given, as that will specify the nodes * -host, aggregated across all app_contexts * -hostfile, aggregated across all app_contexts * default hostfile * assign local node Fix slots_inuse accounting so that the nodes are correctly reset upon error termination - e.g., when oversubscribed without permission. Ensure we accurately track the user's specified desires for oversubscribe and no-use-local when dynamically spawning jobs. Signed-off-by: Ralph Castain <rhc@open-mpi.org> (cherry picked from commit c9b3e68ce596a68a2ed2fbf73f211b3334b0a6a8)
740 строки
30 KiB
C
740 строки
30 KiB
C
/*
|
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
|
* and Technology (RIST). All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#include <string.h>
|
|
|
|
#include "opal/mca/hwloc/base/base.h"
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/rmaps/base/rmaps_private.h"
|
|
#include "orte/mca/rmaps/base/base.h"
|
|
#include "rmaps_ppr.h"
|
|
|
|
static int ppr_mapper(orte_job_t *jdata);
|
|
static int assign_locations(orte_job_t *jdata);
|
|
|
|
orte_rmaps_base_module_t orte_rmaps_ppr_module = {
|
|
.map_job = ppr_mapper,
|
|
.assign_locations = assign_locations
|
|
};
|
|
|
|
/* RHC: will eventually remove this
|
|
* definition as it is no longer reqd
|
|
* in the rest of OMPI system.
|
|
*
|
|
* Define a hierarchical level value that
|
|
* helps resolve the hwloc behavior of
|
|
* treating caches as a single type of
|
|
* entity - must always be available
|
|
*/
|
|
typedef enum {
|
|
OPAL_HWLOC_NODE_LEVEL=0,
|
|
OPAL_HWLOC_NUMA_LEVEL,
|
|
OPAL_HWLOC_SOCKET_LEVEL,
|
|
OPAL_HWLOC_L3CACHE_LEVEL,
|
|
OPAL_HWLOC_L2CACHE_LEVEL,
|
|
OPAL_HWLOC_L1CACHE_LEVEL,
|
|
OPAL_HWLOC_CORE_LEVEL,
|
|
OPAL_HWLOC_HWTHREAD_LEVEL
|
|
} opal_hwloc_level_t;
|
|
|
|
static void prune(orte_jobid_t jobid,
|
|
orte_app_idx_t app_idx,
|
|
orte_node_t *node,
|
|
opal_hwloc_level_t *level,
|
|
orte_vpid_t *nmapped);
|
|
|
|
static int ppr[OPAL_HWLOC_HWTHREAD_LEVEL+1];
|
|
|
|
static int ppr_mapper(orte_job_t *jdata)
|
|
{
|
|
int rc = ORTE_SUCCESS, j, n;
|
|
mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
|
|
orte_node_t *node;
|
|
orte_proc_t *proc;
|
|
orte_app_context_t *app;
|
|
orte_vpid_t total_procs, nprocs_mapped;
|
|
opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
|
|
hwloc_obj_t obj;
|
|
hwloc_obj_type_t lowest;
|
|
unsigned cache_level=0;
|
|
unsigned int nobjs, i;
|
|
bool pruning_reqd = false;
|
|
opal_hwloc_level_t level;
|
|
opal_list_t node_list;
|
|
opal_list_item_t *item;
|
|
orte_std_cntr_t num_slots;
|
|
orte_app_idx_t idx;
|
|
char **ppr_req, **ck;
|
|
size_t len;
|
|
bool initial_map=true;
|
|
|
|
/* only handle initial launch of loadbalanced
|
|
* or NPERxxx jobs - allow restarting of failed apps
|
|
*/
|
|
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: job %s being restarted - ppr cannot map",
|
|
ORTE_JOBID_PRINT(jdata->jobid));
|
|
return ORTE_ERR_TAKE_NEXT_OPTION;
|
|
}
|
|
if (NULL != jdata->map->req_mapper &&
|
|
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
|
|
/* a mapper has been specified, and it isn't me */
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: job %s not using ppr mapper",
|
|
ORTE_JOBID_PRINT(jdata->jobid));
|
|
return ORTE_ERR_TAKE_NEXT_OPTION;
|
|
}
|
|
if (NULL == jdata->map->ppr ||
|
|
ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
/* not for us */
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: job %s not using ppr mapper PPR %s policy %s",
|
|
ORTE_JOBID_PRINT(jdata->jobid),
|
|
(NULL == jdata->map->ppr) ? "NULL" : jdata->map->ppr,
|
|
(ORTE_MAPPING_PPR == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) ? "PPRSET" : "PPR NOTSET");
|
|
return ORTE_ERR_TAKE_NEXT_OPTION;
|
|
}
|
|
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: mapping job %s with ppr %s",
|
|
ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);
|
|
|
|
/* flag that I did the mapping */
|
|
if (NULL != jdata->map->last_mapper) {
|
|
free(jdata->map->last_mapper);
|
|
}
|
|
jdata->map->last_mapper = strdup(c->mca_component_name);
|
|
|
|
/* initialize */
|
|
memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
|
|
|
|
/* parse option */
|
|
n=0;
|
|
ppr_req = opal_argv_split(jdata->map->ppr, ',');
|
|
for (j=0; NULL != ppr_req[j]; j++) {
|
|
/* split on the colon */
|
|
ck = opal_argv_split(ppr_req[j], ':');
|
|
if (2 != opal_argv_count(ck)) {
|
|
/* must provide a specification */
|
|
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
|
|
opal_argv_free(ppr_req);
|
|
opal_argv_free(ck);
|
|
return ORTE_ERR_SILENT;
|
|
}
|
|
len = strlen(ck[1]);
|
|
if (0 == strncasecmp(ck[1], "node", len)) {
|
|
ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
|
|
start = OPAL_HWLOC_NODE_LEVEL;
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "hwthread", len) ||
|
|
0 == strncasecmp(ck[1], "thread", len)) {
|
|
ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
|
|
start = OPAL_HWLOC_HWTHREAD_LEVEL;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "core", len)) {
|
|
ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
|
|
if (start < OPAL_HWLOC_CORE_LEVEL) {
|
|
start = OPAL_HWLOC_CORE_LEVEL;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
|
|
}
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "socket", len) ||
|
|
0 == strncasecmp(ck[1], "skt", len)) {
|
|
ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
|
|
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
|
|
start = OPAL_HWLOC_SOCKET_LEVEL;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
|
|
}
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "l1cache", len)) {
|
|
ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
|
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
|
|
start = OPAL_HWLOC_L1CACHE_LEVEL;
|
|
cache_level = 1;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
|
|
}
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "l2cache", len)) {
|
|
ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
|
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
|
|
start = OPAL_HWLOC_L2CACHE_LEVEL;
|
|
cache_level = 2;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
|
|
}
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "l3cache", len)) {
|
|
ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
|
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
|
|
start = OPAL_HWLOC_L3CACHE_LEVEL;
|
|
cache_level = 3;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
|
|
}
|
|
n++;
|
|
} else if (0 == strncasecmp(ck[1], "numa", len)) {
|
|
ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
|
|
if (start < OPAL_HWLOC_NUMA_LEVEL) {
|
|
start = OPAL_HWLOC_NUMA_LEVEL;
|
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
|
|
}
|
|
n++;
|
|
} else {
|
|
/* unknown spec */
|
|
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
|
|
opal_argv_free(ppr_req);
|
|
opal_argv_free(ck);
|
|
return ORTE_ERR_SILENT;
|
|
}
|
|
opal_argv_free(ck);
|
|
}
|
|
opal_argv_free(ppr_req);
|
|
/* if nothing was given, that's an error */
|
|
if (0 == n) {
|
|
opal_output(0, "NOTHING GIVEN");
|
|
return ORTE_ERR_SILENT;
|
|
}
|
|
/* if more than one level was specified, then pruning will be reqd */
|
|
if (1 < n) {
|
|
pruning_reqd = true;
|
|
}
|
|
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: job %s assigned policy %s",
|
|
ORTE_JOBID_PRINT(jdata->jobid),
|
|
orte_rmaps_base_print_mapping(jdata->map->mapping));
|
|
|
|
/* convenience */
|
|
level = start;
|
|
lowest = opal_hwloc_levels[start];
|
|
|
|
for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
|
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
|
|
continue;
|
|
}
|
|
|
|
/* if the number of total procs was given, set that
|
|
* limit - otherwise, set to max so we simply fill
|
|
* all the nodes with the pattern
|
|
*/
|
|
if (0 < app->num_procs) {
|
|
total_procs = app->num_procs;
|
|
} else {
|
|
total_procs = ORTE_VPID_MAX;
|
|
}
|
|
|
|
/* get the available nodes */
|
|
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
|
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
|
jdata->map->mapping, initial_map, false))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto error;
|
|
}
|
|
/* flag that all subsequent requests should not reset the node->mapped flag */
|
|
initial_map = false;
|
|
|
|
/* if a bookmark exists from some prior mapping, set us to start there */
|
|
jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
|
|
|
/* cycle across the nodes */
|
|
nprocs_mapped = 0;
|
|
for (item = opal_list_get_first(&node_list);
|
|
item != opal_list_get_end(&node_list);
|
|
item = opal_list_get_next(item)) {
|
|
node = (orte_node_t*)item;
|
|
/* bozo check */
|
|
if (NULL == node->topology || NULL == node->topology->topo) {
|
|
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
|
|
true, node->name);
|
|
rc = ORTE_ERR_SILENT;
|
|
goto error;
|
|
}
|
|
/* add the node to the map, if needed */
|
|
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
|
|
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
|
|
OBJ_RETAIN(node);
|
|
opal_pointer_array_add(jdata->map->nodes, node);
|
|
jdata->map->num_nodes++;
|
|
}
|
|
/* if we are mapping solely at the node level, just put
|
|
* that many procs on this node
|
|
*/
|
|
if (OPAL_HWLOC_NODE_LEVEL == start) {
|
|
obj = hwloc_get_root_obj(node->topology->topo);
|
|
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
|
|
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto error;
|
|
}
|
|
nprocs_mapped++;
|
|
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
|
|
}
|
|
} else {
|
|
/* get the number of lowest resources on this node */
|
|
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
|
lowest, cache_level,
|
|
OPAL_HWLOC_AVAILABLE);
|
|
|
|
/* map the specified number of procs to each such resource on this node,
|
|
* recording the locale of each proc so we know its cpuset
|
|
*/
|
|
for (i=0; i < nobjs; i++) {
|
|
obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
|
|
lowest, cache_level,
|
|
i, OPAL_HWLOC_AVAILABLE);
|
|
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
|
|
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto error;
|
|
}
|
|
nprocs_mapped++;
|
|
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
|
|
}
|
|
}
|
|
|
|
if (pruning_reqd) {
|
|
/* go up the ladder and prune the procs according to
|
|
* the specification, adjusting the count of procs on the
|
|
* node as we go
|
|
*/
|
|
level--;
|
|
prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
|
|
}
|
|
}
|
|
|
|
if (!(ORTE_MAPPING_DEBUGGER & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
|
|
/* set the total slots used */
|
|
if ((int)node->num_procs <= node->slots) {
|
|
node->slots_inuse = (int)node->num_procs;
|
|
} else {
|
|
node->slots_inuse = node->slots;
|
|
}
|
|
|
|
/* if no-oversubscribe was specified, check to see if
|
|
* we have violated the total slot specification - regardless,
|
|
* if slots_max was given, we are not allowed to violate it!
|
|
*/
|
|
if ((node->slots < (int)node->num_procs) ||
|
|
(0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
|
|
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
|
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
|
true, node->num_procs, app->app);
|
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
rc = ORTE_ERR_SILENT;
|
|
goto error;
|
|
}
|
|
/* flag the node as oversubscribed so that sched-yield gets
|
|
* properly set
|
|
*/
|
|
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
|
|
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
|
|
/* check for permission */
|
|
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
|
/* if we weren't given a directive either way, then we will error out
|
|
* as the #slots were specifically given, either by the host RM or
|
|
* via hostfile/dash-host */
|
|
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
|
|
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
|
true, app->num_procs, app->app);
|
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
return ORTE_ERR_SILENT;
|
|
} else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
|
/* if we were explicitly told not to oversubscribe, then don't */
|
|
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
|
true, app->num_procs, app->app);
|
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
return ORTE_ERR_SILENT;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we haven't mapped all the procs, continue on to the
|
|
* next node
|
|
*/
|
|
if (total_procs == nprocs_mapped) {
|
|
break;
|
|
}
|
|
}
|
|
if (0 == app->num_procs) {
|
|
app->num_procs = nprocs_mapped;
|
|
}
|
|
if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
|
|
/* couldn't map them all */
|
|
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
|
|
true, app->app, app->num_procs, jdata->map->ppr);
|
|
rc = ORTE_ERR_SILENT;
|
|
goto error;
|
|
}
|
|
|
|
/* track the total number of processes we mapped - must update
|
|
* this AFTER we compute vpids so that computation is done
|
|
* correctly
|
|
*/
|
|
jdata->num_procs += app->num_procs;
|
|
|
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&node_list);
|
|
}
|
|
return ORTE_SUCCESS;
|
|
|
|
error:
|
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&node_list);
|
|
return rc;
|
|
}
|
|
|
|
static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
|
|
{
|
|
unsigned k;
|
|
hwloc_obj_t nxt;
|
|
|
|
if (1 < obj->arity) {
|
|
return obj;
|
|
}
|
|
for (k=0; k < obj->arity; k++) {
|
|
nxt = find_split(topo, obj->children[k]);
|
|
if (NULL != nxt) {
|
|
return nxt;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* recursively climb the topology, pruning procs beyond that allowed
|
|
* by the given ppr
|
|
*/
|
|
static void prune(orte_jobid_t jobid,
|
|
orte_app_idx_t app_idx,
|
|
orte_node_t *node,
|
|
opal_hwloc_level_t *level,
|
|
orte_vpid_t *nmapped)
|
|
{
|
|
hwloc_obj_t obj, top;
|
|
unsigned int i, nobjs;
|
|
hwloc_obj_type_t lvl;
|
|
unsigned cache_level = 0, k;
|
|
int nprocs;
|
|
hwloc_cpuset_t avail;
|
|
int n, limit, nmax, nunder, idx, idxmax = 0;
|
|
orte_proc_t *proc, *pptr, *procmax;
|
|
opal_hwloc_level_t ll;
|
|
char dang[64];
|
|
hwloc_obj_t locale;
|
|
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: pruning level %d",
|
|
*level);
|
|
|
|
/* convenience */
|
|
ll = *level;
|
|
|
|
/* convenience */
|
|
lvl = opal_hwloc_levels[ll];
|
|
limit = ppr[ll];
|
|
|
|
if (0 == limit) {
|
|
/* no limit at this level, so move up if necessary */
|
|
if (0 == ll) {
|
|
/* done */
|
|
return;
|
|
}
|
|
--(*level);
|
|
prune(jobid, app_idx, node, level, nmapped);
|
|
return;
|
|
}
|
|
|
|
/* handle the darn cache thing again */
|
|
if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
|
|
cache_level = 3;
|
|
} else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
|
|
cache_level = 2;
|
|
} else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
|
|
cache_level = 1;
|
|
}
|
|
|
|
/* get the number of resources at this level on this node */
|
|
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
|
lvl, cache_level,
|
|
OPAL_HWLOC_AVAILABLE);
|
|
|
|
/* for each resource, compute the number of procs sitting
|
|
* underneath it and check against the limit
|
|
*/
|
|
for (i=0; i < nobjs; i++) {
|
|
obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
|
|
lvl, cache_level,
|
|
i, OPAL_HWLOC_AVAILABLE);
|
|
/* get the available cpuset */
|
|
avail = obj->cpuset;
|
|
|
|
/* look at the intersection of this object's cpuset and that
|
|
* of each proc in the job/app - if they intersect, then count this proc
|
|
* against the limit
|
|
*/
|
|
nprocs = 0;
|
|
for (n=0; n < node->procs->size; n++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
|
|
continue;
|
|
}
|
|
if (proc->name.jobid != jobid ||
|
|
proc->app_idx != app_idx) {
|
|
continue;
|
|
}
|
|
locale = NULL;
|
|
if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
return;
|
|
}
|
|
if (hwloc_bitmap_intersects(avail, locale->cpuset)) {
|
|
nprocs++;
|
|
}
|
|
}
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: found %d procs limit %d",
|
|
nprocs, limit);
|
|
|
|
/* check against the limit */
|
|
while (limit < nprocs) {
|
|
/* need to remove procs - do this in a semi-intelligent
|
|
* manner to provide a little load balancing by cycling
|
|
* across the objects beneath this one, removing procs
|
|
* in a round-robin fashion until the limit is satisfied
|
|
*
|
|
* NOTE: I'm sure someone more knowledgeable with hwloc
|
|
* will come up with a more efficient way to do this, so
|
|
* consider this is a starting point
|
|
*/
|
|
|
|
/* find the first level that has more than
|
|
* one child beneath it - if all levels
|
|
* have only one child, then return this
|
|
* object
|
|
*/
|
|
top = find_split(node->topology->topo, obj);
|
|
hwloc_obj_type_snprintf(dang, 64, top, 1);
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);
|
|
|
|
/* cycle across the children of this object */
|
|
nmax = 0;
|
|
procmax = NULL;
|
|
idx = 0;
|
|
/* find the child with the most procs underneath it */
|
|
for (k=0; k < top->arity && limit < nprocs; k++) {
|
|
/* get this object's available cpuset */
|
|
nunder = 0;
|
|
pptr = NULL;
|
|
for (n=0; n < node->procs->size; n++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
|
|
continue;
|
|
}
|
|
if (proc->name.jobid != jobid ||
|
|
proc->app_idx != app_idx) {
|
|
continue;
|
|
}
|
|
locale = NULL;
|
|
if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
return;
|
|
}
|
|
if (hwloc_bitmap_intersects(top->children[k]->cpuset, locale->cpuset)) {
|
|
nunder++;
|
|
if (NULL == pptr) {
|
|
/* save the location of the first proc under this object */
|
|
pptr = proc;
|
|
idx = n;
|
|
}
|
|
}
|
|
}
|
|
if (nmax < nunder) {
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
|
|
k, nunder, nmax);
|
|
nmax = nunder;
|
|
procmax = pptr;
|
|
idxmax = idx;
|
|
}
|
|
}
|
|
if (NULL == procmax) {
|
|
/* can't find anything to remove - error out */
|
|
goto error;
|
|
}
|
|
/* remove it */
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: removing proc at posn %d",
|
|
idxmax);
|
|
opal_pointer_array_set_item(node->procs, idxmax, NULL);
|
|
node->num_procs--;
|
|
node->slots_inuse--;
|
|
if (node->slots_inuse < 0) {
|
|
node->slots_inuse = 0;
|
|
}
|
|
nprocs--;
|
|
*nmapped -= 1;
|
|
OBJ_RELEASE(procmax);
|
|
}
|
|
}
|
|
/* finished with this level - move up if necessary */
|
|
if (0 == ll) {
|
|
return;
|
|
}
|
|
--(*level);
|
|
prune(jobid, app_idx, node, level, nmapped);
|
|
return;
|
|
|
|
error:
|
|
opal_output(0, "INFINITE LOOP");
|
|
}
|
|
|
|
static int assign_locations(orte_job_t *jdata)
|
|
{
|
|
int i, j, m, n;
|
|
mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
|
|
orte_node_t *node;
|
|
orte_proc_t *proc;
|
|
orte_app_context_t *app;
|
|
hwloc_obj_type_t level;
|
|
hwloc_obj_t obj;
|
|
unsigned int cache_level=0;
|
|
int ppr, cnt, nobjs, nprocs_mapped;
|
|
char **ppr_req, **ck;
|
|
|
|
if (NULL == jdata->map->last_mapper ||
|
|
0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) {
|
|
/* a mapper has been specified, and it isn't me */
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: job %s not using ppr assign: %s",
|
|
ORTE_JOBID_PRINT(jdata->jobid),
|
|
(NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper);
|
|
return ORTE_ERR_TAKE_NEXT_OPTION;
|
|
}
|
|
|
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
|
"mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s",
|
|
ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr,
|
|
orte_rmaps_base_print_mapping(jdata->map->mapping));
|
|
|
|
/* pickup the object level */
|
|
if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_MACHINE;
|
|
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_PU;
|
|
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_CORE;
|
|
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_SOCKET;
|
|
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_L1CACHE;
|
|
cache_level = 1;
|
|
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_L2CACHE;
|
|
cache_level = 2;
|
|
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_L3CACHE;
|
|
cache_level = 3;
|
|
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
|
level = HWLOC_OBJ_NUMANODE;
|
|
} else {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
return ORTE_ERR_TAKE_NEXT_OPTION;
|
|
}
|
|
|
|
/* get the ppr value */
|
|
ppr_req = opal_argv_split(jdata->map->ppr, ',');
|
|
ck = opal_argv_split(ppr_req[0], ':');
|
|
ppr = strtol(ck[0], NULL, 10);
|
|
opal_argv_free(ck);
|
|
opal_argv_free(ppr_req);
|
|
|
|
/* start assigning procs to objects, filling each object as we go until
|
|
* all procs are assigned. */
|
|
for (n=0; n < jdata->apps->size; n++) {
|
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
|
|
continue;
|
|
}
|
|
nprocs_mapped = 0;
|
|
for (m=0; m < jdata->map->nodes->size; m++) {
|
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
|
|
continue;
|
|
}
|
|
if (NULL == node->topology || NULL == node->topology->topo) {
|
|
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
|
|
true, node->name);
|
|
return ORTE_ERR_SILENT;
|
|
}
|
|
if (HWLOC_OBJ_MACHINE == level) {
|
|
obj = hwloc_get_root_obj(node->topology->topo);
|
|
for (j=0; j < node->procs->size; j++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
|
continue;
|
|
}
|
|
if (proc->name.jobid != jdata->jobid) {
|
|
continue;
|
|
}
|
|
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
|
|
}
|
|
} else {
|
|
/* get the number of resources on this node at this level */
|
|
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
|
level, cache_level,
|
|
OPAL_HWLOC_AVAILABLE);
|
|
|
|
/* map the specified number of procs to each such resource on this node,
|
|
* recording the locale of each proc so we know its cpuset
|
|
*/
|
|
for (i=0; i < nobjs; i++) {
|
|
cnt = 0;
|
|
obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
|
|
level, cache_level,
|
|
i, OPAL_HWLOC_AVAILABLE);
|
|
for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
|
continue;
|
|
}
|
|
if (proc->name.jobid != jdata->jobid) {
|
|
continue;
|
|
}
|
|
/* if we already assigned it, then skip */
|
|
if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, NULL, OPAL_PTR)) {
|
|
continue;
|
|
}
|
|
nprocs_mapped++;
|
|
cnt++;
|
|
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ORTE_SUCCESS;
|
|
}
|