1
1
Ralph Castain 0ac97761cc Now that we are binding by default, the issue of #slots and what to do when oversubscribed has become a bit more complicated. This isn't a problem in managed environments as we are always provided an accurate assignment for the #slots, or when -host is used to define the allocation since we automatically assume one slot for every time a node is named.
The problem arises when a hostfile is used, and the user provides host names without specifying the slots= paramater. In these cases, we assign slots=1, but automatically allow oversubscription since that number isn't confirmed. We then provide a separate parameter by which the user can direct that we assign the number of slots based on the sensed hardware - e.g., by telling us to set the #slots equal to the #cores on each node. However, this has been set to "off" by default.

In order to make this a little less complex for the user, set the default such that we automatically set #slots equal to #cores (or #hwt's if use_hwthreads_as_cpus has been set) only for those cases where the user provides names in a hostfile but does not provide slot information.

Also cleanup some a couple of issues in the mapping/binding system:

* ensure we only override the binding directive if we are oversubscribed *and* overload is not allowed

* ensure that the MPI procs don't attempt to bind themselves if they are launched by an orted as any binding directive (no matter what it was) would have been serviced by the orted on launch

* minor cleanup to the warning message when oversubscribed and binding was requested

cmr=v1.7.5:reviewer=rhc:subject=update mapping/binding system

This commit was SVN r30909.
2014-03-03 16:46:37 +00:00

275 строки
12 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/error_strings.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_rr.h"
/*
* Create a round-robin mapping for the job.
*/
static int orte_rmaps_rr_map(orte_job_t *jdata)
{
orte_app_context_t *app;
int i;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
int rc;
mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
bool initial_map=true;
/* this mapper can only handle initial launch
* when rr mapping is desired - allow
* restarting of failed apps
*/
if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: job %s is being restarted - rr cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: job %s not using rr mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
/* I don't know how to do these - defer */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: job %s not using rr mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rr: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
jdata->map->last_mapper = strdup(c->mca_component_name);
/* start at the beginning... */
jdata->num_procs = 0;
/* cycle through the app_contexts, mapping them sequentially */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* setup the nodelist here in case we jump to error */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* if the number of processes wasn't specified, then we know there can be only
* one app_context allowed in the launch, and that we are to launch it across
* all available slots. We'll double-check the single app_context rule first
*/
if (0 == app->num_procs && 1 < jdata->num_apps) {
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
true, jdata->num_apps, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->mapping, initial_map, false))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* flag that all subsequent requests should not reset the node->mapped flag */
initial_map = false;
/* if a bookmark exists from some prior mapping, set us to start there */
jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
if (0 == app->num_procs) {
/* set the num_procs to equal the number of slots on these
* mapped nodes, taking into account the number of cpus/rank
*/
app->num_procs = num_slots / orte_rmaps_base.cpus_per_rank;
}
/* Make assignments */
if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots,
app->num_procs);
} else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
#if OPAL_HAVE_HWLOC
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_PU, 0);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CORE, 0);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 1);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 2);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 3);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_SOCKET, 0);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_NODE, 0);
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
#endif
} else {
/* unrecognized mapping directive */
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy",
true, "mapping",
orte_rmaps_base_print_mapping(jdata->map->mapping));
rc = ORTE_ERR_SILENT;
goto error;
}
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* compute vpids and add proc objects to the job - do this after
* each app_context so that the ranks within each context are
* contiguous
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* track the total number of processes we mapped - must update
* this value AFTER we compute vpids so that computation
* is done correctly
*/
jdata->num_procs += app->num_procs;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
}
return ORTE_SUCCESS;
error:
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
orte_rmaps_base_module_t orte_rmaps_round_robin_module = {
orte_rmaps_rr_map
};