1
1

Now that we are binding by default, the issue of #slots and what to do when oversubscribed has become a bit more complicated. This isn't a problem in managed environments as we are always provided an accurate assignment for the #slots, or when -host is used to define the allocation since we automatically assume one slot for every time a node is named.

The problem arises when a hostfile is used, and the user provides host names without specifying the slots= paramater. In these cases, we assign slots=1, but automatically allow oversubscription since that number isn't confirmed. We then provide a separate parameter by which the user can direct that we assign the number of slots based on the sensed hardware - e.g., by telling us to set the #slots equal to the #cores on each node. However, this has been set to "off" by default.

In order to make this a little less complex for the user, set the default such that we automatically set #slots equal to #cores (or #hwt's if use_hwthreads_as_cpus has been set) only for those cases where the user provides names in a hostfile but does not provide slot information.

Also cleanup some a couple of issues in the mapping/binding system:

* ensure we only override the binding directive if we are oversubscribed *and* overload is not allowed

* ensure that the MPI procs don't attempt to bind themselves if they are launched by an orted as any binding directive (no matter what it was) would have been serviced by the orted on launch

* minor cleanup to the warning message when oversubscribed and binding was requested

cmr=v1.7.5:reviewer=rhc:subject=update mapping/binding system

This commit was SVN r30909.
Этот коммит содержится в:
Ralph Castain 2014-03-03 16:46:37 +00:00
родитель baf21ab446
Коммит 0ac97761cc
8 изменённых файлов: 40 добавлений и 37 удалений

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -778,6 +778,14 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
free(param2);
}
/* Set an info MCA param that tells the launched processes that
* any binding policy was applied by us (e.g., so that
* MPI_INIT doesn't try to bind itself)
*/
(void) mca_base_var_env_name ("orte_bound_at_launch", &param);
opal_setenv(param, "1", true, environ_copy);
free(param);
/* push data into environment - don't push any single proc
* info, though. We are setting the environment up on a
* per-context basis, and will add the individual proc

Просмотреть файл

@ -478,11 +478,9 @@ static int do_child(orte_app_context_t* context,
/* bind this proc to all available processors */
hwloc_set_cpubind(opal_hwloc_topology, sum->available, 0);
}
/* Set an info MCA param that tells
the launched processes that it was bound by us (e.g., so that
MPI_INIT doesn't try to bind itself) */
(void) mca_base_var_env_name ("orte_bound_at_launch", &param);
opal_setenv(param, "1", true, &environ_copy);
/* provide a nice string representation of what we bound to */
(void) mca_base_var_env_name ("orte_base_applied_binding", &param);
opal_setenv(param, child->cpu_bitmap, true, &environ_copy);
free(param);
goto PROCEED;
}
@ -602,17 +600,6 @@ static int do_child(orte_app_context_t* context,
goto PROCEED;
}
}
/* Set an info MCA param that tells
the launched processes that it was bound by us (e.g., so that
MPI_INIT doesn't try to bind itself) */
(void) mca_base_var_env_name ("orte_bound_at_launch", &param);
opal_setenv(param, "1", true, &environ_copy);
free(param);
/* ...and provide a nice string representation of what we
bound to */
(void) mca_base_var_env_name ("orte_base_applied_binding", &param);
opal_setenv(param, child->cpu_bitmap, true, &environ_copy);
free (param);
}
}
#endif

Просмотреть файл

@ -113,7 +113,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
* slots on each node as directed or using default
*/
if (!orte_managed_allocation) {
if (NULL != orte_set_slots) {
if (NULL != orte_set_slots &&
0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) {
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;

Просмотреть файл

@ -276,11 +276,11 @@ the required syntax of #:object
Please check your request and try again.
#
[orte-rmaps-base:oversubscribed]
The requested number of processors exceeds the allocated
The requested number of processes exceeds the allocated
number of slots:
#slots: %d
#processors: %d
#processes: %d
This creates an oversubscribed condition that may adversely
impact performance when combined with the requested binding

Просмотреть файл

@ -145,7 +145,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_PU, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
@ -156,7 +156,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CORE, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
@ -167,7 +167,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 1);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
@ -178,7 +178,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 2);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
@ -189,7 +189,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 3);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
@ -200,7 +200,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_SOCKET, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
@ -211,7 +211,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_NODE, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
if (ORTE_ERR_NOT_FOUND == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so

Просмотреть файл

@ -71,8 +71,8 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
}
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
} else {
/* don't default to bound */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
@ -253,8 +253,8 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
}
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
} else {
/* don't default to bound */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
@ -508,8 +508,8 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
}
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
} else {
/* don't default to bound */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
@ -653,8 +653,8 @@ static int byobj_span(orte_job_t *jdata,
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
}
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
} else {
/* don't default to bound */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);

Просмотреть файл

@ -712,9 +712,16 @@ int orte_register_params(void)
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_max_vm_size);
orte_set_slots = NULL;
if (opal_hwloc_use_hwthreads_as_cpus) {
orte_set_slots = strdup("hwthreads");
} else {
orte_set_slots = strdup("cores");
}
(void) mca_base_var_register ("orte", "orte", NULL, "set_default_slots",
"Set the number of slots on nodes that lack such info to the number of specified objects [a number, \"cores\", \"numas\", \"sockets\", or \"hwthreads\"]",
"Set the number of slots on nodes that lack such info to the"
" number of specified objects [a number, \"cores\" (default),"
" \"numas\", \"sockets\", \"hwthreads\" (default if hwthreads_as_cpus is set),"
" or \"none\" to skip this option]",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_set_slots);

Просмотреть файл

@ -658,7 +658,7 @@ int orterun(int argc, char *argv[])
* depend upon opal_init_util() functionality.
*/
/* Need to initialize OPAL so that install_dirs are filled in */
if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
exit(1);
}
@ -833,10 +833,10 @@ int orterun(int argc, char *argv[])
*/
return rc;
}
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init
* we continue to have a reference count on them. So we have to finalize them twice...
/* finalize OPAL. As it was opened again from orte_init->opal_init
* we continue to have a reference count on it. So we have to finalize it twice...
*/
opal_finalize_util();
opal_finalize();
/* get the daemon job object */