Now that we are binding by default, the issue of #slots and what to do when oversubscribed has become a bit more complicated. This isn't a problem in managed environments as we are always provided an accurate assignment for the #slots, or when -host is used to define the allocation since we automatically assume one slot for every time a node is named.
The problem arises when a hostfile is used, and the user provides host names without specifying the slots= paramater. In these cases, we assign slots=1, but automatically allow oversubscription since that number isn't confirmed. We then provide a separate parameter by which the user can direct that we assign the number of slots based on the sensed hardware - e.g., by telling us to set the #slots equal to the #cores on each node. However, this has been set to "off" by default. In order to make this a little less complex for the user, set the default such that we automatically set #slots equal to #cores (or #hwt's if use_hwthreads_as_cpus has been set) only for those cases where the user provides names in a hostfile but does not provide slot information. Also cleanup some a couple of issues in the mapping/binding system: * ensure we only override the binding directive if we are oversubscribed *and* overload is not allowed * ensure that the MPI procs don't attempt to bind themselves if they are launched by an orted as any binding directive (no matter what it was) would have been serviced by the orted on launch * minor cleanup to the warning message when oversubscribed and binding was requested cmr=v1.7.5:reviewer=rhc:subject=update mapping/binding system This commit was SVN r30909.
Этот коммит содержится в:
родитель
baf21ab446
Коммит
0ac97761cc
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -778,6 +778,14 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
||||
free(param2);
|
||||
}
|
||||
|
||||
/* Set an info MCA param that tells the launched processes that
|
||||
* any binding policy was applied by us (e.g., so that
|
||||
* MPI_INIT doesn't try to bind itself)
|
||||
*/
|
||||
(void) mca_base_var_env_name ("orte_bound_at_launch", ¶m);
|
||||
opal_setenv(param, "1", true, environ_copy);
|
||||
free(param);
|
||||
|
||||
/* push data into environment - don't push any single proc
|
||||
* info, though. We are setting the environment up on a
|
||||
* per-context basis, and will add the individual proc
|
||||
|
@ -478,11 +478,9 @@ static int do_child(orte_app_context_t* context,
|
||||
/* bind this proc to all available processors */
|
||||
hwloc_set_cpubind(opal_hwloc_topology, sum->available, 0);
|
||||
}
|
||||
/* Set an info MCA param that tells
|
||||
the launched processes that it was bound by us (e.g., so that
|
||||
MPI_INIT doesn't try to bind itself) */
|
||||
(void) mca_base_var_env_name ("orte_bound_at_launch", ¶m);
|
||||
opal_setenv(param, "1", true, &environ_copy);
|
||||
/* provide a nice string representation of what we bound to */
|
||||
(void) mca_base_var_env_name ("orte_base_applied_binding", ¶m);
|
||||
opal_setenv(param, child->cpu_bitmap, true, &environ_copy);
|
||||
free(param);
|
||||
goto PROCEED;
|
||||
}
|
||||
@ -602,17 +600,6 @@ static int do_child(orte_app_context_t* context,
|
||||
goto PROCEED;
|
||||
}
|
||||
}
|
||||
/* Set an info MCA param that tells
|
||||
the launched processes that it was bound by us (e.g., so that
|
||||
MPI_INIT doesn't try to bind itself) */
|
||||
(void) mca_base_var_env_name ("orte_bound_at_launch", ¶m);
|
||||
opal_setenv(param, "1", true, &environ_copy);
|
||||
free(param);
|
||||
/* ...and provide a nice string representation of what we
|
||||
bound to */
|
||||
(void) mca_base_var_env_name ("orte_base_applied_binding", ¶m);
|
||||
opal_setenv(param, child->cpu_bitmap, true, &environ_copy);
|
||||
free (param);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -113,7 +113,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
* slots on each node as directed or using default
|
||||
*/
|
||||
if (!orte_managed_allocation) {
|
||||
if (NULL != orte_set_slots) {
|
||||
if (NULL != orte_set_slots &&
|
||||
0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) {
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
|
@ -276,11 +276,11 @@ the required syntax of #:object
|
||||
Please check your request and try again.
|
||||
#
|
||||
[orte-rmaps-base:oversubscribed]
|
||||
The requested number of processors exceeds the allocated
|
||||
The requested number of processes exceeds the allocated
|
||||
number of slots:
|
||||
|
||||
#slots: %d
|
||||
#processors: %d
|
||||
#processes: %d
|
||||
|
||||
This creates an oversubscribed condition that may adversely
|
||||
impact performance when combined with the requested binding
|
||||
|
@ -145,7 +145,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_PU, 0);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
@ -156,7 +156,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CORE, 0);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
@ -167,7 +167,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CACHE, 1);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
@ -178,7 +178,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CACHE, 2);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
@ -189,7 +189,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CACHE, 3);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
@ -200,7 +200,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_SOCKET, 0);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
@ -211,7 +211,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_NODE, 0);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||
if (ORTE_ERR_NOT_FOUND == rc) {
|
||||
/* if the mapper couldn't map by this object because
|
||||
* it isn't available, but the error allows us to try
|
||||
* byslot, then do so
|
||||
|
@ -71,8 +71,8 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
|
||||
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
} else {
|
||||
/* don't default to bound */
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
@ -253,8 +253,8 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
|
||||
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
} else {
|
||||
/* don't default to bound */
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
@ -508,8 +508,8 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
|
||||
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
} else {
|
||||
/* don't default to bound */
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
@ -653,8 +653,8 @@ static int byobj_span(orte_job_t *jdata,
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
|
||||
true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
} else {
|
||||
/* don't default to bound */
|
||||
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
|
||||
|
@ -712,9 +712,16 @@ int orte_register_params(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_max_vm_size);
|
||||
|
||||
orte_set_slots = NULL;
|
||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||
orte_set_slots = strdup("hwthreads");
|
||||
} else {
|
||||
orte_set_slots = strdup("cores");
|
||||
}
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "set_default_slots",
|
||||
"Set the number of slots on nodes that lack such info to the number of specified objects [a number, \"cores\", \"numas\", \"sockets\", or \"hwthreads\"]",
|
||||
"Set the number of slots on nodes that lack such info to the"
|
||||
" number of specified objects [a number, \"cores\" (default),"
|
||||
" \"numas\", \"sockets\", \"hwthreads\" (default if hwthreads_as_cpus is set),"
|
||||
" or \"none\" to skip this option]",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_set_slots);
|
||||
|
@ -658,7 +658,7 @@ int orterun(int argc, char *argv[])
|
||||
* depend upon opal_init_util() functionality.
|
||||
*/
|
||||
/* Need to initialize OPAL so that install_dirs are filled in */
|
||||
if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
|
||||
if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -833,10 +833,10 @@ int orterun(int argc, char *argv[])
|
||||
*/
|
||||
return rc;
|
||||
}
|
||||
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init
|
||||
* we continue to have a reference count on them. So we have to finalize them twice...
|
||||
/* finalize OPAL. As it was opened again from orte_init->opal_init
|
||||
* we continue to have a reference count on it. So we have to finalize it twice...
|
||||
*/
|
||||
opal_finalize_util();
|
||||
opal_finalize();
|
||||
|
||||
|
||||
/* get the daemon job object */
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user