From 3f04d50cb062489be55bcc3c5d1e34a2dfad0ef2 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 14 Jun 2014 15:38:32 +0000 Subject: [PATCH] Per the ticket, resolve our handling of overload conditions to provide a more consistent response. If we are overloaded (i.e., attempting to bind more processes to a location than the number of cpus under that location), then we consider the following conditions: (a) default binding policy is in effect. In this case, we will emit a warning and default to not binding unless the user provided the "oversubscribe" or "overload" modifier to the "bind-to" option. (b) user-specified binding policy is in effect. In this case, we will error out unless the user provided the "oversubscribe" or "overload" modifier to the "bind-to" option as we cannot meet the directive. Either "bind-to" modifier (oversubscribe or overload) will be accepted for now - in 1.9, we will deprecate the "overload" term in favor of "oversubscribe". Also added the ability to accept a --bind-to modifier without specifying the binding policy itself so a user can specify overload-allowed with the default policy. Closes trac:4345 cmr=v1.8.2:reviewer=rhc:subject=resolve handling of overload conditions This commit was SVN r32005. The following Trac tickets were found above: Ticket 4345 --> https://svn.open-mpi.org/trac/ompi/ticket/4345 --- opal/mca/hwloc/base/hwloc_base_frame.c | 10 +- orte/mca/rmaps/base/help-orte-rmaps-base.txt | 16 +--- orte/mca/rmaps/base/rmaps_base_binding.c | 92 ++++++++++++++----- orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 92 ------------------- 4 files changed, 80 insertions(+), 130 deletions(-) diff --git a/opal/mca/hwloc/base/hwloc_base_frame.c b/opal/mca/hwloc/base/hwloc_base_frame.c index 7a52edb310..91069ffd17 100644 --- a/opal/mca/hwloc/base/hwloc_base_frame.c +++ b/opal/mca/hwloc/base/hwloc_base_frame.c @@ -517,8 +517,12 @@ int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NONE); } else { tmpvals = opal_argv_split(spec, ':'); - if (1 < opal_argv_count(tmpvals)) { - quals = opal_argv_split(tmpvals[1], ','); + if (1 < opal_argv_count(tmpvals) || ':' == spec[0]) { + if (':' == spec[0]) { + quals = opal_argv_split(&spec[1], ','); + } else { + quals = opal_argv_split(tmpvals[1], ','); + } for (i=0; NULL != quals[i]; i++) { if (0 == strncasecmp(quals[i], "if-supported", strlen(quals[i]))) { tmp |= OPAL_BIND_IF_SUPPORTED; @@ -533,7 +537,7 @@ int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec } opal_argv_free(quals); } - if (NULL == tmpvals[0]) { + if (NULL == tmpvals[0] || ':' == spec[0]) { OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE); tmp &= ~OPAL_BIND_GIVEN; } else { diff --git a/orte/mca/rmaps/base/help-orte-rmaps-base.txt b/orte/mca/rmaps/base/help-orte-rmaps-base.txt index 5427c37c24..e88d383c02 100644 --- a/orte/mca/rmaps/base/help-orte-rmaps-base.txt +++ b/orte/mca/rmaps/base/help-orte-rmaps-base.txt @@ -123,10 +123,10 @@ be found on node %s. A request was made to bind to that would result in binding more processes than cpus on a resource: - Bind to: %s - Node: %s + Bind to: %s + Node: %s #processes: %d - #cpus: %d + #cpus: %d You can override this protection by adding the "overload-allowed" option to your binding directive. @@ -312,13 +312,3 @@ directive. Please specify a mapping level that has more than one cpu, or else let us define a default mapping that will allow multiple cpus-per-proc. -# -[unrecog-modifier] -A modifier was given to the --map-by directive that is not -recognized: - - Modifier: %s - -Please see "mpirun --help" for a description of supported -modifiers. - diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index c4ee8454b2..e4b79845be 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -104,6 +104,20 @@ static void reset_usage(orte_node_t *node, orte_jobid_t jobid) } } +static void unbind_procs(orte_job_t *jdata) +{ + int j; + orte_proc_t *proc; + + for (j=0; j < jdata->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { + continue; + } + orte_remove_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND); + orte_remove_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP); + } +} + static int bind_upwards(orte_job_t *jdata, orte_node_t *node, hwloc_obj_type_t target, @@ -176,12 +190,27 @@ static int bind_upwards(orte_job_t *jdata, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && - !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && - OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { - orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, - opal_hwloc_base_print_binding(map->binding), node->name, - data->num_bound, ncpus); - return ORTE_ERR_SILENT; + !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { + if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { + /* if the user specified a binding policy, then we cannot meet + * it since overload isn't allowed, so error out - have the + * message indicate that setting overload allowed will remove + * this restriction */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, + opal_hwloc_base_print_binding(map->binding), node->name, + data->num_bound, ncpus); + return ORTE_ERR_SILENT; + } else { + /* if we have the default binding policy, emit a warning + * that we won't be binding-by-default and include a statement + * that setting overload allowed will silence the warning */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, + opal_hwloc_base_print_binding(map->binding), node->name, + data->num_bound, ncpus); + OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); + unbind_procs(jdata); + return ORTE_SUCCESS; + } } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); @@ -298,20 +327,26 @@ static int bind_downwards(orte_job_t *jdata, if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { + /* if the user specified a binding policy, then we cannot meet + * it since overload isn't allowed, so error out - have the + * message indicate that setting overload allowed will remove + * this restriction */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } else { - /* if this is the default binding policy, then just don't - * bind this proc - */ - data->num_bound--; // maintain count - /* show the proc as not bound */ - orte_remove_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND); + /* if we have the default binding policy, emit a warning + * that we won't be binding-by-default and include a statement + * that setting overload allowed will silence the warning */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, + opal_hwloc_base_print_binding(map->binding), node->name, + data->num_bound, ncpus); + OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); + unbind_procs(jdata); hwloc_bitmap_zero(totalcpuset); - break; + return ORTE_SUCCESS; } } /* bind the proc here */ @@ -501,15 +536,28 @@ static int bind_in_place(orte_job_t *jdata, } } if (!found) { - /* no place to put this - see if overload is allowed and - * error out if adding a proc would cause overload and that wasn't allowed, - * and it wasn't a default binding policy (i.e., the user requested it)*/ - if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && - OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { - orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, - opal_hwloc_base_print_binding(map->binding), node->name, - data->num_bound, ncpus); - return ORTE_ERR_SILENT; + /* no place to put this - see if overload is allowed */ + if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { + if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { + /* if the user specified a binding policy, then we cannot meet + * it since overload isn't allowed, so error out - have the + * message indicate that setting overload allowed will remove + * this restriction */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, + opal_hwloc_base_print_binding(map->binding), node->name, + data->num_bound, ncpus); + return ORTE_ERR_SILENT; + } else { + /* if we have the default binding policy, emit a warning + * that we won't be binding-by-default and include a statement + * that setting overload allowed will silence the warning */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, + opal_hwloc_base_print_binding(map->binding), node->name, + data->num_bound, ncpus); + OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); + unbind_procs(jdata); + return ORTE_SUCCESS; + } } } } diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index 61b01f37c5..9104b6fbc5 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -62,30 +62,6 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata, true, app->num_procs, app->app); return ORTE_ERR_SILENT; } -#if OPAL_HAVE_HWLOC - /* if we will and are allowed to oversubscribe, and binding was given, then - * we really should warn the user that we cannot bind - */ - if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { - if ((OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding) || - OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) && - !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){ - /* RHC: don't emit this warning at this time while we try to - * determine the best path forward. See - * https://svn.open-mpi.org/trac/ompi/ticket/4345 - * for an explanation - orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed", - true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - */ - } - } else { - /* don't default to bound */ - opal_output_verbose(2, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping by slot resetting binding policy to NONE as node is oversubscribed"); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - } -#endif } /* first pass: map the number of procs to each node until we @@ -253,30 +229,6 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, return ORTE_ERR_SILENT; } oversubscribed = true; -#if OPAL_HAVE_HWLOC - /* if we will and are allowed to oversubscribe, and binding was given, then - * we really should warn the user that we cannot bind - */ - if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { - if ((OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding) || - OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) && - !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){ - /* RHC: don't emit this warning at this time while we try to - * determine the best path forward. See - * https://svn.open-mpi.org/trac/ompi/ticket/4345 - * for an explanation - orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed", - true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - */ - } - } else { - /* don't default to bound */ - opal_output_verbose(2, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping by node resetting binding policy to NONE as node is oversubscribed"); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - } -#endif } nnodes = opal_list_get_size(node_list); @@ -517,28 +469,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, true, app->num_procs, app->app); return ORTE_ERR_SILENT; } - /* if we will and are allowed to oversubscribe, and binding was given, then - * we really should warn the user that we cannot bind - */ - if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { - if ((OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding) || - OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) && - !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){ - /* RHC: don't emit this warning at this time while we try to - * determine the best path forward. See - * https://svn.open-mpi.org/trac/ompi/ticket/4345 - * for an explanation - orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed", - true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - */ - } - } else { - /* don't default to bound */ - opal_output_verbose(2, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping no-span resetting binding policy to NONE as node is oversubscribed"); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - } } /* we know we have enough slots, or that oversubscrption is allowed, so @@ -675,28 +605,6 @@ static int byobj_span(orte_job_t *jdata, true, app->num_procs, app->app); return ORTE_ERR_SILENT; } - /* if we will and are allowed to oversubscribe, and binding was given, then - * we really should warn the user that we cannot bind - */ - if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { - if ((OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding) || - OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) && - !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){ - /* RHC: don't emit this warning at this time while we try to - * determine the best path forward. See - * https://svn.open-mpi.org/trac/ompi/ticket/4345 - * for an explanation - orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed", - true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - */ - } - } else { - /* don't default to bound */ - opal_output_verbose(2, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping span resetting binding policy to NONE as node is oversubscribed"); - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - } } /* we know we have enough slots, or that oversubscrption is allowed, so