From f17d47087ae329ade94541505ce3c4853b982571 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 20 Jun 2018 21:26:09 -0700 Subject: [PATCH] Define a new binding method and qualifier Allow users to request that procs be bound to a cpu in a given cpu-list based on their corresponding local rank Signed-off-by: Ralph Castain --- opal/mca/hwloc/base/hwloc_base_frame.c | 10 ++++-- opal/mca/hwloc/hwloc-internal.h | 13 ++++++-- orte/mca/rmaps/base/help-orte-rmaps-base.txt | 15 ++++++++- orte/mca/rmaps/base/rmaps_base_binding.c | 33 ++++++++++++++++++- orte/mca/rmaps/base/rmaps_base_frame.c | 1 + orte/mca/rmaps/base/rmaps_base_map_job.c | 3 ++ orte/mca/rmaps/rank_file/rmaps_rank_file.c | 6 +++- .../rank_file/rmaps_rank_file_component.c | 5 +-- 8 files changed, 76 insertions(+), 10 deletions(-) diff --git a/opal/mca/hwloc/base/hwloc_base_frame.c b/opal/mca/hwloc/base/hwloc_base_frame.c index a56c5e065d..6d2273a06f 100644 --- a/opal/mca/hwloc/base/hwloc_base_frame.c +++ b/opal/mca/hwloc/base/hwloc_base_frame.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -119,9 +119,9 @@ static int opal_hwloc_base_register(mca_base_register_flag_t flags) opal_hwloc_base_binding_policy = NULL; (void) mca_base_var_register("opal", "hwloc", "base", "binding_policy", "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, " - "l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is " + "l3cache, socket, numa, board, cpuset (\"none\" is the default when oversubscribed, \"core\" is " "the default when np<=2, and \"numa\" is the default when np>2). Allowed qualifiers: " - "overload-allowed, if-supported", + "overload-allowed, if-supported, ordered", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &opal_hwloc_base_binding_policy); @@ -504,6 +504,8 @@ int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec } else if (0 == strncasecmp(quals[i], "overload-allowed", strlen(quals[i])) || 0 == strncasecmp(quals[i], "oversubscribe-allowed", strlen(quals[i]))) { tmp |= OPAL_BIND_ALLOW_OVERLOAD; + } else if (0 == strncasecmp(quals[i], "ordered", strlen(quals[i]))) { + tmp |= OPAL_BIND_ORDERED; } else { /* unknown option */ opal_output(0, "Unknown qualifier to binding policy: %s", spec); @@ -534,6 +536,8 @@ int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NUMA); } else if (0 == strcasecmp(tmpvals[0], "board")) { OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_BOARD); + } else if (0 == strcasecmp(tmpvals[0], "cpuset")) { + OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CPUSET); } else { opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", spec); opal_argv_free(tmpvals); diff --git a/opal/mca/hwloc/hwloc-internal.h b/opal/mca/hwloc/hwloc-internal.h index ca030d0314..b680a471b1 100644 --- a/opal/mca/hwloc/hwloc-internal.h +++ b/opal/mca/hwloc/hwloc-internal.h @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * @@ -172,8 +172,15 @@ typedef uint16_t opal_binding_policy_t; /* binding directives */ #define OPAL_BIND_IF_SUPPORTED 0x1000 +/* allow assignment of multiple procs to + * same cpu */ #define OPAL_BIND_ALLOW_OVERLOAD 0x2000 +/* the binding policy was specified by the user */ #define OPAL_BIND_GIVEN 0x4000 +/* bind each rank to the cpu in the given + * cpu list based on its node-local-rank */ +#define OPAL_BIND_ORDERED 0x8000 + /* binding policies - any changes in these * values must be reflected in orte/mca/rmaps/rmaps.h */ @@ -190,7 +197,7 @@ typedef uint16_t opal_binding_policy_t; #define OPAL_GET_BINDING_POLICY(pol) \ ((pol) & 0x0fff) #define OPAL_SET_BINDING_POLICY(target, pol) \ - (target) = (pol) | (((target) & 0x2000) | OPAL_BIND_GIVEN) + (target) = (pol) | (((target) & 0xf000) | OPAL_BIND_GIVEN) #define OPAL_SET_DEFAULT_BINDING_POLICY(target, pol) \ do { \ if (!OPAL_BINDING_POLICY_IS_SET((target))) { \ @@ -208,6 +215,8 @@ typedef uint16_t opal_binding_policy_t; /* macro to detect if binding is forced */ #define OPAL_BIND_OVERLOAD_ALLOWED(n) \ (OPAL_BIND_ALLOW_OVERLOAD & (n)) +#define OPAL_BIND_ORDERED_REQUESTED(n) \ + (OPAL_BIND_ORDERED & (n)) /* some global values */ OPAL_DECLSPEC extern hwloc_topology_t opal_hwloc_topology; diff --git a/orte/mca/rmaps/base/help-orte-rmaps-base.txt b/orte/mca/rmaps/base/help-orte-rmaps-base.txt index 2f5f5b5d0c..88dcab07a9 100644 --- a/orte/mca/rmaps/base/help-orte-rmaps-base.txt +++ b/orte/mca/rmaps/base/help-orte-rmaps-base.txt @@ -13,7 +13,7 @@ # Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2018 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -420,3 +420,16 @@ compute node failed: We cannot continue - please check that the policy is in accordance with the actual available hardware. +# +[rmaps:insufficient-cpus] +The request to bind processes to cpus in a provided list +of logical id's based on their local rank on a node cannot +be met due to there being more processes on a node than +available cpus: + + Node: %s + Local rank: %d + Cpu list: %s + +Please adjust either the number of processes per node or +the list of cpus. diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index 43e0916564..6183030e09 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -505,6 +505,9 @@ static int bind_to_cpuset(orte_job_t *jdata) opal_hwloc_topo_data_t *sum; hwloc_obj_t root; char *cpu_bitmap; + unsigned id; + orte_local_rank_t lrank; + hwloc_bitmap_t mycpuset; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind job %s to cpus %s", @@ -512,6 +515,7 @@ static int bind_to_cpuset(orte_job_t *jdata) opal_hwloc_base_cpu_list); /* initialize */ map = jdata->map; + mycpuset = hwloc_bitmap_alloc(); for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { @@ -569,6 +573,8 @@ static int bind_to_cpuset(orte_job_t *jdata) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } + /* the cpu list in sum->available has already been filtered + * to include _only_ the cpus defined by the user */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; @@ -577,13 +583,38 @@ static int bind_to_cpuset(orte_job_t *jdata) if (proc->name.jobid != jdata->jobid) { continue; } - hwloc_bitmap_list_asprintf(&cpu_bitmap, sum->available); + if (OPAL_BIND_ORDERED_REQUESTED(jdata->map->binding)) { + /* assign each proc, in local rank order, to + * the corresponding cpu in the list */ + id = hwloc_bitmap_first(sum->available); + lrank = 0; + while (lrank != proc->local_rank) { + id = hwloc_bitmap_next(sum->available, id); + if ((unsigned)-1 == id) { + break; + } + ++lrank; + } + if ((unsigned)-1 ==id) { + /* ran out of cpus - that's an error */ + orte_show_help("help-orte-rmaps-base.txt", "rmaps:insufficient-cpus", true, + node->name, (int)proc->local_rank, opal_hwloc_base_cpu_list); + return ORTE_ERR_OUT_OF_RESOURCE; + } + /* set the bit of interest */ + hwloc_bitmap_only(mycpuset, id); + } else { + /* bind the proc to all assigned cpus */ + mycpuset = sum->available; + } + hwloc_bitmap_list_asprintf(&cpu_bitmap, mycpuset); orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); if (NULL != cpu_bitmap) { free(cpu_bitmap); } } } + hwloc_bitmap_free(mycpuset); return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/base/rmaps_base_frame.c b/orte/mca/rmaps/base/rmaps_base_frame.c index 1210e05e89..9d22d102fc 100644 --- a/orte/mca/rmaps/base/rmaps_base_frame.c +++ b/orte/mca/rmaps/base/rmaps_base_frame.c @@ -280,6 +280,7 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags) return ORTE_ERR_SILENT; } } + if (0 < orte_rmaps_base.cpus_per_rank) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank", diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 6fdc6cc1a1..5dfbcaad1e 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -199,6 +199,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) } } } + /* check for oversubscribe directives */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { @@ -212,12 +213,14 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) } } } + /* check for no-use-local directive */ if (!(ORTE_MAPPING_LOCAL_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { if (ORTE_MAPPING_NO_USE_LOCAL & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_USE_LOCAL); } } + /* ditto for rank policy */ if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { jdata->map->ranking = orte_rmaps_base.ranking; diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index ee8651d5b2..8a623cceb4 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -14,7 +14,7 @@ * All rights reserved. * Copyright (c) 2008 Voltaire. All rights reserved * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. @@ -110,6 +110,10 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) /* NOT FOR US */ return ORTE_ERR_TAKE_NEXT_OPTION; } + if (OPAL_BIND_ORDERED_REQUESTED(jdata->map->binding)) { + /* NOT FOR US */ + return ORTE_ERR_TAKE_NEXT_OPTION; + } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_file: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c b/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c index 623d85d16e..5f7f5d6641 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2008 Voltaire. All rights reserved * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -106,7 +106,8 @@ static int orte_rmaps_rank_file_register(void) static int orte_rmaps_rank_file_open(void) { /* ensure we flag mapping by user */ - if (NULL != opal_hwloc_base_cpu_list || NULL != orte_rankfile) { + if ((NULL != opal_hwloc_base_cpu_list && !OPAL_BIND_ORDERED_REQUESTED(opal_hwloc_binding_policy)) || + NULL != orte_rankfile) { if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* if a non-default mapping is already specified, then we * have an error