From 9db4542c2bbb5fb2e20f7522f8cefe3d1ba56403 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 19 Sep 2011 16:10:37 +0000 Subject: [PATCH] Move maffinity_base_alloc_policy and maffinity_base_bind_failure_action MCA params to the hwloc base (hwloc_base_alloc_polocy and hwloc_base_bind_failure_action). Since these MCA parameters were never on a release branch, I'm just moving/renaming them outright and not leaving aliases to the old names. Note that some upper layer needs to call opal_hwloc_base_set_process_membind_policy() to set the set-by-MCA-param process-wide memory affinity policy. We can't do this automatically during hwloc_base_open() because, for reasons described elsewhere, opal_hwloc_topology is not automatically filled during hwloc_base_open() (in short: potential scalability issues when launching many MPI processes simultaneously on a single machine, for example). This commit was SVN r25156. --- opal/mca/hwloc/base/Makefile.am | 4 +- opal/mca/hwloc/base/base.h | 62 +++++++++++++++++- .../base/help-opal-hwloc-base.txt} | 8 +-- opal/mca/hwloc/base/hwloc_base_open.c | 56 +++++++++++++++- .../hwloc/base/hwloc_base_proc_mempolicy.c | 64 +++++++++++++++++++ .../base/hwloc_base_util.c} | 11 ++-- opal/mca/maffinity/Makefile.am | 4 +- opal/mca/maffinity/base/Makefile.am | 5 +- opal/mca/maffinity/base/base.h | 31 +-------- opal/mca/maffinity/base/maffinity_base_open.c | 54 ---------------- .../maffinity/hwloc/maffinity_hwloc_module.c | 32 +--------- 11 files changed, 198 insertions(+), 133 deletions(-) rename opal/mca/{maffinity/base/help-opal-maffinity-base.txt => hwloc/base/help-opal-hwloc-base.txt} (73%) create mode 100644 opal/mca/hwloc/base/hwloc_base_proc_mempolicy.c rename opal/mca/{maffinity/base/maffinity_base_util.c => hwloc/base/hwloc_base_util.c} (81%) diff --git a/opal/mca/hwloc/base/Makefile.am b/opal/mca/hwloc/base/Makefile.am index 4b0b679282..9ff614cc18 100644 --- a/opal/mca/hwloc/base/Makefile.am +++ b/opal/mca/hwloc/base/Makefile.am @@ -18,6 +18,8 @@ libmca_hwloc_la_SOURCES += \ if OPAL_HAVE_HWLOC libmca_hwloc_la_SOURCES += \ - base/hwloc_base_dt.c + base/hwloc_base_dt.c \ + base/hwloc_base_util.c \ + base/hwloc_base_proc_mempolicy.c endif diff --git a/opal/mca/hwloc/base/base.h b/opal/mca/hwloc/base/base.h index f783dd7daa..ada3c049dc 100644 --- a/opal/mca/hwloc/base/base.h +++ b/opal/mca/hwloc/base/base.h @@ -40,6 +40,17 @@ BEGIN_C_DECLS * variable should \em only be used by other hwloc base * functions -- it is not considered a public interface member -- * and is only mentioned here for completeness. + * + * Note that this function does NOT fill the global variable + * opal_hwloc_topology, nor does it set the process-wide memory + * affinity policy. Filling opal_hwloc_topology via + * hwloc_topology_load() can be expensive (and/or serialized by the + * OS); it may not be desireable to call this function in every MPI + * process on a machine. Hence, it is the responsibility for an upper + * layer to both fill opal_hwloc_topology in some scalable way, as + * well as to invoke opal_hwloc_base_set_process_membind_policy() + * (after opal_hwloc_topology has been loaded) to set the process-wide + * memory affinity policy. */ OPAL_DECLSPEC int opal_hwloc_base_open(void); @@ -85,8 +96,57 @@ OPAL_DECLSPEC int opal_hwloc_size(size_t *size, hwloc_topology_t src, opal_data_type_t type); OPAL_DECLSPEC void opal_hwloc_release(opal_dss_value_t *value); + +/** + * Report a bind failure using the normal mechanisms if a component + * fails to bind memory -- according to the value of the + * hwloc_base_bind_failure_action MCA parameter. + */ +OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file, + int line, + const char *msg, + int rc); + #endif +/** + * Enum for what memory allocation policy we want for user allocations. + * MAP = memory allocation policy. + */ +typedef enum { + OPAL_HWLOC_BASE_MAP_NONE, + OPAL_HWLOC_BASE_MAP_LOCAL_ONLY +} opal_hwloc_base_map_t; + +/** + * Global reflecting the MAP (set by MCA param). + */ +OPAL_DECLSPEC extern opal_hwloc_base_map_t opal_hwloc_base_map; + +/** + * Enum for what to do if the hwloc framework tries to bind memory + * and fails. BFA = bind failure action. + */ +typedef enum { + OPAL_HWLOC_BASE_BFA_WARN, + OPAL_HWLOC_BASE_BFA_ERROR +} opal_hwloc_base_bfa_t; + +/** + * Global reflecting the BFA (set by MCA param). + */ +OPAL_DECLSPEC extern opal_hwloc_base_bfa_t opal_hwloc_base_bfa; + +/** + * This function sets the process-wide memory affinity policy + * according to opal_hwloc_base_map and opal_hwloc_base_bfa. It needs + * to be a separate, standalone function (as opposed to being done + * during opal_hwloc_base_open()) because opal_hwloc_topology is not + * loaded by opal_hwloc_base_open(). Hence, an upper layer needs to + * invoke this function after opal_hwloc_topology has been loaded. + */ +OPAL_DECLSPEC int opal_hwloc_base_set_process_membind_policy(void); + END_C_DECLS -#endif /* OPAL_BASE_HWLOC_H */ +#endif /* OPAL_HWLOC_BASE_H */ diff --git a/opal/mca/maffinity/base/help-opal-maffinity-base.txt b/opal/mca/hwloc/base/help-opal-hwloc-base.txt similarity index 73% rename from opal/mca/maffinity/base/help-opal-maffinity-base.txt rename to opal/mca/hwloc/base/help-opal-hwloc-base.txt index 533ac2070d..0658307c9f 100644 --- a/opal/mca/maffinity/base/help-opal-maffinity-base.txt +++ b/opal/mca/hwloc/base/help-opal-hwloc-base.txt @@ -7,14 +7,14 @@ # # $HEADER$ # -# This is the US/English help file for Open MPI's maffinity base support +# This is the US/English help file for Open MPI's hwloc base support # [invalid policy] WARNING: An invalid value was given for the -maffinity_base_general_alloc_policy MCA parameter. This MCA parameter +hwloc_base_general_alloc_policy MCA parameter. This MCA parameter determines the policy used for general memory allocations. Note that this parameter *only* has effect when MPI processes are bound to -specific processors. +specific processors. The value provided was: @@ -25,8 +25,6 @@ The value provided was: Valid values are: none: no memory binding policy is enforced - prefer_local: try allocating memory on the local NUMA node, but - allow spilling over to remote NUMA nodes if necessary local_only: fail an allocation if it cannot be placed entirely on the local NUMA node diff --git a/opal/mca/hwloc/base/hwloc_base_open.c b/opal/mca/hwloc/base/hwloc_base_open.c index 0329441d95..1260765eb6 100644 --- a/opal/mca/hwloc/base/hwloc_base_open.c +++ b/opal/mca/hwloc/base/hwloc_base_open.c @@ -13,6 +13,7 @@ #include "opal/constants.h" #include "opal/dss/dss.h" #include "opal/util/output.h" +#include "opal/util/show_help.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" @@ -38,6 +39,9 @@ bool opal_hwloc_base_inited = false; #if OPAL_HAVE_HWLOC hwloc_topology_t opal_hwloc_topology=NULL; #endif +opal_hwloc_base_map_t opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE; +opal_hwloc_base_bfa_t opal_hwloc_base_bfa = OPAL_HWLOC_BASE_BFA_ERROR; + int opal_hwloc_base_open(void) { @@ -50,7 +54,8 @@ int opal_hwloc_base_open(void) { int value; opal_data_type_t tmp; - + char *str_value; + /* Debugging / verbose output */ mca_base_param_reg_int_name("hwloc", "base_verbose", "Verbosity level of the hwloc framework", @@ -62,6 +67,55 @@ int opal_hwloc_base_open(void) opal_hwloc_base_output = -1; } + /* hwloc_base_mbind_policy */ + switch (opal_hwloc_base_map) { + case OPAL_HWLOC_BASE_MAP_NONE: + str_value = "none"; + break; + case OPAL_HWLOC_BASE_MAP_LOCAL_ONLY: + str_value = "local_only"; + break; + } + mca_base_param_reg_string_name("hwloc", "base_alloc_policy", + "Policy that determines how general memory allocations are bound after MPI_INIT. A value of \"none\" means that no memory policy is applied. A value of \"local_only\" means that all memory allocations will be restricted to the local NUMA node where each process is placed. Note that operating system paging policies are unaffected by this setting. For example, if \"local_only\" is used and local NUMA node memory is exhausted, a new memory allocation may cause paging.", + false, false, str_value, &str_value); + if (strcasecmp(str_value, "none") == 0) { + opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE; + } else if (strcasecmp(str_value, "local_only") == 0 || + strcasecmp(str_value, "local-only") == 0) { + opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_LOCAL_ONLY; + } else { + char hostname[32]; + gethostname(hostname, sizeof(hostname)); + opal_show_help("help-opal-hwloc-base.txt", "invalid policy", + true, hostname, getpid(), str_value); + return OPAL_ERR_BAD_PARAM; + } + + /* hwloc_base_bind_failure_action */ + switch (opal_hwloc_base_bfa) { + case OPAL_HWLOC_BASE_BFA_WARN: + str_value = "warn"; + break; + case OPAL_HWLOC_BASE_BFA_ERROR: + str_value = "error"; + break; + } + mca_base_param_reg_string_name("hwloc", "base_bind_failure_action", + "What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by hwloc_base_alloc_policy. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.", + false, false, str_value, &str_value); + if (strcasecmp(str_value, "warn") == 0) { + opal_hwloc_base_bfa = OPAL_HWLOC_BASE_BFA_WARN; + } else if (strcasecmp(str_value, "error") == 0) { + opal_hwloc_base_bfa = OPAL_HWLOC_BASE_BFA_ERROR; + } else { + char hostname[32]; + gethostname(hostname, sizeof(hostname)); + opal_show_help("help-opal-hwloc-base.txt", "invalid error action", + true, hostname, getpid(), str_value); + return OPAL_ERR_BAD_PARAM; + } + /* to support tools such as ompi_info, add the components * to a list */ diff --git a/opal/mca/hwloc/base/hwloc_base_proc_mempolicy.c b/opal/mca/hwloc/base/hwloc_base_proc_mempolicy.c new file mode 100644 index 0000000000..9591349a0f --- /dev/null +++ b/opal/mca/hwloc/base/hwloc_base_proc_mempolicy.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "opal_config.h" + +#include "opal/constants.h" + +#include "opal/mca/hwloc/hwloc.h" +#include "opal/mca/hwloc/base/base.h" + + +/* + * Don't use show_help() here (or print any error message at all). + * Let the upper layer output a relevant message, because doing so may + * be complicated (e.g., this might be called from the ORTE ODLS, + * which has to do some extra steps to get error messages to be + * displayed). + */ +int opal_hwloc_base_set_process_membind_policy(void) +{ + int rc = 0, flags; + hwloc_membind_policy_t policy; + hwloc_cpuset_t cpuset; + + /* Make sure opal_hwloc_topology has been set by the time we've + been called */ + if (NULL == opal_hwloc_topology) { + return OPAL_ERR_BAD_PARAM; + } + + /* Set the default memory allocation policy according to MCA + param */ + switch (opal_hwloc_base_map) { + case OPAL_HWLOC_BASE_MAP_LOCAL_ONLY: + policy = HWLOC_MEMBIND_BIND; + flags = HWLOC_MEMBIND_STRICT; + break; + + case OPAL_HWLOC_BASE_MAP_NONE: + default: + policy = HWLOC_MEMBIND_DEFAULT; + flags = 0; + break; + } + + cpuset = hwloc_bitmap_alloc(); + if (NULL == cpuset) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + } else { + hwloc_get_cpubind(opal_hwloc_topology, cpuset, 0); + rc = hwloc_set_membind(opal_hwloc_topology, + cpuset, HWLOC_MEMBIND_BIND, flags); + hwloc_bitmap_free(cpuset); + } + + return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR; +} diff --git a/opal/mca/maffinity/base/maffinity_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c similarity index 81% rename from opal/mca/maffinity/base/maffinity_base_util.c rename to opal/mca/hwloc/base/hwloc_base_util.c index 5b6d612f85..318bd6a5ce 100644 --- a/opal/mca/maffinity/base/maffinity_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,11 +29,11 @@ #include "opal/constants.h" #include "opal/util/show_help.h" -#include "opal/mca/maffinity/maffinity.h" -#include "opal/mca/maffinity/base/base.h" +#include "opal/mca/hwloc/hwloc.h" +#include "opal/mca/hwloc/base/base.h" -int opal_maffinity_base_report_bind_failure(const char *file, +int opal_hwloc_base_report_bind_failure(const char *file, int line, const char *msg, int rc) { @@ -42,9 +43,9 @@ int opal_maffinity_base_report_bind_failure(const char *file, char hostname[64]; gethostname(hostname, sizeof(hostname)); - opal_show_help("help-opal-maffinity-base.txt", "mbind failure", true, + opal_show_help("help-opal-hwloc-base.txt", "mbind failure", true, hostname, getpid(), file, line, msg, - (OPAL_MAFFINITY_BASE_BFA_WARN == opal_maffinity_base_bfa) ? + (OPAL_HWLOC_BASE_BFA_WARN == opal_hwloc_base_bfa) ? "Warning -- your job will continue, but possibly with degraded performance" : "ERROR -- your job may abort or behave erraticly"); already_reported = 1; diff --git a/opal/mca/maffinity/Makefile.am b/opal/mca/maffinity/Makefile.am index f410c769d3..81bedfe104 100644 --- a/opal/mca/maffinity/Makefile.am +++ b/opal/mca/maffinity/Makefile.am @@ -9,7 +9,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -31,8 +31,6 @@ opaldir = $(includedir)/openmpi/$(subdir) nobase_opal_HEADERS = $(headers) endif -dist_pkgdata_DATA = - include base/Makefile.am distclean-local: diff --git a/opal/mca/maffinity/base/Makefile.am b/opal/mca/maffinity/base/Makefile.am index 50f9086486..a97d9fa28b 100644 --- a/opal/mca/maffinity/base/Makefile.am +++ b/opal/mca/maffinity/base/Makefile.am @@ -9,6 +9,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -16,9 +17,6 @@ # $HEADER$ # -dist_pkgdata_DATA += \ - base/help-opal-maffinity-base.txt - headers += \ base/base.h @@ -26,5 +24,4 @@ libmca_maffinity_la_SOURCES += \ base/maffinity_base_close.c \ base/maffinity_base_select.c \ base/maffinity_base_open.c \ - base/maffinity_base_util.c \ base/maffinity_base_wrappers.c diff --git a/opal/mca/maffinity/base/base.h b/opal/mca/maffinity/base/base.h index 9e45b7edb3..905ed7e6ee 100644 --- a/opal/mca/maffinity/base/base.h +++ b/opal/mca/maffinity/base/base.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -170,34 +171,6 @@ extern int opal_maffinity_base_output; */ OPAL_DECLSPEC extern bool opal_maffinity_setup; -/** - * Enum for what memory allocation policy we want for user allocations. - * MAP = memory allocation policy. - */ -typedef enum { - OPAL_MAFFINITY_BASE_MAP_NONE, - OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY -} opal_maffinity_base_map_t; - -/** - * Global reflecting the MAP (set by MCA param). - */ -OPAL_DECLSPEC extern opal_maffinity_base_map_t opal_maffinity_base_map; - -/** - * Enum for what to do if the maffinity framework tries to bind memory - * and fails. BFA = bind failure action. - */ -typedef enum { - OPAL_MAFFINITY_BASE_BFA_WARN, - OPAL_MAFFINITY_BASE_BFA_ERROR -} opal_maffinity_base_bfa_t; - -/** - * Global reflecting the BFA (set by MCA param). - */ -OPAL_DECLSPEC extern opal_maffinity_base_bfa_t opal_maffinity_base_bfa; - END_C_DECLS -#endif /* OPAL_BASE_MAFFINITY_H */ +#endif /* OPAL_MAFFINITY_BASE_H */ diff --git a/opal/mca/maffinity/base/maffinity_base_open.c b/opal/mca/maffinity/base/maffinity_base_open.c index a4fa21fb27..905036c1ff 100644 --- a/opal/mca/maffinity/base/maffinity_base_open.c +++ b/opal/mca/maffinity/base/maffinity_base_open.c @@ -53,10 +53,6 @@ int opal_maffinity_base_output = -1; bool opal_maffinity_base_components_opened_valid = false; opal_list_t opal_maffinity_base_components_opened; bool opal_maffinity_setup = false; -opal_maffinity_base_map_t opal_maffinity_base_map = - OPAL_MAFFINITY_BASE_MAP_NONE; -opal_maffinity_base_bfa_t opal_maffinity_base_bfa = - OPAL_MAFFINITY_BASE_BFA_ERROR; /* * Function for finding and opening either all MCA components, or the one @@ -65,7 +61,6 @@ opal_maffinity_base_bfa_t opal_maffinity_base_bfa = int opal_maffinity_base_open(void) { int int_value; - char *str_value; /* Debugging / verbose output */ @@ -79,55 +74,6 @@ int opal_maffinity_base_open(void) opal_maffinity_base_output = -1; } - /* maffinity_base_mbind_policy */ - switch (opal_maffinity_base_map) { - case OPAL_MAFFINITY_BASE_MAP_NONE: - str_value = "none"; - break; - case OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY: - str_value = "local_only"; - break; - } - mca_base_param_reg_string_name("maffinity", "base_alloc_policy", - "Policy that determines how general memory allocations are bound after MPI_INIT. A value of \"none\" means that no memory policy is applied. A value of \"local_only\" means that all memory allocations will be restricted to the local NUMA node where each process is placed. Note that operating system paging policies are unaffected by this setting. For example, if \"local_only\" is used and local NUMA node memory is exhausted, a new memory allocation may cause paging.", - false, false, str_value, &str_value); - if (strcasecmp(str_value, "none") == 0) { - opal_maffinity_base_map = OPAL_MAFFINITY_BASE_MAP_NONE; - } else if (strcasecmp(str_value, "local_only") == 0 || - strcasecmp(str_value, "local-only") == 0) { - opal_maffinity_base_map = OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY; - } else { - char hostname[32]; - gethostname(hostname, sizeof(hostname)); - opal_show_help("help-opal-maffinity-base.txt", "invalid policy", - true, hostname, getpid(), str_value); - return OPAL_ERR_BAD_PARAM; - } - - /* maffinity_base_bind_failure_action */ - switch (opal_maffinity_base_bfa) { - case OPAL_MAFFINITY_BASE_BFA_WARN: - str_value = "warn"; - break; - case OPAL_MAFFINITY_BASE_BFA_ERROR: - str_value = "error"; - break; - } - mca_base_param_reg_string_name("maffinity", "base_bind_failure_action", - "What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by maffinity_base_alloc_policy. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.", - false, false, str_value, &str_value); - if (strcasecmp(str_value, "warn") == 0) { - opal_maffinity_base_bfa = OPAL_MAFFINITY_BASE_BFA_WARN; - } else if (strcasecmp(str_value, "error") == 0) { - opal_maffinity_base_bfa = OPAL_MAFFINITY_BASE_BFA_ERROR; - } else { - char hostname[32]; - gethostname(hostname, sizeof(hostname)); - opal_show_help("help-opal-maffinity-base.txt", "invalid error action", - true, hostname, getpid(), str_value); - return OPAL_ERR_BAD_PARAM; - } - opal_maffinity_base_components_opened_valid = false; /* Open up all available components */ diff --git a/opal/mca/maffinity/hwloc/maffinity_hwloc_module.c b/opal/mca/maffinity/hwloc/maffinity_hwloc_module.c index 2bc604e321..aa11e259a9 100644 --- a/opal/mca/maffinity/hwloc/maffinity_hwloc_module.c +++ b/opal/mca/maffinity/hwloc/maffinity_hwloc_module.c @@ -74,37 +74,9 @@ int opal_maffinity_hwloc_component_query(mca_base_module_t **module, static int hwloc_module_init(void) { - int rc = 0, flags; - hwloc_membind_policy_t policy; - hwloc_cpuset_t cpuset; + /* Nothing to do! */ - /* Set the default memory allocation policy according to MCA param */ - switch (opal_maffinity_base_map) { - case OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY: - policy = HWLOC_MEMBIND_BIND; - flags = HWLOC_MEMBIND_STRICT; - break; - - case OPAL_MAFFINITY_BASE_MAP_NONE: - default: - policy = HWLOC_MEMBIND_DEFAULT; - flags = 0; - break; - - } - - cpuset = hwloc_bitmap_alloc(); - if (NULL == cpuset) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - } else { - hwloc_get_cpubind(opal_hwloc_topology, - cpuset, 0); - rc = hwloc_set_membind(opal_hwloc_topology, - cpuset, HWLOC_MEMBIND_BIND, flags); - hwloc_bitmap_free(cpuset); - } - - return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR; + return OPAL_SUCCESS; }