1
1
After a long period of development with many starts and stops, we
finally got this where we wanted it.

This commit introduces 2 new MCA params (note that the
"maffinity_libnuma_policy" MCA param introduced by r24290 was removed
when libnuma support was removed).  Remember that maffinity policies
are only in effect when paffinity is enaabled -- i.e., when processes
are bound to processors!

 * '''maffinity_base_alloc_policy:''' Policy that determines how
   general memory allocations are bound after MPI_INIT.  A value of
   "none" means that no memory policy is applied.  A value of
   "local_only" means that all memory allocations will be restricted
   to the local NUMA node where each process is placed.  Note that
   operating system paging policies are unaffected by this setting.
   For example, if "local_only" is used and local NUMA node memory is
   exhausted, a new memory allocation may cause paging.
 * '''maffinity_base_bind_failure_action:''' What Open MPI will do if
   it explicitly tries to bind memory to a specific NUMA location, and
   fails.  Note that this is a different case than the general
   allocation policy described by maffinity_base_alloc_policy.  A
   value of "warn" means that Open MPI will warn the first time this
   happens, but allow the job to continue (possibly with degraded
   performance).  A value of "error" means that Open MPI will abort
   the job if this happens.

This needs at least a little soak time on the trunk before going to
v1.5.

This commit was SVN r24639.

The following SVN revision numbers were found above:
  r24290 --> open-mpi/ompi@afa654746c

The following Trac tickets were found above:
  Ticket 2698 --> https://svn.open-mpi.org/trac/ompi/ticket/2698
Этот коммит содержится в:
Jeff Squyres 2011-04-26 13:31:07 +00:00
родитель a1e304b2d6
Коммит d134ff9b4d
10 изменённых файлов: 280 добавлений и 45 удалений

Просмотреть файл

@ -31,6 +31,8 @@ opaldir = $(includedir)/openmpi/$(subdir)
nobase_opal_HEADERS = $(headers)
endif
dist_pkgdata_DATA =
include base/Makefile.am
distclean-local:

Просмотреть файл

@ -16,6 +16,9 @@
# $HEADER$
#
dist_pkgdata_DATA += \
base/help-opal-maffinity-base.txt
headers += \
base/base.h
@ -23,4 +26,5 @@ libmca_maffinity_la_SOURCES += \
base/maffinity_base_close.c \
base/maffinity_base_select.c \
base/maffinity_base_open.c \
base/maffinity_base_util.c \
base/maffinity_base_wrappers.c

Просмотреть файл

@ -108,6 +108,16 @@ OPAL_DECLSPEC int opal_maffinity_base_set(opal_maffinity_base_segment_t *segment
OPAL_DECLSPEC int opal_maffinity_base_node_name_to_id(char *, int *);
OPAL_DECLSPEC int opal_maffinity_base_bind(opal_maffinity_base_segment_t *, size_t, int);
/**
* Report a bind failure using the normal mechanisms if a component
* fails to bind memory -- according to the value of the
* maffinity_base_bind_failure_action MCA parameter.
*/
OPAL_DECLSPEC int opal_maffinity_base_report_bind_failure(const char *file,
int line,
const char *msg,
int rc);
/**
* Shut down the maffinity MCA framework.
*
@ -160,6 +170,34 @@ extern int opal_maffinity_base_output;
*/
OPAL_DECLSPEC extern bool opal_maffinity_setup;
/**
* Enum for what memory allocation policy we want for user allocations.
* MAP = memory allocation policy.
*/
typedef enum {
OPAL_MAFFINITY_BASE_MAP_NONE,
OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY
} opal_maffinity_base_map_t;
/**
* Global reflecting the MAP (set by MCA param).
*/
OPAL_DECLSPEC extern opal_maffinity_base_map_t opal_maffinity_base_map;
/**
* Enum for what to do if the maffinity framework tries to bind memory
* and fails. BFA = bind failure action.
*/
typedef enum {
OPAL_MAFFINITY_BASE_BFA_WARN,
OPAL_MAFFINITY_BASE_BFA_ERROR
} opal_maffinity_base_bfa_t;
/**
* Global reflecting the BFA (set by MCA param).
*/
OPAL_DECLSPEC extern opal_maffinity_base_bfa_t opal_maffinity_base_bfa;
END_C_DECLS
#endif /* OPAL_BASE_MAFFINITY_H */

Просмотреть файл

@ -0,0 +1,43 @@
# -*- text -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI's maffinity base support
#
[invalid policy]
WARNING: An invalid value was given for the
maffinity_base_general_alloc_policy MCA parameter. This MCA parameter
determines the policy used for general memory allocations. Note that
this parameter *only* has effect when MPI processes are bound to
specific processors.
The value provided was:
Local host: %s
PID: %d
Value: %s
Valid values are:
none: no memory binding policy is enforced
prefer_local: try allocating memory on the local NUMA node, but
allow spilling over to remote NUMA nodes if necessary
local_only: fail an allocation if it cannot be placed entirely on
the local NUMA node
Your job will now abort.
#
[mbind failure]
Open MPI failed to bind internal memory to a specific NUMA node. This
message will only be reported at most once per process.
Local host: %s
PID: %d
File: %s:%d
Message: %s
Severity: %s

Просмотреть файл

@ -19,8 +19,16 @@
#include "opal_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "opal/constants.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
@ -43,6 +51,10 @@ int opal_maffinity_base_output = -1;
bool opal_maffinity_base_components_opened_valid = false;
opal_list_t opal_maffinity_base_components_opened;
bool opal_maffinity_setup = false;
opal_maffinity_base_map_t opal_maffinity_base_map =
OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY;
opal_maffinity_base_bfa_t opal_maffinity_base_bfa =
OPAL_MAFFINITY_BASE_BFA_ERROR;
/*
* Function for finding and opening either all MCA components, or the one
@ -50,20 +62,70 @@ bool opal_maffinity_setup = false;
*/
int opal_maffinity_base_open(void)
{
int value;
int int_value;
char *str_value;
/* Debugging / verbose output */
mca_base_param_reg_int_name("maffinity", "base_verbose",
"Verbosity level of the maffinity framework",
false, false,
0, &value);
if (0 != value) {
0, &int_value);
if (0 != int_value) {
opal_maffinity_base_output = opal_output_open(NULL);
} else {
opal_maffinity_base_output = -1;
}
/* maffinity_base_mbind_policy */
switch (opal_maffinity_base_map) {
case OPAL_MAFFINITY_BASE_MAP_NONE:
str_value = "none";
break;
case OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY:
str_value = "local_only";
break;
}
mca_base_param_reg_string_name("maffinity", "base_alloc_policy",
"Policy that determines how general memory allocations are bound after MPI_INIT. A value of \"none\" means that no memory policy is applied. A value of \"local_only\" means that all memory allocations will be restricted to the local NUMA node where each process is placed. Note that operating system paging policies are unaffected by this setting. For example, if \"local_only\" is used and local NUMA node memory is exhausted, a new memory allocation may cause paging.",
false, false, str_value, &str_value);
if (strcasecmp(str_value, "none") == 0) {
opal_maffinity_base_map = OPAL_MAFFINITY_BASE_MAP_NONE;
} else if (strcasecmp(str_value, "local_only") == 0 ||
strcasecmp(str_value, "local-only") == 0) {
opal_maffinity_base_map = OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY;
} else {
char hostname[32];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-maffinity-base.txt", "invalid policy",
true, hostname, getpid(), str_value);
return OPAL_ERR_BAD_PARAM;
}
/* maffinity_base_bind_failure_action */
switch (opal_maffinity_base_bfa) {
case OPAL_MAFFINITY_BASE_BFA_WARN:
str_value = "warn";
break;
case OPAL_MAFFINITY_BASE_BFA_ERROR:
str_value = "error";
break;
}
mca_base_param_reg_string_name("maffinity", "base_bind_failure_action",
"What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by maffinity_base_alloc_policy. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.",
false, false, str_value, &str_value);
if (strcasecmp(str_value, "warn") == 0) {
opal_maffinity_base_bfa = OPAL_MAFFINITY_BASE_BFA_WARN;
} else if (strcasecmp(str_value, "error") == 0) {
opal_maffinity_base_bfa = OPAL_MAFFINITY_BASE_BFA_ERROR;
} else {
char hostname[32];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-maffinity-base.txt", "invalid error action",
true, hostname, getpid(), str_value);
return OPAL_ERR_BAD_PARAM;
}
opal_maffinity_base_components_opened_valid = false;
/* Open up all available components */

Просмотреть файл

@ -0,0 +1,55 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "opal/constants.h"
#include "opal/util/show_help.h"
#include "opal/mca/maffinity/maffinity.h"
#include "opal/mca/maffinity/base/base.h"
int opal_maffinity_base_report_bind_failure(const char *file,
int line,
const char *msg, int rc)
{
static int already_reported = 0;
if (!already_reported) {
char hostname[64];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-maffinity-base.txt", "mbind failure", true,
hostname, getpid(), file, line, msg,
(OPAL_MAFFINITY_BASE_BFA_WARN == opal_maffinity_base_bfa) ?
"Warning -- your job will continue, but possibly with degraded performance" :
"ERROR -- your job may abort or behave erraticly");
already_reported = 1;
return rc;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -36,7 +36,6 @@ typedef struct {
/* This component's data */
int priority;
int bind_policy;
hwloc_topology_t topology;
bool topology_need_destroy;
} opal_maffinity_hwloc_component_2_0_0_t;

Просмотреть файл

@ -73,8 +73,6 @@ opal_maffinity_hwloc_component_2_0_0_t mca_maffinity_hwloc_component = {
/* Priority */
40,
/* Default binding policy */
HWLOC_MEMBIND_STRICT,
/* NULL fill the rest of the component data */
};
@ -83,7 +81,6 @@ opal_maffinity_hwloc_component_2_0_0_t mca_maffinity_hwloc_component = {
static int hwloc_register(void)
{
int i;
char *val, *mca_policy;
/* Call the registration function of common hwloc */
opal_common_hwloc_register();
@ -101,25 +98,6 @@ static int hwloc_register(void)
false, false, 40,
&mca_maffinity_hwloc_component.priority);
/* Default memory binding policy */
val = (HWLOC_MEMBIND_STRICT == mca_maffinity_hwloc_component.bind_policy ?
"strict" : "loose");
mca_base_param_reg_string(&mca_maffinity_hwloc_component.base.base_version,
"policy",
"Binding policy that determines what happens if memory is unavailable on the local NUMA node. A value of \"strict\" means that the memory allocation will fail; a value of \"loose\" means that the memory allocation will spill over to another NUMA node.",
false, false, val, &mca_policy);
if (strcasecmp(mca_policy, "loose") == 0) {
mca_maffinity_hwloc_component.bind_policy = 0;
} else if (strcasecmp(mca_policy, "strict") == 0) {
mca_maffinity_hwloc_component.bind_policy = HWLOC_MEMBIND_STRICT;
} else {
opal_show_help("help-opal-maffinity-hwloc.txt", "invalid policy",
true, mca_policy, getpid());
mca_maffinity_hwloc_component.bind_policy = HWLOC_MEMBIND_STRICT;
return OPAL_ERR_BAD_PARAM;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -43,7 +43,7 @@ static int hwloc_module_init(void);
static int hwloc_module_set(opal_maffinity_base_segment_t *segments,
size_t num_segments);
static int hwloc_module_node_name_to_id(char *, int *);
static int hwloc_modules_bind(opal_maffinity_base_segment_t *, size_t, int);
static int hwloc_module_bind(opal_maffinity_base_segment_t *, size_t, int);
/*
* Hwloc maffinity module
@ -55,7 +55,7 @@ static const opal_maffinity_base_module_1_0_0_t local_module = {
/* Module function pointers */
hwloc_module_set,
hwloc_module_node_name_to_id,
hwloc_modules_bind
hwloc_module_bind
};
int opal_maffinity_hwloc_component_query(mca_base_module_t **module,
@ -74,16 +74,35 @@ int opal_maffinity_hwloc_component_query(mca_base_module_t **module,
static int hwloc_module_init(void)
{
int rc;
int rc = 0, flags;
hwloc_membind_policy_t policy;
hwloc_cpuset_t cpuset;
/* Set the default memory binding policy to allocate locally */
/* Set the default memory allocation policy according to MCA param */
switch (opal_maffinity_base_map) {
case OPAL_MAFFINITY_BASE_MAP_LOCAL_ONLY:
policy = HWLOC_MEMBIND_BIND;
flags = HWLOC_MEMBIND_STRICT;
break;
case OPAL_MAFFINITY_BASE_MAP_NONE:
default:
policy = HWLOC_MEMBIND_DEFAULT;
flags = 0;
break;
}
cpuset = hwloc_bitmap_alloc();
hwloc_get_cpubind(mca_maffinity_hwloc_component.topology, cpuset, 0);
rc = hwloc_set_membind(mca_maffinity_hwloc_component.topology,
cpuset, HWLOC_MEMBIND_BIND,
mca_maffinity_hwloc_component.bind_policy);
hwloc_bitmap_free(cpuset);
if (NULL == cpuset) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
} else {
hwloc_get_cpubind(mca_maffinity_hwloc_component.topology,
cpuset, 0);
rc = hwloc_set_membind(mca_maffinity_hwloc_component.topology,
cpuset, HWLOC_MEMBIND_BIND, flags);
hwloc_bitmap_free(cpuset);
}
return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR;
}
@ -92,13 +111,20 @@ static int hwloc_module_init(void)
static int hwloc_module_set(opal_maffinity_base_segment_t *segments,
size_t num_segments)
{
int rc = OPAL_SUCCESS;
char *msg = NULL;
size_t i;
hwloc_cpuset_t cpuset;
hwloc_cpuset_t cpuset = NULL;
/* This module won't be used unless the process is already
processor-bound. So find out where we're processor bound, and
bind our memory there, too. */
cpuset = hwloc_bitmap_alloc();
if (NULL == cpuset) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
msg = "hwloc_bitmap_alloc() failure";
goto out;
}
hwloc_get_cpubind(mca_maffinity_hwloc_component.topology, cpuset, 0);
for (i = 0; i < num_segments; ++i) {
if (0 != hwloc_set_area_membind(mca_maffinity_hwloc_component.topology,
@ -106,12 +132,20 @@ static int hwloc_module_set(opal_maffinity_base_segment_t *segments,
segments[i].mbs_len, cpuset,
HWLOC_MEMBIND_BIND,
HWLOC_MEMBIND_STRICT)) {
hwloc_bitmap_free(cpuset);
return OPAL_ERROR;
rc = OPAL_ERROR;
msg = "hwloc_set_area_membind() failure";
goto out;
}
}
hwloc_bitmap_free(cpuset);
out:
if (NULL != cpuset) {
hwloc_bitmap_free(cpuset);
}
if (OPAL_SUCCESS != rc) {
return opal_maffinity_base_report_bind_failure(__FILE__, __LINE__,
msg, rc);
}
return OPAL_SUCCESS;
}
@ -123,13 +157,20 @@ static int hwloc_module_node_name_to_id(char *node_name, int *id)
return OPAL_SUCCESS;
}
static int hwloc_modules_bind(opal_maffinity_base_segment_t *segs,
size_t count, int node_id)
static int hwloc_module_bind(opal_maffinity_base_segment_t *segs,
size_t count, int node_id)
{
size_t i;
hwloc_cpuset_t cpuset;
int rc = OPAL_SUCCESS;
char *msg = NULL;
hwloc_cpuset_t cpuset = NULL;
cpuset = hwloc_bitmap_alloc();
if (NULL == cpuset) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
msg = "hwloc_bitmap_alloc() failure";
goto out;
}
hwloc_bitmap_set(cpuset, node_id);
for(i = 0; i < count; i++) {
if (0 != hwloc_set_area_membind(mca_maffinity_hwloc_component.topology,
@ -137,11 +178,19 @@ static int hwloc_modules_bind(opal_maffinity_base_segment_t *segs,
segs[i].mbs_len, cpuset,
HWLOC_MEMBIND_BIND,
HWLOC_MEMBIND_STRICT)) {
hwloc_bitmap_free(cpuset);
return OPAL_ERROR;
rc = OPAL_ERROR;
msg = "hwloc_set_area_membind() failure";
goto out;
}
}
hwloc_bitmap_free(cpuset);
out:
if (NULL != cpuset) {
hwloc_bitmap_free(cpuset);
}
if (OPAL_SUCCESS != rc) {
return opal_maffinity_base_report_bind_failure(__FILE__, __LINE__,
msg, rc);
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -62,6 +62,11 @@
/**
* Module initialization function. Should return OPAL_SUCCESS.
*
* This function should act on the value of the MCA parameter
* maffinity_base_alloc_policy (stored in the global
* opal_maffinity_base_map, declared in
* opal/mca/maffinity/base/base.h).
*/
typedef int (*opal_maffinity_base_module_init_1_0_0_fn_t)(void);