1
1

btl/openib: limit each process to a ppn fraction of the available registered memory when using mellanox hardware (mlx4 and mthca). fixed

This commit was SVN r26811.
Этот коммит содержится в:
Nathan Hjelm 2012-07-19 17:52:21 +00:00
родитель 66fe57f746
Коммит cd2cbdca09
4 изменённых файлов: 135 добавлений и 2 удалений

Просмотреть файл

@ -56,6 +56,9 @@
#include "ompi/mca/mpool/grdma/mpool_grdma.h"
#include "orte/util/proc_info.h"
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <math.h>
#ifdef HAVE_SYS_TYPES_H
@ -70,6 +73,10 @@
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef OPAL_HAVE_HWLOC
#include "opal/mca/hwloc/hwloc.h"
#endif
#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif
@ -579,6 +586,86 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
return OMPI_SUCCESS;
}
/* read a single integer from a linux module parameters file */
static uint64_t read_module_param(char *file, uint64_t value)
{
int fd = open(file, O_RDONLY);
char buffer[64];
uint64_t ret;
if (0 > fd) {
return value;
}
read (fd, buffer, 64);
close (fd);
errno = 0;
ret = strtoull(buffer, NULL, 10);
return (0 == errno) ? ret : value;
}
/* calculate memory registation limits */
static uint64_t calculate_total_mem (void)
{
#if OPAL_HAVE_HWLOC
hwloc_obj_t machine;
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
}
return machine->memory.total_memory;
#else
return 0;
#endif
}
static uint64_t calculate_max_reg (void)
{
struct stat statinfo;
uint64_t mtts_per_seg = 1;
uint64_t num_mtt = 1 << 19;
uint64_t reserved_mtt = 0;
uint64_t max_reg, mem_total;
mem_total = calculate_total_mem ();
if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
if (1 == num_mtt) {
/* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
num_mtt = 1 << 20;
}
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
} else if (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
} else {
/* need to update to determine the registration limit for this configuration */
max_reg = mem_total;
}
/* NTH: print a warning if we can't register more than 75% of physical memory */
if (max_reg < mem_total * 3 / 4) {
orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
orte_process_info.nodename, (unsigned long)(max_reg >> 20),
(unsigned long)(mem_total >> 20));
}
/* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
return (max_reg * 7) >> 3;
}
/*
* add a proc to this btl module
* creates an endpoint that is setup on the
@ -592,7 +679,7 @@ int mca_btl_openib_add_procs(
opal_bitmap_t* reachable)
{
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
int i,j, rc;
int i,j, rc, local_procs;
int rem_subnet_id_port_cnt;
int lcl_subnet_id_port_cnt = 0;
int btl_rank = 0;
@ -621,13 +708,17 @@ int mca_btl_openib_add_procs(
}
#endif
for (i = 0; i < (int) nprocs; i++) {
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_openib_proc_t* ib_proc;
int remote_matching_port;
opal_output(-1, "add procs: adding proc %d", i);
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
local_procs ++;
}
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
* mark the prco as unreachable by openib btl */
if (OPAL_EQUAL == orte_util_compare_name_fields
@ -794,6 +885,9 @@ int mca_btl_openib_add_procs(
peers[i] = endpoint;
}
openib_btl->local_procs += local_procs;
openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs;
return mca_btl_openib_size_queues(openib_btl, nprocs);
}

Просмотреть файл

@ -390,6 +390,8 @@ typedef struct mca_btl_openib_device_t {
mca_btl_openib_device_qp_t *qps;
/* Maximum value supported by this device for max_inline_data */
uint32_t max_inline_data;
/* Registration limit and current count */
uint64_t mem_reg_max, mem_reg_active;
} mca_btl_openib_device_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
@ -467,6 +469,8 @@ struct mca_btl_openib_module_t {
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
mca_btl_openib_module_qp_t * qps;
int local_procs; /** number of local procs */
};
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;

Просмотреть файл

@ -596,6 +596,13 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
if (device->mem_reg_max &&
device->mem_reg_max < (device->mem_reg_active + size)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
device->mem_reg_active += size;
#if HAVE_DECL_IBV_ACCESS_SO
if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) {
access_flag |= IBV_ACCESS_SO;
@ -620,6 +627,7 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
{
mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data;
mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
if(openib_reg->mr != NULL) {
@ -637,6 +645,9 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
#endif
}
device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
openib_reg->mr = NULL;
return OMPI_SUCCESS;
}
@ -818,6 +829,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
openib_btl->cpcs = NULL;
openib_btl->num_cpcs = 0;
openib_btl->local_procs = 0;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
@ -1670,6 +1682,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
return OMPI_ERR_OUT_OF_RESOURCE;
}
device->mem_reg_active = 0;
/* NTH: set some high default until we know how many local peers we have */
device->mem_reg_max = 1ull << 48;
device->ib_dev = ib_dev;
device->ib_dev_context = ibv_open_device(ib_dev);
device->ib_pd = NULL;

Просмотреть файл

@ -689,3 +689,22 @@ device).
Use "ibv_devinfo -v" on the local host to see the GID table of this
device.
[reg mem limit low]
WARNING: It appears that your OpenFabrics subsystem is configured to only
allow registering part of your physical memory. This can cause MPI jobs to
run with erratic performance, hang, and/or crash.
This may be caused by your OpenFabrics vendor limiting the amount of
physical memory that can be registered. You should investigate the
relevant Linux kernel module parameters that control how much physical
memory can be registered, and increase them to allow registering all
physical memory on your machine.
See this Open MPI FAQ item for more information on these Linux kernel module
parameters:
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
Local host: %s
Registerable memory: %lu MiB
Total memory: %lu MiB