btl/openib: limit each process to a ppn fraction of the available registered memory when using mellanox hardware (mlx4 and mthca). fixed
This commit was SVN r26811.
Этот коммит содержится в:
родитель
66fe57f746
Коммит
cd2cbdca09
@ -56,6 +56,9 @@
|
||||
#include "ompi/mca/mpool/grdma/mpool_grdma.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
@ -70,6 +73,10 @@
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef OPAL_HAVE_HWLOC
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||
#endif
|
||||
@ -579,6 +586,86 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* read a single integer from a linux module parameters file */
|
||||
static uint64_t read_module_param(char *file, uint64_t value)
|
||||
{
|
||||
int fd = open(file, O_RDONLY);
|
||||
char buffer[64];
|
||||
uint64_t ret;
|
||||
|
||||
if (0 > fd) {
|
||||
return value;
|
||||
}
|
||||
|
||||
read (fd, buffer, 64);
|
||||
|
||||
close (fd);
|
||||
|
||||
errno = 0;
|
||||
ret = strtoull(buffer, NULL, 10);
|
||||
|
||||
return (0 == errno) ? ret : value;
|
||||
}
|
||||
|
||||
/* calculate memory registation limits */
|
||||
static uint64_t calculate_total_mem (void)
|
||||
{
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_obj_t machine;
|
||||
|
||||
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
|
||||
if (NULL == machine) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return machine->memory.total_memory;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static uint64_t calculate_max_reg (void)
|
||||
{
|
||||
struct stat statinfo;
|
||||
uint64_t mtts_per_seg = 1;
|
||||
uint64_t num_mtt = 1 << 19;
|
||||
uint64_t reserved_mtt = 0;
|
||||
uint64_t max_reg, mem_total;
|
||||
|
||||
mem_total = calculate_total_mem ();
|
||||
|
||||
if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
|
||||
mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
|
||||
num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
|
||||
if (1 == num_mtt) {
|
||||
/* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
|
||||
num_mtt = 1 << 20;
|
||||
}
|
||||
|
||||
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
|
||||
} else if (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) {
|
||||
mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
|
||||
num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
|
||||
reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
|
||||
|
||||
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
|
||||
} else {
|
||||
/* need to update to determine the registration limit for this configuration */
|
||||
max_reg = mem_total;
|
||||
}
|
||||
|
||||
/* NTH: print a warning if we can't register more than 75% of physical memory */
|
||||
if (max_reg < mem_total * 3 / 4) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
|
||||
orte_process_info.nodename, (unsigned long)(max_reg >> 20),
|
||||
(unsigned long)(mem_total >> 20));
|
||||
}
|
||||
|
||||
/* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
|
||||
return (max_reg * 7) >> 3;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* add a proc to this btl module
|
||||
* creates an endpoint that is setup on the
|
||||
@ -592,7 +679,7 @@ int mca_btl_openib_add_procs(
|
||||
opal_bitmap_t* reachable)
|
||||
{
|
||||
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
|
||||
int i,j, rc;
|
||||
int i,j, rc, local_procs;
|
||||
int rem_subnet_id_port_cnt;
|
||||
int lcl_subnet_id_port_cnt = 0;
|
||||
int btl_rank = 0;
|
||||
@ -621,13 +708,17 @@ int mca_btl_openib_add_procs(
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i = 0; i < (int) nprocs; i++) {
|
||||
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
mca_btl_openib_proc_t* ib_proc;
|
||||
int remote_matching_port;
|
||||
|
||||
opal_output(-1, "add procs: adding proc %d", i);
|
||||
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
|
||||
local_procs ++;
|
||||
}
|
||||
|
||||
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
|
||||
* mark the prco as unreachable by openib btl */
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields
|
||||
@ -794,6 +885,9 @@ int mca_btl_openib_add_procs(
|
||||
peers[i] = endpoint;
|
||||
}
|
||||
|
||||
openib_btl->local_procs += local_procs;
|
||||
openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs;
|
||||
|
||||
return mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||
}
|
||||
|
||||
|
@ -390,6 +390,8 @@ typedef struct mca_btl_openib_device_t {
|
||||
mca_btl_openib_device_qp_t *qps;
|
||||
/* Maximum value supported by this device for max_inline_data */
|
||||
uint32_t max_inline_data;
|
||||
/* Registration limit and current count */
|
||||
uint64_t mem_reg_max, mem_reg_active;
|
||||
} mca_btl_openib_device_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
|
||||
|
||||
@ -467,6 +469,8 @@ struct mca_btl_openib_module_t {
|
||||
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
|
||||
|
||||
mca_btl_openib_module_qp_t * qps;
|
||||
|
||||
int local_procs; /** number of local procs */
|
||||
};
|
||||
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
||||
|
||||
|
@ -596,6 +596,13 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
||||
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
||||
|
||||
if (device->mem_reg_max &&
|
||||
device->mem_reg_max < (device->mem_reg_active + size)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
device->mem_reg_active += size;
|
||||
|
||||
#if HAVE_DECL_IBV_ACCESS_SO
|
||||
if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) {
|
||||
access_flag |= IBV_ACCESS_SO;
|
||||
@ -620,6 +627,7 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
||||
|
||||
static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
{
|
||||
mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data;
|
||||
mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
|
||||
|
||||
if(openib_reg->mr != NULL) {
|
||||
@ -637,6 +645,9 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
|
||||
|
||||
openib_reg->mr = NULL;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -818,6 +829,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
|
||||
openib_btl->cpcs = NULL;
|
||||
openib_btl->num_cpcs = 0;
|
||||
openib_btl->local_procs = 0;
|
||||
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
|
||||
@ -1670,6 +1682,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
device->mem_reg_active = 0;
|
||||
/* NTH: set some high default until we know how many local peers we have */
|
||||
device->mem_reg_max = 1ull << 48;
|
||||
|
||||
device->ib_dev = ib_dev;
|
||||
device->ib_dev_context = ibv_open_device(ib_dev);
|
||||
device->ib_pd = NULL;
|
||||
|
@ -689,3 +689,22 @@ device).
|
||||
|
||||
Use "ibv_devinfo -v" on the local host to see the GID table of this
|
||||
device.
|
||||
[reg mem limit low]
|
||||
WARNING: It appears that your OpenFabrics subsystem is configured to only
|
||||
allow registering part of your physical memory. This can cause MPI jobs to
|
||||
run with erratic performance, hang, and/or crash.
|
||||
|
||||
This may be caused by your OpenFabrics vendor limiting the amount of
|
||||
physical memory that can be registered. You should investigate the
|
||||
relevant Linux kernel module parameters that control how much physical
|
||||
memory can be registered, and increase them to allow registering all
|
||||
physical memory on your machine.
|
||||
|
||||
See this Open MPI FAQ item for more information on these Linux kernel module
|
||||
parameters:
|
||||
|
||||
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
|
||||
|
||||
Local host: %s
|
||||
Registerable memory: %lu MiB
|
||||
Total memory: %lu MiB
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user