1
1

btl/openib: limit each process to a ppn fraction of the available registered memory when using mellanox hardware (mlx4 and mthca)

This commit was SVN r26804.
Этот коммит содержится в:
Nathan Hjelm 2012-07-18 17:29:48 +00:00
родитель 4a97ecbdd2
Коммит 610be870f9
4 изменённых файлов: 110 добавлений и 2 удалений

Просмотреть файл

@ -70,6 +70,10 @@
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif #endif
#ifdef OPAL_HAVE_HWLOC
#include "opal/mca/hwloc/hwloc.h"
#endif
#ifndef MIN #ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b)) #define MIN(a,b) ((a)<(b)?(a):(b))
#endif #endif
@ -579,6 +583,65 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* calculate memory registation limits */
static uint64_t calculate_total_mem (void)
{
#if OPAL_HAVE_HWLOC
hwloc_obj_t machine;
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
}
return machine->memory.total_memory;
#else
return 0;
#endif
}
static uint64_t calculate_max_reg (void)
{
struct stat statinfo;
uint64_t mtts_per_seg = 1;
uint64_t num_mtt = 1 << 19;
uint64_t reserved_mtt = 0;
uint64_t max_reg, mem_total;
mem_total = calculate_total_mem ();
if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
if (1 == num_mtt) {
/* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
num_mtt = 1 << 20;
}
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
} else if (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
} else {
/* need to update to determine the registration limit for this configuration */
max_reg = mem_total;
}
/* NTH: print a warning if we can't register more than 75% of physical memory */
if (max_reg < mem_total * 3 / 4) {
orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
orte_process_info.nodename, (unsigned long)(max_reg >> 20),
(unsigned long)(mem_total >> 20));
}
/* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
return (max_reg * 7) >> 3;
}
/* /*
* add a proc to this btl module * add a proc to this btl module
* creates an endpoint that is setup on the * creates an endpoint that is setup on the
@ -592,7 +655,7 @@ int mca_btl_openib_add_procs(
opal_bitmap_t* reachable) opal_bitmap_t* reachable)
{ {
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl; mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
int i,j, rc; int i,j, rc, local_procs;
int rem_subnet_id_port_cnt; int rem_subnet_id_port_cnt;
int lcl_subnet_id_port_cnt = 0; int lcl_subnet_id_port_cnt = 0;
int btl_rank = 0; int btl_rank = 0;
@ -621,13 +684,17 @@ int mca_btl_openib_add_procs(
} }
#endif #endif
for (i = 0; i < (int) nprocs; i++) { for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i]; struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_openib_proc_t* ib_proc; mca_btl_openib_proc_t* ib_proc;
int remote_matching_port; int remote_matching_port;
opal_output(-1, "add procs: adding proc %d", i); opal_output(-1, "add procs: adding proc %d", i);
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
local_procs ++;
}
/* OOB, XOOB, and RDMACM do not support SELF comunication, so /* OOB, XOOB, and RDMACM do not support SELF comunication, so
* mark the prco as unreachable by openib btl */ * mark the prco as unreachable by openib btl */
if (OPAL_EQUAL == orte_util_compare_name_fields if (OPAL_EQUAL == orte_util_compare_name_fields
@ -794,6 +861,9 @@ int mca_btl_openib_add_procs(
peers[i] = endpoint; peers[i] = endpoint;
} }
openib_btl->local_procs += local_procs;
openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs;
return mca_btl_openib_size_queues(openib_btl, nprocs); return mca_btl_openib_size_queues(openib_btl, nprocs);
} }

Просмотреть файл

@ -390,6 +390,8 @@ typedef struct mca_btl_openib_device_t {
mca_btl_openib_device_qp_t *qps; mca_btl_openib_device_qp_t *qps;
/* Maximum value supported by this device for max_inline_data */ /* Maximum value supported by this device for max_inline_data */
uint32_t max_inline_data; uint32_t max_inline_data;
/* Registration limit and current count */
uint64_t mem_reg_max, mem_reg_active;
} mca_btl_openib_device_t; } mca_btl_openib_device_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_device_t); OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
@ -467,6 +469,8 @@ struct mca_btl_openib_module_t {
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */ mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
mca_btl_openib_module_qp_t * qps; mca_btl_openib_module_qp_t * qps;
int local_procs; /** number of local procs */
}; };
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;

Просмотреть файл

@ -596,6 +596,13 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE | enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
if (device->mem_reg_max &&
device->mem_reg_max < (device->mem_reg_active + size)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
device->mem_reg_active += size;
#if HAVE_DECL_IBV_ACCESS_SO #if HAVE_DECL_IBV_ACCESS_SO
if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) { if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) {
access_flag |= IBV_ACCESS_SO; access_flag |= IBV_ACCESS_SO;
@ -637,6 +644,9 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
#endif #endif
} }
device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
openib_reg->mr = NULL; openib_reg->mr = NULL;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -818,6 +828,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
openib_btl->cpcs = NULL; openib_btl->cpcs = NULL;
openib_btl->num_cpcs = 0; openib_btl->num_cpcs = 0;
openib_btl->local_procs = 0;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
@ -1670,6 +1681,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
device->mem_reg_active = 0;
/* NTH: set some high default until we know how many local peers we have */
device->mem_reg_max = 1ull << 48;
device->ib_dev = ib_dev; device->ib_dev = ib_dev;
device->ib_dev_context = ibv_open_device(ib_dev); device->ib_dev_context = ibv_open_device(ib_dev);
device->ib_pd = NULL; device->ib_pd = NULL;

Просмотреть файл

@ -689,3 +689,22 @@ device).
Use "ibv_devinfo -v" on the local host to see the GID table of this Use "ibv_devinfo -v" on the local host to see the GID table of this
device. device.
[reg mem limit low]
WARNING: It appears that your OpenFabrics subsystem is configured to only
allow registering part of your physical memory. This can cause MPI jobs to
run with erratic performance, hang, and/or crash.
This may be caused by your OpenFabrics vendor limiting the amount of
physical memory that can be registered. You should investigate the
relevant Linux kernel module parameters that control how much physical
memory can be registered, and increase them to allow registering all
physical memory on your machine.
See this Open MPI FAQ item for more information on these Linux kernel module
parameters:
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
Local host: %s
Registerable memory: %lu MiB
Total memory: %lu MiB