btl/openib: limit each process to a ppn fraction of the available registered memory when using mellanox hardware (mlx4 and mthca)
This commit was SVN r26804.
Этот коммит содержится в:
родитель
4a97ecbdd2
Коммит
610be870f9
@ -70,6 +70,10 @@
|
|||||||
#ifdef HAVE_UNISTD_H
|
#ifdef HAVE_UNISTD_H
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef OPAL_HAVE_HWLOC
|
||||||
|
#include "opal/mca/hwloc/hwloc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef MIN
|
#ifndef MIN
|
||||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||||
#endif
|
#endif
|
||||||
@ -579,6 +583,65 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* calculate memory registation limits */
|
||||||
|
static uint64_t calculate_total_mem (void)
|
||||||
|
{
|
||||||
|
#if OPAL_HAVE_HWLOC
|
||||||
|
hwloc_obj_t machine;
|
||||||
|
|
||||||
|
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
|
||||||
|
if (NULL == machine) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return machine->memory.total_memory;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t calculate_max_reg (void)
|
||||||
|
{
|
||||||
|
struct stat statinfo;
|
||||||
|
uint64_t mtts_per_seg = 1;
|
||||||
|
uint64_t num_mtt = 1 << 19;
|
||||||
|
uint64_t reserved_mtt = 0;
|
||||||
|
uint64_t max_reg, mem_total;
|
||||||
|
|
||||||
|
mem_total = calculate_total_mem ();
|
||||||
|
|
||||||
|
if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
|
||||||
|
mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
|
||||||
|
num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
|
||||||
|
if (1 == num_mtt) {
|
||||||
|
/* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
|
||||||
|
num_mtt = 1 << 20;
|
||||||
|
}
|
||||||
|
|
||||||
|
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
|
||||||
|
} else if (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) {
|
||||||
|
mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
|
||||||
|
num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
|
||||||
|
reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
|
||||||
|
|
||||||
|
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
|
||||||
|
} else {
|
||||||
|
/* need to update to determine the registration limit for this configuration */
|
||||||
|
max_reg = mem_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* NTH: print a warning if we can't register more than 75% of physical memory */
|
||||||
|
if (max_reg < mem_total * 3 / 4) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
|
||||||
|
orte_process_info.nodename, (unsigned long)(max_reg >> 20),
|
||||||
|
(unsigned long)(mem_total >> 20));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
|
||||||
|
return (max_reg * 7) >> 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* add a proc to this btl module
|
* add a proc to this btl module
|
||||||
* creates an endpoint that is setup on the
|
* creates an endpoint that is setup on the
|
||||||
@ -592,7 +655,7 @@ int mca_btl_openib_add_procs(
|
|||||||
opal_bitmap_t* reachable)
|
opal_bitmap_t* reachable)
|
||||||
{
|
{
|
||||||
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
|
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
|
||||||
int i,j, rc;
|
int i,j, rc, local_procs;
|
||||||
int rem_subnet_id_port_cnt;
|
int rem_subnet_id_port_cnt;
|
||||||
int lcl_subnet_id_port_cnt = 0;
|
int lcl_subnet_id_port_cnt = 0;
|
||||||
int btl_rank = 0;
|
int btl_rank = 0;
|
||||||
@ -621,13 +684,17 @@ int mca_btl_openib_add_procs(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (i = 0; i < (int) nprocs; i++) {
|
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
|
||||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||||
mca_btl_openib_proc_t* ib_proc;
|
mca_btl_openib_proc_t* ib_proc;
|
||||||
int remote_matching_port;
|
int remote_matching_port;
|
||||||
|
|
||||||
opal_output(-1, "add procs: adding proc %d", i);
|
opal_output(-1, "add procs: adding proc %d", i);
|
||||||
|
|
||||||
|
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
|
||||||
|
local_procs ++;
|
||||||
|
}
|
||||||
|
|
||||||
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
|
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
|
||||||
* mark the prco as unreachable by openib btl */
|
* mark the prco as unreachable by openib btl */
|
||||||
if (OPAL_EQUAL == orte_util_compare_name_fields
|
if (OPAL_EQUAL == orte_util_compare_name_fields
|
||||||
@ -794,6 +861,9 @@ int mca_btl_openib_add_procs(
|
|||||||
peers[i] = endpoint;
|
peers[i] = endpoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
openib_btl->local_procs += local_procs;
|
||||||
|
openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs;
|
||||||
|
|
||||||
return mca_btl_openib_size_queues(openib_btl, nprocs);
|
return mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -390,6 +390,8 @@ typedef struct mca_btl_openib_device_t {
|
|||||||
mca_btl_openib_device_qp_t *qps;
|
mca_btl_openib_device_qp_t *qps;
|
||||||
/* Maximum value supported by this device for max_inline_data */
|
/* Maximum value supported by this device for max_inline_data */
|
||||||
uint32_t max_inline_data;
|
uint32_t max_inline_data;
|
||||||
|
/* Registration limit and current count */
|
||||||
|
uint64_t mem_reg_max, mem_reg_active;
|
||||||
} mca_btl_openib_device_t;
|
} mca_btl_openib_device_t;
|
||||||
OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
|
OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
|
||||||
|
|
||||||
@ -467,6 +469,8 @@ struct mca_btl_openib_module_t {
|
|||||||
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
|
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
|
||||||
|
|
||||||
mca_btl_openib_module_qp_t * qps;
|
mca_btl_openib_module_qp_t * qps;
|
||||||
|
|
||||||
|
int local_procs; /** number of local procs */
|
||||||
};
|
};
|
||||||
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
||||||
|
|
||||||
|
@ -596,6 +596,13 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
|||||||
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
|
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
|
||||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
||||||
|
|
||||||
|
if (device->mem_reg_max &&
|
||||||
|
device->mem_reg_max < (device->mem_reg_active + size)) {
|
||||||
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
device->mem_reg_active += size;
|
||||||
|
|
||||||
#if HAVE_DECL_IBV_ACCESS_SO
|
#if HAVE_DECL_IBV_ACCESS_SO
|
||||||
if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) {
|
if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) {
|
||||||
access_flag |= IBV_ACCESS_SO;
|
access_flag |= IBV_ACCESS_SO;
|
||||||
@ -637,6 +644,9 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
|
||||||
|
|
||||||
openib_reg->mr = NULL;
|
openib_reg->mr = NULL;
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -818,6 +828,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
|||||||
|
|
||||||
openib_btl->cpcs = NULL;
|
openib_btl->cpcs = NULL;
|
||||||
openib_btl->num_cpcs = 0;
|
openib_btl->num_cpcs = 0;
|
||||||
|
openib_btl->local_procs = 0;
|
||||||
|
|
||||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
|
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
|
||||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
|
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
|
||||||
@ -1670,6 +1681,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
device->mem_reg_active = 0;
|
||||||
|
/* NTH: set some high default until we know how many local peers we have */
|
||||||
|
device->mem_reg_max = 1ull << 48;
|
||||||
|
|
||||||
device->ib_dev = ib_dev;
|
device->ib_dev = ib_dev;
|
||||||
device->ib_dev_context = ibv_open_device(ib_dev);
|
device->ib_dev_context = ibv_open_device(ib_dev);
|
||||||
device->ib_pd = NULL;
|
device->ib_pd = NULL;
|
||||||
|
@ -689,3 +689,22 @@ device).
|
|||||||
|
|
||||||
Use "ibv_devinfo -v" on the local host to see the GID table of this
|
Use "ibv_devinfo -v" on the local host to see the GID table of this
|
||||||
device.
|
device.
|
||||||
|
[reg mem limit low]
|
||||||
|
WARNING: It appears that your OpenFabrics subsystem is configured to only
|
||||||
|
allow registering part of your physical memory. This can cause MPI jobs to
|
||||||
|
run with erratic performance, hang, and/or crash.
|
||||||
|
|
||||||
|
This may be caused by your OpenFabrics vendor limiting the amount of
|
||||||
|
physical memory that can be registered. You should investigate the
|
||||||
|
relevant Linux kernel module parameters that control how much physical
|
||||||
|
memory can be registered, and increase them to allow registering all
|
||||||
|
physical memory on your machine.
|
||||||
|
|
||||||
|
See this Open MPI FAQ item for more information on these Linux kernel module
|
||||||
|
parameters:
|
||||||
|
|
||||||
|
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Registerable memory: %lu MiB
|
||||||
|
Total memory: %lu MiB
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user