diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 6dc78daad2..81b3f324fe 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -56,6 +56,9 @@ #include "ompi/mca/mpool/grdma/mpool_grdma.h" #include "orte/util/proc_info.h" #include +#include +#include +#include #include #include #ifdef HAVE_SYS_TYPES_H @@ -70,6 +73,10 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef OPAL_HAVE_HWLOC +#include "opal/mca/hwloc/hwloc.h" +#endif + #ifndef MIN #define MIN(a,b) ((a)<(b)?(a):(b)) #endif @@ -579,6 +586,86 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl, return OMPI_SUCCESS; } +/* read a single integer from a linux module parameters file */ +static uint64_t read_module_param(char *file, uint64_t value) +{ + int fd = open(file, O_RDONLY); + char buffer[64]; + uint64_t ret; + + if (0 > fd) { + return value; + } + + read (fd, buffer, 64); + + close (fd); + + errno = 0; + ret = strtoull(buffer, NULL, 10); + + return (0 == errno) ? ret : value; +} + +/* calculate memory registation limits */ +static uint64_t calculate_total_mem (void) +{ +#if OPAL_HAVE_HWLOC + hwloc_obj_t machine; + + machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); + if (NULL == machine) { + return 0; + } + + return machine->memory.total_memory; +#else + return 0; +#endif +} + +static uint64_t calculate_max_reg (void) +{ + struct stat statinfo; + uint64_t mtts_per_seg = 1; + uint64_t num_mtt = 1 << 19; + uint64_t reserved_mtt = 0; + uint64_t max_reg, mem_total; + + mem_total = calculate_total_mem (); + + if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) { + mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1); + num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20); + if (1 == num_mtt) { + /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */ + num_mtt = 1 << 20; + } + + max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg; + } else if (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) { + mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1); + num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20); + reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0); + + max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg; + } else { + /* need to update to determine the registration limit for this configuration */ + max_reg = mem_total; + } + + /* NTH: print a warning if we can't register more than 75% of physical memory */ + if (max_reg < mem_total * 3 / 4) { + orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true, + orte_process_info.nodename, (unsigned long)(max_reg >> 20), + (unsigned long)(mem_total >> 20)); + } + + /* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */ + return (max_reg * 7) >> 3; +} + + /* * add a proc to this btl module * creates an endpoint that is setup on the @@ -592,7 +679,7 @@ int mca_btl_openib_add_procs( opal_bitmap_t* reachable) { mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl; - int i,j, rc; + int i,j, rc, local_procs; int rem_subnet_id_port_cnt; int lcl_subnet_id_port_cnt = 0; int btl_rank = 0; @@ -621,13 +708,17 @@ int mca_btl_openib_add_procs( } #endif - for (i = 0; i < (int) nprocs; i++) { + for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) { struct ompi_proc_t* ompi_proc = ompi_procs[i]; mca_btl_openib_proc_t* ib_proc; int remote_matching_port; opal_output(-1, "add procs: adding proc %d", i); + if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) { + local_procs ++; + } + /* OOB, XOOB, and RDMACM do not support SELF comunication, so * mark the prco as unreachable by openib btl */ if (OPAL_EQUAL == orte_util_compare_name_fields @@ -794,6 +885,9 @@ int mca_btl_openib_add_procs( peers[i] = endpoint; } + openib_btl->local_procs += local_procs; + openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs; + return mca_btl_openib_size_queues(openib_btl, nprocs); } diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 47f62fdb17..d2ca387d10 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -390,6 +390,8 @@ typedef struct mca_btl_openib_device_t { mca_btl_openib_device_qp_t *qps; /* Maximum value supported by this device for max_inline_data */ uint32_t max_inline_data; + /* Registration limit and current count */ + uint64_t mem_reg_max, mem_reg_active; } mca_btl_openib_device_t; OBJ_CLASS_DECLARATION(mca_btl_openib_device_t); @@ -467,6 +469,8 @@ struct mca_btl_openib_module_t { mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */ mca_btl_openib_module_qp_t * qps; + + int local_procs; /** number of local procs */ }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 605b80d57d..ccb3ca09fe 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -596,6 +596,13 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); + if (device->mem_reg_max && + device->mem_reg_max < (device->mem_reg_active + size)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + device->mem_reg_active += size; + #if HAVE_DECL_IBV_ACCESS_SO if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) { access_flag |= IBV_ACCESS_SO; @@ -620,6 +627,7 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) { + mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data; mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; if(openib_reg->mr != NULL) { @@ -637,6 +645,9 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) #endif } + + device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1); + openib_reg->mr = NULL; return OMPI_SUCCESS; } @@ -818,6 +829,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, openib_btl->cpcs = NULL; openib_btl->num_cpcs = 0; + openib_btl->local_procs = 0; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL; @@ -1670,6 +1682,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) return OMPI_ERR_OUT_OF_RESOURCE; } + device->mem_reg_active = 0; + /* NTH: set some high default until we know how many local peers we have */ + device->mem_reg_max = 1ull << 48; + device->ib_dev = ib_dev; device->ib_dev_context = ibv_open_device(ib_dev); device->ib_pd = NULL; diff --git a/ompi/mca/btl/openib/help-mpi-btl-openib.txt b/ompi/mca/btl/openib/help-mpi-btl-openib.txt index efbb2f547c..db137ab084 100644 --- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt +++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt @@ -689,3 +689,22 @@ device). Use "ibv_devinfo -v" on the local host to see the GID table of this device. +[reg mem limit low] +WARNING: It appears that your OpenFabrics subsystem is configured to only +allow registering part of your physical memory. This can cause MPI jobs to +run with erratic performance, hang, and/or crash. + +This may be caused by your OpenFabrics vendor limiting the amount of +physical memory that can be registered. You should investigate the +relevant Linux kernel module parameters that control how much physical +memory can be registered, and increase them to allow registering all +physical memory on your machine. + +See this Open MPI FAQ item for more information on these Linux kernel module +parameters: + + http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages + + Local host: %s + Registerable memory: %lu MiB + Total memory: %lu MiB