diff --git a/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h b/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h index ce1c1cf54b..96cc2ee032 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h +++ b/ompi/mca/coll/hcoll/coll_hcoll_dtypes.h @@ -73,41 +73,23 @@ static dte_data_representation_t ompi_dtype_2_dte_dtype(ompi_datatype_t *dtype){ return *ompi_datatype_2_dte_data_rep[opal_type_id]; } - -/* -enum ompi_op_type { - OMPI_OP_BASE_FORTRAN_NULL = 0, - OMPI_OP_BASE_FORTRAN_MAX, - OMPI_OP_BASE_FORTRAN_MIN, - OMPI_OP_BASE_FORTRAN_SUM, - OMPI_OP_BASE_FORTRAN_PROD, - OMPI_OP_BASE_FORTRAN_LAND, - OMPI_OP_BASE_FORTRAN_BAND, - OMPI_OP_BASE_FORTRAN_LOR, - OMPI_OP_BASE_FORTRAN_BOR, - OMPI_OP_BASE_FORTRAN_LXOR, - OMPI_OP_BASE_FORTRAN_BXOR, - OMPI_OP_BASE_FORTRAN_MAXLOC, - OMPI_OP_BASE_FORTRAN_MINLOC, - OMPI_OP_BASE_FORTRAN_REPLACE, - - OMPI_OP_BASE_FORTRAN_OP_MAX -};*/ -static hcoll_dte_op_t* ompi_op_2_hcoll_op[OMPI_OP_BASE_FORTRAN_OP_MAX] = { - &hcoll_dte_op_null, - &hcoll_dte_op_max, - &hcoll_dte_op_min, - &hcoll_dte_op_sum, - &hcoll_dte_op_prod, - &hcoll_dte_op_land, - &hcoll_dte_op_band, - &hcoll_dte_op_lor, - &hcoll_dte_op_bor, - &hcoll_dte_op_lxor, - &hcoll_dte_op_bxor, - &hcoll_dte_op_null, - &hcoll_dte_op_null, - &hcoll_dte_op_null +static hcoll_dte_op_t* ompi_op_2_hcoll_op[OMPI_OP_BASE_FORTRAN_OP_MAX + 1] = { + &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_NULL = 0 */ + &hcoll_dte_op_max, /* OMPI_OP_BASE_FORTRAN_MAX */ + &hcoll_dte_op_min, /* OMPI_OP_BASE_FORTRAN_MIN */ + &hcoll_dte_op_sum, /* OMPI_OP_BASE_FORTRAN_SUM */ + &hcoll_dte_op_prod, /* OMPI_OP_BASE_FORTRAN_PROD */ + &hcoll_dte_op_land, /* OMPI_OP_BASE_FORTRAN_LAND */ + &hcoll_dte_op_band, /* OMPI_OP_BASE_FORTRAN_BAND */ + &hcoll_dte_op_lor, /* OMPI_OP_BASE_FORTRAN_LOR */ + &hcoll_dte_op_bor, /* OMPI_OP_BASE_FORTRAN_BOR */ + &hcoll_dte_op_lxor, /* OMPI_OP_BASE_FORTRAN_LXOR */ + &hcoll_dte_op_bxor, /* OMPI_OP_BASE_FORTRAN_BXOR */ + &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_MAXLOC */ + &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_MINLOC */ + &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_REPLACE */ + &hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_NO_OP */ + &hcoll_dte_op_null /* OMPI_OP_BASE_FORTRAN_OP_MAX */ }; static hcoll_dte_op_t* ompi_op_2_hcolrte_op(ompi_op_t *op){ return ompi_op_2_hcoll_op[op->o_f_to_c_index]; diff --git a/ompi/mca/coll/hcoll/coll_hcoll_ops.c b/ompi/mca/coll/hcoll/coll_hcoll_ops.c index 621e83f157..61fa8b339e 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_ops.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_ops.c @@ -36,7 +36,7 @@ int mca_coll_hcoll_bcast(void *buff, int count, HCOL_VERBOSE(20,"RUNNING HCOL BCAST"); mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; dtype = ompi_dtype_2_dte_dtype(datatype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ @@ -68,7 +68,7 @@ int mca_coll_hcoll_allgather(void *sbuf, int scount, mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; stype = ompi_dtype_2_dte_dtype(sdtype); rtype = ompi_dtype_2_dte_dtype(rdtype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_COMPLEX(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ @@ -143,7 +143,7 @@ int mca_coll_hcoll_allreduce(void *sbuf, void *rbuf, int count, HCOL_VERBOSE(20,"RUNNING HCOL ALLREDUCE"); mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; Dtype = ompi_dtype_2_dte_dtype(dtype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ @@ -192,7 +192,7 @@ int mca_coll_hcoll_alltoall(void *sbuf, int scount, mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; stype = ompi_dtype_2_dte_dtype(sdtype); rtype = ompi_dtype_2_dte_dtype(rdtype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_COMPLEX(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ @@ -284,7 +284,7 @@ int mca_coll_hcoll_ibcast(void *buff, int count, mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; rt_handle = (void**) request; dtype = ompi_dtype_2_dte_dtype(datatype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ @@ -319,7 +319,7 @@ int mca_coll_hcoll_iallgather(void *sbuf, int scount, rt_handle = (void**) request; stype = ompi_dtype_2_dte_dtype(sdtype); rtype = ompi_dtype_2_dte_dtype(rdtype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_COMPLEX(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ @@ -360,7 +360,7 @@ int mca_coll_hcoll_iallreduce(void *sbuf, void *rbuf, int count, mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; rt_handle = (void**) request; Dtype = ompi_dtype_2_dte_dtype(dtype); - if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){ /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 85c5494b19..3d1c259756 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -580,113 +580,6 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl, return OPAL_SUCCESS; } -/* read a single integer from a linux module parameters file */ -static uint64_t read_module_param(char *file, uint64_t value) -{ - int fd = open(file, O_RDONLY); - char buffer[64]; - uint64_t ret; - - if (0 > fd) { - return value; - } - - read (fd, buffer, 64); - - close (fd); - - errno = 0; - ret = strtoull(buffer, NULL, 10); - - return (0 == errno) ? ret : value; -} - -/* calculate memory registation limits */ -static uint64_t calculate_total_mem (void) -{ -#if OPAL_HAVE_HWLOC - hwloc_obj_t machine; - - machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); - if (NULL == machine) { - return 0; - } - - return machine->memory.total_memory; -#else - return 0; -#endif -} - - -static uint64_t calculate_max_reg (void) -{ - struct stat statinfo; - uint64_t mtts_per_seg = 1; - uint64_t num_mtt = 1 << 19; - uint64_t reserved_mtt = 0; - uint64_t max_reg, mem_total; - - mem_total = calculate_total_mem (); - - if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) { - mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1); - num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20); - if (1 == num_mtt) { - if (0 == stat("/sys/module/mlx5_core", &statinfo)) { - max_reg = 2 * mem_total; - } else { - /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */ - num_mtt = 1 << 19; - max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; - } - } else { - max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; - } - - } else if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) { - mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1); - num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20); - reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0); - - max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; - - } else if ( - (0 == stat("/sys/module/mlx5_core", &statinfo)) || - (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) || - (0 == stat("/sys/module/ib_mthca/parameters", &statinfo)) - ) { - /* mlx5 means that we have ofed 2.0 and it can always register 2xmem_total for any mlx hca */ - max_reg = 2 * mem_total; - - } else { - /* Need to update to determine the registration limit for this - configuration */ - max_reg = mem_total; - } - - /* Print a warning if we can't register more than 75% of physical - memory. Abort if the abort_not_enough_reg_mem MCA param was - set. */ - if (max_reg < mem_total * 3 / 4) { - char *action; - - if (mca_btl_openib_component.abort_not_enough_reg_mem) { - action = "Your MPI job will now abort."; - } else { - action = "Your MPI job will continue, but may be behave poorly and/or hang."; - } - opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true, - opal_process_info.nodename, (unsigned long)(max_reg >> 20), - (unsigned long)(mem_total >> 20), action); - return 0; /* signal that we can't have enough memory */ - } - - /* Limit us to 87.5% of the registered memory (some fluff for QPs, - file systems, etc) */ - return (max_reg * 7) >> 3; -} - static int prepare_device_for_use (mca_btl_openib_device_t *device) { mca_btl_openib_frag_init_data_t *init_data; @@ -1128,10 +1021,7 @@ int mca_btl_openib_add_procs( } openib_btl->local_procs += local_procs; - openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs; - if(( 0 == openib_btl->device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) { - return OPAL_ERROR; - } + openib_btl->device->mem_reg_max /= openib_btl->local_procs; return mca_btl_openib_size_queues(openib_btl, nprocs); } diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index cfcf367461..3a2038e804 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -300,6 +300,7 @@ struct mca_btl_openib_component_t { int gid_index; /** Whether we want a dynamically resizing srq, enabled by default */ bool enable_srq_resize; + bool allow_max_memory_registration; int memory_registration_verbose_level; int memory_registration_verbose; int ignore_locality; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 128a335a66..2760f383b3 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -38,6 +38,7 @@ #endif #include #include +#include #include #include #if BTL_OPENIB_MALLOC_HOOKS_ENABLED @@ -62,6 +63,7 @@ #include "opal/util/argv.h" #include "opal/mca/timer/base/base.h" #include "opal/sys/atomic.h" +#include "opal/util/sys_limits.h" #include "opal/util/argv.h" #include "opal/memoryhooks/memory.h" /* Define this before including hwloc.h so that we also get the hwloc @@ -1423,6 +1425,118 @@ error: return ret; } +/* read a single integer from a linux module parameters file */ +static uint64_t read_module_param(char *file, uint64_t value) +{ + int fd = open(file, O_RDONLY); + char buffer[64]; + uint64_t ret; + + if (0 > fd) { + return value; + } + + read (fd, buffer, 64); + + close (fd); + + errno = 0; + ret = strtoull(buffer, NULL, 10); + + return (0 == errno) ? ret : value; +} + +/* calculate memory registation limits */ +static uint64_t calculate_total_mem (void) +{ +#if OPAL_HAVE_HWLOC + hwloc_obj_t machine; + + machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); + if (NULL == machine) { + return 0; + } + + return machine->memory.total_memory; +#else + return 0; +#endif +} + + +static uint64_t calculate_max_reg (const char *device_name) +{ + struct stat statinfo; + uint64_t mtts_per_seg = 1; + uint64_t num_mtt = 1 << 19; + uint64_t reserved_mtt = 0; + uint64_t max_reg, mem_total; + + mem_total = calculate_total_mem (); + + /* On older OFED(<2.0), may need to turn off this parameter*/ + if (mca_btl_openib_component.allow_max_memory_registration) { + max_reg = 2 * mem_total; + /* Limit us to 87.5% of the registered memory (some fluff for QPs, + file systems, etc) */ + return (max_reg * 7) >> 3; + } + + if (!strncmp(device_name, "mlx5", 4)) { + max_reg = 2 * mem_total; + + } else if (!strncmp(device_name, "mlx4", 4)) { + if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) { + mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1); + num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20); + if (1 == num_mtt) { + /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */ + num_mtt = 1 << 19; + max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; + } else { + max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; + } + } + + } else if (!strncmp(device_name, "mthca", 5)) { + if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) { + mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1); + num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20); + reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0); + + max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; + } else { + max_reg = mem_total; + } + + } else { + /* Need to update to determine the registration limit for this + configuration */ + max_reg = mem_total; + } + + /* Print a warning if we can't register more than 75% of physical + memory. Abort if the abort_not_enough_reg_mem MCA param was + set. */ + if (max_reg < mem_total * 3 / 4) { + char *action; + + if (mca_btl_openib_component.abort_not_enough_reg_mem) { + action = "Your MPI job will now abort."; + } else { + action = "Your MPI job will continue, but may be behave poorly and/or hang."; + } + opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true, + opal_process_info.nodename, (unsigned long)(max_reg >> 20), + (unsigned long)(mem_total >> 20), action); + return 0; /* signal that we can't have enough memory */ + } + + /* Limit us to 87.5% of the registered memory (some fluff for QPs, + file systems, etc) */ + return (max_reg * 7) >> 3; +} + static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) { struct mca_mpool_base_resources_t mpool_resources; @@ -1458,8 +1572,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) } device->mem_reg_active = 0; - /* NTH: set some high default until we know how many local peers we have */ - device->mem_reg_max = 1ull << 48; + device->mem_reg_max = calculate_max_reg(ibv_get_device_name(ib_dev)); + if(( 0 == device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) { + return OPAL_ERROR; + } device->ib_dev = ib_dev; device->ib_dev_context = dev_context; diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c index 42ea77f9ff..9176fa7919 100644 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ b/opal/mca/btl/openib/btl_openib_mca.c @@ -536,6 +536,10 @@ int btl_openib_register_mca_params(void) "Maximum size (in bytes) of a single fragment of a long message when using the RDMA protocols (must be > 0 and <= hw capabilities).", 0, &mca_btl_openib_component.max_hw_msg_size, 0)); + CHECK(reg_bool("allow_max_memory_registration", NULL, + "Allow maximum possible memory to register with HCA", + 1, &mca_btl_openib_component.allow_max_memory_registration)); + /* Help debug memory registration issues */ CHECK(reg_int("memory_registration_verbose", NULL, "Output some verbose memory registration information " diff --git a/opal/mca/btl/vader/configure.m4 b/opal/mca/btl/vader/configure.m4 index abf6e51135..3c73cbf6db 100644 --- a/opal/mca/btl/vader/configure.m4 +++ b/opal/mca/btl/vader/configure.m4 @@ -23,7 +23,7 @@ AC_DEFUN([OPAL_CHECK_XPMEM], [ AC_ARG_WITH([xpmem], [AC_HELP_STRING([--with-xpmem(=DIR)], [Build with XPMEM kernel module support, searching for headers in DIR])]) - OPAL_CHECK_WITHDIR([xpmem], [$with_xpmem], [include/xpmem.h include/sn/xpmem.h]) + OPAL_CHECK_WITHDIR([xpmem], [$with_xpmem], [include/xpmem.h]) AC_ARG_WITH([xpmem-libdir], [AC_HELP_STRING([--with-xpmem-libdir=DIR], @@ -41,7 +41,7 @@ AC_DEFUN([OPAL_CHECK_XPMEM], [ opal_check_xpmem_libdir="$with_xpmem_libdir" fi - OPAL_CHECK_PACKAGE([$1],[xpmem.h sn/xpmem.h],[xpmem],[xpmem_make],[], + OPAL_CHECK_PACKAGE([$1],[xpmem.h],[xpmem],[xpmem_make],[], [$opal_check_xpmem_dir],[$opal_check_xpmem_libdir], [opal_check_xpmem_happy="yes"], []) if test "$opal_check_xpmem_happy" = "no" -a -n "$with_xpmem" -a "$with_xpmem" != "yes" ; then