1
1
Этот коммит содержится в:
Gilles Gouaillardet 2014-12-24 11:44:27 +09:00
родитель 8976dcf610 ccafc62c07
Коммит 1ab9dd994e
7 изменённых файлов: 150 добавлений и 157 удалений

Просмотреть файл

@ -73,41 +73,23 @@ static dte_data_representation_t ompi_dtype_2_dte_dtype(ompi_datatype_t *dtype){
return *ompi_datatype_2_dte_data_rep[opal_type_id];
}
/*
enum ompi_op_type {
OMPI_OP_BASE_FORTRAN_NULL = 0,
OMPI_OP_BASE_FORTRAN_MAX,
OMPI_OP_BASE_FORTRAN_MIN,
OMPI_OP_BASE_FORTRAN_SUM,
OMPI_OP_BASE_FORTRAN_PROD,
OMPI_OP_BASE_FORTRAN_LAND,
OMPI_OP_BASE_FORTRAN_BAND,
OMPI_OP_BASE_FORTRAN_LOR,
OMPI_OP_BASE_FORTRAN_BOR,
OMPI_OP_BASE_FORTRAN_LXOR,
OMPI_OP_BASE_FORTRAN_BXOR,
OMPI_OP_BASE_FORTRAN_MAXLOC,
OMPI_OP_BASE_FORTRAN_MINLOC,
OMPI_OP_BASE_FORTRAN_REPLACE,
OMPI_OP_BASE_FORTRAN_OP_MAX
};*/
static hcoll_dte_op_t* ompi_op_2_hcoll_op[OMPI_OP_BASE_FORTRAN_OP_MAX] = {
&hcoll_dte_op_null,
&hcoll_dte_op_max,
&hcoll_dte_op_min,
&hcoll_dte_op_sum,
&hcoll_dte_op_prod,
&hcoll_dte_op_land,
&hcoll_dte_op_band,
&hcoll_dte_op_lor,
&hcoll_dte_op_bor,
&hcoll_dte_op_lxor,
&hcoll_dte_op_bxor,
&hcoll_dte_op_null,
&hcoll_dte_op_null,
&hcoll_dte_op_null
static hcoll_dte_op_t* ompi_op_2_hcoll_op[OMPI_OP_BASE_FORTRAN_OP_MAX + 1] = {
&hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_NULL = 0 */
&hcoll_dte_op_max, /* OMPI_OP_BASE_FORTRAN_MAX */
&hcoll_dte_op_min, /* OMPI_OP_BASE_FORTRAN_MIN */
&hcoll_dte_op_sum, /* OMPI_OP_BASE_FORTRAN_SUM */
&hcoll_dte_op_prod, /* OMPI_OP_BASE_FORTRAN_PROD */
&hcoll_dte_op_land, /* OMPI_OP_BASE_FORTRAN_LAND */
&hcoll_dte_op_band, /* OMPI_OP_BASE_FORTRAN_BAND */
&hcoll_dte_op_lor, /* OMPI_OP_BASE_FORTRAN_LOR */
&hcoll_dte_op_bor, /* OMPI_OP_BASE_FORTRAN_BOR */
&hcoll_dte_op_lxor, /* OMPI_OP_BASE_FORTRAN_LXOR */
&hcoll_dte_op_bxor, /* OMPI_OP_BASE_FORTRAN_BXOR */
&hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_MAXLOC */
&hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_MINLOC */
&hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_REPLACE */
&hcoll_dte_op_null, /* OMPI_OP_BASE_FORTRAN_NO_OP */
&hcoll_dte_op_null /* OMPI_OP_BASE_FORTRAN_OP_MAX */
};
static hcoll_dte_op_t* ompi_op_2_hcolrte_op(ompi_op_t *op){
return ompi_op_2_hcoll_op[op->o_f_to_c_index];

Просмотреть файл

@ -36,7 +36,7 @@ int mca_coll_hcoll_bcast(void *buff, int count,
HCOL_VERBOSE(20,"RUNNING HCOL BCAST");
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
dtype = ompi_dtype_2_dte_dtype(datatype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */
@ -68,7 +68,7 @@ int mca_coll_hcoll_allgather(void *sbuf, int scount,
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
stype = ompi_dtype_2_dte_dtype(sdtype);
rtype = ompi_dtype_2_dte_dtype(rdtype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_COMPLEX(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */
@ -143,7 +143,7 @@ int mca_coll_hcoll_allreduce(void *sbuf, void *rbuf, int count,
HCOL_VERBOSE(20,"RUNNING HCOL ALLREDUCE");
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
Dtype = ompi_dtype_2_dte_dtype(dtype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */
@ -192,7 +192,7 @@ int mca_coll_hcoll_alltoall(void *sbuf, int scount,
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
stype = ompi_dtype_2_dte_dtype(sdtype);
rtype = ompi_dtype_2_dte_dtype(rdtype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_COMPLEX(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */
@ -284,7 +284,7 @@ int mca_coll_hcoll_ibcast(void *buff, int count,
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
rt_handle = (void**) request;
dtype = ompi_dtype_2_dte_dtype(datatype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */
@ -319,7 +319,7 @@ int mca_coll_hcoll_iallgather(void *sbuf, int scount,
rt_handle = (void**) request;
stype = ompi_dtype_2_dte_dtype(sdtype);
rtype = ompi_dtype_2_dte_dtype(rdtype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(stype) || HCOL_DTE_IS_COMPLEX(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */
@ -360,7 +360,7 @@ int mca_coll_hcoll_iallreduce(void *sbuf, void *rbuf, int count,
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
rt_handle = (void**) request;
Dtype = ompi_dtype_2_dte_dtype(dtype);
if (OPAL_UNLIKELY(HCOL_DTE_IS_COMPLEX(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(Dtype)) && mca_coll_hcoll_component.hcoll_datatype_fallback){
/*If we are here then datatype is not simple predefined datatype */
/*In future we need to add more complex mapping to the dte_data_representation_t */
/* Now use fallback */

Просмотреть файл

@ -580,113 +580,6 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
return OPAL_SUCCESS;
}
/* read a single integer from a linux module parameters file */
static uint64_t read_module_param(char *file, uint64_t value)
{
int fd = open(file, O_RDONLY);
char buffer[64];
uint64_t ret;
if (0 > fd) {
return value;
}
read (fd, buffer, 64);
close (fd);
errno = 0;
ret = strtoull(buffer, NULL, 10);
return (0 == errno) ? ret : value;
}
/* calculate memory registation limits */
static uint64_t calculate_total_mem (void)
{
#if OPAL_HAVE_HWLOC
hwloc_obj_t machine;
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
}
return machine->memory.total_memory;
#else
return 0;
#endif
}
static uint64_t calculate_max_reg (void)
{
struct stat statinfo;
uint64_t mtts_per_seg = 1;
uint64_t num_mtt = 1 << 19;
uint64_t reserved_mtt = 0;
uint64_t max_reg, mem_total;
mem_total = calculate_total_mem ();
if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
if (1 == num_mtt) {
if (0 == stat("/sys/module/mlx5_core", &statinfo)) {
max_reg = 2 * mem_total;
} else {
/* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
num_mtt = 1 << 19;
max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
}
} else {
max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
}
} else if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
} else if (
(0 == stat("/sys/module/mlx5_core", &statinfo)) ||
(0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) ||
(0 == stat("/sys/module/ib_mthca/parameters", &statinfo))
) {
/* mlx5 means that we have ofed 2.0 and it can always register 2xmem_total for any mlx hca */
max_reg = 2 * mem_total;
} else {
/* Need to update to determine the registration limit for this
configuration */
max_reg = mem_total;
}
/* Print a warning if we can't register more than 75% of physical
memory. Abort if the abort_not_enough_reg_mem MCA param was
set. */
if (max_reg < mem_total * 3 / 4) {
char *action;
if (mca_btl_openib_component.abort_not_enough_reg_mem) {
action = "Your MPI job will now abort.";
} else {
action = "Your MPI job will continue, but may be behave poorly and/or hang.";
}
opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
opal_process_info.nodename, (unsigned long)(max_reg >> 20),
(unsigned long)(mem_total >> 20), action);
return 0; /* signal that we can't have enough memory */
}
/* Limit us to 87.5% of the registered memory (some fluff for QPs,
file systems, etc) */
return (max_reg * 7) >> 3;
}
static int prepare_device_for_use (mca_btl_openib_device_t *device)
{
mca_btl_openib_frag_init_data_t *init_data;
@ -1128,10 +1021,7 @@ int mca_btl_openib_add_procs(
}
openib_btl->local_procs += local_procs;
openib_btl->device->mem_reg_max = calculate_max_reg () / openib_btl->local_procs;
if(( 0 == openib_btl->device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) {
return OPAL_ERROR;
}
openib_btl->device->mem_reg_max /= openib_btl->local_procs;
return mca_btl_openib_size_queues(openib_btl, nprocs);
}

Просмотреть файл

@ -300,6 +300,7 @@ struct mca_btl_openib_component_t {
int gid_index;
/** Whether we want a dynamically resizing srq, enabled by default */
bool enable_srq_resize;
bool allow_max_memory_registration;
int memory_registration_verbose_level;
int memory_registration_verbose;
int ignore_locality;

Просмотреть файл

@ -38,6 +38,7 @@
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stddef.h>
#if BTL_OPENIB_MALLOC_HOOKS_ENABLED
@ -62,6 +63,7 @@
#include "opal/util/argv.h"
#include "opal/mca/timer/base/base.h"
#include "opal/sys/atomic.h"
#include "opal/util/sys_limits.h"
#include "opal/util/argv.h"
#include "opal/memoryhooks/memory.h"
/* Define this before including hwloc.h so that we also get the hwloc
@ -1423,6 +1425,118 @@ error:
return ret;
}
/* read a single integer from a linux module parameters file */
static uint64_t read_module_param(char *file, uint64_t value)
{
int fd = open(file, O_RDONLY);
char buffer[64];
uint64_t ret;
if (0 > fd) {
return value;
}
read (fd, buffer, 64);
close (fd);
errno = 0;
ret = strtoull(buffer, NULL, 10);
return (0 == errno) ? ret : value;
}
/* calculate memory registation limits */
static uint64_t calculate_total_mem (void)
{
#if OPAL_HAVE_HWLOC
hwloc_obj_t machine;
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
}
return machine->memory.total_memory;
#else
return 0;
#endif
}
static uint64_t calculate_max_reg (const char *device_name)
{
struct stat statinfo;
uint64_t mtts_per_seg = 1;
uint64_t num_mtt = 1 << 19;
uint64_t reserved_mtt = 0;
uint64_t max_reg, mem_total;
mem_total = calculate_total_mem ();
/* On older OFED(<2.0), may need to turn off this parameter*/
if (mca_btl_openib_component.allow_max_memory_registration) {
max_reg = 2 * mem_total;
/* Limit us to 87.5% of the registered memory (some fluff for QPs,
file systems, etc) */
return (max_reg * 7) >> 3;
}
if (!strncmp(device_name, "mlx5", 4)) {
max_reg = 2 * mem_total;
} else if (!strncmp(device_name, "mlx4", 4)) {
if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
if (1 == num_mtt) {
/* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
num_mtt = 1 << 19;
max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
} else {
max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
}
}
} else if (!strncmp(device_name, "mthca", 5)) {
if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) {
mtts_per_seg = 1 << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1);
num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20);
reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0);
max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
} else {
max_reg = mem_total;
}
} else {
/* Need to update to determine the registration limit for this
configuration */
max_reg = mem_total;
}
/* Print a warning if we can't register more than 75% of physical
memory. Abort if the abort_not_enough_reg_mem MCA param was
set. */
if (max_reg < mem_total * 3 / 4) {
char *action;
if (mca_btl_openib_component.abort_not_enough_reg_mem) {
action = "Your MPI job will now abort.";
} else {
action = "Your MPI job will continue, but may be behave poorly and/or hang.";
}
opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
opal_process_info.nodename, (unsigned long)(max_reg >> 20),
(unsigned long)(mem_total >> 20), action);
return 0; /* signal that we can't have enough memory */
}
/* Limit us to 87.5% of the registered memory (some fluff for QPs,
file systems, etc) */
return (max_reg * 7) >> 3;
}
static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
{
struct mca_mpool_base_resources_t mpool_resources;
@ -1458,8 +1572,10 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
}
device->mem_reg_active = 0;
/* NTH: set some high default until we know how many local peers we have */
device->mem_reg_max = 1ull << 48;
device->mem_reg_max = calculate_max_reg(ibv_get_device_name(ib_dev));
if(( 0 == device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) {
return OPAL_ERROR;
}
device->ib_dev = ib_dev;
device->ib_dev_context = dev_context;

Просмотреть файл

@ -536,6 +536,10 @@ int btl_openib_register_mca_params(void)
"Maximum size (in bytes) of a single fragment of a long message when using the RDMA protocols (must be > 0 and <= hw capabilities).",
0, &mca_btl_openib_component.max_hw_msg_size, 0));
CHECK(reg_bool("allow_max_memory_registration", NULL,
"Allow maximum possible memory to register with HCA",
1, &mca_btl_openib_component.allow_max_memory_registration));
/* Help debug memory registration issues */
CHECK(reg_int("memory_registration_verbose", NULL,
"Output some verbose memory registration information "

Просмотреть файл

@ -23,7 +23,7 @@ AC_DEFUN([OPAL_CHECK_XPMEM], [
AC_ARG_WITH([xpmem],
[AC_HELP_STRING([--with-xpmem(=DIR)],
[Build with XPMEM kernel module support, searching for headers in DIR])])
OPAL_CHECK_WITHDIR([xpmem], [$with_xpmem], [include/xpmem.h include/sn/xpmem.h])
OPAL_CHECK_WITHDIR([xpmem], [$with_xpmem], [include/xpmem.h])
AC_ARG_WITH([xpmem-libdir],
[AC_HELP_STRING([--with-xpmem-libdir=DIR],
@ -41,7 +41,7 @@ AC_DEFUN([OPAL_CHECK_XPMEM], [
opal_check_xpmem_libdir="$with_xpmem_libdir"
fi
OPAL_CHECK_PACKAGE([$1],[xpmem.h sn/xpmem.h],[xpmem],[xpmem_make],[],
OPAL_CHECK_PACKAGE([$1],[xpmem.h],[xpmem],[xpmem_make],[],
[$opal_check_xpmem_dir],[$opal_check_xpmem_libdir], [opal_check_xpmem_happy="yes"], [])
if test "$opal_check_xpmem_happy" = "no" -a -n "$with_xpmem" -a "$with_xpmem" != "yes" ; then