Add supports for MPI_OP using AVX512, AVX2 and MMX
Add logic to handle different architectural capabilities Detect the compiler flags necessary to build specialized versions of the MPI_OP. Once the different flavors (AVX512, AVX2, AVX) are built, detect at runtime which is the best match with the current processor capabilities. Add validation checks for loadu 256 and 512 bits. Add validation tests for MPI_Op. Signed-off-by: Jeff Squyres <jsquyres@cisco.com> Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp> Signed-off-by: dongzhong <zhongdong0321@hotmail.com> Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
родитель
a26e494953
Коммит
14b3c70628
@ -2,7 +2,7 @@ dnl
|
|||||||
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
dnl University Research and Technology
|
dnl University Research and Technology
|
||||||
dnl Corporation. All rights reserved.
|
dnl Corporation. All rights reserved.
|
||||||
dnl Copyright (c) 2004-2018 The University of Tennessee and The University
|
dnl Copyright (c) 2004-2020 The University of Tennessee and The University
|
||||||
dnl of Tennessee Research Foundation. All rights
|
dnl of Tennessee Research Foundation. All rights
|
||||||
dnl reserved.
|
dnl reserved.
|
||||||
dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
|
dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
|
||||||
@ -1255,7 +1255,7 @@ AC_DEFUN([OPAL_CONFIG_ASM],[
|
|||||||
|
|
||||||
# Check for RDTSCP support
|
# Check for RDTSCP support
|
||||||
result=0
|
result=0
|
||||||
AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
|
AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
|
||||||
[AC_MSG_CHECKING([for RDTSCP assembly support])
|
[AC_MSG_CHECKING([for RDTSCP assembly support])
|
||||||
AC_LANG_PUSH([C])
|
AC_LANG_PUSH([C])
|
||||||
AC_TRY_RUN([[
|
AC_TRY_RUN([[
|
||||||
|
101
ompi/mca/op/avx/Makefile.am
Обычный файл
101
ompi/mca/op/avx/Makefile.am
Обычный файл
@ -0,0 +1,101 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2019-2020 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2020 Research Organization for Information Science
|
||||||
|
# and Technology (RIST). All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# This component provide support for the Advanced Vector Extensions (AVX)
|
||||||
|
# available in recent versions of x86 processors.
|
||||||
|
#
|
||||||
|
# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
|
||||||
|
# for more details on how to make Open MPI components.
|
||||||
|
|
||||||
|
# First, list all .h and .c sources. It is necessary to list all .h
|
||||||
|
# files so that they will be picked up in the distribution tarball.
|
||||||
|
|
||||||
|
sources = op_avx_component.c op_avx.h
|
||||||
|
sources_extended = op_avx_functions.c
|
||||||
|
|
||||||
|
# Open MPI components can be compiled two ways:
|
||||||
|
#
|
||||||
|
# 1. As a standalone dynamic shared object (DSO), sometimes called a
|
||||||
|
# dynamically loadable library (DLL).
|
||||||
|
#
|
||||||
|
# 2. As a static library that is slurped up into the upper-level
|
||||||
|
# libmpi library (regardless of whether libmpi is a static or dynamic
|
||||||
|
# library). This is called a "Libtool convenience library".
|
||||||
|
#
|
||||||
|
# The component needs to create an output library in this top-level
|
||||||
|
# component directory, and named either mca_<type>_<name>.la (for DSO
|
||||||
|
# builds) or libmca_<type>_<name>.la (for static builds). The OMPI
|
||||||
|
# build system will have set the
|
||||||
|
# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
|
||||||
|
# which way this component should be built.
|
||||||
|
|
||||||
|
# We need to support all processors from early AVX to full AVX512 support, based on
|
||||||
|
# a decision made at runtime. So, we generate all combinations of capabilities, and
|
||||||
|
# we will select the most suitable (based on the processor flags) during the
|
||||||
|
# component initialization.
|
||||||
|
specialized_op_libs =
|
||||||
|
if MCA_BUILD_ompi_op_has_avx_support
|
||||||
|
specialized_op_libs += liblocal_ops_avx.la
|
||||||
|
liblocal_ops_avx_la_SOURCES = $(sources_extended)
|
||||||
|
liblocal_ops_avx_la_CFLAGS = @MCA_BUILD_OP_AVX_FLAGS@
|
||||||
|
liblocal_ops_avx_la_CPPFLAGS = -DGENERATE_AVX_CODE
|
||||||
|
if MCA_BUILD_ompi_op_has_sse3_support
|
||||||
|
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE3_CODE
|
||||||
|
endif
|
||||||
|
if MCA_BUILD_ompi_op_has_sse41_support
|
||||||
|
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE41_CODE
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
if MCA_BUILD_ompi_op_has_avx2_support
|
||||||
|
specialized_op_libs += liblocal_ops_avx2.la
|
||||||
|
liblocal_ops_avx2_la_SOURCES = $(sources_extended)
|
||||||
|
liblocal_ops_avx2_la_CFLAGS = @MCA_BUILD_OP_AVX2_FLAGS@
|
||||||
|
liblocal_ops_avx2_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE
|
||||||
|
endif
|
||||||
|
if MCA_BUILD_ompi_op_has_avx512_support
|
||||||
|
specialized_op_libs += liblocal_ops_avx512.la
|
||||||
|
liblocal_ops_avx512_la_SOURCES = $(sources_extended)
|
||||||
|
liblocal_ops_avx512_la_CFLAGS = @MCA_BUILD_OP_AVX512_FLAGS@
|
||||||
|
liblocal_ops_avx512_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE -DGENERATE_AVX512_CODE
|
||||||
|
endif
|
||||||
|
|
||||||
|
component_noinst = $(specialized_op_libs)
|
||||||
|
if MCA_BUILD_ompi_op_avx_DSO
|
||||||
|
component_install = mca_op_avx.la
|
||||||
|
else
|
||||||
|
component_install =
|
||||||
|
component_noinst += libmca_op_avx.la
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Specific information for DSO builds.
|
||||||
|
#
|
||||||
|
# The DSO should install itself in $(ompilibdir) (by default,
|
||||||
|
# $prefix/lib/openmpi).
|
||||||
|
|
||||||
|
mcacomponentdir = $(ompilibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
|
mca_op_avx_la_SOURCES = $(sources)
|
||||||
|
mca_op_avx_la_LIBADD = $(specialized_op_libs)
|
||||||
|
mca_op_avx_la_LDFLAGS = -module -avoid-version
|
||||||
|
|
||||||
|
|
||||||
|
# Specific information for static builds.
|
||||||
|
#
|
||||||
|
# Note that we *must* "noinst"; the upper-layer Makefile.am's will
|
||||||
|
# slurp in the resulting .la library into libmpi.
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
libmca_op_avx_la_SOURCES = $(sources)
|
||||||
|
libmca_op_avx_la_LIBADD = $(specialized_op_libs)
|
||||||
|
libmca_op_avx_la_LDFLAGS = -module -avoid-version
|
||||||
|
|
265
ompi/mca/op/avx/configure.m4
Обычный файл
265
ompi/mca/op/avx/configure.m4
Обычный файл
@ -0,0 +1,265 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2019-2020 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
|
||||||
|
#
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
|
||||||
|
# [action-if-cant-compile])
|
||||||
|
# ------------------------------------------------
|
||||||
|
# We can always build, unless we were explicitly disabled.
|
||||||
|
AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
|
||||||
|
AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
|
||||||
|
|
||||||
|
MCA_BUILD_OP_AVX_FLAGS=""
|
||||||
|
MCA_BUILD_OP_AVX2_FLAGS=""
|
||||||
|
MCA_BUILD_OP_AVX512_FLAGS=""
|
||||||
|
op_sse3_support=0
|
||||||
|
op_sse41_support=0
|
||||||
|
op_avx_support=0
|
||||||
|
op_avx2_support=0
|
||||||
|
op_avx512_support=0
|
||||||
|
OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
|
||||||
|
|
||||||
|
AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
|
||||||
|
[AC_LANG_PUSH([C])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check for AVX512 support
|
||||||
|
#
|
||||||
|
AC_MSG_CHECKING([for AVX512 support (no additional flags)])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m512 vA, vB;
|
||||||
|
_mm512_add_ps(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_avx512_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
|
||||||
|
AS_IF([test $op_avx512_support -eq 0],
|
||||||
|
[AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
|
||||||
|
op_avx_cflags_save="$CFLAGS"
|
||||||
|
CFLAGS="$CFLAGS -march=skylake-avx512"
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m512 vA, vB;
|
||||||
|
_mm512_add_ps(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_avx512_support=1
|
||||||
|
MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
CFLAGS="$op_avx_cflags_save"
|
||||||
|
])
|
||||||
|
#
|
||||||
|
# Some combination of gcc and older as would not correctly build the code generated by
|
||||||
|
# _mm256_loadu_si256. Screen them out.
|
||||||
|
#
|
||||||
|
AS_IF([test $op_avx512_support -eq 1],
|
||||||
|
[AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
|
||||||
|
op_avx_cflags_save="$CFLAGS"
|
||||||
|
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
|
||||||
|
__m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
|
||||||
|
]])],
|
||||||
|
[AC_MSG_RESULT([yes])],
|
||||||
|
[op_avx512_support=0
|
||||||
|
MCA_BUILD_OP_AVX512_FLAGS=""
|
||||||
|
AC_MSG_RESULT([no])])
|
||||||
|
CFLAGS="$op_avx_cflags_save"
|
||||||
|
])
|
||||||
|
#
|
||||||
|
# Check support for AVX2
|
||||||
|
#
|
||||||
|
AC_MSG_CHECKING([for AVX2 support (no additional flags)])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m256 vA, vB;
|
||||||
|
_mm256_add_ps(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_avx2_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
AS_IF([test $op_avx2_support -eq 0],
|
||||||
|
[AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
|
||||||
|
op_avx_cflags_save="$CFLAGS"
|
||||||
|
CFLAGS="$CFLAGS -mavx2"
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m256 vA, vB;
|
||||||
|
_mm256_add_ps(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_avx2_support=1
|
||||||
|
MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
CFLAGS="$op_avx_cflags_save"
|
||||||
|
])
|
||||||
|
#
|
||||||
|
# Some combination of gcc and older as would not correctly build the code generated by
|
||||||
|
# _mm256_loadu_si256. Screen them out.
|
||||||
|
#
|
||||||
|
AS_IF([test $op_avx2_support -eq 1],
|
||||||
|
[AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
|
||||||
|
op_avx_cflags_save="$CFLAGS"
|
||||||
|
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
|
||||||
|
__m256i vA = _mm256_loadu_si256((__m256i*)&A)
|
||||||
|
]])],
|
||||||
|
[AC_MSG_RESULT([yes])],
|
||||||
|
[op_avx2_support=0
|
||||||
|
MCA_BUILD_OP_AVX2_FLAGS=""
|
||||||
|
AC_MSG_RESULT([no])])
|
||||||
|
CFLAGS="$op_avx_cflags_save"
|
||||||
|
])
|
||||||
|
#
|
||||||
|
# What about early AVX support. The rest of the logic is slightly different as
|
||||||
|
# we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
|
||||||
|
# if we can compile AVX code without a flag, then we validate that we have support
|
||||||
|
# for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
|
||||||
|
# the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
|
||||||
|
# instructions.
|
||||||
|
#
|
||||||
|
AC_MSG_CHECKING([for AVX support (no additional flags)])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m128 vA, vB;
|
||||||
|
_mm_add_ps(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_avx_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
#
|
||||||
|
# Check for SSE4.1 support
|
||||||
|
#
|
||||||
|
AS_IF([test $op_avx_support -eq 1],
|
||||||
|
[AC_MSG_CHECKING([for SSE4.1 support])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m128i vA, vB;
|
||||||
|
(void)_mm_max_epi8(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_sse41_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
])
|
||||||
|
#
|
||||||
|
# Check for SSE3 support
|
||||||
|
#
|
||||||
|
AS_IF([test $op_avx_support -eq 1],
|
||||||
|
[AC_MSG_CHECKING([for SSE3 support])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
int A[4] = {0, 1, 2, 3};
|
||||||
|
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
|
||||||
|
]])],
|
||||||
|
[op_sse3_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
])
|
||||||
|
# Second pass, do we need to add the AVX flag ?
|
||||||
|
AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
|
||||||
|
[AC_MSG_CHECKING([for AVX support (with -mavx)])
|
||||||
|
op_avx_cflags_save="$CFLAGS"
|
||||||
|
CFLAGS="$CFLAGS -mavx"
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m128 vA, vB;
|
||||||
|
_mm_add_ps(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_avx_support=1
|
||||||
|
MCA_BUILD_OP_AVX_FLAGS="-mavx"
|
||||||
|
op_sse41_support=0
|
||||||
|
op_sse3_support=0
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
|
||||||
|
AS_IF([test $op_sse41_support -eq 0],
|
||||||
|
[AC_MSG_CHECKING([for SSE4.1 support])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
__m128i vA, vB;
|
||||||
|
(void)_mm_max_epi8(vA, vB)
|
||||||
|
]])],
|
||||||
|
[op_sse41_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
])
|
||||||
|
AS_IF([test $op_sse3_support -eq 0],
|
||||||
|
[AC_MSG_CHECKING([for SSE3 support])
|
||||||
|
AC_LINK_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
|
||||||
|
[[
|
||||||
|
int A[4] = {0, 1, 2, 3};
|
||||||
|
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
|
||||||
|
]])],
|
||||||
|
[op_sse3_support=1
|
||||||
|
AC_MSG_RESULT([yes])],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
])
|
||||||
|
CFLAGS="$op_avx_cflags_save"
|
||||||
|
])
|
||||||
|
|
||||||
|
AC_LANG_POP([C])
|
||||||
|
])
|
||||||
|
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
|
||||||
|
[$op_avx512_support],
|
||||||
|
[AVX512 supported in the current build])
|
||||||
|
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX2],
|
||||||
|
[$op_avx2_support],
|
||||||
|
[AVX2 supported in the current build])
|
||||||
|
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX],
|
||||||
|
[$op_avx_support],
|
||||||
|
[AVX supported in the current build])
|
||||||
|
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE41],
|
||||||
|
[$op_sse41_support],
|
||||||
|
[SSE4.1 supported in the current build])
|
||||||
|
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE3],
|
||||||
|
[$op_sse3_support],
|
||||||
|
[SSE3 supported in the current build])
|
||||||
|
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx512_support],
|
||||||
|
[test "$op_avx512_support" == "1"])
|
||||||
|
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx2_support],
|
||||||
|
[test "$op_avx2_support" == "1"])
|
||||||
|
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx_support],
|
||||||
|
[test "$op_avx_support" == "1"])
|
||||||
|
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse41_support],
|
||||||
|
[test "$op_sse41_support" == "1"])
|
||||||
|
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse3_support],
|
||||||
|
[test "$op_sse3_support" == "1"])
|
||||||
|
AC_SUBST(MCA_BUILD_OP_AVX512_FLAGS)
|
||||||
|
AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
|
||||||
|
AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
|
||||||
|
|
||||||
|
OPAL_VAR_SCOPE_POP
|
||||||
|
# Enable this component iff we have at least the most basic form of support
|
||||||
|
# for vectorial ISA
|
||||||
|
AS_IF([test $op_avx_support -eq 1 || test $op_avx2_support -eq 1 || test $op_avx512_support -eq 1],
|
||||||
|
[$1],
|
||||||
|
[$2])
|
||||||
|
|
||||||
|
])dnl
|
65
ompi/mca/op/avx/op_avx.h
Обычный файл
65
ompi/mca/op/avx/op_avx.h
Обычный файл
@ -0,0 +1,65 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019-2020 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MCA_OP_AVX_EXPORT_H
|
||||||
|
#define MCA_OP_AVX_EXPORT_H
|
||||||
|
|
||||||
|
#include "ompi_config.h"
|
||||||
|
|
||||||
|
#include "ompi/mca/mca.h"
|
||||||
|
#include "opal/class/opal_object.h"
|
||||||
|
|
||||||
|
#include "ompi/mca/op/op.h"
|
||||||
|
|
||||||
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
|
#define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200
|
||||||
|
#define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100
|
||||||
|
#define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020
|
||||||
|
#define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010
|
||||||
|
#define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008
|
||||||
|
#define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004
|
||||||
|
#define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002
|
||||||
|
#define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Derive a struct from the base op component struct, allowing us to
|
||||||
|
* cache some component-specific information on our well-known
|
||||||
|
* component struct.
|
||||||
|
*/
|
||||||
|
typedef struct {
|
||||||
|
/** The base op component struct */
|
||||||
|
ompi_op_base_component_1_0_0_t super;
|
||||||
|
|
||||||
|
/* What follows is avx-component-specific cached information. We
|
||||||
|
tend to use this scheme (caching information on the avx
|
||||||
|
component itself) instead of lots of individual global
|
||||||
|
variables for the component. The following data fields are
|
||||||
|
avxs; replace them with whatever is relevant for your
|
||||||
|
component. */
|
||||||
|
|
||||||
|
uint32_t flags; /* AVX capabilities supported by the processor */
|
||||||
|
} ompi_op_avx_component_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Globally exported variable. Note that it is a *avx* component
|
||||||
|
* (defined above), which has the ompi_op_base_component_t as its
|
||||||
|
* first member. Hence, the MCA/op framework will find the data that
|
||||||
|
* it expects in the first memory locations, but then the component
|
||||||
|
* itself can cache additional information after that that can be used
|
||||||
|
* by both the component and modules.
|
||||||
|
*/
|
||||||
|
OMPI_DECLSPEC extern ompi_op_avx_component_t
|
||||||
|
mca_op_avx_component;
|
||||||
|
|
||||||
|
END_C_DECLS
|
||||||
|
|
||||||
|
#endif /* MCA_OP_AVX_EXPORT_H */
|
295
ompi/mca/op/avx/op_avx_component.c
Обычный файл
295
ompi/mca/op/avx/op_avx_component.c
Обычный файл
@ -0,0 +1,295 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019-2020 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2020 Research Organization for Information Science
|
||||||
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** @file
|
||||||
|
*
|
||||||
|
* This is the "avx" component source code.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ompi_config.h"
|
||||||
|
|
||||||
|
#include "opal/util/printf.h"
|
||||||
|
|
||||||
|
#include "ompi/constants.h"
|
||||||
|
#include "ompi/op/op.h"
|
||||||
|
#include "ompi/mca/op/op.h"
|
||||||
|
#include "ompi/mca/op/base/base.h"
|
||||||
|
#include "ompi/mca/op/avx/op_avx.h"
|
||||||
|
|
||||||
|
static int avx_component_open(void);
|
||||||
|
static int avx_component_close(void);
|
||||||
|
static int avx_component_init_query(bool enable_progress_threads,
|
||||||
|
bool enable_mpi_thread_multiple);
|
||||||
|
static struct ompi_op_base_module_1_0_0_t *
|
||||||
|
avx_component_op_query(struct ompi_op_t *op, int *priority);
|
||||||
|
static int avx_component_register(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A slightly modified code from
|
||||||
|
* https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
|
||||||
|
*/
|
||||||
|
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
static uint32_t has_intel_AVX_features(void)
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
|
||||||
|
flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
#else /* non-Intel compiler */
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
|
||||||
|
{
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
__cpuidex(abcd, eax, ecx);
|
||||||
|
#else
|
||||||
|
uint32_t ebx = 0, edx = 0;
|
||||||
|
#if defined( __i386__ ) && defined ( __PIC__ )
|
||||||
|
/* in case of PIC under 32-bit EBX cannot be clobbered */
|
||||||
|
__asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
|
||||||
|
#else
|
||||||
|
__asm__ ( "cpuid" : "+b" (ebx),
|
||||||
|
#endif /* defined( __i386__ ) && defined ( __PIC__ ) */
|
||||||
|
"+a" (eax), "+c" (ecx), "=d" (edx) );
|
||||||
|
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t has_intel_AVX_features(void)
|
||||||
|
{
|
||||||
|
/* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
|
||||||
|
const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX
|
||||||
|
const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX
|
||||||
|
const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX
|
||||||
|
const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX
|
||||||
|
const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX
|
||||||
|
const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX
|
||||||
|
const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX
|
||||||
|
const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX
|
||||||
|
uint32_t flags = 0, abcd[4];
|
||||||
|
|
||||||
|
run_cpuid( 1, 0, abcd );
|
||||||
|
flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
|
||||||
|
flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
|
||||||
|
flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
|
||||||
|
flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
|
||||||
|
flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
|
||||||
|
// OS supports extended processor state management ?
|
||||||
|
if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
|
||||||
|
return 0;
|
||||||
|
#endif /* defined(__APPLE__) */
|
||||||
|
|
||||||
|
run_cpuid( 7, 0, abcd );
|
||||||
|
flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
|
||||||
|
flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
|
||||||
|
flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
#endif /* non-Intel compiler */
|
||||||
|
|
||||||
|
ompi_op_avx_component_t mca_op_avx_component = {
|
||||||
|
{
|
||||||
|
.opc_version = {
|
||||||
|
OMPI_OP_BASE_VERSION_1_0_0,
|
||||||
|
|
||||||
|
.mca_component_name = "avx",
|
||||||
|
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
|
||||||
|
OMPI_RELEASE_VERSION),
|
||||||
|
.mca_open_component = avx_component_open,
|
||||||
|
.mca_close_component = avx_component_close,
|
||||||
|
.mca_register_component_params = avx_component_register,
|
||||||
|
},
|
||||||
|
.opc_data = {
|
||||||
|
/* The component is checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
|
},
|
||||||
|
|
||||||
|
.opc_init_query = avx_component_init_query,
|
||||||
|
.opc_op_query = avx_component_op_query,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Component open
|
||||||
|
*/
|
||||||
|
static int avx_component_open(void)
|
||||||
|
{
|
||||||
|
/* We checked the flags during register, so if they are set to
|
||||||
|
* zero either the architecture is not suitable or the user disabled
|
||||||
|
* AVX support.
|
||||||
|
*
|
||||||
|
* A first level check to see what level of AVX is available on the
|
||||||
|
* hardware.
|
||||||
|
*
|
||||||
|
* Note that if this function returns non-OMPI_SUCCESS, then this
|
||||||
|
* component won't even be shown in ompi_info output (which is
|
||||||
|
* probably not what you want).
|
||||||
|
*/
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Component close
|
||||||
|
*/
|
||||||
|
static int avx_component_close(void)
|
||||||
|
{
|
||||||
|
/* If avx was opened successfully, close it (i.e., release any
|
||||||
|
resources that may have been allocated on this component).
|
||||||
|
Note that _component_close() will always be called at the end
|
||||||
|
of the process, so it may have been after any/all of the other
|
||||||
|
component functions have been invoked (and possibly even after
|
||||||
|
modules have been created and/or destroyed). */
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Register MCA params.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
avx_component_register(void)
|
||||||
|
{
|
||||||
|
int32_t requested_flags = mca_op_avx_component.flags = has_intel_AVX_features();
|
||||||
|
(void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
|
||||||
|
"support",
|
||||||
|
"Level of SSE/MMX/AVX support to be used (combination of processor capabilities as follow SSE 0x01, SSE2 0x02, SSE3 0x04, SSE4.1 0x08, AVX 0x010, AVX2 0x020, AVX512F 0x100, AVX512BW 0x200) capped by the local architecture capabilities",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_6,
|
||||||
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||||
|
&mca_op_avx_component.flags);
|
||||||
|
mca_op_avx_component.flags &= requested_flags;
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Query whether this component wants to be used in this process.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
avx_component_init_query(bool enable_progress_threads,
|
||||||
|
bool enable_mpi_thread_multiple)
|
||||||
|
{
|
||||||
|
if( 0 == mca_op_avx_component.flags )
|
||||||
|
return OMPI_ERR_NOT_SUPPORTED;
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if OMPI_MCA_OP_HAVE_AVX512
|
||||||
|
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
|
||||||
|
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
|
||||||
|
#endif
|
||||||
|
#if OMPI_MCA_OP_HAVE_AVX2
|
||||||
|
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
|
||||||
|
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
|
||||||
|
#endif
|
||||||
|
#if OMPI_MCA_OP_HAVE_AVX
|
||||||
|
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
|
||||||
|
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
* Query whether this component can be used for a specific op
|
||||||
|
*/
|
||||||
|
static struct ompi_op_base_module_1_0_0_t*
|
||||||
|
avx_component_op_query(struct ompi_op_t *op, int *priority)
|
||||||
|
{
|
||||||
|
ompi_op_base_module_t *module = NULL;
|
||||||
|
/* Sanity check -- although the framework should never invoke the
|
||||||
|
_component_op_query() on non-intrinsic MPI_Op's, we'll put a
|
||||||
|
check here just to be sure. */
|
||||||
|
if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (op->o_f_to_c_index) {
|
||||||
|
case OMPI_OP_BASE_FORTRAN_MAX:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_MIN:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_SUM:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_PROD:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_BOR:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_BAND:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_BXOR:
|
||||||
|
module = OBJ_NEW(ompi_op_base_module_t);
|
||||||
|
for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
|
||||||
|
#if OMPI_MCA_OP_HAVE_AVX512
|
||||||
|
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
|
||||||
|
module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
|
||||||
|
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if OMPI_MCA_OP_HAVE_AVX2
|
||||||
|
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
|
||||||
|
if( NULL == module->opm_fns[i] ) {
|
||||||
|
module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
|
||||||
|
}
|
||||||
|
if( NULL == module->opm_3buff_fns[i] ) {
|
||||||
|
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if OMPI_MCA_OP_HAVE_AVX
|
||||||
|
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
|
||||||
|
if( NULL == module->opm_fns[i] ) {
|
||||||
|
module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
|
||||||
|
}
|
||||||
|
if( NULL == module->opm_3buff_fns[i] ) {
|
||||||
|
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if( NULL != module->opm_fns[i] ) {
|
||||||
|
OBJ_RETAIN(module);
|
||||||
|
}
|
||||||
|
if( NULL != module->opm_3buff_fns[i] ) {
|
||||||
|
OBJ_RETAIN(module);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case OMPI_OP_BASE_FORTRAN_LAND:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_LOR:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_LXOR:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_MAXLOC:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_MINLOC:
|
||||||
|
case OMPI_OP_BASE_FORTRAN_REPLACE:
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* If we got a module from above, we'll return it. Otherwise,
|
||||||
|
we'll return NULL, indicating that this component does not want
|
||||||
|
to be considered for selection for this MPI_Op. Note that the
|
||||||
|
functions each returned a *avx* component pointer
|
||||||
|
(vs. a *base* component pointer -- where an *avx* component
|
||||||
|
is a base component plus some other module-specific cached
|
||||||
|
information), so we have to cast it to the right pointer type
|
||||||
|
before returning. */
|
||||||
|
if (NULL != module) {
|
||||||
|
*priority = 50;
|
||||||
|
}
|
||||||
|
return (ompi_op_base_module_1_0_0_t *) module;
|
||||||
|
}
|
1280
ompi/mca/op/avx/op_avx_functions.c
Обычный файл
1280
ompi/mca/op/avx/op_avx_functions.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -38,7 +38,7 @@
|
|||||||
* This macro is for (out op in).
|
* This macro is for (out op in).
|
||||||
*/
|
*/
|
||||||
#define OP_FUNC(name, type_name, type, op) \
|
#define OP_FUNC(name, type_name, type, op) \
|
||||||
static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
|
static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -58,7 +58,7 @@
|
|||||||
* This macro is for (out = op(out, in))
|
* This macro is for (out = op(out, in))
|
||||||
*/
|
*/
|
||||||
#define FUNC_FUNC(name, type_name, type) \
|
#define FUNC_FUNC(name, type_name, type) \
|
||||||
static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
|
static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -86,7 +86,7 @@
|
|||||||
} ompi_op_predefined_##type_name##_t;
|
} ompi_op_predefined_##type_name##_t;
|
||||||
|
|
||||||
#define LOC_FUNC(name, type_name, op) \
|
#define LOC_FUNC(name, type_name, op) \
|
||||||
static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
|
static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -110,7 +110,7 @@
|
|||||||
* not supports the corresponding complex number type.
|
* not supports the corresponding complex number type.
|
||||||
*/
|
*/
|
||||||
#define COMPLEX_SUM_FUNC(type_name, type) \
|
#define COMPLEX_SUM_FUNC(type_name, type) \
|
||||||
static void ompi_op_base_2buff_sum_##type_name(void *in, void *out, int *count, \
|
static void ompi_op_base_2buff_sum_##type_name(const void *in, void *out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -130,7 +130,7 @@
|
|||||||
* not supports the corresponding complex number type.
|
* not supports the corresponding complex number type.
|
||||||
*/
|
*/
|
||||||
#define COMPLEX_PROD_FUNC(type_name, type) \
|
#define COMPLEX_PROD_FUNC(type_name, type) \
|
||||||
static void ompi_op_base_2buff_prod_##type_name(void *in, void *out, int *count, \
|
static void ompi_op_base_2buff_prod_##type_name(const void *in, void *out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -652,8 +652,8 @@ LOC_FUNC(minloc, long_double_int, <)
|
|||||||
* routines, needed for some optimizations.
|
* routines, needed for some optimizations.
|
||||||
*/
|
*/
|
||||||
#define OP_FUNC_3BUF(name, type_name, type, op) \
|
#define OP_FUNC_3BUF(name, type_name, type, op) \
|
||||||
static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
|
static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
|
||||||
void * restrict in2, void * restrict out, int *count, \
|
const void * restrict in2, void * restrict out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -674,8 +674,8 @@ LOC_FUNC(minloc, long_double_int, <)
|
|||||||
* This macro is for (out = op(in1, in2))
|
* This macro is for (out = op(in1, in2))
|
||||||
*/
|
*/
|
||||||
#define FUNC_FUNC_3BUF(name, type_name, type) \
|
#define FUNC_FUNC_3BUF(name, type_name, type) \
|
||||||
static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
|
static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
|
||||||
void * restrict in2, void * restrict out, int *count, \
|
const void * restrict in2, void * restrict out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -707,8 +707,8 @@ LOC_FUNC(minloc, long_double_int, <)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#define LOC_FUNC_3BUF(name, type_name, op) \
|
#define LOC_FUNC_3BUF(name, type_name, op) \
|
||||||
static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
|
static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
|
||||||
void * restrict in2, void * restrict out, int *count, \
|
const void * restrict in2, void * restrict out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -737,8 +737,8 @@ LOC_FUNC(minloc, long_double_int, <)
|
|||||||
* not supports the corresponding complex number type.
|
* not supports the corresponding complex number type.
|
||||||
*/
|
*/
|
||||||
#define COMPLEX_SUM_FUNC_3BUF(type_name, type) \
|
#define COMPLEX_SUM_FUNC_3BUF(type_name, type) \
|
||||||
static void ompi_op_base_3buff_sum_##type_name(void * restrict in1, \
|
static void ompi_op_base_3buff_sum_##type_name(const void * restrict in1, \
|
||||||
void * restrict in2, void * restrict out, int *count, \
|
const void * restrict in2, void * restrict out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
@ -759,8 +759,8 @@ LOC_FUNC(minloc, long_double_int, <)
|
|||||||
* not supports the corresponding complex number type.
|
* not supports the corresponding complex number type.
|
||||||
*/
|
*/
|
||||||
#define COMPLEX_PROD_FUNC_3BUF(type_name, type) \
|
#define COMPLEX_PROD_FUNC_3BUF(type_name, type) \
|
||||||
static void ompi_op_base_3buff_prod_##type_name(void * restrict in1, \
|
static void ompi_op_base_3buff_prod_##type_name(const void * restrict in1, \
|
||||||
void * restrict in2, void * restrict out, int *count, \
|
const void * restrict in2, void * restrict out, int *count, \
|
||||||
struct ompi_datatype_t **dtype, \
|
struct ompi_datatype_t **dtype, \
|
||||||
struct ompi_op_base_module_1_0_0_t *module) \
|
struct ompi_op_base_module_1_0_0_t *module) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -14,6 +14,8 @@
|
|||||||
* rights reserved.
|
* rights reserved.
|
||||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2020 Research Organization for Information Science
|
||||||
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -206,8 +208,8 @@ int ompi_op_base_op_select(ompi_op_t *op)
|
|||||||
static int avail_op_compare(opal_list_item_t **itema,
|
static int avail_op_compare(opal_list_item_t **itema,
|
||||||
opal_list_item_t **itemb)
|
opal_list_item_t **itemb)
|
||||||
{
|
{
|
||||||
avail_op_t *availa = (avail_op_t *) itema;
|
avail_op_t *availa = (avail_op_t *) *itema;
|
||||||
avail_op_t *availb = (avail_op_t *) itemb;
|
avail_op_t *availb = (avail_op_t *) *itemb;
|
||||||
|
|
||||||
if (availa->ao_priority > availb->ao_priority) {
|
if (availa->ao_priority > availb->ao_priority) {
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -255,7 +255,7 @@ typedef struct ompi_op_base_module_1_0_0_t ompi_op_base_module_t;
|
|||||||
* repeated code, but it's better this way (and this typedef will
|
* repeated code, but it's better this way (and this typedef will
|
||||||
* never change, so there's not much of a maintenance worry).
|
* never change, so there's not much of a maintenance worry).
|
||||||
*/
|
*/
|
||||||
typedef void (*ompi_op_base_handler_fn_1_0_0_t)(void *, void *, int *,
|
typedef void (*ompi_op_base_handler_fn_1_0_0_t)(const void *, void *, int *,
|
||||||
struct ompi_datatype_t **,
|
struct ompi_datatype_t **,
|
||||||
struct ompi_op_base_module_1_0_0_t *);
|
struct ompi_op_base_module_1_0_0_t *);
|
||||||
|
|
||||||
@ -264,8 +264,8 @@ typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t;
|
|||||||
/*
|
/*
|
||||||
* Typedef for 3-buffer (two input and one output) op functions.
|
* Typedef for 3-buffer (two input and one output) op functions.
|
||||||
*/
|
*/
|
||||||
typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(void *,
|
typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(const void *,
|
||||||
void *,
|
const void *,
|
||||||
void *, int *,
|
void *, int *,
|
||||||
struct ompi_datatype_t **,
|
struct ompi_datatype_t **,
|
||||||
struct ompi_op_base_module_1_0_0_t *);
|
struct ompi_op_base_module_1_0_0_t *);
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
if PROJECT_OMPI
|
if PROJECT_OMPI
|
||||||
MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data
|
MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data
|
||||||
MPI_CHECKS = to_self
|
MPI_CHECKS = to_self reduce_local
|
||||||
endif
|
endif
|
||||||
TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS)
|
TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS)
|
||||||
|
|
||||||
@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
|
|||||||
unpack_hetero_LDADD = \
|
unpack_hetero_LDADD = \
|
||||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||||
|
|
||||||
|
reduce_local_SOURCES = reduce_local.c
|
||||||
|
reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
|
||||||
|
reduce_local_LDADD = \
|
||||||
|
$(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
|
||||||
|
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||||
|
|
||||||
distclean:
|
distclean:
|
||||||
rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile
|
rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile
|
||||||
|
81
test/datatype/check_op.sh
Исполняемый файл
81
test/datatype/check_op.sh
Исполняемый файл
@ -0,0 +1,81 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'u'|'f'|'d' : datatype: signed, unsigned, float, double. args3 size of type. args4 operation"
|
||||||
|
mpirun="mpirun --mca pml ob1 --mca btl vader,self"
|
||||||
|
# For SVE-architecture
|
||||||
|
# echo "$mpirun -mca op_sve_hardware_available 0 -mca op_avx_hardware_available 0 -np 1 Reduce_local_float 1048576 i 8 max"
|
||||||
|
|
||||||
|
# For X86_64 architectures
|
||||||
|
# echo "$mpirun -mca op_avx_support 0 -np 1 Reduce_local_float 1048576 i 8 max"
|
||||||
|
|
||||||
|
Orange="\033[0;33m"
|
||||||
|
Blue="\033[0;34m"
|
||||||
|
Purple="\033[0;35m"
|
||||||
|
Yellow="\e[1;33m"
|
||||||
|
|
||||||
|
NC="\e[m"
|
||||||
|
|
||||||
|
verbose=0
|
||||||
|
|
||||||
|
echo "=========Signed Integer type all operations & all sizes========"
|
||||||
|
echo ""
|
||||||
|
for op in max min sum prod band bor bxor; do
|
||||||
|
echo -e "\n===Operation $op test==="
|
||||||
|
for type_size in 8 16 32 64; do
|
||||||
|
for size in 0 1 7 15 31 63 127 130; do
|
||||||
|
foo=$((1024 * 1024 + $size))
|
||||||
|
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * $type_size "
|
||||||
|
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t i -s $type_size -o $op"
|
||||||
|
if test $verbose -eq 1 ; then echo $cmd; fi
|
||||||
|
eval $cmd
|
||||||
|
done
|
||||||
|
echo -e "\n\n"
|
||||||
|
done
|
||||||
|
echo -e "\n\n"
|
||||||
|
done
|
||||||
|
echo "=========Signed Integer type all operations & all sizes========"
|
||||||
|
echo -e "\n\n"
|
||||||
|
|
||||||
|
echo "=========Unsigned Integer type all operations & all sizes========"
|
||||||
|
echo ""
|
||||||
|
for op in max min sum prod band bor bxor; do
|
||||||
|
echo -e "\n===Operation $op test==="
|
||||||
|
for type_size in 8 16 32 64; do
|
||||||
|
for size in 0 1 7 15 31 63 127 130; do
|
||||||
|
foo=$((1024 * 1024 + $size))
|
||||||
|
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * $type_size"
|
||||||
|
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t u -s $type_size -o $op"
|
||||||
|
if test $verbose -eq 1 ; then echo $cmd; fi
|
||||||
|
eval $cmd
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
echo "=========Unsigned Integer type all operations & all sizes========"
|
||||||
|
echo -e "\n\n"
|
||||||
|
|
||||||
|
echo "=======Float type all operations========"
|
||||||
|
echo ""
|
||||||
|
for op in max min sum prod; do
|
||||||
|
for size in 1024 127 130; do
|
||||||
|
foo=$((1024 * 1024 + $size))
|
||||||
|
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * 32"
|
||||||
|
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t f -s 32 -o $op"
|
||||||
|
if test $verbose -eq 1 ; then echo $cmd; fi
|
||||||
|
eval $cmd
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "========Double type all operations========="
|
||||||
|
echo ""
|
||||||
|
for op in max min sum prod; do
|
||||||
|
for size in 1024 127 130; do
|
||||||
|
foo=$((1024 * 1024 + $size))
|
||||||
|
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * 64"
|
||||||
|
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t d -s 64 -o $op"
|
||||||
|
if test $verbose -eq 1 ; then echo $cmd; fi
|
||||||
|
eval $cmd
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
1425
test/datatype/reduce_local.c
Обычный файл
1425
test/datatype/reduce_local.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче
Block a user