1
1

Add supports for MPI_OP using AVX512, AVX2 and MMX

Add logic to handle different architectural capabilities
Detect the compiler flags necessary to build specialized
versions of the MPI_OP. Once the different flavors (AVX512,
AVX2, AVX) are built, detect at runtime which is the best
match with the current processor capabilities.

Add validation checks for loadu 256 and 512 bits.
Add validation tests for MPI_Op.

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
(cherry picked from commit 14b3c70628)
This commit is contained in:
dongzhong 2019-10-11 11:12:07 -04:00 committed by Jeff Squyres
parent 0d1af43851
commit b4e04bbd8a
12 changed files with 3623 additions and 17 deletions

View File

@ -2,7 +2,7 @@ dnl
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
dnl University Research and Technology
dnl Corporation. All rights reserved.
dnl Copyright (c) 2004-2018 The University of Tennessee and The University
dnl Copyright (c) 2004-2020 The University of Tennessee and The University
dnl of Tennessee Research Foundation. All rights
dnl reserved.
dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -1245,7 +1245,7 @@ AC_MSG_ERROR([Can not continue.])
# Check for RDTSCP support
result=0
AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
[AC_MSG_CHECKING([for RDTSCP assembly support])
AC_LANG_PUSH([C])
AC_TRY_RUN([[

101
ompi/mca/op/avx/Makefile.am Normal file
View File

@ -0,0 +1,101 @@
#
# Copyright (c) 2019-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2020 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This component provide support for the Advanced Vector Extensions (AVX)
# available in recent versions of x86 processors.
#
# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
# for more details on how to make Open MPI components.
# First, list all .h and .c sources. It is necessary to list all .h
# files so that they will be picked up in the distribution tarball.
sources = op_avx_component.c op_avx.h
sources_extended = op_avx_functions.c
# Open MPI components can be compiled two ways:
#
# 1. As a standalone dynamic shared object (DSO), sometimes called a
# dynamically loadable library (DLL).
#
# 2. As a static library that is slurped up into the upper-level
# libmpi library (regardless of whether libmpi is a static or dynamic
# library). This is called a "Libtool convenience library".
#
# The component needs to create an output library in this top-level
# component directory, and named either mca_<type>_<name>.la (for DSO
# builds) or libmca_<type>_<name>.la (for static builds). The OMPI
# build system will have set the
# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
# which way this component should be built.
# We need to support all processors from early AVX to full AVX512 support, based on
# a decision made at runtime. So, we generate all combinations of capabilities, and
# we will select the most suitable (based on the processor flags) during the
# component initialization.
specialized_op_libs =
if MCA_BUILD_ompi_op_has_avx_support
specialized_op_libs += liblocal_ops_avx.la
liblocal_ops_avx_la_SOURCES = $(sources_extended)
liblocal_ops_avx_la_CFLAGS = @MCA_BUILD_OP_AVX_FLAGS@
liblocal_ops_avx_la_CPPFLAGS = -DGENERATE_AVX_CODE
if MCA_BUILD_ompi_op_has_sse3_support
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE3_CODE
endif
if MCA_BUILD_ompi_op_has_sse41_support
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE41_CODE
endif
endif
if MCA_BUILD_ompi_op_has_avx2_support
specialized_op_libs += liblocal_ops_avx2.la
liblocal_ops_avx2_la_SOURCES = $(sources_extended)
liblocal_ops_avx2_la_CFLAGS = @MCA_BUILD_OP_AVX2_FLAGS@
liblocal_ops_avx2_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE
endif
if MCA_BUILD_ompi_op_has_avx512_support
specialized_op_libs += liblocal_ops_avx512.la
liblocal_ops_avx512_la_SOURCES = $(sources_extended)
liblocal_ops_avx512_la_CFLAGS = @MCA_BUILD_OP_AVX512_FLAGS@
liblocal_ops_avx512_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE -DGENERATE_AVX512_CODE
endif
component_noinst = $(specialized_op_libs)
if MCA_BUILD_ompi_op_avx_DSO
component_install = mca_op_avx.la
else
component_install =
component_noinst += libmca_op_avx.la
endif
# Specific information for DSO builds.
#
# The DSO should install itself in $(ompilibdir) (by default,
# $prefix/lib/openmpi).
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_op_avx_la_SOURCES = $(sources)
mca_op_avx_la_LIBADD = $(specialized_op_libs)
mca_op_avx_la_LDFLAGS = -module -avoid-version
# Specific information for static builds.
#
# Note that we *must* "noinst"; the upper-layer Makefile.am's will
# slurp in the resulting .la library into libmpi.
noinst_LTLIBRARIES = $(component_noinst)
libmca_op_avx_la_SOURCES = $(sources)
libmca_op_avx_la_LIBADD = $(specialized_op_libs)
libmca_op_avx_la_LDFLAGS = -module -avoid-version

View File

@ -0,0 +1,265 @@
# -*- shell-script -*-
#
# Copyright (c) 2019-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
MCA_BUILD_OP_AVX_FLAGS=""
MCA_BUILD_OP_AVX2_FLAGS=""
MCA_BUILD_OP_AVX512_FLAGS=""
op_sse3_support=0
op_sse41_support=0
op_avx_support=0
op_avx2_support=0
op_avx512_support=0
OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
[AC_LANG_PUSH([C])
#
# Check for AVX512 support
#
AC_MSG_CHECKING([for AVX512 support (no additional flags)])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m512 vA, vB;
_mm512_add_ps(vA, vB)
]])],
[op_avx512_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
AS_IF([test $op_avx512_support -eq 0],
[AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS -march=skylake-avx512"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m512 vA, vB;
_mm512_add_ps(vA, vB)
]])],
[op_avx512_support=1
MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# Some combination of gcc and older as would not correctly build the code generated by
# _mm256_loadu_si256. Screen them out.
#
AS_IF([test $op_avx512_support -eq 1],
[AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
__m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
]])],
[AC_MSG_RESULT([yes])],
[op_avx512_support=0
MCA_BUILD_OP_AVX512_FLAGS=""
AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# Check support for AVX2
#
AC_MSG_CHECKING([for AVX2 support (no additional flags)])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m256 vA, vB;
_mm256_add_ps(vA, vB)
]])],
[op_avx2_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
AS_IF([test $op_avx2_support -eq 0],
[AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS -mavx2"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m256 vA, vB;
_mm256_add_ps(vA, vB)
]])],
[op_avx2_support=1
MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# Some combination of gcc and older as would not correctly build the code generated by
# _mm256_loadu_si256. Screen them out.
#
AS_IF([test $op_avx2_support -eq 1],
[AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
__m256i vA = _mm256_loadu_si256((__m256i*)&A)
]])],
[AC_MSG_RESULT([yes])],
[op_avx2_support=0
MCA_BUILD_OP_AVX2_FLAGS=""
AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# What about early AVX support. The rest of the logic is slightly different as
# we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
# if we can compile AVX code without a flag, then we validate that we have support
# for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
# the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
# instructions.
#
AC_MSG_CHECKING([for AVX support (no additional flags)])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128 vA, vB;
_mm_add_ps(vA, vB)
]])],
[op_avx_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
#
# Check for SSE4.1 support
#
AS_IF([test $op_avx_support -eq 1],
[AC_MSG_CHECKING([for SSE4.1 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128i vA, vB;
(void)_mm_max_epi8(vA, vB)
]])],
[op_sse41_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
#
# Check for SSE3 support
#
AS_IF([test $op_avx_support -eq 1],
[AC_MSG_CHECKING([for SSE3 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[4] = {0, 1, 2, 3};
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
]])],
[op_sse3_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
# Second pass, do we need to add the AVX flag ?
AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
[AC_MSG_CHECKING([for AVX support (with -mavx)])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS -mavx"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128 vA, vB;
_mm_add_ps(vA, vB)
]])],
[op_avx_support=1
MCA_BUILD_OP_AVX_FLAGS="-mavx"
op_sse41_support=0
op_sse3_support=0
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
AS_IF([test $op_sse41_support -eq 0],
[AC_MSG_CHECKING([for SSE4.1 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128i vA, vB;
(void)_mm_max_epi8(vA, vB)
]])],
[op_sse41_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
AS_IF([test $op_sse3_support -eq 0],
[AC_MSG_CHECKING([for SSE3 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[4] = {0, 1, 2, 3};
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
]])],
[op_sse3_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
CFLAGS="$op_avx_cflags_save"
])
AC_LANG_POP([C])
])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
[$op_avx512_support],
[AVX512 supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX2],
[$op_avx2_support],
[AVX2 supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX],
[$op_avx_support],
[AVX supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE41],
[$op_sse41_support],
[SSE4.1 supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE3],
[$op_sse3_support],
[SSE3 supported in the current build])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx512_support],
[test "$op_avx512_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx2_support],
[test "$op_avx2_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx_support],
[test "$op_avx_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse41_support],
[test "$op_sse41_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse3_support],
[test "$op_sse3_support" == "1"])
AC_SUBST(MCA_BUILD_OP_AVX512_FLAGS)
AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
OPAL_VAR_SCOPE_POP
# Enable this component iff we have at least the most basic form of support
# for vectorial ISA
AS_IF([test $op_avx_support -eq 1 || test $op_avx2_support -eq 1 || test $op_avx512_support -eq 1],
[$1],
[$2])
])dnl

65
ompi/mca/op/avx/op_avx.h Normal file
View File

@ -0,0 +1,65 @@
/*
* Copyright (c) 2019-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_OP_AVX_EXPORT_H
#define MCA_OP_AVX_EXPORT_H
#include "ompi_config.h"
#include "ompi/mca/mca.h"
#include "opal/class/opal_object.h"
#include "ompi/mca/op/op.h"
BEGIN_C_DECLS
#define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200
#define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100
#define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020
#define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010
#define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008
#define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004
#define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002
#define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001
/**
* Derive a struct from the base op component struct, allowing us to
* cache some component-specific information on our well-known
* component struct.
*/
typedef struct {
/** The base op component struct */
ompi_op_base_component_1_0_0_t super;
/* What follows is avx-component-specific cached information. We
tend to use this scheme (caching information on the avx
component itself) instead of lots of individual global
variables for the component. The following data fields are
avxs; replace them with whatever is relevant for your
component. */
uint32_t flags; /* AVX capabilities supported by the processor */
} ompi_op_avx_component_t;
/**
* Globally exported variable. Note that it is a *avx* component
* (defined above), which has the ompi_op_base_component_t as its
* first member. Hence, the MCA/op framework will find the data that
* it expects in the first memory locations, but then the component
* itself can cache additional information after that that can be used
* by both the component and modules.
*/
OMPI_DECLSPEC extern ompi_op_avx_component_t
mca_op_avx_component;
END_C_DECLS
#endif /* MCA_OP_AVX_EXPORT_H */

View File

@ -0,0 +1,295 @@
/*
* Copyright (c) 2019-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file
*
* This is the "avx" component source code.
*
*/
#include "ompi_config.h"
#include "opal/util/printf.h"
#include "ompi/constants.h"
#include "ompi/op/op.h"
#include "ompi/mca/op/op.h"
#include "ompi/mca/op/base/base.h"
#include "ompi/mca/op/avx/op_avx.h"
static int avx_component_open(void);
static int avx_component_close(void);
static int avx_component_init_query(bool enable_progress_threads,
bool enable_mpi_thread_multiple);
static struct ompi_op_base_module_1_0_0_t *
avx_component_op_query(struct ompi_op_t *op, int *priority);
static int avx_component_register(void);
/**
* A slightly modified code from
* https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
*/
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
#include <immintrin.h>
static uint32_t has_intel_AVX_features(void)
{
uint32_t flags = 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
return flags;
}
#else /* non-Intel compiler */
#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
{
#if defined(_MSC_VER)
__cpuidex(abcd, eax, ecx);
#else
uint32_t ebx = 0, edx = 0;
#if defined( __i386__ ) && defined ( __PIC__ )
/* in case of PIC under 32-bit EBX cannot be clobbered */
__asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
#else
__asm__ ( "cpuid" : "+b" (ebx),
#endif /* defined( __i386__ ) && defined ( __PIC__ ) */
"+a" (eax), "+c" (ecx), "=d" (edx) );
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
#endif
}
static uint32_t has_intel_AVX_features(void)
{
/* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX
const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX
const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX
const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX
const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX
const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX
const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX
const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX
uint32_t flags = 0, abcd[4];
run_cpuid( 1, 0, abcd );
flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
#if defined(__APPLE__)
uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
// OS supports extended processor state management ?
if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
return 0;
#endif /* defined(__APPLE__) */
run_cpuid( 7, 0, abcd );
flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
return flags;
}
#endif /* non-Intel compiler */
ompi_op_avx_component_t mca_op_avx_component = {
{
.opc_version = {
OMPI_OP_BASE_VERSION_1_0_0,
.mca_component_name = "avx",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_open_component = avx_component_open,
.mca_close_component = avx_component_close,
.mca_register_component_params = avx_component_register,
},
.opc_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.opc_init_query = avx_component_init_query,
.opc_op_query = avx_component_op_query,
},
};
/*
* Component open
*/
static int avx_component_open(void)
{
/* We checked the flags during register, so if they are set to
* zero either the architecture is not suitable or the user disabled
* AVX support.
*
* A first level check to see what level of AVX is available on the
* hardware.
*
* Note that if this function returns non-OMPI_SUCCESS, then this
* component won't even be shown in ompi_info output (which is
* probably not what you want).
*/
return OMPI_SUCCESS;
}
/*
* Component close
*/
static int avx_component_close(void)
{
/* If avx was opened successfully, close it (i.e., release any
resources that may have been allocated on this component).
Note that _component_close() will always be called at the end
of the process, so it may have been after any/all of the other
component functions have been invoked (and possibly even after
modules have been created and/or destroyed). */
return OMPI_SUCCESS;
}
/*
* Register MCA params.
*/
static int
avx_component_register(void)
{
int32_t requested_flags = mca_op_avx_component.flags = has_intel_AVX_features();
(void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
"support",
"Level of SSE/MMX/AVX support to be used (combination of processor capabilities as follow SSE 0x01, SSE2 0x02, SSE3 0x04, SSE4.1 0x08, AVX 0x010, AVX2 0x020, AVX512F 0x100, AVX512BW 0x200) capped by the local architecture capabilities",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_op_avx_component.flags);
mca_op_avx_component.flags &= requested_flags;
return OMPI_SUCCESS;
}
/*
* Query whether this component wants to be used in this process.
*/
static int
avx_component_init_query(bool enable_progress_threads,
bool enable_mpi_thread_multiple)
{
if( 0 == mca_op_avx_component.flags )
return OMPI_ERR_NOT_SUPPORTED;
return OMPI_SUCCESS;
}
#if OMPI_MCA_OP_HAVE_AVX512
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
#endif
#if OMPI_MCA_OP_HAVE_AVX2
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
#endif
#if OMPI_MCA_OP_HAVE_AVX
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
#endif
/*
* Query whether this component can be used for a specific op
*/
static struct ompi_op_base_module_1_0_0_t*
avx_component_op_query(struct ompi_op_t *op, int *priority)
{
ompi_op_base_module_t *module = NULL;
/* Sanity check -- although the framework should never invoke the
_component_op_query() on non-intrinsic MPI_Op's, we'll put a
check here just to be sure. */
if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
return NULL;
}
switch (op->o_f_to_c_index) {
case OMPI_OP_BASE_FORTRAN_MAX:
case OMPI_OP_BASE_FORTRAN_MIN:
case OMPI_OP_BASE_FORTRAN_SUM:
case OMPI_OP_BASE_FORTRAN_PROD:
case OMPI_OP_BASE_FORTRAN_BOR:
case OMPI_OP_BASE_FORTRAN_BAND:
case OMPI_OP_BASE_FORTRAN_BXOR:
module = OBJ_NEW(ompi_op_base_module_t);
for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
#if OMPI_MCA_OP_HAVE_AVX512
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
}
#endif
#if OMPI_MCA_OP_HAVE_AVX2
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
if( NULL == module->opm_fns[i] ) {
module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
}
if( NULL == module->opm_3buff_fns[i] ) {
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
}
}
#endif
#if OMPI_MCA_OP_HAVE_AVX
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
if( NULL == module->opm_fns[i] ) {
module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
}
if( NULL == module->opm_3buff_fns[i] ) {
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
}
}
#endif
if( NULL != module->opm_fns[i] ) {
OBJ_RETAIN(module);
}
if( NULL != module->opm_3buff_fns[i] ) {
OBJ_RETAIN(module);
}
}
break;
case OMPI_OP_BASE_FORTRAN_LAND:
case OMPI_OP_BASE_FORTRAN_LOR:
case OMPI_OP_BASE_FORTRAN_LXOR:
case OMPI_OP_BASE_FORTRAN_MAXLOC:
case OMPI_OP_BASE_FORTRAN_MINLOC:
case OMPI_OP_BASE_FORTRAN_REPLACE:
default:
break;
}
/* If we got a module from above, we'll return it. Otherwise,
we'll return NULL, indicating that this component does not want
to be considered for selection for this MPI_Op. Note that the
functions each returned a *avx* component pointer
(vs. a *base* component pointer -- where an *avx* component
is a base component plus some other module-specific cached
information), so we have to cast it to the right pointer type
before returning. */
if (NULL != module) {
*priority = 50;
}
return (ompi_op_base_module_1_0_0_t *) module;
}

File diff suppressed because it is too large Load Diff

View File

@ -37,7 +37,7 @@
* This macro is for (out op in).
*/
#define OP_FUNC(name, type_name, type, op) \
static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
@ -57,7 +57,7 @@
* This macro is for (out = op(out, in))
*/
#define FUNC_FUNC(name, type_name, type) \
static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
@ -85,7 +85,7 @@
} ompi_op_predefined_##type_name##_t;
#define LOC_FUNC(name, type_name, op) \
static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
@ -102,6 +102,49 @@
} \
}
/*
* Define a function to calculate sum of complex numbers using a real
* number floating-point type (float, double, etc.). This macro is used
* when the compiler supports a real number floating-point type but does
* not supports the corresponding complex number type.
*/
#define COMPLEX_SUM_FUNC(type_name, type) \
static void ompi_op_base_2buff_sum_##type_name(const void *in, void *out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
int i; \
type (*a)[2] = (type (*)[2]) in; \
type (*b)[2] = (type (*)[2]) out; \
for (i = 0; i < *count; ++i, ++a, ++b) { \
(*b)[0] += (*a)[0]; \
(*b)[1] += (*a)[1]; \
} \
}
/*
* Define a function to calculate product of complex numbers using a real
* number floating-point type (float, double, etc.). This macro is used
* when the compiler supports a real number floating-point type but does
* not supports the corresponding complex number type.
*/
#define COMPLEX_PROD_FUNC(type_name, type) \
static void ompi_op_base_2buff_prod_##type_name(const void *in, void *out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
int i; \
type (*a)[2] = (type (*)[2]) in; \
type (*b)[2] = (type (*)[2]) out; \
type c[2]; \
for (i = 0; i < *count; ++i, ++a, ++b) { \
c[0] = (*a)[0] * (*b)[0] - (*a)[1] * (*b)[1]; \
c[1] = (*a)[0] * (*b)[1] + (*a)[1] * (*b)[0]; \
(*b)[0] = c[0]; \
(*b)[1] = c[1]; \
} \
}
/*************************************************************************
* Max
*************************************************************************/
@ -604,8 +647,8 @@ LOC_FUNC(minloc, long_double_int, <)
* routines, needed for some optimizations.
*/
#define OP_FUNC_3BUF(name, type_name, type, op) \
static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
void * restrict in2, void * restrict out, int *count, \
static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
const void * restrict in2, void * restrict out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
@ -626,8 +669,8 @@ LOC_FUNC(minloc, long_double_int, <)
* This macro is for (out = op(in1, in2))
*/
#define FUNC_FUNC_3BUF(name, type_name, type) \
static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
void * restrict in2, void * restrict out, int *count, \
static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
const void * restrict in2, void * restrict out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
@ -659,8 +702,8 @@ LOC_FUNC(minloc, long_double_int, <)
*/
#define LOC_FUNC_3BUF(name, type_name, op) \
static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
void * restrict in2, void * restrict out, int *count, \
static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
const void * restrict in2, void * restrict out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
@ -682,6 +725,50 @@ LOC_FUNC(minloc, long_double_int, <)
} \
}
/*
* Define a function to calculate sum of complex numbers using a real
* number floating-point type (float, double, etc.). This macro is used
* when the compiler supports a real number floating-point type but does
* not supports the corresponding complex number type.
*/
#define COMPLEX_SUM_FUNC_3BUF(type_name, type) \
static void ompi_op_base_3buff_sum_##type_name(const void * restrict in1, \
const void * restrict in2, void * restrict out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
int i; \
type (*a1)[2] = (type (*)[2]) in1; \
type (*a2)[2] = (type (*)[2]) in2; \
type (*b)[2] = (type (*)[2]) out; \
for (i = 0; i < *count; ++i, ++a1, ++a2, ++b) { \
(*b)[0] = (*a1)[0] + (*a2)[0]; \
(*b)[1] = (*a1)[1] + (*a2)[1]; \
} \
}
/*
* Define a function to calculate product of complex numbers using a real
* number floating-point type (float, double, etc.). This macro is used
* when the compiler supports a real number floating-point type but does
* not supports the corresponding complex number type.
*/
#define COMPLEX_PROD_FUNC_3BUF(type_name, type) \
static void ompi_op_base_3buff_prod_##type_name(const void * restrict in1, \
const void * restrict in2, void * restrict out, int *count, \
struct ompi_datatype_t **dtype, \
struct ompi_op_base_module_1_0_0_t *module) \
{ \
int i; \
type (*a1)[2] = (type (*)[2]) in1; \
type (*a2)[2] = (type (*)[2]) in2; \
type (*b)[2] = (type (*)[2]) out; \
for (i = 0; i < *count; ++i, ++a1, ++a2, ++b) { \
(*b)[0] = (*a1)[0] * (*a2)[0] - (*a1)[1] * (*a2)[1]; \
(*b)[1] = (*a1)[0] * (*a2)[1] + (*a1)[1] * (*a2)[0]; \
} \
}
/*************************************************************************
* Max
*************************************************************************/

View File

@ -14,6 +14,8 @@
* rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2020 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -206,8 +208,8 @@ int ompi_op_base_op_select(ompi_op_t *op)
static int avail_op_compare(opal_list_item_t **itema,
opal_list_item_t **itemb)
{
avail_op_t *availa = (avail_op_t *) itema;
avail_op_t *availb = (avail_op_t *) itemb;
avail_op_t *availa = (avail_op_t *) *itema;
avail_op_t *availb = (avail_op_t *) *itemb;
if (availa->ao_priority > availb->ao_priority) {
return 1;

View File

@ -250,7 +250,7 @@ typedef struct ompi_op_base_module_1_0_0_t ompi_op_base_module_t;
* repeated code, but it's better this way (and this typedef will
* never change, so there's not much of a maintenance worry).
*/
typedef void (*ompi_op_base_handler_fn_1_0_0_t)(void *, void *, int *,
typedef void (*ompi_op_base_handler_fn_1_0_0_t)(const void *, void *, int *,
struct ompi_datatype_t **,
struct ompi_op_base_module_1_0_0_t *);
@ -259,8 +259,8 @@ typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t;
/*
* Typedef for 3-buffer (two input and one output) op functions.
*/
typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(void *,
void *,
typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(const void *,
const void *,
void *, int *,
struct ompi_datatype_t **,
struct ompi_op_base_module_1_0_0_t *);

View File

@ -16,7 +16,7 @@
if PROJECT_OMPI
MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data
MPI_CHECKS = to_self
MPI_CHECKS = to_self reduce_local
endif
TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS)
@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
unpack_hetero_LDADD = \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
reduce_local_SOURCES = reduce_local.c
reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
reduce_local_LDADD = \
$(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
distclean:
rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile

81
test/datatype/check_op.sh Executable file
View File

@ -0,0 +1,81 @@
#!/bin/bash
set -u
echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'u'|'f'|'d' : datatype: signed, unsigned, float, double. args3 size of type. args4 operation"
mpirun="mpirun --mca pml ob1 --mca btl vader,self"
# For SVE-architecture
# echo "$mpirun -mca op_sve_hardware_available 0 -mca op_avx_hardware_available 0 -np 1 Reduce_local_float 1048576 i 8 max"
# For X86_64 architectures
# echo "$mpirun -mca op_avx_support 0 -np 1 Reduce_local_float 1048576 i 8 max"
Orange="\033[0;33m"
Blue="\033[0;34m"
Purple="\033[0;35m"
Yellow="\e[1;33m"
NC="\e[m"
verbose=0
echo "=========Signed Integer type all operations & all sizes========"
echo ""
for op in max min sum prod band bor bxor; do
echo -e "\n===Operation $op test==="
for type_size in 8 16 32 64; do
for size in 0 1 7 15 31 63 127 130; do
foo=$((1024 * 1024 + $size))
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * $type_size "
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t i -s $type_size -o $op"
if test $verbose -eq 1 ; then echo $cmd; fi
eval $cmd
done
echo -e "\n\n"
done
echo -e "\n\n"
done
echo "=========Signed Integer type all operations & all sizes========"
echo -e "\n\n"
echo "=========Unsigned Integer type all operations & all sizes========"
echo ""
for op in max min sum prod band bor bxor; do
echo -e "\n===Operation $op test==="
for type_size in 8 16 32 64; do
for size in 0 1 7 15 31 63 127 130; do
foo=$((1024 * 1024 + $size))
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * $type_size"
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t u -s $type_size -o $op"
if test $verbose -eq 1 ; then echo $cmd; fi
eval $cmd
done
done
done
echo "=========Unsigned Integer type all operations & all sizes========"
echo -e "\n\n"
echo "=======Float type all operations========"
echo ""
for op in max min sum prod; do
for size in 1024 127 130; do
foo=$((1024 * 1024 + $size))
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * 32"
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t f -s 32 -o $op"
if test $verbose -eq 1 ; then echo $cmd; fi
eval $cmd
done
done
echo "========Double type all operations========="
echo ""
for op in max min sum prod; do
for size in 1024 127 130; do
foo=$((1024 * 1024 + $size))
echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * 64"
cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t d -s 64 -o $op"
if test $verbose -eq 1 ; then echo $cmd; fi
eval $cmd
done
done

1425
test/datatype/reduce_local.c Normal file

File diff suppressed because it is too large Load Diff