From 14b3c706289cfc26e53b13efd3eb85641636e459 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 11 Oct 2019 11:12:07 -0400
Subject: [PATCH] Add supports for MPI_OP using AVX512, AVX2 and MMX

Add logic to handle different architectural capabilities
Detect the compiler flags necessary to build specialized
versions of the MPI_OP. Once the different flavors (AVX512,
AVX2, AVX) are built, detect at runtime which is the best
match with the current processor capabilities.

Add validation checks for loadu 256 and 512 bits.
Add validation tests for MPI_Op.

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 config/opal_config_asm.m4            |    4 +-
 ompi/mca/op/avx/Makefile.am          |  101 ++
 ompi/mca/op/avx/configure.m4         |  265 +++++
 ompi/mca/op/avx/op_avx.h             |   65 ++
 ompi/mca/op/avx/op_avx_component.c   |  295 ++++++
 ompi/mca/op/avx/op_avx_functions.c   | 1280 +++++++++++++++++++++++
 ompi/mca/op/base/op_base_functions.c |   30 +-
 ompi/mca/op/base/op_base_op_select.c |    6 +-
 ompi/mca/op/op.h                     |    6 +-
 test/datatype/Makefile.am            |    8 +-
 test/datatype/check_op.sh            |   81 ++
 test/datatype/reduce_local.c         | 1425 ++++++++++++++++++++++++++
 12 files changed, 3543 insertions(+), 23 deletions(-)
 create mode 100644 ompi/mca/op/avx/Makefile.am
 create mode 100644 ompi/mca/op/avx/configure.m4
 create mode 100644 ompi/mca/op/avx/op_avx.h
 create mode 100644 ompi/mca/op/avx/op_avx_component.c
 create mode 100644 ompi/mca/op/avx/op_avx_functions.c
 create mode 100755 test/datatype/check_op.sh
 create mode 100644 test/datatype/reduce_local.c

diff --git a/config/opal_config_asm.m4 b/config/opal_config_asm.m4
index d692e1d349..14ad4e2d81 100644
--- a/config/opal_config_asm.m4
+++ b/config/opal_config_asm.m4
@@ -2,7 +2,7 @@ dnl
 dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 dnl                         University Research and Technology
 dnl                         Corporation.  All rights reserved.
-dnl Copyright (c) 2004-2018 The University of Tennessee and The University
+dnl Copyright (c) 2004-2020 The University of Tennessee and The University
 dnl                         of Tennessee Research Foundation.  All rights
 dnl                         reserved.
 dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -1255,7 +1255,7 @@ AC_DEFUN([OPAL_CONFIG_ASM],[
 
     # Check for RDTSCP support
     result=0
-    AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
+    AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
           [AC_MSG_CHECKING([for RDTSCP assembly support])
            AC_LANG_PUSH([C])
            AC_TRY_RUN([[
diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
new file mode 100644
index 0000000000..41dcf2e183
--- /dev/null
+++ b/ompi/mca/op/avx/Makefile.am
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2019-2020 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2020      Research Organization for Information Science
+#                         and Technology (RIST).  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This component provide support for the Advanced Vector Extensions (AVX)
+# available in recent versions of x86 processors.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+sources = op_avx_component.c op_avx.h
+sources_extended = op_avx_functions.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+# We need to support all processors from early AVX to full AVX512 support, based on
+# a decision made at runtime. So, we generate all combinations of capabilities, and
+# we will select the most suitable (based on the processor flags) during the
+# component initialization.
+specialized_op_libs =
+if MCA_BUILD_ompi_op_has_avx_support
+specialized_op_libs += liblocal_ops_avx.la
+liblocal_ops_avx_la_SOURCES = $(sources_extended)
+liblocal_ops_avx_la_CFLAGS = @MCA_BUILD_OP_AVX_FLAGS@
+liblocal_ops_avx_la_CPPFLAGS = -DGENERATE_AVX_CODE
+if MCA_BUILD_ompi_op_has_sse3_support
+liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE3_CODE
+endif
+if MCA_BUILD_ompi_op_has_sse41_support
+liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE41_CODE
+endif
+endif
+if MCA_BUILD_ompi_op_has_avx2_support
+specialized_op_libs += liblocal_ops_avx2.la
+liblocal_ops_avx2_la_SOURCES = $(sources_extended)
+liblocal_ops_avx2_la_CFLAGS = @MCA_BUILD_OP_AVX2_FLAGS@
+liblocal_ops_avx2_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE
+endif
+if MCA_BUILD_ompi_op_has_avx512_support
+specialized_op_libs += liblocal_ops_avx512.la
+liblocal_ops_avx512_la_SOURCES = $(sources_extended)
+liblocal_ops_avx512_la_CFLAGS = @MCA_BUILD_OP_AVX512_FLAGS@
+liblocal_ops_avx512_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE -DGENERATE_AVX512_CODE
+endif
+
+component_noinst = $(specialized_op_libs)
+if MCA_BUILD_ompi_op_avx_DSO
+component_install = mca_op_avx.la
+else
+component_install =
+component_noinst += libmca_op_avx.la
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_avx_la_SOURCES = $(sources)
+mca_op_avx_la_LIBADD = $(specialized_op_libs)
+mca_op_avx_la_LDFLAGS = -module -avoid-version
+
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_avx_la_SOURCES = $(sources)
+libmca_op_avx_la_LIBADD = $(specialized_op_libs)
+libmca_op_avx_la_LDFLAGS = -module -avoid-version
+
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
new file mode 100644
index 0000000000..889aa85ba5
--- /dev/null
+++ b/ompi/mca/op/avx/configure.m4
@@ -0,0 +1,265 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019-2020 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2020      Cisco Systems, Inc.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
+#                         [action-if-cant-compile])
+# ------------------------------------------------
+# We can always build, unless we were explicitly disabled.
+AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
+
+    MCA_BUILD_OP_AVX_FLAGS=""
+    MCA_BUILD_OP_AVX2_FLAGS=""
+    MCA_BUILD_OP_AVX512_FLAGS=""
+    op_sse3_support=0
+    op_sse41_support=0
+    op_avx_support=0
+    op_avx2_support=0
+    op_avx512_support=0
+    OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
+
+    AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
+          [AC_LANG_PUSH([C])
+
+           #
+           # Check for AVX512 support
+           #
+           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
+           AC_LINK_IFELSE(
+               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB)
+                                ]])],
+               [op_avx512_support=1
+                AC_MSG_RESULT([yes])],
+               [AC_MSG_RESULT([no])])
+
+           AS_IF([test $op_avx512_support -eq 0],
+                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
+                  op_avx_cflags_save="$CFLAGS"
+                  CFLAGS="$CFLAGS -march=skylake-avx512"
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                       [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB)
+                                       ]])],
+                      [op_avx512_support=1
+                       MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])
+                  CFLAGS="$op_avx_cflags_save"
+                 ])
+           #
+           # Some combination of gcc and older as would not correctly build the code generated by
+           # _mm256_loadu_si256. Screen them out.
+           #
+           AS_IF([test $op_avx512_support -eq 1],
+                 [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
+                  op_avx_cflags_save="$CFLAGS"
+                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                               [[
+    int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+    __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
+                               ]])],
+                      [AC_MSG_RESULT([yes])],
+                      [op_avx512_support=0
+                       MCA_BUILD_OP_AVX512_FLAGS=""
+                       AC_MSG_RESULT([no])])
+                  CFLAGS="$op_avx_cflags_save"
+                 ])
+           #
+           # Check support for AVX2
+           #
+           AC_MSG_CHECKING([for AVX2 support (no additional flags)])
+           AC_LINK_IFELSE(
+               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                       [[
+    __m256 vA, vB;
+    _mm256_add_ps(vA, vB)
+                       ]])],
+               [op_avx2_support=1
+                AC_MSG_RESULT([yes])],
+               [AC_MSG_RESULT([no])])
+           AS_IF([test $op_avx2_support -eq 0],
+               [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
+                op_avx_cflags_save="$CFLAGS"
+                CFLAGS="$CFLAGS -mavx2"
+                AC_LINK_IFELSE(
+                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                            [[
+    __m256 vA, vB;
+    _mm256_add_ps(vA, vB)
+                            ]])],
+                    [op_avx2_support=1
+                     MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
+                     AC_MSG_RESULT([yes])],
+                    [AC_MSG_RESULT([no])])
+                CFLAGS="$op_avx_cflags_save"
+                ])
+           #
+           # Some combination of gcc and older as would not correctly build the code generated by
+           # _mm256_loadu_si256. Screen them out.
+           #
+           AS_IF([test $op_avx2_support -eq 1],
+                 [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
+                  op_avx_cflags_save="$CFLAGS"
+                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                               [[
+    int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    __m256i vA = _mm256_loadu_si256((__m256i*)&A)
+                               ]])],
+                      [AC_MSG_RESULT([yes])],
+                      [op_avx2_support=0
+                       MCA_BUILD_OP_AVX2_FLAGS=""
+                       AC_MSG_RESULT([no])])
+                  CFLAGS="$op_avx_cflags_save"
+                 ])
+           #
+           # What about early AVX support. The rest of the logic is slightly different as
+           # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
+           # if we can compile AVX code without a flag, then we validate that we have support
+           # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
+           # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
+           # instructions.
+           #
+           AC_MSG_CHECKING([for AVX support (no additional flags)])
+           AC_LINK_IFELSE(
+               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                       [[
+    __m128 vA, vB;
+    _mm_add_ps(vA, vB)
+                       ]])],
+               [op_avx_support=1
+                AC_MSG_RESULT([yes])],
+               [AC_MSG_RESULT([no])])
+           #
+           # Check for SSE4.1 support
+           #
+           AS_IF([test $op_avx_support -eq 1],
+               [AC_MSG_CHECKING([for SSE4.1 support])
+                AC_LINK_IFELSE(
+                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                            [[
+    __m128i vA, vB;
+    (void)_mm_max_epi8(vA, vB)
+                            ]])],
+                    [op_sse41_support=1
+                     AC_MSG_RESULT([yes])],
+                    [AC_MSG_RESULT([no])])
+                ])
+           #
+           # Check for SSE3 support
+           #
+           AS_IF([test $op_avx_support -eq 1],
+               [AC_MSG_CHECKING([for SSE3 support])
+                AC_LINK_IFELSE(
+                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                            [[
+    int A[4] = {0, 1, 2, 3};
+    __m128i vA = _mm_lddqu_si128((__m128i*)&A)
+                            ]])],
+                    [op_sse3_support=1
+                     AC_MSG_RESULT([yes])],
+                    [AC_MSG_RESULT([no])])
+                ])
+           # Second pass, do we need to add the AVX flag ?
+           AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
+               [AC_MSG_CHECKING([for AVX support (with -mavx)])
+                op_avx_cflags_save="$CFLAGS"
+                CFLAGS="$CFLAGS -mavx"
+                AC_LINK_IFELSE(
+                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                            [[
+    __m128 vA, vB;
+    _mm_add_ps(vA, vB)
+                            ]])],
+                    [op_avx_support=1
+                     MCA_BUILD_OP_AVX_FLAGS="-mavx"
+                     op_sse41_support=0
+                     op_sse3_support=0
+                     AC_MSG_RESULT([yes])],
+                    [AC_MSG_RESULT([no])])
+
+                AS_IF([test $op_sse41_support -eq 0],
+                    [AC_MSG_CHECKING([for SSE4.1 support])
+                     AC_LINK_IFELSE(
+                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                 [[
+    __m128i vA, vB;
+    (void)_mm_max_epi8(vA, vB)
+                                 ]])],
+                         [op_sse41_support=1
+                          AC_MSG_RESULT([yes])],
+                         [AC_MSG_RESULT([no])])
+                     ])
+                AS_IF([test $op_sse3_support -eq 0],
+                    [AC_MSG_CHECKING([for SSE3 support])
+                     AC_LINK_IFELSE(
+                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                 [[
+    int A[4] = {0, 1, 2, 3};
+    __m128i vA = _mm_lddqu_si128((__m128i*)&A)
+                                 ]])],
+                         [op_sse3_support=1
+                          AC_MSG_RESULT([yes])],
+                         [AC_MSG_RESULT([no])])
+                     ])
+                CFLAGS="$op_avx_cflags_save"
+               ])
+
+           AC_LANG_POP([C])
+          ])
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
+                       [$op_avx512_support],
+                       [AVX512 supported in the current build])
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX2],
+                       [$op_avx2_support],
+                       [AVX2 supported in the current build])
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX],
+                       [$op_avx_support],
+                       [AVX supported in the current build])
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE41],
+                       [$op_sse41_support],
+                       [SSE4.1 supported in the current build])
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE3],
+                       [$op_sse3_support],
+                       [SSE3 supported in the current build])
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx512_support],
+                   [test "$op_avx512_support" == "1"])
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx2_support],
+                   [test "$op_avx2_support" == "1"])
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx_support],
+                   [test "$op_avx_support" == "1"])
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse41_support],
+                   [test "$op_sse41_support" == "1"])
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse3_support],
+                   [test "$op_sse3_support" == "1"])
+    AC_SUBST(MCA_BUILD_OP_AVX512_FLAGS)
+    AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
+    AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
+
+    OPAL_VAR_SCOPE_POP
+    # Enable this component iff we have at least the most basic form of support
+    # for vectorial ISA
+    AS_IF([test $op_avx_support -eq 1 || test $op_avx2_support -eq 1 || test $op_avx512_support -eq 1],
+          [$1],
+          [$2])
+
+])dnl
diff --git a/ompi/mca/op/avx/op_avx.h b/ompi/mca/op/avx/op_avx.h
new file mode 100644
index 0000000000..1f2523f887
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_AVX_EXPORT_H
+#define MCA_OP_AVX_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+#define OMPI_OP_AVX_HAS_AVX512BW_FLAG  0x00000200
+#define OMPI_OP_AVX_HAS_AVX512F_FLAG   0x00000100
+#define OMPI_OP_AVX_HAS_AVX2_FLAG      0x00000020
+#define OMPI_OP_AVX_HAS_AVX_FLAG       0x00000010
+#define OMPI_OP_AVX_HAS_SSE4_1_FLAG    0x00000008
+#define OMPI_OP_AVX_HAS_SSE3_FLAG      0x00000004
+#define OMPI_OP_AVX_HAS_SSE2_FLAG      0x00000002
+#define OMPI_OP_AVX_HAS_SSE_FLAG       0x00000001
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is avx-component-specific cached information.  We
+       tend to use this scheme (caching information on the avx
+       component itself) instead of lots of individual global
+       variables for the component.  The following data fields are
+       avxs; replace them with whatever is relevant for your
+       component. */
+
+    uint32_t flags; /* AVX capabilities supported by the processor */
+} ompi_op_avx_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *avx* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_avx_component_t
+    mca_op_avx_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_AVX_EXPORT_H */
diff --git a/ompi/mca/op/avx/op_avx_component.c b/ompi/mca/op/avx/op_avx_component.c
new file mode 100644
index 0000000000..e5c5d76097
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx_component.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "avx" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/avx/op_avx.h"
+
+static int avx_component_open(void);
+static int avx_component_close(void);
+static int avx_component_init_query(bool enable_progress_threads,
+                                    bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    avx_component_op_query(struct ompi_op_t *op, int *priority);
+static int avx_component_register(void);
+
+/**
+ * A slightly modified code from
+ * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
+
+#include <immintrin.h>
+
+static uint32_t has_intel_AVX_features(void)
+{
+    uint32_t flags = 0;
+
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F)  ? OMPI_OP_AVX_HAS_AVX512F_FLAG   : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX2)     ? OMPI_OP_AVX_HAS_AVX2_FLAG      : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX)      ? OMPI_OP_AVX_HAS_AVX_FLAG       : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1)   ? OMPI_OP_AVX_HAS_SSE4_1_FLAG    : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE3)     ? OMPI_OP_AVX_HAS_SSE3_FLAG      : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE2)     ? OMPI_OP_AVX_HAS_SSE2_FLAG      : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE)      ? OMPI_OP_AVX_HAS_SSE_FLAG       : 0;
+    return flags;
+}
+#else /* non-Intel compiler */
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
+{
+#if defined(_MSC_VER)
+    __cpuidex(abcd, eax, ecx);
+#else
+    uint32_t ebx = 0, edx = 0;
+#if defined( __i386__ ) && defined ( __PIC__ )
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
+#else
+    __asm__ ( "cpuid" : "+b" (ebx),
+#endif  /* defined( __i386__ ) && defined ( __PIC__ ) */
+              "+a" (eax), "+c" (ecx), "=d" (edx) );
+    abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
+#endif
+}
+
+static uint32_t has_intel_AVX_features(void)
+{
+    /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
+    const uint32_t avx512f_mask   = (1U << 16);  // AVX512F   (EAX = 7, ECX = 0) : EBX
+    const uint32_t avx512_bw_mask = (1U << 30);  // AVX512BW  (EAX = 7, ECX = 0) : EBX
+    const uint32_t avx2_mask      = (1U << 5);   // AVX2      (EAX = 7, ECX = 0) : EBX
+    const uint32_t avx_mask       = (1U << 28);  // AVX       (EAX = 1, ECX = 0) : ECX
+    const uint32_t sse4_1_mask    = (1U << 19);  // SSE4.1    (EAX = 1, ECX = 0) : ECX
+    const uint32_t sse3_mask      = (1U << 0);   // SSE3      (EAX = 1, ECX = 0) : ECX
+    const uint32_t sse2_mask      = (1U << 26);  // SSE2      (EAX = 1, ECX = 0) : EDX
+    const uint32_t sse_mask       = (1U << 15);  // SSE       (EAX = 1, ECX = 0) : EDX
+    uint32_t flags = 0, abcd[4];
+
+    run_cpuid( 1, 0, abcd );
+    flags |= (abcd[2] & avx_mask)       ? OMPI_OP_AVX_HAS_AVX_FLAG      : 0;
+    flags |= (abcd[2] & sse4_1_mask)    ? OMPI_OP_AVX_HAS_SSE4_1_FLAG   : 0;
+    flags |= (abcd[2] & sse3_mask)      ? OMPI_OP_AVX_HAS_SSE3_FLAG     : 0;
+    flags |= (abcd[3] & sse2_mask)      ? OMPI_OP_AVX_HAS_SSE2_FLAG     : 0;
+    flags |= (abcd[3] & sse_mask)       ? OMPI_OP_AVX_HAS_SSE_FLAG      : 0;
+#if defined(__APPLE__)
+    uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27));  /* FMA(12) + MOVBE (22) OSXSAVE (27) */
+    // OS supports extended processor state management ?
+    if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
+        return 0;
+#endif  /* defined(__APPLE__) */
+
+    run_cpuid( 7, 0, abcd );
+    flags |= (abcd[1] & avx512f_mask)   ? OMPI_OP_AVX_HAS_AVX512F_FLAG  : 0;
+    flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
+    flags |= (abcd[1] & avx2_mask)      ? OMPI_OP_AVX_HAS_AVX2_FLAG     : 0;
+    return flags;
+}
+#endif /* non-Intel compiler */
+
+ompi_op_avx_component_t mca_op_avx_component = {
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "avx",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = avx_component_open,
+            .mca_close_component = avx_component_close,
+            .mca_register_component_params = avx_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = avx_component_init_query,
+        .opc_op_query = avx_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int avx_component_open(void)
+{
+    /* We checked the flags during register, so if they are set to
+     * zero either the architecture is not suitable or the user disabled
+     * AVX support.
+     *
+     * A first level check to see what level of AVX is available on the
+     * hardware.
+     *
+     * Note that if this function returns non-OMPI_SUCCESS, then this
+     * component won't even be shown in ompi_info output (which is
+     * probably not what you want).
+     */
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int avx_component_close(void)
+{
+    /* If avx was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int
+avx_component_register(void)
+{
+    int32_t requested_flags = mca_op_avx_component.flags = has_intel_AVX_features();
+    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
+                                           "support",
+                                           "Level of SSE/MMX/AVX support to be used (combination of processor capabilities as follow SSE 0x01, SSE2 0x02, SSE3 0x04, SSE4.1 0x08, AVX 0x010, AVX2 0x020, AVX512F 0x100, AVX512BW 0x200) capped by the local architecture capabilities",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_6,
+                                           MCA_BASE_VAR_SCOPE_LOCAL,
+                                           &mca_op_avx_component.flags);
+    mca_op_avx_component.flags &= requested_flags;
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int
+avx_component_init_query(bool enable_progress_threads,
+                         bool enable_mpi_thread_multiple)
+{
+    if( 0 == mca_op_avx_component.flags )
+        return OMPI_ERR_NOT_SUPPORTED;
+    return OMPI_SUCCESS;
+}
+
+#if OMPI_MCA_OP_HAVE_AVX512
+ extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif
+#if OMPI_MCA_OP_HAVE_AVX2
+ extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif
+#if OMPI_MCA_OP_HAVE_AVX
+ extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t*
+avx_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = NULL;
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+    case OMPI_OP_BASE_FORTRAN_MIN:
+    case OMPI_OP_BASE_FORTRAN_SUM:
+    case OMPI_OP_BASE_FORTRAN_PROD:
+    case OMPI_OP_BASE_FORTRAN_BOR:
+    case OMPI_OP_BASE_FORTRAN_BAND:
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        module = OBJ_NEW(ompi_op_base_module_t);
+        for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+#if OMPI_MCA_OP_HAVE_AVX512
+            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
+                module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
+                module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
+            }
+#endif
+#if OMPI_MCA_OP_HAVE_AVX2
+            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
+                if( NULL == module->opm_fns[i] ) {
+                    module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
+                }
+                if( NULL == module->opm_3buff_fns[i] ) {
+                    module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
+                }
+            }
+#endif
+#if OMPI_MCA_OP_HAVE_AVX
+            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
+                if( NULL == module->opm_fns[i] ) {
+                    module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
+                }
+                if( NULL == module->opm_3buff_fns[i] ) {
+                    module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
+                }
+            }
+#endif
+            if( NULL != module->opm_fns[i] ) {
+                OBJ_RETAIN(module);
+            }
+            if( NULL != module->opm_3buff_fns[i] ) {
+                OBJ_RETAIN(module);
+            }
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+    case OMPI_OP_BASE_FORTRAN_LOR:
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+    case OMPI_OP_BASE_FORTRAN_REPLACE:
+    default:
+        break;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *avx* component pointer
+       (vs. a *base* component pointer -- where an *avx* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
new file mode 100644
index 0000000000..7f9062fa73
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/avx/op_avx.h"
+
+#include <immintrin.h>
+
+#if defined(GENERATE_AVX512_CODE)
+#define PREPEND _avx512
+#elif defined(GENERATE_AVX2_CODE)
+#define PREPEND _avx2
+#elif defined(GENERATE_AVX_CODE)
+#define PREPEND _avx
+#else
+#error This file should not be compiled in this conditions
+#endif
+
+/*
+ * Concatenate preprocessor tokens A and B without expanding macro definitions
+ * (however, if invoked from a macro, macro arguments are expanded).
+ */
+#define OP_CONCAT_NX(A, B) A ## B
+
+/*
+ * Concatenate preprocessor tokens A and B after macro-expanding them.
+ */
+#define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ * Support ops: max, min, for signed/unsigned 8,16,32,64
+ *              sum, for integer 8,16,32,64
+ *
+ */
+
+#define OMPI_OP_AVX_HAS_FLAGS(_flag) \
+  (((_flag) & mca_op_avx_component.flags) == (_flag))
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op)               \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
+        int types_per_step = (512 / 8) / sizeof(type);                         \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
+            __m512i vecA =  _mm512_loadu_si512((__m512*)in);                   \
+            in += types_per_step;                                              \
+            __m512i vecB =  _mm512_loadu_si512((__m512*)out);                  \
+            __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB);  \
+            _mm512_storeu_si512((__m512*)out, res);                            \
+            out += types_per_step;                                             \
+        }                                                                      \
+        if( 0 == left_over ) return;                                           \
+    }
+#else
+#define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op)                 \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) {  \
+        int types_per_step = (256 / 8) / sizeof(type);  /* AVX2 */             \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
+            __m256i vecA = _mm256_loadu_si256((__m256i*)in);                   \
+            in += types_per_step;                                              \
+            __m256i vecB = _mm256_loadu_si256((__m256i*)out);                  \
+            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm256_storeu_si256((__m256i*)out, res);                           \
+            out += types_per_step;                                             \
+        }                                                                      \
+        if( 0 == left_over ) return;                                           \
+    }
+#else
+#define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op)               \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
+        int types_per_step = (128 / 8) / sizeof(type);  /* AVX */              \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
+            __m128i vecA = _mm_lddqu_si128((__m128i*)in);                      \
+            in += types_per_step;                                              \
+            __m128i vecB = _mm_lddqu_si128((__m128i*)out);                     \
+            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB);    \
+            _mm_storeu_si128((__m128i*)out, res);                              \
+            out += types_per_step;                                             \
+        }                                                                      \
+    }
+#else
+#define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_FUNC(name, type_sign, type_size, type, op)                      \
+static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in, void *_out, int *count, \
+                                                                 struct ompi_datatype_t **dtype, \
+                                                                 struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                              \
+    int left_over = *count;                                                    \
+    type *in = (type*)_in, *out = (type*)_out;                                 \
+    OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op);                  \
+    OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op);                    \
+    OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op);                  \
+    while( left_over > 0 ) {                                                   \
+        int how_much = (left_over > 8) ? 8 : left_over;                        \
+        switch(how_much) {                                                     \
+        case 8: out[7] = current_func(out[7], in[7]);                          \
+        case 7: out[6] = current_func(out[6], in[6]);                          \
+        case 6: out[5] = current_func(out[5], in[5]);                          \
+        case 5: out[4] = current_func(out[4], in[4]);                          \
+        case 4: out[3] = current_func(out[3], in[3]);                          \
+        case 3: out[2] = current_func(out[2], in[2]);                          \
+        case 2: out[1] = current_func(out[1], in[1]);                          \
+        case 1: out[0] = current_func(out[0], in[0]);                          \
+        }                                                                      \
+        left_over -= how_much;                                                 \
+        out += how_much;                                                       \
+        in += how_much;                                                        \
+    }                                                                          \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op)         \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {  \
+        int types_per_step = (256 / 8) / sizeof(type);                  \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in);       \
+            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)out);      \
+            in += types_per_step;                                       \
+            __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
+            __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp);              \
+            __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB);  \
+            vecB_tmp = _mm512_cvtepi16_epi8(res);                       \
+            _mm256_storeu_si256((__m256i*)out, vecB_tmp);               \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+/**
+ * There is no support for 16 to 8 conversion without AVX512BW and AVX512VL, so
+ * there is no AVX-only optimized function posible for OP_AVX_AVX2_MUL.
+ */
+
+/* special case for int8 mul */
+#define OP_AVX_MUL(name, type_sign, type_size, type, op)                \
+static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_in, void *_out, int *count, \
+                                                                   struct ompi_datatype_t **dtype, \
+                                                                   struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int left_over = *count;                                             \
+    type *in = (type*)_in, *out = (type*)_out;                          \
+    OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op);            \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(out[7], in[7]);                   \
+        case 7: out[6] = current_func(out[6], in[6]);                   \
+        case 6: out[5] = current_func(out[5], in[5]);                   \
+        case 5: out[4] = current_func(out[4], in[4]);                   \
+        case 4: out[3] = current_func(out[3], in[3]);                   \
+        case 3: out[2] = current_func(out[2], in[2]);                   \
+        case 2: out[1] = current_func(out[1], in[1]);                   \
+        case 1: out[0] = current_func(out[0], in[0]);                   \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in += how_much;                                                 \
+    }                                                                   \
+}
+
+/*
+ *  This macro is for bit-wise operations (out op in).
+ *
+ *  Support ops: or, xor, and of 512 bits (representing integer data)
+ *
+ */
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op)               \
+    if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {        \
+        types_per_step = (512 / 8) / sizeof(type);                      \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m512i vecA =  _mm512_loadu_si512((__m512i*)in);           \
+            in += types_per_step;                                       \
+            __m512i vecB =  _mm512_loadu_si512((__m512i*)out);          \
+            __m512i res = _mm512_##op##_si512(vecA, vecB);              \
+            _mm512_storeu_si512((__m512i*)out, res);                    \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op)                 \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
+        types_per_step = (256 / 8) / sizeof(type);                      \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m256i vecA = _mm256_loadu_si256((__m256i*)in);            \
+            in += types_per_step;                                       \
+            __m256i vecB = _mm256_loadu_si256((__m256i*)out);           \
+            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
+            _mm256_storeu_si256((__m256i*)out, res);                    \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op)                 \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
+        types_per_step = (128 / 8) / sizeof(type);                      \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m128i vecA = _mm_lddqu_si128((__m128i*)in);               \
+            in += types_per_step;                                       \
+            __m128i vecB = _mm_lddqu_si128((__m128i*)out);              \
+            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
+            _mm_storeu_si128((__m128i*)out, res);                       \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_BIT_FUNC(name, type_size, type, op)                      \
+static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in, void *_out, int *count, \
+                                                       struct ompi_datatype_t **dtype, \
+                                                       struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int types_per_step, left_over = *count;                             \
+    type *in = (type*)_in, *out = (type*)_out;                          \
+    OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op);                  \
+    OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op);                    \
+    OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op);                    \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(out[7], in[7]);                   \
+        case 7: out[6] = current_func(out[6], in[6]);                   \
+        case 6: out[5] = current_func(out[5], in[5]);                   \
+        case 5: out[4] = current_func(out[4], in[4]);                   \
+        case 4: out[3] = current_func(out[3], in[3]);                   \
+        case 3: out[2] = current_func(out[2], in[2]);                   \
+        case 2: out[1] = current_func(out[1], in[1]);                   \
+        case 1: out[0] = current_func(out[0], in[0]);                   \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in += how_much;                                                 \
+    }                                                                   \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_FLOAT_FUNC(op)                                    \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+        types_per_step = (512 / 8) / sizeof(float);                     \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m512 vecA =  _mm512_load_ps((__m512*)in);                 \
+            __m512 vecB =  _mm512_load_ps((__m512*)out);                \
+            in += types_per_step;                                       \
+            __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
+            _mm512_store_ps((__m512*)out, res);                         \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_FLOAT_FUNC(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX_FLOAT_FUNC(op)                                       \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+        types_per_step = (256 / 8) / sizeof(float);                     \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m256 vecA =  _mm256_load_ps(in);                          \
+            in += types_per_step;                                       \
+            __m256 vecB =  _mm256_load_ps(out);                         \
+            __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
+            _mm256_store_ps(out, res);                                  \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX_FLOAT_FUNC(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE_FLOAT_FUNC(op)                                       \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
+        types_per_step = (128 / 8) / sizeof(float);                     \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m128 vecA = _mm_load_ps(in);                              \
+            in += types_per_step;                                       \
+            __m128 vecB = _mm_load_ps(out);                             \
+            __m128 res = _mm_##op##_ps(vecA, vecB);                     \
+            _mm_store_ps(out, res);                                     \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE_FLOAT_FUNC(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_FLOAT_FUNC(op) \
+static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, void *_out, int *count, \
+                                                              struct ompi_datatype_t **dtype, \
+                                                              struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int types_per_step, left_over = *count;                             \
+    float *in = (float*)_in, *out = (float*)_out;                       \
+    OP_AVX_AVX512_FLOAT_FUNC(op);                                       \
+    OP_AVX_AVX_FLOAT_FUNC(op);                                          \
+    OP_AVX_SSE_FLOAT_FUNC(op);                                          \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(out[7], in[7]);                   \
+        case 7: out[6] = current_func(out[6], in[6]);                   \
+        case 6: out[5] = current_func(out[5], in[5]);                   \
+        case 5: out[4] = current_func(out[4], in[4]);                   \
+        case 4: out[3] = current_func(out[3], in[3]);                   \
+        case 3: out[2] = current_func(out[2], in[2]);                   \
+        case 2: out[1] = current_func(out[1], in[1]);                   \
+        case 1: out[0] = current_func(out[0], in[0]);                   \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in += how_much;                                                 \
+    }                                                                   \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_DOUBLE_FUNC(op)                                   \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+        types_per_step = (512 / 8)  / sizeof(double);                   \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m512d vecA =  _mm512_load_pd(in);                         \
+            in += types_per_step;                                       \
+            __m512d vecB =  _mm512_load_pd(out);                        \
+            __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
+            _mm512_store_pd((out), res);                                \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_DOUBLE_FUNC(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX_DOUBLE_FUNC(op)                                      \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+        types_per_step = (256 / 8)  / sizeof(double);                   \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m256d vecA =  _mm256_load_pd(in);                         \
+            in += types_per_step;                                       \
+            __m256d vecB =  _mm256_load_pd(out);                        \
+            __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
+            _mm256_store_pd(out, res);                                  \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+      }
+#else
+#define OP_AVX_AVX_DOUBLE_FUNC(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE2_DOUBLE_FUNC(op)                                     \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
+        types_per_step = (128 / 8)  / sizeof(double);                   \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m128d vecA = _mm_load_pd(in);                             \
+            in += types_per_step;                                       \
+            __m128d vecB = _mm_load_pd(out);                            \
+            __m128d res = _mm_##op##_pd(vecA, vecB);                    \
+            _mm_store_pd(out, res);                                     \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE2_DOUBLE_FUNC(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_DOUBLE_FUNC(op) \
+static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, void *_out, int *count, \
+                                                               struct ompi_datatype_t **dtype, \
+                                                               struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int types_per_step = (512 / 8)  / sizeof(double);                   \
+    int left_over = *count;                                             \
+    double* in = (double*)_in;                                          \
+    double* out = (double*)_out;                                        \
+    OP_AVX_AVX512_DOUBLE_FUNC(op);                                      \
+    OP_AVX_AVX_DOUBLE_FUNC(op);                                         \
+    OP_AVX_SSE2_DOUBLE_FUNC(op);                                        \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(out[7], in[7]);                   \
+        case 7: out[6] = current_func(out[6], in[6]);                   \
+        case 6: out[5] = current_func(out[5], in[5]);                   \
+        case 5: out[4] = current_func(out[4], in[4]);                   \
+        case 4: out[3] = current_func(out[3], in[3]);                   \
+        case 3: out[2] = current_func(out[2], in[2]);                   \
+        case 2: out[1] = current_func(out[1], in[1]);                   \
+        case 1: out[0] = current_func(out[0], in[0]);                   \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in += how_much;                                                 \
+    }                                                                   \
+}
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+    OP_AVX_FUNC(max, i, 8,    int8_t, max)
+    OP_AVX_FUNC(max, u, 8,   uint8_t, max)
+    OP_AVX_FUNC(max, i, 16,  int16_t, max)
+    OP_AVX_FUNC(max, u, 16, uint16_t, max)
+    OP_AVX_FUNC(max, i, 32,  int32_t, max)
+    OP_AVX_FUNC(max, u, 32, uint32_t, max)
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+    OP_AVX_FUNC(max, i, 64,  int64_t, max)
+    OP_AVX_FUNC(max, u, 64, uint64_t, max)
+#endif
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(max)
+    OP_AVX_DOUBLE_FUNC(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_AVX_FUNC(min, i, 8,    int8_t, min)
+    OP_AVX_FUNC(min, u, 8,   uint8_t, min)
+    OP_AVX_FUNC(min, i, 16,  int16_t, min)
+    OP_AVX_FUNC(min, u, 16, uint16_t, min)
+    OP_AVX_FUNC(min, i, 32,  int32_t, min)
+    OP_AVX_FUNC(min, u, 32, uint32_t, min)
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+    OP_AVX_FUNC(min, i, 64,  int64_t, min)
+    OP_AVX_FUNC(min, u, 64, uint64_t, min)
+#endif
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(min)
+    OP_AVX_DOUBLE_FUNC(min)
+
+/*************************************************************************
+ * Sum
+ ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+    OP_AVX_FUNC(sum, i, 8,    int8_t, adds)
+    OP_AVX_FUNC(sum, u, 8,   uint8_t, adds)
+    OP_AVX_FUNC(sum, i, 16,  int16_t, adds)
+    OP_AVX_FUNC(sum, u, 16, uint16_t, adds)
+    OP_AVX_FUNC(sum, i, 32,  int32_t, add)
+    OP_AVX_FUNC(sum, i, 32, uint32_t, add)
+    OP_AVX_FUNC(sum, i, 64,  int64_t, add)
+    OP_AVX_FUNC(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(add)
+    OP_AVX_DOUBLE_FUNC(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_AVX_MUL(prod, i, 8, int8_t, mullo)
+    OP_AVX_MUL(prod, i, 8, uint8_t, mullo)
+    OP_AVX_FUNC(prod, i, 16,  int16_t, mullo)
+    OP_AVX_FUNC(prod, i, 16, uint16_t, mullo)
+    OP_AVX_FUNC(prod, i, 32,  int32_t, mullo)
+    OP_AVX_FUNC(prod, i ,32, uint32_t, mullo)
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+    OP_AVX_FUNC(prod, i, 64,  int64_t, mullo)
+    OP_AVX_FUNC(prod, i, 64, uint64_t, mullo)
+#endif
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(mul)
+    OP_AVX_DOUBLE_FUNC(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_AVX_BIT_FUNC(band, 8,    int8_t, and)
+    OP_AVX_BIT_FUNC(band, 8,   uint8_t, and)
+    OP_AVX_BIT_FUNC(band, 16,  int16_t, and)
+    OP_AVX_BIT_FUNC(band, 16, uint16_t, and)
+    OP_AVX_BIT_FUNC(band, 32,  int32_t, and)
+    OP_AVX_BIT_FUNC(band, 32, uint32_t, and)
+    OP_AVX_BIT_FUNC(band, 64,  int64_t, and)
+    OP_AVX_BIT_FUNC(band, 64, uint64_t, and)
+
+    // not defined - OP_AVX_FLOAT_FUNC(and)
+    // not defined - OP_AVX_DOUBLE_FUNC(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_AVX_BIT_FUNC(bor, 8,    int8_t, or)
+    OP_AVX_BIT_FUNC(bor, 8,   uint8_t, or)
+    OP_AVX_BIT_FUNC(bor, 16,  int16_t, or)
+    OP_AVX_BIT_FUNC(bor, 16, uint16_t, or)
+    OP_AVX_BIT_FUNC(bor, 32,  int32_t, or)
+    OP_AVX_BIT_FUNC(bor, 32, uint32_t, or)
+    OP_AVX_BIT_FUNC(bor, 64,  int64_t, or)
+    OP_AVX_BIT_FUNC(bor, 64, uint64_t, or)
+
+    // not defined - OP_AVX_FLOAT_FUNC(or)
+    // not defined - OP_AVX_DOUBLE_FUNC(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_AVX_BIT_FUNC(bxor, 8,    int8_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 8,   uint8_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 16,  int16_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 16, uint16_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 32,  int32_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 32, uint32_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 64,  int64_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 64, uint64_t, xor)
+
+    // not defined - OP_AVX_FLOAT_FUNC(xor)
+    // not defined - OP_AVX_DOUBLE_FUNC(xor)
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *  routines, needed for some optimizations.
+ */
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op)      \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {   \
+        int types_per_step = (512 / 8) / sizeof(type);                  \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m512i vecA =  _mm512_loadu_si512(in1);                    \
+            __m512i vecB =  _mm512_loadu_si512(in2);                    \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm512_storeu_si512((out), res);                            \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op)        \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
+        int types_per_step = (256 / 8) / sizeof(type);                  \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m256i vecA = _mm256_loadu_si256((__m256i*)in1);           \
+            __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm256_storeu_si256((__m256i*)out, res);                    \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op)      \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) {       \
+        int types_per_step = (128 / 8) / sizeof(type);                  \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m128i vecA = _mm_lddqu_si128((__m128i*)in1);              \
+            __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm_storeu_si128((__m128i*)out, res);                       \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_FUNC_3(name, type_sign, type_size, type, op)               \
+static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * restrict _in1, \
+                                                                 const void * restrict _in2, \
+                                                                 void * restrict _out, int *count, \
+                                                                 struct ompi_datatype_t **dtype, \
+                                                                 struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    type *in1 = (type*)_in1, *in2 = (type*)_in2, *out = (type*)_out;    \
+    int left_over = *count;                                             \
+    OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op);         \
+    OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op);           \
+    OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op);         \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(in1[7], in2[7]);                  \
+        case 7: out[6] = current_func(in1[6], in2[6]);                  \
+        case 6: out[5] = current_func(in1[5], in2[5]);                  \
+        case 5: out[4] = current_func(in1[4], in2[4]);                  \
+        case 4: out[3] = current_func(in1[3], in2[3]);                  \
+        case 3: out[2] = current_func(in1[2], in2[2]);                  \
+        case 2: out[1] = current_func(in1[1], in2[1]);                  \
+        case 1: out[0] = current_func(in1[0], in2[0]);                  \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in1 += how_much;                                                \
+        in2 += how_much;                                                \
+    }                                                                   \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op)       \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
+        int types_per_step = (256 / 8) / sizeof(type);                  \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in1);      \
+            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)in2);      \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
+            __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp);              \
+            __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB);  \
+            vecB_tmp = _mm512_cvtepi16_epi8(res);                       \
+            _mm256_storeu_si256((__m256i*)out, vecB_tmp);               \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+  }
+#else
+#define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+/**
+ * There is no support for 16 to 8 conversion without AVX512BW and AVX512VL, so
+ * there is no AVX-only optimized function posible for OP_AVX_AVX2_MUL.
+ */
+
+/* special case for int8 mul */
+#define OP_AVX_MUL_3(name, type_sign, type_size, type, op)              \
+static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * restrict _in1, \
+                                                                 const void * restrict _in2, \
+                                                                 void * restrict _out, int *count, \
+                                                                 struct ompi_datatype_t **dtype, \
+                                                                 struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    type *in1 = (type*)_in1, *in2 = (type*)_in2, *out = (type*)_out;    \
+    int left_over = *count;                                             \
+    OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op);          \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(in1[7], in2[7]);                  \
+        case 7: out[6] = current_func(in1[6], in2[6]);                  \
+        case 6: out[5] = current_func(in1[5], in2[5]);                  \
+        case 5: out[4] = current_func(in1[4], in2[4]);                  \
+        case 4: out[3] = current_func(in1[3], in2[3]);                  \
+        case 3: out[2] = current_func(in1[2], in2[2]);                  \
+        case 2: out[1] = current_func(in1[1], in2[1]);                  \
+        case 1: out[0] = current_func(in1[0], in2[0]);                  \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in1 += how_much;                                                \
+        in2 += how_much;                                                \
+    }                                                                   \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op)             \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+        types_per_step = (512 / 8) / sizeof(type);                      \
+        for (; left_over >= types_per_step; left_over -= types_per_step) {  \
+            __m512i vecA =  _mm512_loadu_si512(in1);                    \
+            __m512i vecB =  _mm512_loadu_si512(in2);                    \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m512i res = _mm512_##op##_si512(vecA, vecB);              \
+            _mm512_storeu_si512(out, res);                              \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op)               \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
+        types_per_step = (256 / 8) / sizeof(type);                      \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) {     \
+            __m256i vecA = _mm256_loadu_si256((__m256i*)in1);           \
+            __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
+            _mm256_storeu_si256((__m256i*)out, res);                    \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op)               \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
+        types_per_step = (128 / 8) / sizeof(type);                      \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) {     \
+            __m128i vecA = _mm_lddqu_si128((__m128i*)in1);              \
+            __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
+            _mm_storeu_si128((__m128i*)out, res);                       \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_BIT_FUNC_3(name, type_size, type, op)                    \
+static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, const void *_in2, \
+                                                               void *_out, int *count, \
+                                                               struct ompi_datatype_t **dtype, \
+                                                               struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int types_per_step, left_over = *count;                             \
+    type *in1 = (type*)_in1, *in2 = (type*)_in2, *out = (type*)_out;    \
+    OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op);                \
+    OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op);                  \
+    OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op);                  \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(in1[7], in2[7]);                  \
+        case 7: out[6] = current_func(in1[6], in2[6]);                  \
+        case 6: out[5] = current_func(in1[5], in2[5]);                  \
+        case 5: out[4] = current_func(in1[4], in2[4]);                  \
+        case 4: out[3] = current_func(in1[3], in2[3]);                  \
+        case 3: out[2] = current_func(in1[2], in2[2]);                  \
+        case 2: out[1] = current_func(in1[1], in2[1]);                  \
+        case 1: out[0] = current_func(in1[0], in2[0]);                  \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in1 += how_much;                                                \
+        in2 += how_much;                                                \
+    }                                                                   \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_FLOAT_FUNC_3(op)                                  \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+        types_per_step = (512 / 8) / sizeof(float);                     \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m512 vecA =  _mm512_load_ps(in1);                         \
+            __m512 vecB =  _mm512_load_ps(in2);                         \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
+            _mm512_store_ps(out, res);                                  \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_FLOAT_FUNC_3(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX_FLOAT_FUNC_3(op)                                     \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+        types_per_step = (256 / 8) / sizeof(float);                     \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m256 vecA =  _mm256_load_ps(in1);                         \
+            __m256 vecB =  _mm256_load_ps(in2);                         \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
+            _mm256_store_ps(out, res);                                  \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX_FLOAT_FUNC_3(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE_FLOAT_FUNC_3(op)                  \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
+        types_per_step = (128 / 8) / sizeof(float);                     \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m128 vecA = _mm_load_ps(in1);                             \
+            __m128 vecB = _mm_load_ps(in2);                             \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m128 res = _mm_##op##_ps(vecA, vecB);                     \
+            _mm_store_ps(out, res);                                     \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE_FLOAT_FUNC_3(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_FLOAT_FUNC_3(op)                                         \
+static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, const void *_in2, \
+                                                              void *_out, int *count, \
+                                                              struct ompi_datatype_t **dtype, \
+                                                              struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int types_per_step, left_over = *count;                             \
+    float *in1 = (float*)_in1, *in2 = (float*)_in2, *out = (float*)_out; \
+    OP_AVX_AVX512_FLOAT_FUNC_3(op);                                     \
+    OP_AVX_AVX_FLOAT_FUNC_3(op);                                        \
+    OP_AVX_SSE_FLOAT_FUNC_3(op);                                        \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(in1[7], in2[7]);                  \
+        case 7: out[6] = current_func(in1[6], in2[6]);                  \
+        case 6: out[5] = current_func(in1[5], in2[5]);                  \
+        case 5: out[4] = current_func(in1[4], in2[4]);                  \
+        case 4: out[3] = current_func(in1[3], in2[3]);                  \
+        case 3: out[2] = current_func(in1[2], in2[2]);                  \
+        case 2: out[1] = current_func(in1[1], in2[1]);                  \
+        case 1: out[0] = current_func(in1[0], in2[0]);                  \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in1 += how_much;                                                \
+        in2 += how_much;                                                \
+    }                                                                   \
+}
+
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#define OP_AVX_AVX512_DOUBLE_FUNC_3(op)                                 \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
+        types_per_step = (512 / 8) / sizeof(double);                    \
+        for (; left_over >= types_per_step; left_over -= types_per_step) { \
+            __m512d vecA =  _mm512_load_pd((in1));                      \
+            __m512d vecB =  _mm512_load_pd((in2));                      \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
+            _mm512_store_pd((out), res);                                \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
+
+#if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#define OP_AVX_AVX_DOUBLE_FUNC_3(op)                                    \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
+        types_per_step = (256 / 8) / sizeof(double);                    \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m256d vecA =  _mm256_load_pd(in1);                        \
+            __m256d vecB =  _mm256_load_pd(in2);                        \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
+            _mm256_store_pd(out, res);                                  \
+            out += types_per_step;                                      \
+        }                                                               \
+        if( 0 == left_over ) return;                                    \
+    }
+#else
+#define OP_AVX_AVX_DOUBLE_FUNC_3(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */
+
+#if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#define OP_AVX_SSE2_DOUBLE_FUNC_3(op)                                   \
+    if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
+        types_per_step = (128 / 8) / sizeof(double);                    \
+        for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
+            __m128d vecA = _mm_load_pd(in1);                            \
+            __m128d vecB = _mm_load_pd(in2);                            \
+            in1 += types_per_step;                                      \
+            in2 += types_per_step;                                      \
+            __m128d res = _mm_##op##_pd(vecA, vecB);                    \
+            _mm_store_pd(out, res);                                     \
+            out += types_per_step;                                      \
+        }                                                               \
+    }
+#else
+#define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {}
+#endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */
+
+#define OP_AVX_DOUBLE_FUNC_3(op)                                        \
+static void OP_CONCAT(ompi_op_avx_3buff_##op##_double,PREPEND)(const void *_in1, const void *_in2, \
+                                                               void *_out, int *count, \
+                                                               struct ompi_datatype_t **dtype, \
+                                                               struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                       \
+    int types_per_step, left_over = *count;                             \
+    double *in1 = (double*)_in1, *in2 = (double*)_in2, *out = (double*)_out; \
+    OP_AVX_AVX512_DOUBLE_FUNC_3(op);                                    \
+    OP_AVX_AVX_DOUBLE_FUNC_3(op);                                       \
+    OP_AVX_SSE2_DOUBLE_FUNC_3(op);                                      \
+    while( left_over > 0 ) {                                            \
+        int how_much = (left_over > 8) ? 8 : left_over;                 \
+        switch(how_much) {                                              \
+        case 8: out[7] = current_func(in1[7], in2[7]);                  \
+        case 7: out[6] = current_func(in1[6], in2[6]);                  \
+        case 6: out[5] = current_func(in1[5], in2[5]);                  \
+        case 5: out[4] = current_func(in1[4], in2[4]);                  \
+        case 4: out[3] = current_func(in1[3], in2[3]);                  \
+        case 3: out[2] = current_func(in1[2], in2[2]);                  \
+        case 2: out[1] = current_func(in1[1], in2[1]);                  \
+        case 1: out[0] = current_func(in1[0], in2[0]);                  \
+        }                                                               \
+        left_over -= how_much;                                          \
+        out += how_much;                                                \
+        in1 += how_much;                                                \
+        in2 += how_much;                                                \
+    }                                                                   \
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+
+    OP_AVX_FUNC_3(max, i, 8,    int8_t, max)
+    OP_AVX_FUNC_3(max, u, 8,   uint8_t, max)
+    OP_AVX_FUNC_3(max, i, 16,  int16_t, max)
+    OP_AVX_FUNC_3(max, u, 16, uint16_t, max)
+    OP_AVX_FUNC_3(max, i, 32,  int32_t, max)
+    OP_AVX_FUNC_3(max, u, 32, uint32_t, max)
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+    OP_AVX_FUNC_3(max, i, 64,  int64_t, max)
+    OP_AVX_FUNC_3(max, u, 64, uint64_t, max)
+#endif
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3(max)
+    OP_AVX_DOUBLE_FUNC_3(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_AVX_FUNC_3(min, i, 8,    int8_t, min)
+    OP_AVX_FUNC_3(min, u, 8,   uint8_t, min)
+    OP_AVX_FUNC_3(min, i, 16,  int16_t, min)
+    OP_AVX_FUNC_3(min, u, 16, uint16_t, min)
+    OP_AVX_FUNC_3(min, i, 32,  int32_t, min)
+    OP_AVX_FUNC_3(min, u, 32, uint32_t, min)
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+    OP_AVX_FUNC_3(min, i, 64,  int64_t, min)
+    OP_AVX_FUNC_3(min, u, 64, uint64_t, min)
+#endif
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3(min)
+    OP_AVX_DOUBLE_FUNC_3(min)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+
+    OP_AVX_FUNC_3(sum, i, 8,    int8_t, add)
+    OP_AVX_FUNC_3(sum, i, 8,   uint8_t, add)
+    OP_AVX_FUNC_3(sum, i, 16,  int16_t, add)
+    OP_AVX_FUNC_3(sum, i, 16, uint16_t, add)
+    OP_AVX_FUNC_3(sum, i, 32,  int32_t, add)
+    OP_AVX_FUNC_3(sum, i, 32, uint32_t, add)
+    OP_AVX_FUNC_3(sum, i, 64,  int64_t, add)
+    OP_AVX_FUNC_3(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3(add)
+    OP_AVX_DOUBLE_FUNC_3(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_AVX_MUL_3(prod, i, 8, int8_t, mullo)
+    OP_AVX_MUL_3(prod, i, 8, uint8_t, mullo)
+    OP_AVX_FUNC_3(prod, i, 16,  int16_t, mullo)
+    OP_AVX_FUNC_3(prod, i, 16, uint16_t, mullo)
+    OP_AVX_FUNC_3(prod, i, 32,  int32_t, mullo)
+    OP_AVX_FUNC_3(prod, i ,32, uint32_t, mullo)
+#if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+    OP_AVX_FUNC_3(prod, i, 64,  int64_t, mullo)
+    OP_AVX_FUNC_3(prod, i, 64, uint64_t, mullo)
+#endif
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3(mul)
+    OP_AVX_DOUBLE_FUNC_3(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_AVX_BIT_FUNC_3(band, 8,    int8_t, and)
+    OP_AVX_BIT_FUNC_3(band, 8,   uint8_t, and)
+    OP_AVX_BIT_FUNC_3(band, 16,  int16_t, and)
+    OP_AVX_BIT_FUNC_3(band, 16, uint16_t, and)
+    OP_AVX_BIT_FUNC_3(band, 32,  int32_t, and)
+    OP_AVX_BIT_FUNC_3(band, 32, uint32_t, and)
+    OP_AVX_BIT_FUNC_3(band, 64,  int64_t, and)
+    OP_AVX_BIT_FUNC_3(band, 64, uint64_t, and)
+
+    // not defined - OP_AVX_FLOAT_FUNC_3(and)
+    // not defined - OP_AVX_DOUBLE_FUNC_3(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_AVX_BIT_FUNC_3(bor, 8,    int8_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 8,   uint8_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 16,  int16_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 16, uint16_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 32,  int32_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 32, uint32_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 64,  int64_t, or)
+    OP_AVX_BIT_FUNC_3(bor, 64, uint64_t, or)
+
+    // not defined - OP_AVX_FLOAT_FUNC_3(or)
+    // not defined - OP_AVX_DOUBLE_FUNC_3(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_AVX_BIT_FUNC_3(bxor, 8,    int8_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 8,   uint8_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 16,  int16_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 16, uint16_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 32,  int32_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 32, uint32_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 64,  int64_t, xor)
+    OP_AVX_BIT_FUNC_3(bxor, 64, uint64_t, xor)
+
+    // not defined - OP_AVX_FLOAT_FUNC_3(xor)
+    // not defined - OP_AVX_DOUBLE_FUNC_3(xor)
+
+/** C integer ***********************************************************/
+#define C_INTEGER_8_16_32(name, ftype)                                                         \
+    [OMPI_OP_BASE_TYPE_INT8_T]   = OP_CONCAT(ompi_op_avx_##ftype##_##name##_int8_t,PREPEND),   \
+    [OMPI_OP_BASE_TYPE_UINT8_T]  = OP_CONCAT(ompi_op_avx_##ftype##_##name##_uint8_t,PREPEND),  \
+    [OMPI_OP_BASE_TYPE_INT16_T]  = OP_CONCAT(ompi_op_avx_##ftype##_##name##_int16_t,PREPEND),  \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = OP_CONCAT(ompi_op_avx_##ftype##_##name##_uint16_t,PREPEND), \
+    [OMPI_OP_BASE_TYPE_INT32_T]  = OP_CONCAT(ompi_op_avx_##ftype##_##name##_int32_t,PREPEND),  \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = OP_CONCAT(ompi_op_avx_##ftype##_##name##_uint32_t,PREPEND)
+
+#define C_INTEGER(name, ftype)                                                                 \
+    C_INTEGER_8_16_32(name, ftype),                                                            \
+    [OMPI_OP_BASE_TYPE_INT64_T]  = OP_CONCAT(ompi_op_avx_##ftype##_##name##_int64_t,PREPEND),  \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = OP_CONCAT(ompi_op_avx_##ftype##_##name##_uint64_t,PREPEND)
+
+#if defined(GENERATE_AVX512_CODE)
+#define C_INTEGER_OPTIONAL(name, ftype)                                                        \
+    C_INTEGER_8_16_32(name, ftype),                                                            \
+    [OMPI_OP_BASE_TYPE_INT64_T]  = OP_CONCAT(ompi_op_avx_##ftype##_##name##_int64_t,PREPEND),  \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = OP_CONCAT(ompi_op_avx_##ftype##_##name##_uint64_t,PREPEND)
+#else
+#define C_INTEGER_OPTIONAL(name, ftype)                                                        \
+    C_INTEGER_8_16_32(name, ftype)
+#endif
+
+/** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype) OP_CONCAT(ompi_op_avx_##ftype##_##name##_float,PREPEND)
+#define DOUBLE(name, ftype) OP_CONCAT(ompi_op_avx_##ftype##_##name##_double,PREPEND)
+
+#define FLOATING_POINT(name, ftype)                                         \
+    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL,                                 \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                         \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_handler_fn_t OP_CONCAT(ompi_op_avx_functions, PREPEND)[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER_OPTIONAL(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER_OPTIONAL(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 2buff),
+        FLOATING_POINT(add, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_OPTIONAL(prod, 2buff),
+        FLOATING_POINT(mul, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+ompi_op_base_3buff_handler_fn_t OP_CONCAT(ompi_op_avx_3buff_functions, PREPEND)[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER_OPTIONAL(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER_OPTIONAL(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 3buff),
+        FLOATING_POINT(add, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_OPTIONAL(prod, 3buff),
+        FLOATING_POINT(mul, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(and, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(or, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(xor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};
diff --git a/ompi/mca/op/base/op_base_functions.c b/ompi/mca/op/base/op_base_functions.c
index f08a9d04be..cf7512a919 100644
--- a/ompi/mca/op/base/op_base_functions.c
+++ b/ompi/mca/op/base/op_base_functions.c
@@ -38,7 +38,7 @@
  * This macro is for (out op in).
  */
 #define OP_FUNC(name, type_name, type, op) \
-  static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
+  static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
                                                       struct ompi_datatype_t **dtype, \
                                                       struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                      \
@@ -58,7 +58,7 @@
  * This macro is for (out = op(out, in))
  */
 #define FUNC_FUNC(name, type_name, type) \
-  static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
+  static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
                                                 struct ompi_datatype_t **dtype, \
                                                 struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                      \
@@ -86,7 +86,7 @@
   } ompi_op_predefined_##type_name##_t;
 
 #define LOC_FUNC(name, type_name, op) \
-    static void ompi_op_base_2buff_##name##_##type_name(void *in, void *out, int *count, \
+    static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \
                                                         struct ompi_datatype_t **dtype, \
                                                         struct ompi_op_base_module_1_0_0_t *module) \
     {                                                                   \
@@ -110,7 +110,7 @@
  * not supports the corresponding complex number type.
  */
 #define COMPLEX_SUM_FUNC(type_name, type) \
-  static void ompi_op_base_2buff_sum_##type_name(void *in, void *out, int *count, \
+  static void ompi_op_base_2buff_sum_##type_name(const void *in, void *out, int *count, \
                                                  struct ompi_datatype_t **dtype, \
                                                  struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                      \
@@ -130,7 +130,7 @@
  * not supports the corresponding complex number type.
  */
 #define COMPLEX_PROD_FUNC(type_name, type) \
-  static void ompi_op_base_2buff_prod_##type_name(void *in, void *out, int *count, \
+  static void ompi_op_base_2buff_prod_##type_name(const void *in, void *out, int *count, \
                                                   struct ompi_datatype_t **dtype, \
                                                   struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                      \
@@ -652,8 +652,8 @@ LOC_FUNC(minloc, long_double_int, <)
  *    routines, needed for some optimizations.
  */
 #define OP_FUNC_3BUF(name, type_name, type, op) \
-    static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1,   \
-                                                        void * restrict in2, void * restrict out, int *count, \
+    static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1,   \
+                                                        const void * restrict in2, void * restrict out, int *count, \
                                                         struct ompi_datatype_t **dtype, \
                                                         struct ompi_op_base_module_1_0_0_t *module) \
     {                                                                   \
@@ -674,8 +674,8 @@ LOC_FUNC(minloc, long_double_int, <)
  * This macro is for (out = op(in1, in2))
  */
 #define FUNC_FUNC_3BUF(name, type_name, type)                           \
-    static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1, \
-                                                        void * restrict in2, void * restrict out, int *count, \
+    static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \
+                                                        const void * restrict in2, void * restrict out, int *count, \
                                                         struct ompi_datatype_t **dtype, \
                                                         struct ompi_op_base_module_1_0_0_t *module) \
     {                                                                   \
@@ -707,8 +707,8 @@ LOC_FUNC(minloc, long_double_int, <)
 */
 
 #define LOC_FUNC_3BUF(name, type_name, op) \
-  static void ompi_op_base_3buff_##name##_##type_name(void * restrict in1,      \
-                                                      void * restrict in2, void * restrict out, int *count, \
+  static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1,      \
+                                                      const void * restrict in2, void * restrict out, int *count, \
                                                       struct ompi_datatype_t **dtype, \
                                                       struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                     \
@@ -737,8 +737,8 @@ LOC_FUNC(minloc, long_double_int, <)
  * not supports the corresponding complex number type.
  */
 #define COMPLEX_SUM_FUNC_3BUF(type_name, type) \
-  static void ompi_op_base_3buff_sum_##type_name(void * restrict in1,    \
-                                                 void * restrict in2, void * restrict out, int *count, \
+  static void ompi_op_base_3buff_sum_##type_name(const void * restrict in1,    \
+                                                 const void * restrict in2, void * restrict out, int *count, \
                                                  struct ompi_datatype_t **dtype, \
                                                  struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                      \
@@ -759,8 +759,8 @@ LOC_FUNC(minloc, long_double_int, <)
  * not supports the corresponding complex number type.
  */
 #define COMPLEX_PROD_FUNC_3BUF(type_name, type) \
-  static void ompi_op_base_3buff_prod_##type_name(void * restrict in1,   \
-                                                  void * restrict in2, void * restrict out, int *count, \
+  static void ompi_op_base_3buff_prod_##type_name(const void * restrict in1,   \
+                                                  const void * restrict in2, void * restrict out, int *count, \
                                                   struct ompi_datatype_t **dtype, \
                                                   struct ompi_op_base_module_1_0_0_t *module) \
   {                                                                      \
diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
index 93a72ace23..837bc51580 100644
--- a/ompi/mca/op/base/op_base_op_select.c
+++ b/ompi/mca/op/base/op_base_op_select.c
@@ -14,6 +14,8 @@
  *                         rights reserved.
  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  * Copyright (c) 2008-2015 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -206,8 +208,8 @@ int ompi_op_base_op_select(ompi_op_t *op)
 static int avail_op_compare(opal_list_item_t **itema,
                             opal_list_item_t **itemb)
 {
-    avail_op_t *availa = (avail_op_t *) itema;
-    avail_op_t *availb = (avail_op_t *) itemb;
+    avail_op_t *availa = (avail_op_t *) *itema;
+    avail_op_t *availb = (avail_op_t *) *itemb;
 
     if (availa->ao_priority > availb->ao_priority) {
         return 1;
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 5d1bff93f9..f81127a74f 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -255,7 +255,7 @@ typedef struct ompi_op_base_module_1_0_0_t ompi_op_base_module_t;
  * repeated code, but it's better this way (and this typedef will
  * never change, so there's not much of a maintenance worry).
  */
-typedef void (*ompi_op_base_handler_fn_1_0_0_t)(void *, void *, int *,
+typedef void (*ompi_op_base_handler_fn_1_0_0_t)(const void *, void *, int *,
                                                 struct ompi_datatype_t **,
                                                 struct ompi_op_base_module_1_0_0_t *);
 
@@ -264,8 +264,8 @@ typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t;
 /*
  * Typedef for 3-buffer (two input and one output) op functions.
  */
-typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(void *,
-                                                      void *,
+typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(const void *,
+                                                      const void *,
                                                       void *, int *,
                                                       struct ompi_datatype_t **,
                                                       struct ompi_op_base_module_1_0_0_t *);
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 4366724a52..3da8fdffb7 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -16,7 +16,7 @@
 
 if PROJECT_OMPI
     MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data
-    MPI_CHECKS = to_self
+    MPI_CHECKS = to_self reduce_local
 endif
 TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS)
 
@@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
 unpack_hetero_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
+reduce_local_SOURCES = reduce_local.c
+reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
+reduce_local_LDADD = \
+        $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
+        $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+
 distclean:
 	rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile
diff --git a/test/datatype/check_op.sh b/test/datatype/check_op.sh
new file mode 100755
index 0000000000..820e938eaf
--- /dev/null
+++ b/test/datatype/check_op.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+set -u
+
+echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'u'|'f'|'d' : datatype: signed, unsigned, float, double. args3 size of type. args4 operation"
+mpirun="mpirun --mca pml ob1 --mca btl vader,self"
+# For SVE-architecture
+# echo "$mpirun -mca op_sve_hardware_available 0 -mca op_avx_hardware_available 0 -np 1 Reduce_local_float 1048576  i 8 max"
+
+# For X86_64 architectures
+# echo "$mpirun -mca op_avx_support 0 -np 1 Reduce_local_float 1048576 i 8 max"
+
+Orange="\033[0;33m"
+Blue="\033[0;34m"
+Purple="\033[0;35m"
+Yellow="\e[1;33m"
+
+NC="\e[m"
+
+verbose=0
+
+echo "=========Signed Integer type all operations & all sizes========"
+echo ""
+for op in max min sum prod band bor bxor; do
+    echo -e "\n===Operation  $op test==="
+    for type_size in 8 16 32 64; do
+        for size in 0 1 7 15 31 63 127 130; do
+            foo=$((1024 * 1024 + $size))
+            echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * $type_size "
+            cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t i -s $type_size -o $op"
+            if test $verbose -eq 1 ; then echo $cmd; fi
+            eval $cmd
+        done
+        echo -e "\n\n"
+    done
+    echo -e "\n\n"
+done
+echo "=========Signed Integer type all operations & all sizes========"
+echo -e "\n\n"
+
+echo "=========Unsigned Integer type all operations & all sizes========"
+echo ""
+for op in max min sum prod band bor bxor; do
+    echo -e "\n===Operation  $op test==="
+    for type_size in 8 16 32 64; do
+        for size in 0 1 7 15 31 63 127 130; do
+            foo=$((1024 * 1024 + $size))
+            echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * $type_size"
+            cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t u -s $type_size -o $op"
+            if test $verbose -eq 1 ; then echo $cmd; fi
+            eval $cmd
+        done
+    done
+done
+echo "=========Unsigned Integer type all operations & all sizes========"
+echo -e "\n\n"
+
+echo "=======Float type all operations========"
+echo ""
+for op in max min sum prod; do
+    for size in 1024 127 130; do
+        foo=$((1024 * 1024 + $size))
+        echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * 32"
+        cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t f -s 32 -o $op"
+        if test $verbose -eq 1 ; then echo $cmd; fi
+        eval $cmd
+    done
+done
+
+echo "========Double type all operations========="
+echo ""
+for op in max min sum prod; do
+    for size in 1024 127 130; do
+        foo=$((1024 * 1024 + $size))
+        echo -e "Test $Yellow __mm512 instruction for loop $NC Total_num_bits = $foo * 64"
+        cmd="$mpirun -np 1 reduce_local -l $foo -u $foo -t d -s 64 -o $op"
+        if test $verbose -eq 1 ; then echo $cmd; fi
+        eval $cmd
+    done
+done
+
diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
new file mode 100644
index 0000000000..aed0de4010
--- /dev/null
+++ b/test/datatype/reduce_local.c
@@ -0,0 +1,1425 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "mpi.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/runtime/mpiruntime.h"
+#include "ompi/datatype/ompi_datatype.h"
+
+typedef struct op_name_s {
+    char* name;
+    char* mpi_op_name;
+    MPI_Op op;
+} op_name_t;
+static op_name_t array_of_ops [] = {
+    { "max", "MPI_MAX", MPI_MAX },
+    { "min", "MPI_MIN", MPI_MIN },
+    { "sum", "MPI_SUM", MPI_SUM },
+    { "prod", "MPI_PROD", MPI_PROD },
+    { "land", "MPI_LAND", MPI_LAND },
+    { "band", "MPI_BAND", MPI_BAND },
+    { "lor", "MPI_LOR", MPI_LOR },
+    { "bor", "MPI_BOR", MPI_BOR },
+    { "lxor", "MPI_LXOR", MPI_LXOR },
+    { "bxor", "MPI_BXOR", MPI_BXOR },
+    { "replace", "MPI_REPLACE", MPI_REPLACE },
+    { NULL, "MPI_OP_NULL", MPI_OP_NULL }
+};
+static int do_ops[12] = { -1, };  /* index of the ops to do. Size +1 larger than the array_of_ops */
+static int verbose = 0;
+static int total_errors = 0;
+
+#define max(a,b) \
+   ({ __typeof__ (a) _a = (a); \
+       __typeof__ (b) _b = (b); \
+     _a > _b ? _a : _b; })
+
+#define min(a,b) \
+   ({ __typeof__ (a) _a = (a); \
+       __typeof__ (b) _b = (b); \
+     _a < _b ? _a : _b; })
+
+static void print_status(char* op, char* type, int type_size,
+                         int count, double duration,
+                         int correct )
+{
+    if(correct) {
+        printf("%-10s %s %-10d%s ", op, type, type_size, (verbose ? " [\033[1;32msuccess\033[0m]" : ""));
+    } else {
+        printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
+        total_errors++;
+    }
+    printf(" count  %-10d  time %.6f seconds\n", count, duration);
+}
+
+int main(int argc, char **argv)
+{
+    static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
+    int count, type_size = 8, rank, size, provided, correctness = 1;
+    int repeats = 1, i, c;
+    double tstart, tend;
+    bool check = true;
+    char type[5] = "uifd", *op = "sum", *mpi_type;
+    int lower = 1, upper = 1000000, skip_op_type;
+    MPI_Op mpi_op;
+
+    while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) {
+        switch(c) {
+        case 'l':
+            lower = atoi(optarg);
+            if( lower <= 0 ) {
+                fprintf(stderr, "The number of elements must be positive\n");
+                exit(-1);
+            }
+            break;
+        case 'u':
+            upper = atoi(optarg);
+            break;
+        case 'f':
+            check = false;
+            break;
+        case 'v':
+            verbose++;
+            break;
+        case 'r':
+            repeats = atoi(optarg);
+            if( repeats <= 0 ) {
+                fprintf(stderr, "The number of repetitions (%d) must be positive\n", repeats);
+                exit(-1);
+            }
+            break;
+        case 't':
+            for( i = 0; i < (int)strlen(optarg); i++ ) {
+                if( ! (('i' == optarg[i]) || ('u' == optarg[i]) ||
+                       ('f' == optarg[i]) || ('d' == optarg[i])) ) {
+                    fprintf(stderr, "type must be i (signed int), u (unsigned int), f (float) or d (double)\n");
+                    exit(-1);
+                }
+            }
+            strncpy(type, optarg, 4);
+            break;
+        case 'o':
+            {
+                if( 0 == strcmp(optarg, "all") ) {
+                    for( i = 0; NULL != array_of_ops[i].name; i++ ) {
+                        do_ops[i] = i;
+                    }
+                    do_ops[i] = -1;  /* stop */
+                } else {
+                    int n, idx = 0;
+                    char* token, *arg = optarg;
+                    while ((token = strsep(&arg, ",")) != NULL) {
+                        for( i = 0; NULL != array_of_ops[i].name; i++ ) {  /* find the op */
+                            if( 0 == strcmp(array_of_ops[i].name, token) ) {
+                                /* check if the op was not already selected */
+                                for(n = 0; n < idx; n++ ) {
+                                    if( i == do_ops[n] ) {
+                                        break;
+                                    }
+                                }
+                                if( n >= idx ) {
+                                    do_ops[idx++] = i;
+                                    do_ops[idx]   = -1;
+                                }
+                                break;
+                            }
+                        }
+                        if( NULL == array_of_ops[i].name ) {
+                            fprintf(stderr, "Unknown op %s. Ignored.\n", token);
+                        }
+                    }
+                }
+            }
+            break;
+        case 's':
+            type_size = atoi(optarg);
+            if( ! ((8 == type_size) || (16 == type_size) || (32 == type_size) || (64 == type_size)) ) {
+                fprintf(stderr, "type_size must be 8, 16, 32 or 64. %d is an invalid value\n",
+                        type_size);
+                exit(-1);
+            }
+            break;
+        case 'h':
+            fprintf(stdout, "%s options are:\n"
+                    " -l <number> : lower number of elements\n"
+                    " -u <number> : upper number of elements\n"
+                    " -s <type_size> : 8, 16, 32 or 64 bits elements\n"
+                    " -t [i,u,f,d] : type of the elements to apply the operations on\n"
+                    " -o <op> : comma separated list of operations to execute among\n"
+                    "           sum, min, max, prod, bor, bxor, band\n"
+                    " -h: this help message\n", argv[0]);
+            exit(0);
+        }
+    }
+
+    in_buf          = malloc(upper * sizeof(double));
+    inout_buf       = malloc(upper * sizeof(double));
+    inout_check_buf = malloc(upper * sizeof(double));
+
+    ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
+
+    rank = ompi_comm_rank(MPI_COMM_WORLD); (void)rank;
+    size = ompi_comm_size(MPI_COMM_WORLD); (void)size;
+
+    for(uint32_t type_idx = 0; type_idx < strlen(type); type_idx++ ) {
+        for(uint32_t op_idx = 0; do_ops[op_idx] >= 0; op_idx++ ) {
+            op     = array_of_ops[do_ops[op_idx]].name;
+            mpi_op = array_of_ops[do_ops[op_idx]].op;
+            skip_op_type = 1;
+
+            for( count = lower; count <= upper; count += count ) {
+                mpi_type = NULL;
+                correctness = 1;
+                if('i' == type[type_idx]) {
+                    if( 8 == type_size ) {
+                        int8_t *in_int8 = (int8_t*)in_buf,
+                            *inout_int8 = (int8_t*)inout_buf,
+                            *inout_int8_for_check = (int8_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_int8[i] = 5;
+                            inout_int8[i] = inout_int8_for_check[i] = -3;
+                        }
+                        mpi_type = "MPI_INT8_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == (int8_t)(in_int8[i] + inout_int8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int8[i], op, inout_int8_for_check[i], inout_int8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == max(inout_int8_for_check[i], in_int8[i]))
+                                        continue;
+                                    printf("First error at position %d (%d != %s(%d))\n",
+                                           i, inout_int8[i], op, in_int8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {  //intentionly reversed in and out
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(inout_int8, in_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == min(inout_int8_for_check[i], in_int8[i]))
+                                        continue;
+                                    printf("First error at position %d (%d != %s(%d))\n",
+                                           i, inout_int8[i], op, in_int8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == (in_int8[i] | inout_int8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int8[i], op, inout_int8_for_check[i], inout_int8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == (in_int8[i] ^ inout_int8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int8[i], op, inout_int8_for_check[i], inout_int8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == (int8_t)(in_int8[i] * inout_int8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int8[i], op, inout_int8_for_check[i], inout_int8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int8[i] == (in_int8[i] & inout_int8_for_check[i]) )
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int8[i], op, inout_int8_for_check[i], inout_int8[i]);
+                                    printf("First error at position %d\n", i);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                    if( 16 == type_size ) {
+                        int16_t *in_int16 = (int16_t*)in_buf,
+                            *inout_int16 = (int16_t*)inout_buf,
+                            *inout_int16_for_check = (int16_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_int16[i] = 5;
+                            inout_int16[i] = inout_int16_for_check[i] = -3;
+                        }
+                        mpi_type = "MPI_INT16_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int16[i] == (int16_t)(in_int16[i] + inout_int16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int16[i], op, inout_int16_for_check[i], inout_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ )  {
+                                    if(inout_int16[i] == max(inout_int16_for_check[i], in_int16[i]))
+                                        continue;
+                                    printf("First error at position %d (%d != %s(%d))\n",
+                                           i, inout_int16[i], op, in_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(inout_int16, in_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int16[i] == min(inout_int16_for_check[i],in_int16[i]))
+                                        continue;
+                                    printf("First error at position %d (%d != %s(%d))\n",
+                                           i, inout_int16[i], op, in_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int16[i] == (in_int16[i] | inout_int16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int16[i], op, inout_int16_for_check[i], inout_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int16[i] == (in_int16[i] ^ inout_int16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int16[i], op, inout_int16_for_check[i], inout_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int16[i] == (int16_t)(in_int16[i] * inout_int16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int16[i], op, inout_int16_for_check[i], inout_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int16[i] == (in_int16[i] & inout_int16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int16[i], op, inout_int16_for_check[i], inout_int16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                    if( 32 == type_size ) {
+                        int32_t *in_int32 = (int32_t*)in_buf,
+                            *inout_int32 = (int32_t*)inout_buf,
+                            *inout_int32_for_check = (int32_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_int32[i] = 5;
+                            inout_int32[i] = inout_int32_for_check[i] = 3;
+                        }
+                        mpi_type = "MPI_INT32_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == (int32_t)(in_int32[i] + inout_int32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int32[i], op, inout_int32_for_check[i], inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == max(inout_int32_for_check[i], in_int32[i]))
+                                        continue;
+                                    printf("First error at position %d (%d != %s(%d))\n",
+                                           i, in_int32[i], op, inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(inout_int32, in_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == min(inout_int32_for_check[i], in_int32[i]))
+                                        continue;
+                                    printf("First error at position %d (%d != %s(%d))\n",
+                                           i, in_int32[i], op, inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == (in_int32[i] | inout_int32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int32[i], op, inout_int32_for_check[i], inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == (int32_t)(in_int32[i] * inout_int32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int32[i], op, inout_int32_for_check[i], inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == (in_int32[i] & inout_int32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int32[i], op, inout_int32_for_check[i], inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int32[i] == (in_int32[i] ^ inout_int32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%d %s %d != %d)\n",
+                                           i, in_int32[i], op, inout_int32_for_check[i], inout_int32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                    if( 64 == type_size ) {
+                        int64_t *in_int64 = (int64_t*)in_buf,
+                            *inout_int64 = (int64_t*)inout_buf,
+                            *inout_int64_for_check = (int64_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_int64[i] = 5;
+                            inout_int64[i] = inout_int64_for_check[i] = 3;
+                        }
+                        mpi_type = "MPI_INT64_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == (int64_t)(in_int64[i] + inout_int64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%lld %s %lld != %lld)\n",
+                                           i, in_int64[i], op, inout_int64_for_check[i], inout_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == max(inout_int64_for_check[i], in_int64[i]))
+                                        continue;
+                                    printf("First error at position %d (%lld != %s(%lld))\n",
+                                           i, inout_int64[i], op, in_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(inout_int64, in_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == min(inout_int64_for_check[i], in_int64[i]))
+                                        continue;
+                                    printf("First error at position %d (%lld != %s(%lld))\n",
+                                           i, inout_int64[i], op, in_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(inout_int64, in_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == in_int64[i])
+                                        continue;
+                                    printf("First error at position %d (%lld %s %lld != %lld)\n",
+                                           i, in_int64[i], op, inout_int64_for_check[i], inout_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == (in_int64[i] | inout_int64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%lld %s %lld != %lld)\n",
+                                           i, in_int64[i], op, inout_int64_for_check[i], inout_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == (in_int64[i] ^ inout_int64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%lld %s %lld != %lld)\n",
+                                           i, in_int64[i], op, inout_int64_for_check[i], inout_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int64,inout_int64,count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == (int64_t)(in_int64[i] * inout_int64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%lld %s %lld != %lld)\n",
+                                           i, in_int64[i], op, inout_int64_for_check[i], inout_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_int64[i] == (in_int64[i] & inout_int64_for_check[i]) )
+                                        continue;
+                                    printf("First error at position %d (%lld %s %lld != %lld)\n",
+                                           i, in_int64[i], op, inout_int64_for_check[i], inout_int64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                }
+
+                if( 'u' == type[type_idx] ) {
+                    if( 8 == type_size ) {
+                        uint8_t *in_uint8 = (uint8_t*)in_buf,
+                            *inout_uint8 = (uint8_t*)inout_buf,
+                            *inout_uint8_for_check = (uint8_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_uint8[i] = 5;
+                            inout_uint8[i] = inout_uint8_for_check[i] = 121;
+                        }
+                        mpi_type = "MPI_UINT8_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == (uint8_t)(in_uint8[i] + inout_uint8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u [%u] != %u)\n",
+                                           i, in_uint8[i], op, inout_uint8_for_check[i], (uint8_t)(in_uint8[i] + inout_uint8_for_check[i]), inout_uint8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == max(inout_uint8_for_check[i], in_uint8[i]))
+                                        continue;
+                                    printf("First error at position %d (%u != %s(%u))\n",
+                                           i, inout_uint8[i], op, inout_uint8_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {  //intentionly reversed in and out
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == min(inout_uint8_for_check[i], in_uint8[i]))
+                                        continue;
+                                    printf("First error at position %d (%u != %s(%u))\n",
+                                           i, inout_uint8[i], op, inout_uint8_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == (in_uint8[i] | inout_uint8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint8[i], op, inout_uint8_for_check[i], inout_uint8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == (in_uint8[i] ^ inout_uint8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint8[i], op, inout_uint8_for_check[i], inout_uint8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == (uint8_t)(in_uint8[i] * inout_uint8_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint8[i], op, inout_uint8_for_check[i], inout_uint8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint8[i] == (in_uint8[i] & inout_uint8_for_check[i]) )
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint8[i], op, inout_uint8_for_check[i], inout_uint8[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                    if( 16 == type_size ) {
+                        uint16_t *in_uint16 = (uint16_t*)in_buf,
+                            *inout_uint16 = (uint16_t*)inout_buf,
+                            *inout_uint16_for_check = (uint16_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_uint16[i] = 5;
+                            inout_uint16[i] = inout_uint16_for_check[i] = 1234;
+                        }
+                        mpi_type = "MPI_UINT16_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint16[i] == (uint16_t)(in_uint16[i] + inout_uint16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint16[i], op, inout_uint16_for_check[i], inout_uint16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ )  {
+                                    if(inout_uint16[i] == max(inout_uint16_for_check[i], in_uint16[i]))
+                                        continue;
+                                    printf("First error at position %d (%u != %s(%u))\n",
+                                           i, inout_uint16[i], op, inout_uint16_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint16[i] == min(inout_uint16_for_check[i], in_uint16[i]))
+                                        continue;
+                                    printf("First error at position %d (%u != %s(%u))\n",
+                                           i, inout_uint16[i], op, inout_uint16_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint16[i] == (in_uint16[i] | inout_uint16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint16[i], op, inout_uint16_for_check[i], inout_uint16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint16[i] == (in_uint16[i] ^ inout_uint16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint16[i], op, inout_uint16_for_check[i], inout_uint16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint16[i] == (uint16_t)(in_uint16[i] * inout_uint16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint16[i], op, inout_uint16_for_check[i], inout_uint16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint16[i] == (in_uint16[i] & inout_uint16_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint16[i], op, inout_uint16_for_check[i], inout_uint16[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                    if( 32 == type_size ) {
+                        uint32_t *in_uint32 = (uint32_t*)in_buf,
+                            *inout_uint32 = (uint32_t*)inout_buf,
+                            *inout_uint32_for_check = (uint32_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_uint32[i] = 5;
+                            inout_uint32[i] = inout_uint32_for_check[i] = 3;
+                        }
+                        mpi_type = "MPI_UINT32_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == (uint32_t)(in_uint32[i] + inout_uint32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint32[i], op, inout_uint32_for_check[i], inout_uint32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == max(inout_uint32_for_check[i], in_uint32[i]))
+                                        continue;
+                                    printf("First error at position %d (%u != %s(%u))\n",
+                                           i, inout_uint32[i], op, inout_uint32_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {  // we reverse the send and recv buffers
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(inout_uint32, in_uint32, count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == min(inout_uint32_for_check[i], in_uint32[i]))
+                                        continue;
+                                    printf("First error at position %d (%u != %s(%u))\n",
+                                           i, inout_uint32[i], op, inout_uint32_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == (in_uint32[i] | inout_uint32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint32[i], op, inout_uint32_for_check[i], inout_uint32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == (uint32_t)(in_uint32[i] * inout_uint32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint32[i], op, inout_uint32_for_check[i], inout_uint32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == (in_uint32[i] & inout_uint32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint32[i], op, inout_uint32_for_check[i], inout_uint32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint32[i] == (in_uint32[i] ^ inout_uint32_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%u %s %u != %u)\n",
+                                           i, in_uint32[i], op, inout_uint32_for_check[i], inout_uint32[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                    if( 64 == type_size ) {
+                        int64_t *in_uint64 = (int64_t*)in_buf,
+                            *inout_uint64 = (int64_t*)inout_buf,
+                            *inout_uint64_for_check = (int64_t*)inout_check_buf;
+                        for( i = 0; i < count; i++ ) {
+                            in_uint64[i] = 5;
+                            inout_uint64[i] = inout_uint64_for_check[i] = 32433;
+                        }
+                        mpi_type = "MPI_UINT64_T";
+
+                        if( 0 == strcmp(op, "sum") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( correctness = 1, i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == (int64_t)(in_uint64[i] + inout_uint64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%llu %s %llu != %llu)\n",
+                                           i, in_uint64[i], op, inout_uint64_for_check[i], inout_uint64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "max") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == max(inout_uint64_for_check[i], in_uint64[i]))
+                                        continue;
+                                    printf("First error at position %d (%llu != %s(%llu))\n",
+                                           i, inout_uint64[i], op, inout_uint64_for_check[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "min") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == min(inout_uint64_for_check[i], in_uint64[i]))
+                                        continue;
+                                    printf("First error at position %d (%llu != %s(%llu, %llu))\n",
+                                           i, inout_uint64[i], op, inout_uint64_for_check[i], in_uint64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == (in_uint64[i] | inout_uint64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%llu %s %llu != %llu)\n",
+                                           i, in_uint64[i], op, inout_uint64_for_check[i], inout_uint64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "bxor") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == (in_uint64[i] ^ inout_uint64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%llu %s %llu != %llu)\n",
+                                           i, in_uint64[i], op, inout_uint64_for_check[i], inout_uint64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "prod") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == (int64_t)(in_uint64[i] * inout_uint64_for_check[i]))
+                                        continue;
+                                    printf("First error at position %d (%llu %s %llu != %llu)\n",
+                                           i, in_uint64[i], op, inout_uint64_for_check[i], inout_uint64[i]);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                        if( 0 == strcmp(op, "band") ) {
+                            skip_op_type = 0;
+                            tstart = MPI_Wtime();
+                            MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, mpi_op);
+                            tend = MPI_Wtime();
+                            if( check ) {
+                                for( i = 0; i < count; i++ ) {
+                                    if(inout_uint64[i] == (in_uint64[i] & inout_uint64_for_check[i]) )
+                                        continue;
+                                    printf("First error at position %d (%llu %s %llu != %llu)\n",
+                                           i, in_uint64[i], op, inout_uint64_for_check[i], inout_uint64[i]);
+                                    printf("First error at position %d\n", i);
+                                    correctness = 0;
+                                    break;
+                                }
+                            }
+                            goto check_and_continue;
+                        }
+                    }
+                }
+
+                if( 'f' == type[type_idx] ) {
+                    float *in_float = (float*)in_buf,
+                        *inout_float = (float*)inout_buf,
+                        *inout_float_for_check = (float*)inout_check_buf;
+                    for( i = 0; i < count; i++ ) {
+                        in_float[i] = 1000.0+1;
+                        inout_float[i] = inout_float_for_check[i] = 100.0+2;
+                    }
+                    mpi_type = "MPI_FLOAT";
+
+                    if( 0 == strcmp(op, "sum") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_float[i] == inout_float_for_check[i]+in_float[i])
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                    if( 0 == strcmp(op, "max") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_float[i] == max(inout_float_for_check[i], in_float[i]))
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                    if( 0 == strcmp(op, "min") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_float[i] == min(inout_float_for_check[i], in_float[i]))
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                    if( 0 == strcmp(op, "prod") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_float[i] == in_float[i] * inout_float_for_check[i])
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                }
+
+                if( 'd' == type[type_idx] ) {
+                    double *in_double = (double*)in_buf,
+                        *inout_double = (double*)inout_buf,
+                        *inout_double_for_check = (double*)inout_check_buf;
+                    for( i = 0; i < count; i++ ) {
+                        in_double[i] = 10.0+1;
+                        inout_double[i] = inout_double_for_check[i] = 1.0+2;
+                    }
+                    mpi_type = "MPI_DOUBLE";
+
+                    if( 0 == strcmp(op, "sum") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_double[i] == inout_double_for_check[i]+in_double[i])
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                    if( 0 == strcmp(op, "max") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_double[i] == max(inout_double_for_check[i], in_double[i]))
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                    if( 0 == strcmp(op, "min") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(inout_double, in_double, count, MPI_DOUBLE, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_double[i] == min(inout_double_for_check[i], in_double[i]))
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                    if( 0 == strcmp(op, "prod") ) {
+                        skip_op_type = 0;
+                        tstart = MPI_Wtime();
+                        MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, mpi_op);
+                        tend = MPI_Wtime();
+                        if( check ) {
+                            for( i = 0; i < count; i++ ) {
+                                if(inout_double[i] == inout_double_for_check[i]*in_double[i])
+                                    continue;
+                                printf("First error at position %d\n", i);
+                                correctness = 0;
+                                break;
+                            }
+                        }
+                        goto check_and_continue;
+                    }
+                }
+        check_and_continue:
+                if( !skip_op_type )
+                    print_status(array_of_ops[do_ops[op_idx]].mpi_op_name,
+                                 mpi_type, type_size, count, tend-tstart, correctness);
+            }
+            if( !skip_op_type )
+                printf("\n");
+        }
+    }
+    ompi_mpi_finalize();
+
+    free(in_buf);
+    free(inout_buf);
+    free(inout_check_buf);
+
+    return (0 == total_errors) ? 0 : -1;
+}
+