diff --git a/NEWS b/NEWS index f274276516..f31451c398 100644 --- a/NEWS +++ b/NEWS @@ -87,6 +87,11 @@ Trunk (not on release branches yet) - Add support for the MPI tool information interface (MPI_T). - Update ompi_info to support limiting output by opal info level. +- Wrapper compilers now add rpath support by default to generated + executables on systems that support it. This behavior can be + disabled via --disable-wrapper-rpath. See note in README about ABI + issues when using rpath in MPI applications. + 1.7.2 ----- diff --git a/VERSION b/VERSION index e0655836f1..dd3617e467 100644 --- a/VERSION +++ b/VERSION @@ -1,6 +1,8 @@ # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. # This is the VERSION file for Open MPI, describing the precise # version of Open MPI in this distribution. The various components of @@ -96,6 +98,7 @@ libmpi_usempi_ignore_tkr_so_version=0:0:0 libopen_rte_so_version=0:0:0 libopen_pal_so_version=0:0:0 libmpi_java_so_version=0:0:0 +libshmem_so_version=0:0:0 # "Common" components install standalone libraries that are run-time # linked by one or more components. So they need to be versioned as diff --git a/autogen.pl b/autogen.pl index 6cc752b809..dc2061b173 100755 --- a/autogen.pl +++ b/autogen.pl @@ -2,7 +2,8 @@ # # Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. -# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -41,6 +42,7 @@ my @subdirs; # Command line parameters my $no_ompi_arg = 0; my $no_orte_arg = 0; +my $no_oshmem_arg = 0; my $quiet_arg = 0; my $debug_arg = 0; my $help_arg = 0; @@ -985,6 +987,7 @@ sub patch_autotools_output { my $ok = Getopt::Long::GetOptions("no-ompi" => \$no_ompi_arg, "no-orte" => \$no_orte_arg, + "no-oshmem" => \$no_oshmem_arg, "quiet|q" => \$quiet_arg, "debug|d" => \$debug_arg, "help|h" => \$help_arg, @@ -999,6 +1002,7 @@ if (!$ok || $help_arg) { print "Options: --no-ompi | -no-ompi Do not build the Open MPI layer --no-orte | -no-orte Do not build the ORTE layer + --no-oshmem | -no-oshmem Do not build the OSHMEM layer --quiet | -q Do not display normal verbose output --debug | -d Output lots of debug information --help | -h This help list @@ -1027,6 +1031,10 @@ if (! -e "orte") { $no_orte_arg = 1; debug "No orte subdirectory found - will not build ORTE\n"; } +if (! -e "oshmem") { + $no_oshmem_arg = 1; + debug "No oshmem subdirectory found - will not build OSHMEM\n"; +} if ($no_ompi_arg == 1 && $no_orte_arg == 0) { $project_name_long = "Open MPI Run Time Environment"; @@ -1193,6 +1201,8 @@ push(@{$projects}, { name => "orte", dir => "orte", need_base => 1 }) if (!$no_orte_arg); push(@{$projects}, { name => "ompi", dir => "ompi", need_base => 1 }) if (!$no_ompi_arg); +push(@{$projects}, { name => "oshmem", dir => "oshmem", need_base => 1 }) + if (!$no_ompi_arg && !$no_orte_arg && !$no_oshmem_arg); $m4 .= "dnl Separate m4 define for each project\n"; foreach my $p (@$projects) { diff --git a/config/opal_check_attributes.m4 b/config/opal_check_attributes.m4 index 5bf8263a93..1817347a7f 100644 --- a/config/opal_check_attributes.m4 +++ b/config/opal_check_attributes.m4 @@ -12,6 +12,8 @@ # All rights reserved. # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -215,6 +217,7 @@ AC_DEFUN([OPAL_CHECK_ATTRIBUTES], [ opal_cv___attribute__visibility=0 opal_cv___attribute__warn_unused_result=0 opal_cv___attribute__weak_alias=0 + opal_cv___attribute__destructor=0 else AC_MSG_RESULT([yes]) @@ -533,6 +536,13 @@ AC_DEFUN([OPAL_CHECK_ATTRIBUTES], [ [], []) + _OPAL_CHECK_SPECIFIC_ATTRIBUTE([destructor], + [ + void foo(void) __attribute__ ((__destructor__)); + void foo(void) { return ; } + ], + [], + []) fi # Now that all the values are set, define them @@ -581,4 +591,6 @@ AC_DEFUN([OPAL_CHECK_ATTRIBUTES], [ [Whether your compiler has __attribute__ warn unused result or not]) AC_DEFINE_UNQUOTED(OPAL_HAVE_ATTRIBUTE_WEAK_ALIAS, [$opal_cv___attribute__weak_alias], [Whether your compiler has __attribute__ weak alias or not]) + AC_DEFINE_UNQUOTED(OPAL_HAVE_ATTRIBUTE_DESTRUCTOR, [$opal_cv___attribute__destructor], + [Whether your compiler has __attribute__ destructor or not]) ]) diff --git a/config/oshmem_config_files.m4 b/config/oshmem_config_files.m4 new file mode 100644 index 0000000000..ff3cf30d22 --- /dev/null +++ b/config/oshmem_config_files.m4 @@ -0,0 +1,26 @@ +# -*- shell-script -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([OSHMEM_CONFIG_FILES],[ + AC_CONFIG_FILES([ + oshmem/Makefile + oshmem/include/Makefile + oshmem/shmem/c/Makefile + oshmem/shmem/f77/Makefile + + oshmem/shmem/c/profile/Makefile + + oshmem/tools/wrappers/Makefile + oshmem/tools/wrappers/shmemcc-wrapper-data.txt + oshmem/tools/wrappers/shmemf77-wrapper-data.txt + oshmem/tools/wrappers/shmemf90-wrapper-data.txt + ]) +]) diff --git a/config/oshmem_configure_options.m4 b/config/oshmem_configure_options.m4 new file mode 100644 index 0000000000..4dc5c459e4 --- /dev/null +++ b/config/oshmem_configure_options.m4 @@ -0,0 +1,164 @@ +dnl -*- shell-script -*- +dnl +dnl Copyright (c) 2013 Mellanox Technologies, Inc. +dnl All rights reserved. +dnl +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + + + +AC_DEFUN([OSHMEM_CONFIGURE_OPTIONS],[ +ompi_show_subtitle "OSHMEM Configuration options" + + +AC_SUBST(OSHMEM_LIBSHMEM_EXTRA_LIBS) +AC_SUBST(OSHMEM_LIBSHMEM_EXTRA_LDFLAGS) + +# +# Enable compatibility mode +# +AC_MSG_CHECKING([if want SGI/Quadrix compatibility mode]) +AC_ARG_ENABLE(oshmem-compat, + AC_HELP_STRING([--enable-oshmem-compat], + [enable compatibility mode (default: enabled)])) +if test "$enable_oshmem_compat" != "no"; then + AC_MSG_RESULT([yes]) + OSHMEM_SPEC_COMPAT=1 +else + AC_MSG_RESULT([no]) + OSHMEM_SPEC_COMPAT=0 +fi +AC_DEFINE_UNQUOTED([OSHMEM_SPEC_COMPAT], [$OSHMEM_SPEC_COMPAT], + [Whether user wants OSHMEM in compatibility mode or not]) + + + +# +# Do we want to disable OSHMEM parameter checking at run-time? +# +AC_MSG_CHECKING([if want SHMEM API parameter checking]) +AC_ARG_WITH(shmem-param-check, + AC_HELP_STRING([--shmem-param-check(=VALUE)], + [behavior of SHMEM function parameter checking. Valid values are: always, never. If --with-shmem-param-check is specified with no VALUE argument, it is equivalent to a VALUE of "always"; --without-shmem-param-check is equivalent to "never" (default: always).])) +shmem_param_check=1 +if test "$with_shmem_param_check" = "no" -o \ + "$with_shmem_param_check" = "never"; then + shmem_param_check=0 + AC_MSG_RESULT([never]) +elif test "$with_shmem_param_check" = "yes" -o \ + "$with_shmem_param_check" = "always" -o \ + -z "$with_shmem_param_check"; then + shmem_param_check=1 + AC_MSG_RESULT([always]) +else + AC_MSG_RESULT([unknown]) + AC_MSG_WARN([*** Unrecognized --with-shmem-param-check value]) + AC_MSG_WARN([*** See "configure --help" output]) + AC_MSG_WARN([*** Defaulting to "runtime"]) +fi +AC_DEFINE_UNQUOTED(OSHMEM_PARAM_CHECK, $shmem_param_check, + [Whether we want to check SHMEM parameters always or never]) + + +# +# OSHMEM profiling support +# +AC_MSG_CHECKING([if want pshmem_]) +AC_ARG_ENABLE(oshmem-profile, + AC_HELP_STRING([--enable-oshmem-profile], + [enable OSHMEM profiling (default: enabled)])) +if test "$enable_oshmem_profile" != "no"; then + AC_MSG_RESULT([yes]) + oshmem_progiling_support=1 +else + AC_MSG_RESULT([no]) + oshmem_progiling_support=0 +fi +AM_CONDITIONAL(OSHMEM_PROFILING, test "$oshmem_progiling_support" = 1) +#AC_DEFINE_UNQUOTED([OSHMEM_PROFILING], [$oshmem_progiling_support], +# [Whether user wants OSHMEM profiling]) + +]) + + +AC_DEFUN([OSHMEM_SETUP_CFLAGS],[ + + +OMPI_C_COMPILER_VENDOR([oshmem_c_vendor]) + +# +# OSHMEM force warnings as errors +# +# +# Since SHMEM libraries are not fully ISO99 C compliant +# -pedantic and -Wundef raise a bunch of warnings, so +# we just strip them off for this component +AC_MSG_WARN([Removed -pedantic and -Wundef from CFLAGS for OSHMEM]) + +oshmem_CFLAGS="$CFLAGS" + +# Strip off problematic arguments +oshmem_CFLAGS="`echo $oshmem_CFLAGS | sed 's/-pedantic//g'`" +oshmem_CFLAGS="`echo $oshmem_CFLAGS | sed 's/-Wundef//g'`" +oshmem_CFLAGS="`echo $oshmem_CFLAGS | sed 's/-Wno-long-double//g'`" +CFLAGS="$oshmem_CFLAGS" + +case "$oshmem_c_vendor" in + gnu) + OSHMEM_CFLAGS=" -Werror" + ;; + intel) + # we want specifically the warning on format string conversion + OSHMEM_CFLAGS=" -Werror " + ;; +esac + +AC_SUBST([OSHMEM_CFLAGS]) + + + +OMPI_CHECK_OPENFABRICS([openib], + [openib_happy="yes"], + [openib_happy="no"]) + +# substitute in the things needed to build MEMHEAP BASE +AC_SUBST([openib_CFLAGS]) +AC_SUBST([openib_CPPFLAGS]) +AC_SUBST([openib_LDFLAGS]) +AC_SUBST([openib_LIBS]) + +# If we have the openib stuff available, find out what we've got +AS_IF( + [test "$openib_happy" = "yes"], + [ + OSHMEM_LIBSHMEM_EXTRA_LDFLAGS="$OSHMEM_LIBSHMEM_EXTRA_LDFLAGS $openib_LDFLAGS" + OSHMEM_LIBSHMEM_EXTRA_LIBS="$OSHMEM_LIBSHMEM_EXTRA_LIBS $openib_LIBS" + + # ibv_reg_shared_mr was added in MOFED 1.8 + oshmem_have_mpage=0 + + openib_save_CPPFLAGS="$CPPFLAGS" + openib_save_LDFLAGS="$LDFLAGS" + openib_save_LIBS="$LIBS" + + CPPFLAGS="$CPPFLAGS $openib_CPPFLAGS" + LDFLAGS="$LDFLAGS $openib_LDFLAGS" + LIBS="$LIBS $openib_LIBS" + + AC_CHECK_DECLS([IBV_ACCESS_ALLOCATE_MR,IBV_ACCESS_SHARED_MR_USER_READ], + [oshmem_have_mpage=2], [], + [#include ]) + + CPPFLAGS="$openib_save_CPPFLAGS" + LDFLAGS="$openib_save_LDFLAGS" + LIBS="$openib_save_LIBS" + + AC_DEFINE_UNQUOTED(MPAGE_ENABLE, $oshmem_have_mpage, + [Whether we can use M-PAGE supported since MOFED 1.8]) + ]) +])dnl diff --git a/configure.ac b/configure.ac index 0c333b9abf..6b780c34f8 100644 --- a/configure.ac +++ b/configure.ac @@ -17,6 +17,8 @@ # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. # Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -99,6 +101,8 @@ AC_SUBST([CONFIGURE_DEPENDENCIES], ['$(top_srcdir)/VERSION']) # Set up project specific AM_CONDITIONALs AM_CONDITIONAL([PROJECT_OMPI], m4_ifdef([project_ompi], [true], [false])) AM_CONDITIONAL([PROJECT_ORTE], m4_ifdef([project_orte], [true], [false])) +AM_CONDITIONAL([PROJECT_OSHMEM], m4_ifdef([project_oshmem], [true], [false])) + ompi_show_subtitle "Checking versions" @@ -113,6 +117,11 @@ m4_ifdef([project_orte], [$srcdir/VERSION], [orte/include/orte/version.h])]) +m4_ifdef([project_oshmem], + [OPAL_SAVE_VERSION([OSHMEM], [Open SHMEM], + [$srcdir/VERSION], + [oshmem/include/oshmem/version.h])]) + OPAL_SAVE_VERSION([OPAL], [Open Portable Access Layer], [$srcdir/VERSION], [opal/include/opal/version.h]) @@ -137,6 +146,8 @@ m4_ifdef([project_ompi], AC_SUBST(libmca_common_portals_so_version)]) m4_ifdef([project_orte], [AC_SUBST(libopen_rte_so_version)]) +m4_ifdef([project_oshmem], + [AC_SUBST(libshmem_so_version)]) AC_SUBST(libmca_opal_common_hwloc_so_version) AC_SUBST(libmca_opal_common_pmi_so_version) AC_SUBST(libopen_pal_so_version) @@ -161,6 +172,8 @@ m4_ifdef([project_orte], [AC_CONFIG_HEADERS([orte/include/orte_config.h])]) m4_ifdef([project_ompi], [AC_CONFIG_HEADERS([ompi/include/ompi_config.h ompi/include/mpi.h])]) +m4_ifdef([project_oshmem], + [AC_CONFIG_HEADER([oshmem/include/oshmem_config.h oshmem/include/shmem.h oshmem/include/shmem_portable_platform.h])]) # override/fixup the version numbers set by AC_INIT, since on # developer builds, there's no good way to know what the version is @@ -241,6 +254,7 @@ OPAL_CONFIGURE_OPTIONS OPAL_CHECK_CUDA m4_ifdef([project_orte], [ORTE_CONFIGURE_OPTIONS]) m4_ifdef([project_ompi], [OMPI_CONFIGURE_OPTIONS]) +m4_ifdef([project_oshmem], [OSHMEM_CONFIGURE_OPTIONS]) if test "$enable_binaries" = "no" -a "$enable_dist" = "yes"; then AC_MSG_WARN([--disable-binaries is incompatible with --enable dist]) @@ -860,6 +874,12 @@ AC_DEFINE_UNQUOTED(OMPI_MPI_OFFSET_TYPE, $MPI_OFFSET_TYPE, [Type of MPI_Offset - AC_DEFINE_UNQUOTED(OMPI_MPI_OFFSET_SIZE, $MPI_OFFSET_SIZE, [Size of the MPI_Offset]) AC_DEFINE_UNQUOTED(OMPI_OFFSET_DATATYPE, $MPI_OFFSET_DATATYPE, [MPI datatype corresponding to MPI_Offset]) +AC_DEFINE_UNQUOTED(OSHMEM_SHMEM_OFFSET_TYPE, $MPI_OFFSET_TYPE, [Type of SHMEM_Offset -- has to be defined here and typedef'ed later because shmem.h does not get AC SUBST's]) +AC_DEFINE_UNQUOTED(OSHMEM_SHMEM_OFFSET_SIZE, $MPI_OFFSET_SIZE, [Size of the SHMEM_Offset]) +AC_DEFINE_UNQUOTED(OSHMEM_OFFSET_DATATYPE, $MPI_OFFSET_DATATYPE, [SHMEM datatype corresponding to SHMEM_Offset]) + +AC_DEFINE_UNQUOTED(OPAL_SIZEOF_LONG, $ac_cv_sizeof_long, "Size of 'long' type") + if test $MPI_COUNT_SIZE -eq 8 ; then MPI_COUNT_MAX="0x7fffffffffffffffll" elif test $MPI_COUNT_SIZE -eq 4 ; then @@ -1205,7 +1225,7 @@ if test "$OMPI_TOP_BUILDDIR" != "$OMPI_TOP_SRCDIR"; then # rather than have successive assignments to these shell # variables, lest the $(foo) names try to get evaluated here. # Yuck! - CPPFLAGS='-I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal/include m4_ifdef([project_orte], [-I$(top_srcdir)/orte/include]) m4_ifdef([project_ompi], [-I$(top_srcdir)/ompi/include])'" $CPPFLAGS" + CPPFLAGS='-I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal/include m4_ifdef([project_orte], [-I$(top_srcdir)/orte/include]) m4_ifdef([project_ompi], [-I$(top_srcdir)/ompi/include]) m4_ifdef([project_oshmem], [-I$(top_srcdir)/oshmem/include])'" $CPPFLAGS" # C++ is only relevant if we're building OMPI m4_ifdef([project_ompi], [CXXCPPFLAGS='-I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal/include -I$(top_srcdir)/orte/include -I$(top_srcdir)/ompi/include'" $CXXCPPFLAGS"]) else @@ -1219,6 +1239,7 @@ fi m4_ifdef([project_orte], [ORTE_SETUP_DEBUGGER_FLAGS], [m4_ifdef([project_ompi], [ORTE_SETUP_DEBUGGER_FLAGS])]) +m4_ifdef([project_oshmem], [OSHMEM_SETUP_CFLAGS]) # # Delayed the substitution of CFLAGS and CXXFLAGS until now because @@ -1325,5 +1346,6 @@ AC_CONFIG_FILES([ OPAL_CONFIG_FILES m4_ifdef([project_orte], [ORTE_CONFIG_FILES]) m4_ifdef([project_ompi], [OMPI_CONFIG_FILES]) +m4_ifdef([project_oshmem], [OSHMEM_CONFIG_FILES]) AC_OUTPUT diff --git a/contrib/dist/linux/openmpi.spec b/contrib/dist/linux/openmpi.spec index bfcf4c9896..43d81c8e8d 100644 --- a/contrib/dist/linux/openmpi.spec +++ b/contrib/dist/linux/openmpi.spec @@ -10,6 +10,8 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -97,7 +99,7 @@ # Should we use the default "check_files" RPM step (i.e., check for # unpackaged files)? It is discouraged to disable this, but some # installers need it (e.g., older versions of OFED, because they -# installed lots of other stuff in the BUILD_ROOT before Open MPI). +# installed lots of other stuff in the BUILD_ROOT before Open MPI/SHMEM). # type: bool (0/1) %{!?use_check_files: %define use_check_files 1} @@ -122,7 +124,7 @@ # type: bool (0/1) %{!?disable_auto_requires: %define disable_auto_requires 0} -# On some platforms, Open MPI just flat-out doesn't work with +# On some platforms, Open MPI/SHMEM just flat-out doesn't work with # -D_FORTIFY_SOURCE (e.g., some users have reported that there are # problems on ioa64 platforms). In this case, just turn it off # (meaning: this specfile will strip out that flag from the @@ -194,7 +196,7 @@ # ############################################################################# -Summary: A powerful implementation of MPI +Summary: A powerful implementation of MPI/SHMEM Name: %{?_name:%{_name}}%{!?_name:openmpi} Version: $VERSION Release: 1 @@ -222,8 +224,14 @@ Open MPI is a project combining technologies and resources from several other projects (FT-MPI, LA-MPI, LAM/MPI, and PACX-MPI) in order to build the best MPI library available. +The project includes implementation of SHMEM parallel +programming library in the Partitioned Global Address Space. +This library provides fast inter-processor communication for large +messages using data passing and one-sided communication techniques. +SHMEM API based on OpenSHMEM standard from http://www.openshmem.org/ + This RPM contains all the tools necessary to compile, link, and run -Open MPI jobs. +Open MPI/SHMEM jobs. %if !%{build_all_in_one_rpm} @@ -234,7 +242,7 @@ Open MPI jobs. ############################################################################# %package runtime -Summary: Tools and plugin modules for running Open MPI jobs +Summary: Tools and plugin modules for running Open MPI/SHMEM jobs Group: Development/Libraries Provides: mpi %if %{disable_auto_requires} @@ -249,9 +257,15 @@ Open MPI is a project combining technologies and resources from several other projects (FT-MPI, LA-MPI, LAM/MPI, and PACX-MPI) in order to build the best MPI library available. +The project includes implementation of SHMEM parallel +programming library in the Partitioned Global Address Space. +This library provides fast inter-processor communication for large +messages using data passing and one-sided communication techniques. +SHMEM API based on OpenSHMEM standard from http://www.openshmem.org/ + This subpackage provides general tools (mpirun, mpiexec, etc.) and the Module Component Architecture (MCA) base and plugins necessary for -running Open MPI jobs. +running Open MPI/SHMEM jobs. %endif @@ -262,7 +276,7 @@ running Open MPI jobs. ############################################################################# %package devel -Summary: Development tools and header files for Open MPI +Summary: Development tools and header files for Open MPI/SHMEM Group: Development/Libraries %if %{disable_auto_requires} AutoReq: no @@ -274,8 +288,14 @@ Open MPI is a project combining technologies and resources from several other projects (FT-MPI, LA-MPI, LAM/MPI, and PACX-MPI) in order to build the best MPI library available. -This subpackage provides the development files for Open MPI, such as -wrapper compilers and header files for MPI development. +The project includes implementation of SHMEM parallel +programming library in the Partitioned Global Address Space. +This library provides fast inter-processor communication for large +messages using data passing and one-sided communication techniques. +SHMEM API based on OpenSHMEM standard from http://www.openshmem.org/ + +This subpackage provides the development files for Open MPI/SHMEM, such as +wrapper compilers and header files for MPI/SHMEM development. ############################################################################# # @@ -284,7 +304,7 @@ wrapper compilers and header files for MPI development. ############################################################################# %package docs -Summary: Documentation for Open MPI +Summary: Documentation for Open MPI/SHMEM Group: Development/Documentation %if %{disable_auto_requires} AutoReq: no @@ -296,7 +316,13 @@ Open MPI is a project combining technologies and resources from several other projects (FT-MPI, LA-MPI, LAM/MPI, and PACX-MPI) in order to build the best MPI library available. -This subpackage provides the documentation for Open MPI. +The project includes implementation of SHMEM parallel +programming library in the Partitioned Global Address Space. +This library provides fast inter-processor communication for large +messages using data passing and one-sided communication techniques. +SHMEM API based on OpenSHMEM standard from http://www.openshmem.org/ + +This subpackage provides the documentation for Open MPI/SHMEM. ############################################################################# # @@ -423,14 +449,14 @@ cat <$RPM_BUILD_ROOT/%{modulefile_path}/%{modulefile_subdir}/%{modulefile_ #%Module # NOTE: This is an automatically-generated file! (generated by the -# Open MPI RPM). Any changes made here will be lost a) if the RPM is +# Open MPI/SHMEM RPM). Any changes made here will be lost a) if the RPM is # uninstalled, or b) if the RPM is upgraded or uninstalled. proc ModulesHelp { } { - puts stderr "This module adds Open MPI v%{version} to various paths" + puts stderr "This module adds Open MPI/SHMEM v%{version} to various paths" } -module-whatis "Sets up Open MPI v%{version} in your enviornment" +module-whatis "Sets up Open MPI/SHMEM v%{version} in your enviornment" prepend-path PATH "%{_prefix}/bin/" prepend-path LD_LIBRARY_PATH %{_libdir} @@ -445,7 +471,7 @@ EOF %{__mkdir_p} $RPM_BUILD_ROOT/%{shell_scripts_path} cat < $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.sh # NOTE: This is an automatically-generated file! (generated by the -# Open MPI RPM). Any changes made here will be lost if the RPM is +# Open MPI/SHMEM RPM). Any changes made here will be lost if the RPM is # uninstalled or upgraded. # PATH @@ -472,7 +498,7 @@ export MPI_ROOT EOF cat < $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.csh # NOTE: This is an automatically-generated file! (generated by the -# Open MPI RPM). Any changes made here will be lost if the RPM is +# Open MPI/SHMEM RPM). Any changes made here will be lost if the RPM is # uninstalled or upgraded. # path @@ -718,6 +744,9 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT # ############################################################################# %changelog +* Mon Jun 24 2013 Igor Ivanov +- Add Open SHMEM parallel programming library as part of Open MPI + * Tue Dec 11 2012 Jeff Squyres - Re-release 1.6.0-1.6.3 SRPMs (with new SRPM Release numbers) with patch for VampirTrace's configure script to make it install the diff --git a/examples/Makefile b/examples/Makefile index 3eda2b8804..12ecc53df3 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -28,7 +28,7 @@ CXX = mpic++ CCC = mpic++ FC = mpifort JAVAC = mpijavac - +SHMEM = shmemcc # Using -g is not necessary, but it is helpful for example programs, # especially if users want to examine them with debuggers. Note that # gmake requires the CXXFLAGS macro, while other versions of make @@ -47,6 +47,7 @@ EXAMPLES = \ hello_mpifh \ hello_usempi \ hello_usempif08 \ + hello_shmem \ Hello.class \ ring_c \ ring_cxx \ @@ -75,6 +76,9 @@ all: hello_c ring_c connectivity_c @ if ompi_info --parsable | grep bindings:java:yes >/dev/null; then \ $(MAKE) Hello.class Ring.class; \ fi + @ if ompi_info --parsable | grep mca:shmem >/dev/null; then \ + $(MAKE) hello_shmem; \ + fi # The usual "clean" target @@ -102,3 +106,8 @@ Hello.class: Hello.java $(JAVAC) Hello.java Ring.class: Ring.java $(JAVAC) Ring.java + +hello_shmem: hello_shmem.c + $(SHMEM) $(CFLAGS) $^ -o $@ + + diff --git a/examples/README b/examples/README index 40869027f9..59bf37d614 100644 --- a/examples/README +++ b/examples/README @@ -29,6 +29,7 @@ different MPI interfaces: Fortran use mpi: hello_usempi.f90 Fortran use mpi_f08: hello_usempif08.f90 Java: Hello.java + OSHMEM: hello_shmem.c - Send a trivial message around in a ring C: ring_c.c diff --git a/examples/hello_shmem.c b/examples/hello_shmem.c new file mode 100644 index 0000000000..d23b6a8652 --- /dev/null +++ b/examples/hello_shmem.c @@ -0,0 +1,128 @@ +#include "shmem.h" +#include "stdio.h" + +#define N 100 +static int target[N]; + +static int source[N]; + +#define STATIC_CHECK 1 +#define DYNAMIC_CHECK 1 +#define ATOMIC 1 +#define PEER 1 + +int main() +{ + int *source_d,*target_d; + int i; + + start_pes(0); + + source_d = shmalloc(sizeof(*source_d)*N); + target_d = shmalloc(sizeof(*target_d)*N); + + for (i = 0; i < N; i++) + { + source_d[i] = source[i] = 1; + target[i] = target_d[i] = 9; + } + + int peer = PEER; + if (_my_pe() == 0) + { +#if STATIC_CHECK + int c, f; + int a = c, b = f; +#if ATOMIC + for (i = 0; i < N; i++) + target[i] = shmem_int_g(source + i, peer); +#else + shmem_int_get(target, source, N, PEER); +#endif +#endif + +#if DYNAMIC_CHECK +#if ATOMIC + for (i = 0; i < N; i++) + { + target_d[i] = shmem_int_g(source_d + i, peer); + } +#else + shmem_int_get(target_d, source_d, N, PEER); +#endif + +#endif + } + if(_my_pe() == 0) + { + for (i = 0; i < N; i++) + { +#if DYNAMIC_CHECK + if(target_d[i] != 1) + { + printf("Get dynamic error %d, target + i = %p, target[0] = %d, target[1] = %d\n",i, target_d + i,target_d[0], target_d[1]); + fflush(stdout); + return 1; +#endif +#if STATIC_CHECK + if (target[i] != 1) + { + printf("Get static error %d, target + i = %p, target[i] = %d\n",i, target + i,target[i]); + fflush(stdout); + return 1; + } +#endif + } + } + } + +/*put check*/ + + for (i = 0; i < N; i++) + { + source_d[i] = source[i] = 1; + target[i] = target_d[i] = -9; + } + + shmem_barrier_all(); + + if (_my_pe() == 0) + { +#if STATIC_CHECK + shmem_int_put(target, source, N, PEER); +#endif +#if DYNAMIC_CHECK + shmem_int_put(target_d, source_d, N, PEER); +#endif + } + + shmem_barrier_all(); + + if(_my_pe() == PEER) + { + for (i = 0; i < N; i++) + { +#if DYNAMIC_CHECK + if(target_d[i] != 1) + { + printf("Put dynamic error\n"); + fflush(stdout); + return 1; + } +#endif +#if STATIC_CHECK + if (target[i] != 1) + { + printf("Put static error\n"); + fflush(stdout); + return 1; + } +#endif + } + } + printf("All test passed\n");fflush(stdout); + shmem_finalize(); + + return 0; +} + diff --git a/ompi/mca/btl/openib/connect/base.h b/ompi/mca/btl/openib/connect/base.h index b4de20304d..e17d3c8c67 100644 --- a/ompi/mca/btl/openib/connect/base.h +++ b/ompi/mca/btl/openib/connect/base.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. * * $COPYRIGHT$ * @@ -12,7 +13,7 @@ #ifndef BTL_OPENIB_CONNECT_BASE_H #define BTL_OPENIB_CONNECT_BASE_H -#include "connect/connect.h" +#include "ompi/mca/btl/openib/connect/connect.h" #ifdef OMPI_HAVE_RDMAOE #define BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl) \ diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c index e8683d3762..3fc428ca71 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2008-2011 Mellanox Technologies. All rights reserved. + * Copyright (c) 2008-2013 Mellanox Technologies. All rights reserved. * Copyright (c) 2009-2011 IBM Corporation. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved * @@ -375,6 +375,40 @@ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint) } +static void permute_array(int *permuted_qps, int nqps) +{ + int i; + int idx; + int tmp; + int control[nqps]; + + for (i = 0; i < nqps; i++) { + permuted_qps[i] = i; + control[i] = 0; + } + + for (i = 0; i < nqps - 1; i++) { + idx = i + random() % (nqps - i); + tmp = permuted_qps[i]; + permuted_qps[i] = permuted_qps[idx]; + permuted_qps[idx] = tmp; + } + + /* verify that permutation is ok: */ + for (i = 0; i < nqps; i++) { + control[permuted_qps[i]] ++; + } + for (i = 0; i < nqps; i++) { + if (control[i] != 1) { + BTL_VERBOSE(("bad permutation detected: ")); + for (i = 0; i < nqps; i++) BTL_VERBOSE(("%d ", permuted_qps[i])); + BTL_VERBOSE(("\n")); + abort(); + } + } +} + + /* * Create the local side of all the qp's. The remote sides will be * connected later. @@ -384,6 +418,12 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint) int qp, rc, pp_qp_num = 0; int32_t rd_rsv_total = 0; + int rand_qpns[mca_btl_openib_component.num_qps]; + int i; + + permute_array(rand_qpns, mca_btl_openib_component.num_qps); + + for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) if(BTL_OPENIB_QP_TYPE_PP(qp)) { rd_rsv_total += @@ -396,11 +436,12 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint) if(0 == pp_qp_num && true == endpoint->use_eager_rdma) pp_qp_num = 1; - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { + for (i = 0; i < mca_btl_openib_component.num_qps; ++i) { struct ibv_srq *srq = NULL; uint32_t max_recv_wr, max_send_wr; int32_t rd_rsv, rd_num_credits; + qp = rand_qpns[i]; /* QP used for SW flow control need some additional recourses */ if(qp == mca_btl_openib_component.credits_qp) { rd_rsv = rd_rsv_total; @@ -467,7 +508,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, memset(&attr, 0, sizeof(attr)); init_attr.qp_type = IBV_QPT_RC; - init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ]; + init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_RDMA_QP(qp) ? BTL_OPENIB_HP_CQ: BTL_OPENIB_LP_CQ]; init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)]; init_attr.srq = srq; init_attr.cap.max_inline_data = req_inline = diff --git a/ompi/mca/rte/rte.h b/ompi/mca/rte/rte.h index 0ca20fe960..faf31d48ee 100644 --- a/ompi/mca/rte/rte.h +++ b/ompi/mca/rte/rte.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. * * $COPYRIGHT$ * @@ -210,6 +212,9 @@ BEGIN_C_DECLS #define OMPI_RML_PCONNECT_TAG OMPI_RML_TAG_BASE+13 +/* open shmem oob communication */ +#define OMPI_RML_TAG_SHMEM OMPI_RML_TAG_BASE+14 + #define OMPI_RML_TAG_DYNAMIC OMPI_RML_TAG_BASE+200 /* diff --git a/opal/include/opal_config_bottom.h b/opal/include/opal_config_bottom.h index a208231d1a..2b21a26ead 100644 --- a/opal/include/opal_config_bottom.h +++ b/opal/include/opal_config_bottom.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -220,6 +222,12 @@ # define __opal_attribute_weak_alias__(a) #endif +#if OPAL_HAVE_ATTRIBUTE_DESTRUCTOR +# define __opal_attribute_destructor__ __attribute__((__destructor__)) +#else +# define __opal_attribute_destructor__ +#endif + # if OPAL_C_HAVE_VISIBILITY # define OPAL_DECLSPEC __opal_attribute_visibility__("default") # define OPAL_MODULE_DECLSPEC __opal_attribute_visibility__("default") diff --git a/oshmem/Makefile.am b/oshmem/Makefile.am new file mode 100644 index 0000000000..75bda0a21c --- /dev/null +++ b/oshmem/Makefile.am @@ -0,0 +1,123 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +# Do we have profiling? +if OSHMEM_PROFILING +c_pshmem_lib = shmem/c/profile/libshmem_c_pshmem.la +else +c_pshmem_lib = +endif + +f77_shmem_lib = shmem/f77/libshmem_f77.la + +# Note that the ordering of "." in SUBDIRS is important: the C++, F77, +# and F90 bindings are all in standalone .la files that depend on +# libshmem.la. So we must fully build libshmem.la first. + +# NOTE: A handful of files in mpi/f77/base must be included in +# libshmem.la. But we wanted to keep all the Fortran sources together +# in the same tree, so we moved those sources to a separate +# subdirectory with its own Makefile.include that is included in this +# Makefile.am (NOTE: it did *not* work to put all the files -- base +# q +# and non-base -- into mpi/f77 and have both a regular Makefile.am for +# building the f77 bindings library and a separate Makefile.include +# that was included in this top-level Makefile.am; problems occurred +# with "make distclean" and files in the ompi/mpi/f77/.deps directory +# -- it's not clear whether this is an AM bug or whether this behavior +# is simply not supported). This ompi/mpi/f77/base/Makefile.include +# file makes a convenience LT library that is then sucked into +# libshmem.la (the ompi/mpi/f77/base sources must be compiled with +# special CPPFLAGS; we can't just add the raw sources to +# libshmem_la_SOURCES, unfortunately). + +# The end of the result is that libshmem.la -- including a few sources +# from mpi/f77/base -- is fully built before the C++, F77, and F90 +# bindings are built. Therefore, the C++, F77 and F90 bindings +# libraries can all link against libshmem.la. + +SUBDIRS = \ + include \ + shmem/c \ + shmem/f77 \ + $(EXT_oshmem_FRAMEWORKS_SUBDIRS) \ + $(EXT_oshmem_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORKS_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ + . \ + $(MCA_oshmem_FRAMEWORK_COMPONENT_DSO_SUBDIRS) + +DIST_SUBDIRS = \ + include \ + shmem/c \ + shmem/f77 \ + $(EXT_oshmem_FRAMEWORKS_SUBDIRS) \ + $(EXT_oshmem_FRAMEWORK_COMPONENT_ALL_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORKS_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORK_COMPONENT_ALL_SUBDIRS) + +#Build The main SHMEM library +lib_LTLIBRARIES = libshmem.la +libshmem_la_SOURCES = +libshmem_la_LIBADD = \ + shmem/c/libshmem_c.la \ + $(c_pshmem_lib) \ + $(f77_shmem_lib) \ + $(MCA_oshmem_FRAMEWORK_LIBS) \ + $(top_ompi_builddir)/ompi/libmpi.la +libshmem_la_DEPENDENCIES = $(libshmem_la_LIBADD) +libshmem_la_LDFLAGS = \ + -version-info $(libshmem_so_version) \ + $(OSHMEM_LIBSHMEM_EXTRA_LDFLAGS) + +# included subdirectory Makefile.am's and appended-to variables +headers = +noinst_LTLIBRARIES = +include_HEADERS = +nobase_oshmem_HEADERS = +dist_pkgdata_DATA = +libshmem_la_SOURCES += $(headers) +nodist_man_MANS = + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +oshmemdir = $(includedir)/oshmem/oshmem +nobase_oshmem_HEADERS += $(headers) +else +oshmemdir = $(includedir) +endif + +include op/Makefile.am +include proc/Makefile.am +include request/Makefile.am +include runtime/Makefile.am +include shmem/Makefile.am +include tools/Makefile.am + +# Ensure that the man page directory exists before we try to make man +# page files (because oshmem/shmem/man/man3 has no config.status-generated +# Makefile) +dir_stamp = $(top_builddir)/$(subdir)/shmem/man/man3/.dir-stamp + +# Also ensure that the man pages are rebuilt if the opal_config.h file +# changes (e.g., configure was run again, meaning that the release +# date or version may have changed) +$(nodist_man_MANS): $(dir_stamp) $(top_builddir)/opal/include/opal_config.h + +$(dir_stamp): + $(mkdir_p) `dirname $@` + touch "$@" + +# Remove the generated man pages +distclean-local: + rm -f $(nodist_man_MANS) $(dir_stamp) diff --git a/oshmem/include/Makefile.am b/oshmem/include/Makefile.am new file mode 100644 index 0000000000..161b19196a --- /dev/null +++ b/oshmem/include/Makefile.am @@ -0,0 +1,51 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# mpif-common.h is not generated, but mpif.h and mpif-config.h are. +# See big comments in these files for an explanation. + +# note - headers and nodist_headers will go in ${includedir}/openmpi, +# not ${includedir}/ +headers = +nodist_headers = \ + oshmem_config.h + +# Install these in $(includedir) +include_HEADERS = + +# Install these in $(includedir) +mppincludedir = $(includedir)/mpp +mppinclude_HEADERS = mpp/shmem.h \ + mpp/shmem.fh + +# Always install these in $(pkgincludedir) +pkginclude_HEADERS = + +include_HEADERS += shmem.fh + +# These files are always installed in $(includedir), but shouldn't be +# shipped since they are generated by configure from their .in +# counterparts (which AM automatically ships). +nodist_include_HEADERS = \ + shmem.h \ + shmem_portable_platform.h + +if WANT_INSTALL_HEADERS +oshmemdir = $(includedir)/oshmem +nobase_dist_oshmem_HEADERS = $(headers) +nobase_nodist_oshmem_HEADERS = $(nodist_headers) +else +oshmemdir = $(includedir) +nobase_dist_noinst_HEADERS = $(headers) +nobase_nodist_noinst_HEADERS = $(nodist_headers) +endif + +distclean-local: + +include oshmem/Makefile.am diff --git a/oshmem/include/mpif-common.h b/oshmem/include/mpif-common.h new file mode 100644 index 0000000000..2a9fe8cab0 --- /dev/null +++ b/oshmem/include/mpif-common.h @@ -0,0 +1,457 @@ +! +! Copyright (c) 2013 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! This file contains the bulk of the Open MPI Fortran interface. It +! is included as a back-end file to both mpif.h (i.e., the +! standardized MPI Fortran header file) and mpi.f90 (the MPI-2 +! Fortran module source file, found in ompi/mpi/f90). +! +! This file is marginally different than mpif.h. mpif.h includes +! some "external" statements that are not suitable for use with the +! MPI-2 F90 module, and therefore cannot be included in the mpi.f90 +! source file. Hence, this file is essentially everything that +! needs to be in the standardized mpif.h *except* the "external" +! statements, and is therefore suitable to be included in mpi.f90. +! + +! First, however, include some output from configure. +! + include 'mpif-config.h' + +! +! MPI version +! + integer MPI_VERSION, MPI_SUBVERSION + + parameter (MPI_VERSION=2) + parameter (MPI_SUBVERSION=1) +! +! Miscellaneous constants +! + integer MPI_ANY_SOURCE, MPI_ANY_TAG + integer MPI_PROC_NULL + integer MPI_ROOT + integer MPI_UNDEFINED + integer MPI_CART, MPI_GRAPH, MPI_KEYVAL_INVALID + integer MPI_SOURCE, MPI_TAG, MPI_ERROR + integer MPI_TAG_UB, MPI_HOST, MPI_IO, MPI_WTIME_IS_GLOBAL + integer MPI_APPNUM, MPI_LASTUSEDCODE, MPI_UNIVERSE_SIZE + integer IMPI_CLIENT_SIZE, IMPI_CLIENT_COLOR + integer IMPI_HOST_SIZE, IMPI_HOST_COLOR + integer MPI_BSEND_OVERHEAD + integer MPI_ORDER_C, MPI_ORDER_FORTRAN + integer MPI_DISTRIBUTE_BLOCK, MPI_DISTRIBUTE_CYCLIC + integer MPI_DISTRIBUTE_NONE, MPI_DISTRIBUTE_DFLT_DARG + integer MPI_TYPECLASS_INTEGER, MPI_TYPECLASS_REAL + integer MPI_TYPECLASS_COMPLEX + integer MPI_MODE_NOCHECK, MPI_MODE_NOPRECEDE, MPI_MODE_NOPUT + integer MPI_MODE_NOSTORE, MPI_MODE_NOSUCCEED + integer MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED + integer MPI_WIN_BASE, MPI_WIN_SIZE, MPI_WIN_DISP_UNIT + + parameter (MPI_ANY_SOURCE=-1) + parameter (MPI_ANY_TAG=-1) + parameter (MPI_PROC_NULL=-2) + parameter (MPI_ROOT=-4) + parameter (MPI_UNDEFINED=-32766) + parameter (MPI_CART=1) + parameter (MPI_GRAPH=2) + parameter (MPI_KEYVAL_INVALID=-1) + parameter (MPI_SOURCE=1) + parameter (MPI_TAG=2) + parameter (MPI_ERROR=3) + parameter (MPI_TAG_UB=0) + parameter (MPI_HOST=1) + parameter (MPI_IO=2) + parameter (MPI_WTIME_IS_GLOBAL=3) + parameter (MPI_APPNUM=4) + parameter (MPI_LASTUSEDCODE=5) + parameter (MPI_UNIVERSE_SIZE=6) + parameter (MPI_WIN_BASE=7) + parameter (MPI_WIN_SIZE=8) + parameter (MPI_WIN_DISP_UNIT=9) + parameter (IMPI_CLIENT_SIZE=10) + parameter (IMPI_CLIENT_COLOR=11) + parameter (IMPI_HOST_SIZE=12) + parameter (IMPI_HOST_COLOR=13) + + parameter (MPI_BSEND_OVERHEAD=128) + parameter (MPI_ORDER_C=0) + parameter (MPI_ORDER_FORTRAN=1) + parameter (MPI_DISTRIBUTE_BLOCK=0) + parameter (MPI_DISTRIBUTE_CYCLIC=1) + parameter (MPI_DISTRIBUTE_NONE=2) + parameter (MPI_DISTRIBUTE_DFLT_DARG=-1) + parameter (MPI_TYPECLASS_INTEGER=1) + parameter (MPI_TYPECLASS_REAL=2) + parameter (MPI_TYPECLASS_COMPLEX=3) + parameter (MPI_MODE_NOCHECK=1) + parameter (MPI_MODE_NOPRECEDE=2) + parameter (MPI_MODE_NOPUT=4) + parameter (MPI_MODE_NOSTORE=8) + parameter (MPI_MODE_NOSUCCEED=16) + parameter (MPI_LOCK_EXCLUSIVE=1) + parameter (MPI_LOCK_SHARED=2) + +! +! MPI sentinel values +! +! Several of these types were chosen with care to match specific +! overloaded functions in the F90 bindings. They should also match +! the types of their corresponding C variables. Do not arbitrarily +! change their types without also updating the F90 bindings and +! their corresponding types in ompi/mpi/f77/constants.h and +! ompi/mpi/runtime/ompi_init.c! +! +! MPI_BOTTOM is only used where choice buffers can be used (meaning +! that we already have overloaded F90 bindings for all available +! types), so any type is fine. + integer MPI_BOTTOM +! MPI_IN_PLACE has the same rationale as MPI_BOTTOM. + integer MPI_IN_PLACE +! Making MPI_ARGV_NULL be the same type as the parameter that is +! exepected in the F90 binding for MPI_COMM_SPAWN means that we +! don't need another binding for MPI_COMM_SPAWN. + character MPI_ARGV_NULL(1) +! The array_of_argv parameter in the F90 bindings for +! MPI_COMM_SPAWN_MULTIPLE takes a variable number of dimensions +! (specified by the "count" parameter), so it's not possible to have +! a single variable match all possible values. Hence, make it an +! entirely different type (one that would never likely be used by a +! correct program, e.g., double) and have a separate F90 binding for +! matching just this type. + double precision MPI_ARGVS_NULL +! MPI_ERRCODES_IGNORE has similar rationale to MPI_ARGV_NULL. The +! F77 functions are all smart enough to check that the errcodes +! parameter is not ERRCODES_IGNORE before assigning values into it +! (hence, the fact that this is an array of only 1 element does not +! matter -- we'll never overrun it because we never assign values +! into it). + integer MPI_ERRCODES_IGNORE(1) +! MPI_STATUS_IGNORE has similar rationale to MPI_ERRCODES_IGNORE. + integer MPI_STATUS_IGNORE(MPI_STATUS_SIZE) +! MPI_STATUSES_IGNORE has similar rationale to MPI_ARGVS_NULL. + double precision MPI_STATUSES_IGNORE + + common/mpi_fortran_bottom/MPI_BOTTOM + common/mpi_fortran_in_place/MPI_IN_PLACE + common/mpi_fortran_argv_null/MPI_ARGV_NULL + common/mpi_fortran_argvs_null/MPI_ARGVS_NULL + common/mpi_fortran_errcodes_ignore/MPI_ERRCODES_IGNORE + common/mpi_fortran_status_ignore/MPI_STATUS_IGNORE + common/mpi_fortran_statuses_ignore/MPI_STATUSES_IGNORE +! +! NULL "handles" (indices) +! + integer MPI_GROUP_NULL, MPI_COMM_NULL, MPI_DATATYPE_NULL + integer MPI_REQUEST_NULL, MPI_OP_NULL, MPI_ERRHANDLER_NULL + integer MPI_INFO_NULL, MPI_WIN_NULL + + parameter (MPI_GROUP_NULL=0) + parameter (MPI_COMM_NULL=2) + parameter (MPI_DATATYPE_NULL=0) + parameter (MPI_REQUEST_NULL=0) + parameter (MPI_OP_NULL=0) + parameter (MPI_ERRHANDLER_NULL=0) + parameter (MPI_INFO_NULL=0) + parameter (MPI_WIN_NULL=0) +! +! MPI_Init_thread constants +! + integer MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED + integer MPI_THREAD_SERIALIZED, MPI_THREAD_MULTIPLE + + parameter (MPI_THREAD_SINGLE=0) + parameter (MPI_THREAD_FUNNELED=1) + parameter (MPI_THREAD_SERIALIZED=2) + parameter (MPI_THREAD_MULTIPLE=3) +! +! error classes +! + integer SHMEM_SUCCESS + integer SHMEM_ERR_BUFFER + integer SHMEM_ERR_COUNT + integer SHMEM_ERR_TYPE + integer SHMEM_ERR_TAG + integer SHMEM_ERR_COMM + integer SHMEM_ERR_RANK + integer SHMEM_ERR_REQUEST + integer SHMEM_ERR_ROOT + integer SHMEM_ERR_GROUP + integer SHMEM_ERR_OP + integer SHMEM_ERR_TOPOLOGY + integer SHMEM_ERR_DIMS + integer SHMEM_ERR_ARG + integer SHMEM_ERR_UNKNOWN + integer SHMEM_ERR_TRUNCATE + integer SHMEM_ERR_OTHER + integer SHMEM_ERR_INTERN + integer SHMEM_ERR_IN_STATUS + integer SHMEM_ERR_PENDING + integer SHMEM_ERR_ACCESS + integer SHMEM_ERR_AMODE + integer SHMEM_ERR_ASSERT + integer SHMEM_ERR_BAD_FILE + integer SHMEM_ERR_BASE + integer SHMEM_ERR_CONVERSION + integer SHMEM_ERR_DISP + integer SHMEM_ERR_DUP_DATAREP + integer SHMEM_ERR_FILE_EXISTS + integer SHMEM_ERR_FILE_IN_USE + integer SHMEM_ERR_FILE + integer SHMEM_ERR_INFO_KEY + integer SHMEM_ERR_INFO_NOKEY + integer SHMEM_ERR_INFO_VALUE + integer SHMEM_ERR_INFO + integer SHMEM_ERR_IO + integer SHMEM_ERR_KEYVAL + integer SHMEM_ERR_LOCKTYPE + integer SHMEM_ERR_NAME + integer SHMEM_ERR_NO_MEM + integer SHMEM_ERR_NOT_SAME + integer SHMEM_ERR_NO_SPACE + integer SHMEM_ERR_NO_SUCH_FILE + integer SHMEM_ERR_PORT + integer SHMEM_ERR_QUOTA + integer SHMEM_ERR_READ_ONLY + integer SHMEM_ERR_RMA_CONFLICT + integer SHMEM_ERR_RMA_SYNC + integer SHMEM_ERR_SERVICE + integer SHMEM_ERR_SIZE + integer SHMEM_ERR_SPAWN + integer SHMEM_ERR_UNSUPPORTED_DATAREP + integer SHMEM_ERR_UNSUPPORTED_OPERATION + integer SHMEM_ERR_WIN + + integer SHMEM_ERR_SYSRESOURCE + integer SHMEM_ERR_LASTCODE + + parameter( SHMEM_SUCCESS = 0) + parameter( SHMEM_ERR_BUFFER = 1) + parameter( SHMEM_ERR_COUNT = 2) + parameter( SHMEM_ERR_TYPE = 3) + parameter( SHMEM_ERR_TAG = 4) + parameter( SHMEM_ERR_COMM = 5) + parameter( SHMEM_ERR_RANK = 6) + parameter( SHMEM_ERR_REQUEST = 7) + parameter( SHMEM_ERR_ROOT = 8) + parameter( SHMEM_ERR_GROUP = 9) + parameter( SHMEM_ERR_OP = 10) + parameter( SHMEM_ERR_TOPOLOGY = 11) + parameter( SHMEM_ERR_DIMS = 12) + parameter( SHMEM_ERR_ARG = 13) + parameter( SHMEM_ERR_UNKNOWN = 14) + parameter( SHMEM_ERR_TRUNCATE = 15) + parameter( SHMEM_ERR_OTHER = 16) + parameter( SHMEM_ERR_INTERN = 17) + parameter( SHMEM_ERR_IN_STATUS = 18) + parameter( SHMEM_ERR_PENDING = 19) + parameter( SHMEM_ERR_ACCESS = 20) + parameter( SHMEM_ERR_AMODE = 21) + parameter( SHMEM_ERR_ASSERT = 22) + parameter( SHMEM_ERR_BAD_FILE = 23) + parameter( SHMEM_ERR_BASE = 24) + parameter( SHMEM_ERR_CONVERSION = 25) + parameter( SHMEM_ERR_DISP = 26) + parameter( SHMEM_ERR_DUP_DATAREP = 27) + parameter( SHMEM_ERR_FILE_EXISTS = 28) + parameter( SHMEM_ERR_FILE_IN_USE = 29) + parameter( SHMEM_ERR_FILE = 30) + parameter( SHMEM_ERR_INFO_KEY = 31) + parameter( SHMEM_ERR_INFO_NOKEY = 32) + parameter( SHMEM_ERR_INFO_VALUE = 33) + parameter( SHMEM_ERR_INFO = 34) + parameter( SHMEM_ERR_IO = 35) + parameter( SHMEM_ERR_KEYVAL = 36) + parameter( SHMEM_ERR_LOCKTYPE = 37) + parameter( SHMEM_ERR_NAME = 38) + parameter( SHMEM_ERR_NO_MEM = 39) + parameter( SHMEM_ERR_NOT_SAME = 40) + parameter( SHMEM_ERR_NO_SPACE = 41) + parameter( SHMEM_ERR_NO_SUCH_FILE = 42) + parameter( SHMEM_ERR_PORT = 43) + parameter( SHMEM_ERR_QUOTA = 44) + parameter( SHMEM_ERR_READ_ONLY = 45) + parameter( SHMEM_ERR_RMA_CONFLICT = 46) + parameter( SHMEM_ERR_RMA_SYNC = 47) + parameter( SHMEM_ERR_SERVICE = 48) + parameter( SHMEM_ERR_SIZE = 49) + parameter( SHMEM_ERR_SPAWN = 50) + parameter( SHMEM_ERR_UNSUPPORTED_DATAREP = 51) + parameter( SHMEM_ERR_UNSUPPORTED_OPERATION= 52) + parameter( SHMEM_ERR_WIN = 53) + + parameter( SHMEM_ERR_SYSRESOURCE = -2) + parameter( SHMEM_ERR_LASTCODE = 54) + +! +! comparison results +! + integer MPI_IDENT, MPI_CONGRUENT, MPI_SIMILAR, MPI_UNEQUAL + + parameter (MPI_IDENT=0) + parameter (MPI_CONGRUENT=1) + parameter (MPI_SIMILAR=2) + parameter (MPI_UNEQUAL=3) +! +! datatype combiners +! + integer MPI_COMBINER_NAMED + integer MPI_COMBINER_DUP + integer MPI_COMBINER_CONTIGUOUS + integer MPI_COMBINER_VECTOR + integer MPI_COMBINER_HVECTOR_INTEGER + integer MPI_COMBINER_HVECTOR + integer MPI_COMBINER_INDEXED + integer MPI_COMBINER_HINDEXED_INTEGER + integer MPI_COMBINER_HINDEXED + integer MPI_COMBINER_INDEXED_BLOCK + integer MPI_COMBINER_STRUCT_INTEGER + integer MPI_COMBINER_STRUCT + integer MPI_COMBINER_SUBARRAY + integer MPI_COMBINER_DARRAY + integer MPI_COMBINER_F90_REAL + integer MPI_COMBINER_F90_COMPLEX + integer MPI_COMBINER_F90_INTEGER + integer MPI_COMBINER_RESIZED + + parameter (MPI_COMBINER_NAMED=0) + parameter (MPI_COMBINER_DUP=1) + parameter (MPI_COMBINER_CONTIGUOUS=2) + parameter (MPI_COMBINER_VECTOR=3) + parameter (MPI_COMBINER_HVECTOR_INTEGER=4) + parameter (MPI_COMBINER_HVECTOR=5) + parameter (MPI_COMBINER_INDEXED=6) + parameter (MPI_COMBINER_HINDEXED_INTEGER=7) + parameter (MPI_COMBINER_HINDEXED=8) + parameter (MPI_COMBINER_INDEXED_BLOCK=9) + parameter (MPI_COMBINER_STRUCT_INTEGER=10) + parameter (MPI_COMBINER_STRUCT=11) + parameter (MPI_COMBINER_SUBARRAY=12) + parameter (MPI_COMBINER_DARRAY=13) + parameter (MPI_COMBINER_F90_REAL=14) + parameter (MPI_COMBINER_F90_COMPLEX=15) + parameter (MPI_COMBINER_F90_INTEGER=16) + parameter (MPI_COMBINER_RESIZED=17) +! +! lookup table indices +! + integer MPI_COMM_WORLD, MPI_COMM_SELF + integer MPI_GROUP_EMPTY + integer MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN + + parameter (MPI_COMM_WORLD=0) + parameter (MPI_COMM_SELF=1) + parameter (MPI_GROUP_EMPTY=1) + parameter (MPI_ERRORS_ARE_FATAL=1) + parameter (MPI_ERRORS_RETURN=2) + + integer MPI_BYTE, MPI_PACKED, MPI_UB, MPI_LB + integer MPI_CHARACTER, MPI_LOGICAL + integer MPI_INTEGER, MPI_INTEGER1, MPI_INTEGER2, MPI_INTEGER4 + integer MPI_INTEGER8, MPI_INTEGER16 + integer MPI_REAL, MPI_REAL2, MPI_REAL4, MPI_REAL8, MPI_REAL16 + integer MPI_DOUBLE_PRECISION + integer MPI_COMPLEX, MPI_COMPLEX8, MPI_COMPLEX16, MPI_COMPLEX32 + integer MPI_DOUBLE_COMPLEX + integer MPI_2REAL, MPI_2DOUBLE_PRECISION, MPI_2INTEGER + integer MPI_2COMPLEX, MPI_2DOUBLE_COMPLEX +! Note that MPI_LOGICALx are not defined by the MPI spec, but there are +! other MPI implementations that have them, so it's good for us to have +! as well. + integer MPI_LOGICAL1, MPI_LOGICAL2, MPI_LOGICAL4, MPI_LOGICAL8 + +! +! Do NOT change the order of these parameters +! + parameter (MPI_BYTE=1) + parameter (MPI_PACKED=2) + parameter (MPI_UB=3) + parameter (MPI_LB=4) + parameter (MPI_CHARACTER=5) + parameter (MPI_LOGICAL=6) + parameter (MPI_INTEGER=7) + parameter (MPI_INTEGER1=8) + parameter (MPI_INTEGER2=9) + parameter (MPI_INTEGER4=10) + parameter (MPI_INTEGER8=11) + parameter (MPI_INTEGER16=12) + parameter (MPI_REAL=13) + parameter (MPI_REAL4=14) + parameter (MPI_REAL8=15) + parameter (MPI_REAL16=16) + parameter (MPI_DOUBLE_PRECISION=17) + parameter (MPI_COMPLEX=18) + parameter (MPI_COMPLEX8=19) + parameter (MPI_COMPLEX16=20) + parameter (MPI_COMPLEX32=21) + parameter (MPI_DOUBLE_COMPLEX=22) + parameter (MPI_2REAL=23) + parameter (MPI_2DOUBLE_PRECISION=24) + parameter (MPI_2INTEGER=25) + parameter (MPI_2COMPLEX=26) + parameter (MPI_2DOUBLE_COMPLEX=27) + parameter (MPI_REAL2=28) + parameter (MPI_LOGICAL1=29) + parameter (MPI_LOGICAL2=30) + parameter (MPI_LOGICAL4=31) + parameter (MPI_LOGICAL8=32) + + integer MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD, MPI_LAND + integer MPI_BAND, MPI_LOR, MPI_BOR, MPI_LXOR, MPI_BXOR + integer MPI_MAXLOC, MPI_MINLOC, MPI_REPLACE + + parameter (MPI_MAX=1) + parameter (MPI_MIN=2) + parameter (MPI_SUM=3) + parameter (MPI_PROD=4) + parameter (MPI_LAND=5) + parameter (MPI_BAND=6) + parameter (MPI_LOR=7) + parameter (MPI_BOR=8) + parameter (MPI_LXOR=9) + parameter (MPI_BXOR=10) + parameter (MPI_MAXLOC=11) + parameter (MPI_MINLOC=12) + parameter (MPI_REPLACE=13) diff --git a/oshmem/include/mpif-config.h.in b/oshmem/include/mpif-config.h.in new file mode 100644 index 0000000000..3971d0ebfc --- /dev/null +++ b/oshmem/include/mpif-config.h.in @@ -0,0 +1,99 @@ +! -*- fortran -*- +! +! Copyright (c) 2013 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! This file is included as a back-end file to both mpif.h (i.e., the +! standardized MPI Fortran header file) and a bunch of the MPI +! Fortran 90 subroutine implementations found in ompi/mpi/f90. +! +! This file contains the output from configure that is relevant for +! Fortran applications (both 77 and 90) and a few values that are +! necessary to compile the F90 module (e.g., MPI_STATUS_SIZE). +! + +! Include the MPI I/O stuff, if needed + @OMPI_MPIF_MPI_IO_INCLUDE@ + +! +! OMPI version +! This file is generated from configure; do not edit it manually. +! + integer OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION + integer OMPI_RELEASE_VERSION + character*32 OMPI_GREEK_VERSION + character*32 OMPI_SVN_VERSION + parameter (OMPI_MAJOR_VERSION=@OMPI_MAJOR_VERSION@) + parameter (OMPI_MINOR_VERSION=@OMPI_MINOR_VERSION@) + parameter (OMPI_RELEASE_VERSION=@OMPI_RELEASE_VERSION@) + parameter (OMPI_GREEK_VERSION="@OMPI_GREEK_VERSION@") + parameter (OMPI_SVN_VERSION="@OMPI_SVN_R@") +! +! Kind parameters +! + integer MPI_OFFSET_KIND, MPI_ADDRESS_KIND, MPI_INTEGER_KIND + parameter (MPI_INTEGER_KIND=@OMPI_MPI_INTEGER_KIND@) + parameter (MPI_ADDRESS_KIND=@OMPI_MPI_ADDRESS_KIND@) + parameter (MPI_OFFSET_KIND=@OMPI_MPI_OFFSET_KIND@) +! +! Miscellaneous constants +! + integer MPI_STATUS_SIZE + parameter (MPI_STATUS_SIZE=5) +! +! Configurable length constants +! + integer MPI_MAX_PROCESSOR_NAME + integer MPI_MAX_ERROR_STRING + integer MPI_MAX_OBJECT_NAME + integer MPI_MAX_INFO_KEY + integer MPI_MAX_INFO_VAL + integer MPI_MAX_PORT_NAME + integer MPI_MAX_DATAREP_STRING + parameter (MPI_MAX_PROCESSOR_NAME=@OPAL_MAX_PROCESSOR_NAME@-1) + parameter (MPI_MAX_ERROR_STRING=@OPAL_MAX_ERROR_STRING@-1) + parameter (MPI_MAX_OBJECT_NAME=@OPAL_MAX_OBJECT_NAME@-1) + parameter (MPI_MAX_INFO_KEY=@OPAL_MAX_INFO_KEY@-1) + parameter (MPI_MAX_INFO_VAL=@OPAL_MAX_INFO_VAL@-1) + parameter (MPI_MAX_PORT_NAME=@OPAL_MAX_PORT_NAME@-1) + parameter (MPI_MAX_DATAREP_STRING=@OPAL_MAX_DATAREP_STRING@-1) diff --git a/oshmem/include/mpif-mpi-io.h b/oshmem/include/mpif-mpi-io.h new file mode 100644 index 0000000000..771e0aefff --- /dev/null +++ b/oshmem/include/mpif-mpi-io.h @@ -0,0 +1,74 @@ +!Copyright (c) 2013 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! This file is included as a back-end file to both mpif.h (i.e., the +! standardized MPI Fortran header file) and a bunch of the MPI +! Fortran 90 subroutine implementations found in ompi/mpi/f90. +! +! This file contains the output from configure that is relevant for +! Fortran applications (both 77 and 90) and a few values that are +! necessary to compile the F90 module (e.g., MPI_STATUS_SIZE). +! + + integer MPI_FILE_NULL + integer MPI_SEEK_SET, MPI_SEEK_CUR, MPI_SEEK_END + integer MPI_MODE_CREATE + integer MPI_MODE_RDONLY, MPI_MODE_WRONLY, MPI_MODE_RDWR + integer MPI_MODE_DELETE_ON_CLOSE, MPI_MODE_UNIQUE_OPEN + integer MPI_MODE_EXCL, MPI_MODE_APPEND, MPI_MODE_SEQUENTIAL + integer MPI_DISPLACEMENT_CURRENT + + parameter (MPI_FILE_NULL=0) + parameter (MPI_SEEK_SET=600) + parameter (MPI_SEEK_CUR=602) + parameter (MPI_SEEK_END=604) + parameter (MPI_MODE_CREATE=1) + parameter (MPI_MODE_RDONLY=2) + parameter (MPI_MODE_WRONLY=4) + parameter (MPI_MODE_RDWR=8) + parameter (MPI_MODE_DELETE_ON_CLOSE=16) + parameter (MPI_MODE_UNIQUE_OPEN=32) + parameter (MPI_MODE_EXCL=64) + parameter (MPI_MODE_APPEND=128) + parameter (MPI_MODE_SEQUENTIAL=256) + parameter (MPI_DISPLACEMENT_CURRENT=-54278278) diff --git a/oshmem/include/mpif.h.in b/oshmem/include/mpif.h.in new file mode 100644 index 0000000000..48751683f3 --- /dev/null +++ b/oshmem/include/mpif.h.in @@ -0,0 +1,76 @@ +! -*- fortran -*- +! +! Copyright (c) 2013 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! Include the back-end file that has the bulk of the MPI Fortran +! interface. +! + + include 'mpif-common.h' + +! +! These "external" statements are specific to the MPI F77 interface +! (and are toxic to the MPI F90 interface), and are therefore in the +! MPI F77-specific header file (i.e., this one). +! + external MPI_NULL_COPY_FN, MPI_NULL_DELETE_FN + external MPI_COMM_NULL_COPY_FN, MPI_COMM_NULL_DELETE_FN + external MPI_TYPE_NULL_COPY_FN, MPI_TYPE_NULL_DELETE_FN + external MPI_DUP_FN, MPI_COMM_DUP_FN, MPI_TYPE_DUP_FN + external MPI_WIN_NULL_COPY_FN + external MPI_WIN_NULL_DELETE_FN + external MPI_WIN_DUP_FN +! Note that MPI_CONVERSION_FN_NULL is a "constant" (it is only ever +! checked for comparison; it is never invoked), but it is passed as +! a function pointer (to MPI_REGISTER_DATAREP) and therefore must be +! the same size/type. It is therefore external'ed here, and not +! defined with an integer value in mpif-common.h. + external MPI_CONVERSION_FN_NULL + +! +! double precision functions +! + external MPI_WTIME, MPI_WTICK @MPIF_H_PMPI_W_FUNCS@ + double precision MPI_WTIME, MPI_WTICK @MPIF_H_PMPI_W_FUNCS@ + diff --git a/oshmem/include/mpp/shmem.fh b/oshmem/include/mpp/shmem.fh new file mode 100644 index 0000000000..2b06f34026 --- /dev/null +++ b/oshmem/include/mpp/shmem.fh @@ -0,0 +1,11 @@ +! +! Copyright (c) 2013 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! + + include 'shmem.fh' diff --git a/oshmem/include/mpp/shmem.h b/oshmem/include/mpp/shmem.h new file mode 100644 index 0000000000..19b92ea388 --- /dev/null +++ b/oshmem/include/mpp/shmem.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef __MPP_SHMEM_H__ +#define __MPP_SHMEM_H__ + +#include + +#endif \ No newline at end of file diff --git a/oshmem/include/oshmem/Makefile.am b/oshmem/include/oshmem/Makefile.am new file mode 100644 index 0000000000..e7d43e600c --- /dev/null +++ b/oshmem/include/oshmem/Makefile.am @@ -0,0 +1,15 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ + + +headers += \ + oshmem/constants.h \ + oshmem/types.h + +nodist_headers += \ + oshmem/version.h diff --git a/oshmem/include/oshmem/constants.h b/oshmem/include/oshmem/constants.h new file mode 100644 index 0000000000..ea22f94b0b --- /dev/null +++ b/oshmem/include/oshmem/constants.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_CONSTANTS_H +#define OSHMEM_CONSTANTS_H + +#include "orte/constants.h" +#if defined(OSHMEM_PROFILING) && (OSHMEM_PROFILING == 1) +#include "oshmem/shmem/c/profile/defines.h" +#endif +#include "oshmem/include/shmem.h" + + +#define OSHMEM_ERR_BASE ORTE_ERR_MAX + +/* error codes */ +enum { + /* Error codes inherited from ORTE/OPAL. Still enum values so + that we might get nice debugger help */ + OSHMEM_SUCCESS = ORTE_SUCCESS, + + OSHMEM_ERROR = ORTE_ERROR, + OSHMEM_ERR_OUT_OF_RESOURCE = ORTE_ERR_OUT_OF_RESOURCE, + OSHMEM_ERR_TEMP_OUT_OF_RESOURCE = ORTE_ERR_TEMP_OUT_OF_RESOURCE, + OSHMEM_ERR_RESOURCE_BUSY = ORTE_ERR_RESOURCE_BUSY, + OSHMEM_ERR_BAD_PARAM = ORTE_ERR_BAD_PARAM, + OSHMEM_ERR_FATAL = ORTE_ERR_FATAL, + OSHMEM_ERR_NOT_IMPLEMENTED = ORTE_ERR_NOT_IMPLEMENTED, + OSHMEM_ERR_NOT_SUPPORTED = ORTE_ERR_NOT_SUPPORTED, + OSHMEM_ERR_INTERUPTED = ORTE_ERR_INTERUPTED, + OSHMEM_ERR_WOULD_BLOCK = ORTE_ERR_WOULD_BLOCK, + OSHMEM_ERR_IN_ERRNO = ORTE_ERR_IN_ERRNO, + OSHMEM_ERR_UNREACH = ORTE_ERR_UNREACH, + OSHMEM_ERR_NOT_FOUND = ORTE_ERR_NOT_FOUND, + OSHMEM_EXISTS = ORTE_EXISTS, /* indicates that the specified object already exists */ + OSHMEM_ERR_TIMEOUT = ORTE_ERR_TIMEOUT, + OSHMEM_ERR_NOT_AVAILABLE = ORTE_ERR_NOT_AVAILABLE, + OSHMEM_ERR_PERM = ORTE_ERR_PERM, + OSHMEM_ERR_VALUE_OUT_OF_BOUNDS = ORTE_ERR_VALUE_OUT_OF_BOUNDS, + OSHMEM_ERR_FILE_READ_FAILURE = ORTE_ERR_FILE_READ_FAILURE, + OSHMEM_ERR_FILE_WRITE_FAILURE = ORTE_ERR_FILE_WRITE_FAILURE, + OSHMEM_ERR_FILE_OPEN_FAILURE = ORTE_ERR_FILE_OPEN_FAILURE, + + OSHMEM_ERR_RECV_LESS_THAN_POSTED = ORTE_ERR_RECV_LESS_THAN_POSTED, + OSHMEM_ERR_RECV_MORE_THAN_POSTED = ORTE_ERR_RECV_MORE_THAN_POSTED, + OSHMEM_ERR_NO_MATCH_YET = ORTE_ERR_NO_MATCH_YET, + OSHMEM_ERR_BUFFER = ORTE_ERR_BUFFER, + OSHMEM_ERR_REQUEST = ORTE_ERR_REQUEST, + OSHMEM_ERR_NO_CONNECTION_ALLOWED = ORTE_ERR_NO_CONNECTION_ALLOWED, + OSHMEM_ERR_CONNECTION_REFUSED = ORTE_ERR_CONNECTION_REFUSED , + OSHMEM_ERR_CONNECTION_FAILED = ORTE_ERR_CONNECTION_FAILED, + OSHMEM_PACK_MISMATCH = ORTE_ERR_PACK_MISMATCH, + OSHMEM_ERR_PACK_FAILURE = ORTE_ERR_PACK_FAILURE, + OSHMEM_ERR_UNPACK_FAILURE = ORTE_ERR_UNPACK_FAILURE, + OSHMEM_ERR_COMM_FAILURE = ORTE_ERR_COMM_FAILURE, + OSHMEM_UNPACK_INADEQUATE_SPACE = ORTE_ERR_UNPACK_INADEQUATE_SPACE, + OSHMEM_UNPACK_READ_PAST_END_OF_BUFFER = ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER, + OSHMEM_ERR_TYPE_MISMATCH = ORTE_ERR_TYPE_MISMATCH, + OSHMEM_ERR_COMPARE_FAILURE = ORTE_ERR_COMPARE_FAILURE, + OSHMEM_ERR_COPY_FAILURE = ORTE_ERR_COPY_FAILURE, + OSHMEM_ERR_UNKNOWN_DATA_TYPE = ORTE_ERR_UNKNOWN_DATA_TYPE, + OSHMEM_ERR_DATA_TYPE_REDEF = ORTE_ERR_DATA_TYPE_REDEF, + OSHMEM_ERR_DATA_OVERWRITE_ATTEMPT = ORTE_ERR_DATA_OVERWRITE_ATTEMPT +}; + +#define OSHMEM_ERR_MAX (OSHMEM_ERR_BASE - 1) + + +/* C datatypes */ +/* + * SHMEM_Init_thread constants + * Do not change the order of these without also modifying mpif.h.in. + */ +enum { + SHMEM_NULL = 0, + SHMEM_CHAR, + SHMEM_UCHAR, + SHMEM_SHORT, + SHMEM_USHORT, + SHMEM_INT, + SHMEM_UINT, + SHMEM_LONG, + SHMEM_ULONG, + SHMEM_LLONG, + SHMEM_ULLONG, + SHMEM_FLOAT, + SHMEM_DOUBLE, + SHMEM_LDOUBLE, + + SHMEM_FINT, + SHMEM_FINT4, + SHMEM_FINT8 +}; + + +/* + * Miscellaneous constants + */ +#define SHMEM_ANY_SOURCE -1 /* match any source rank */ +#define SHMEM_PROC_NULL -2 /* rank of null process */ +#define SHMEM_UNDEFINED -32766 /* undefined stuff */ + + +#ifndef UNREFERENCED_PARAMETER +#define UNREFERENCED_PARAMETER(P) ((void)P) +#endif + +#define OSHMEM_PREDEFINED_GLOBAL(type, global) ((type) ((void *) &(global))) + +#if OMPI_WANT_MEMCHECKER +#define MEMCHECKER(x) do { \ + x; \ + } while(0) +#else +#define MEMCHECKER(x) +#endif /* OMPI_WANT_MEMCHECKER */ + + +#endif /* OSHMEM_CONSTANTS_H */ + diff --git a/oshmem/include/oshmem/types.h b/oshmem/include/oshmem/types.h new file mode 100644 index 0000000000..e78e459521 --- /dev/null +++ b/oshmem/include/oshmem/types.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef SHMEM_TYPES_H +#define SHMEM_TYPES_H + + +/* + * Predefine some internal types so we dont need all the include + * dependencies. + */ + + struct oshmem_proc_t; + struct oshmem_group_t; + struct oshmem_op_t; + +#endif diff --git a/oshmem/include/oshmem/version.h.in b/oshmem/include/oshmem/version.h.in new file mode 100644 index 0000000000..697b0edca4 --- /dev/null +++ b/oshmem/include/oshmem/version.h.in @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * This file should be included by any file that needs full + * version information for the OSHMEM project + */ + +#ifndef OSHMEM_VERSIONS_H +#define OSHMEM_VERSIONS_H + +#define OSHMEM_MAJOR_VERSION @OSHMEM_MAJOR_VERSION@ +#define OSHMEM_MINOR_VERSION @OSHMEM_MINOR_VERSION@ +#define OSHMEM_RELEASE_VERSION @OSHMEM_RELEASE_VERSION@ +#define OSHMEM_GREEK_VERSION "@OSHMEM_GREEK_VERSION@" +#define OSHMEM_WANT_REPO_REV @OSHMEM_WANT_REPO_REV@ +#define OSHMEM_REPO_REV "@OSHMEM_REPO_REV@" +#ifdef OSHMEM_VERSION +/* If we included version.h, we want the real version, not the + stripped (no-r number) version */ +#undef OSHMEM_VERSION +#endif +#define OSHMEM_VERSION "@OSHMEM_VERSION@" + +#endif diff --git a/oshmem/include/oshmem_config.h.in b/oshmem/include/oshmem_config.h.in new file mode 100644 index 0000000000..a2b517e7dc --- /dev/null +++ b/oshmem/include/oshmem_config.h.in @@ -0,0 +1,125 @@ +/* -*- c -*- + * + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * Function: - OS, CPU and compiler dependent configuration + */ + +#ifndef OSHMEM_CONFIG_H +#define OSHMEM_CONFIG_H + +#include "opal_config.h" + +#define OSHMEM_IDENT_STRING OPAL_IDENT_STRING + +/*********************************************************************** + * + * OMPI-specific Fortran code that should be in ompi_config.h, but not + * in the other projects. + * + **********************************************************************/ + +/* MPI_Fint is the same as ompi_fortran_INTEGER_t */ +#define MPI_Fint ompi_fortran_integer_t + +#if OMPI_HAVE_FORTRAN_REAL && OMPI_HAVE_FORTRAN_COMPLEX +/* * C type for Fortran COMPLEX */ +/*typedef struct { + ompi_fortran_real_t real; + ompi_fortran_real_t imag; +} ompi_fortran_complex_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_REAL4 && OMPI_HAVE_FORTRAN_COMPLEX8 +/* * C type for Fortran COMPLEX*8 */ +/*typedef struct { + ompi_fortran_real4_t real; + ompi_fortran_real4_t imag; +} ompi_fortran_complex8_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_REAL8 && OMPI_HAVE_FORTRAN_COMPLEX16 +/* * C type for Fortran COMPLEX*16 */ +/*typedef struct { + ompi_fortran_real8_t real; + ompi_fortran_real8_t imag; +} ompi_fortran_complex16_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_HAVE_FORTRAN_COMPLEX32 +/* * C type for Fortran COMPLEX*32 */ +/*typedef struct { + ompi_fortran_real16_t real; + ompi_fortran_real16_t imag; +} ompi_fortran_complex32_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION +/* * C type for Fortran DOUBLE COMPLEX */ +/*typedef struct { + ompi_fortran_double_precision_t real; + ompi_fortran_double_precision_t imag; +} ompi_fortran_double_complex_t;*/ +#endif + +#if OPAL_HAVE_ATTRIBUTE_DESTRUCTOR +# define __opal_attribute_destructor__ __attribute__((__destructor__)) +#else +# define __opal_attribute_destructor__ +#endif + +#if defined(__WINDOWS__) + +# if defined(_USRDLL) /* building shared libraries (.DLL) */ +# if defined(OSHMEM_EXPORTS) +# define OSHMEM_DECLSPEC __declspec(dllexport) +# define OSHMEM_MODULE_DECLSPEC +# else +# define OSHMEM_DECLSPEC __declspec(dllimport) +# if defined(OSHMEM_MODULE_EXPORTS) +# define OSHMEM_MODULE_DECLSPEC __declspec(dllexport) +# else +# define OSHMEM_MODULE_DECLSPEC __declspec(dllimport) +# endif /* defined(OSHMEM_MODULE_EXPORTS) */ +# endif /* defined(OSHMEM_EXPORTS) */ +# else /* building static library */ +# if defined(OSHMEM_IMPORTS) +# define OSHMEM_DECLSPEC __declspec(dllimport) +# else +# define OSHMEM_DECLSPEC +# endif /* defined(OSHMEM_IMPORTS) */ +# define OSHMEM_MODULE_DECLSPEC +# endif /* defined(_USRDLL) */ + +#else + +# if OPAL_C_HAVE_VISIBILITY +# ifndef OSHMEM_DECLSPEC +# define OSHMEM_DECLSPEC __opal_attribute_visibility__("default") +# endif +# ifndef OSHMEM_MODULE_DECLSPEC +# define OSHMEM_MODULE_DECLSPEC __opal_attribute_visibility__("default") +# endif +# ifndef OSHMEM_DESTRUCTOR +# define OSHMEM_DESTRUCTOR __opal_attribute_destructor__ +# endif +# else +# ifndef OSHMEM_DECLSPEC +# define OSHMEM_DECLSPEC +# endif +# ifndef OSHMEM_MODULE_DECLSPEC +# define OSHMEM_MODULE_DECLSPEC +# endif +# ifndef OSHMEM_DESTRUCTOR +# define OSHMEM_DESTRUCTOR +# endif +# endif +#endif /* defined(__WINDOWS__) */ + +#endif diff --git a/oshmem/include/shmem.fh b/oshmem/include/shmem.fh new file mode 100644 index 0000000000..5289c0e54c --- /dev/null +++ b/oshmem/include/shmem.fh @@ -0,0 +1,55 @@ +! Emacs: -*- mode: fortran; -*- +! +! Copyright (c) 2013 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! + +! +! TODO: exact values should be found during configuration +! + + integer SHMEM_BARRIER_SYNC_SIZE + parameter ( SHMEM_BARRIER_SYNC_SIZE = 4 ) + + integer SHMEM_BCAST_SYNC_SIZE + parameter ( SHMEM_BCAST_SYNC_SIZE = 8 ) + + + integer SHMEM_COLLECT_SYNC_SIZE + parameter ( SHMEM_COLLECT_SYNC_SIZE = 8 ) + + integer SHMEM_REDUCE_SYNC_SIZE + parameter ( SHMEM_REDUCE_SYNC_SIZE = 8 ) + + integer SHMEM_SYNC_VALUE + parameter ( SHMEM_SYNC_VALUE = -1 ) + + integer SHMEM_REDUCE_MIN_WRKDATA_SIZE + parameter ( SHMEM_REDUCE_MIN_WRKDATA_SIZE = 8 ) + +! +! waits +! + integer SHMEM_CMP_EQ + parameter ( SHMEM_CMP_EQ = 0 ) + integer SHMEM_CMP_NE + parameter ( SHMEM_CMP_NE = 1 ) + integer SHMEM_CMP_GT + parameter ( SHMEM_CMP_GT = 2 ) + integer SHMEM_CMP_LE + parameter ( SHMEM_CMP_LE = 3 ) + integer SHMEM_CMP_LT + parameter ( SHMEM_CMP_LT = 4 ) + integer SHMEM_CMP_GE + parameter ( SHMEM_CMP_GE = 5 ) + + + logical shmem_pe_accessible + logical shmem_addr_accessible + + integer*8 shmem_ptr diff --git a/oshmem/include/shmem.h.in b/oshmem/include/shmem.h.in new file mode 100644 index 0000000000..2d6814a854 --- /dev/null +++ b/oshmem/include/shmem.h.in @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SHMEM_H +#define OSHMEM_SHMEM_H + + +#include /* include for ptrdiff_t */ + +#if defined(WIN32) || defined(_WIN32) +# define OSHMEM_COMPLEX_TYPE(type) +#else +# if defined(c_plusplus) || defined(__cplusplus) +# include +# define OSHMEM_COMPLEX_TYPE(type) std::complex +# else +# include +# define OSHMEM_COMPLEX_TYPE(type) type complex +# endif +#endif + + +/* + * SHMEM version + */ +#define SHMEM_VERSION 1 +#define SHMEM_SUBVERSION 5 + + +#ifndef OSHMEM_DECLSPEC +# if defined(WIN32) || defined(_WIN32) +# if defined(OSHMEM_IMPORTS) +# define OSHMEM_DECLSPEC __declspec(dllimport) +# else +# define OSHMEM_DECLSPEC +# endif /* defined(OSHMEM_IMPORTS) */ +# else +# if defined(OPAL_C_HAVE_VISIBILITY) && (OPAL_C_HAVE_VISIBILITY == 1) +# define OSHMEM_DECLSPEC __attribute__((visibility("default"))) +# else +# define OSHMEM_DECLSPEC +# endif +# endif +#endif + +#ifndef OSHMEM_DESTRUCTOR +# if defined(OPAL_C_HAVE_VISIBILITY) && (OPAL_C_HAVE_VISIBILITY == 1) +# define OSHMEM_DESTRUCTOR __attribute__((__destructor__)) +# else +# define OSHMEM_DESTRUCTOR +# endif +#endif + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + + +/* + * OpenSHMEM API (www.openshmem.org) + */ + +/* + * Environment variables + */ + +/* size of symmetric heap in bytes. + * Can be qualified with the letter 'K', 'M', 'G' or 'T' + */ +#define SHMEM_HEAP_SIZE "SHMEM_SYMMETRIC_HEAP_SIZE" + +/* Following environment variables are Mellanox extension */ + +/* + * Type of allocator used by symmetric heap + */ +#define SHMEM_HEAP_TYPE "SHMEM_SYMMETRIC_HEAP_ALLOCATOR" + +/* + * Constants and definitions + */ +enum shmem_wait_ops { + SHMEM_CMP_EQ, + SHMEM_CMP_NE, + SHMEM_CMP_GT, + SHMEM_CMP_LE, + SHMEM_CMP_LT, + SHMEM_CMP_GE +}; + +#define _SHMEM_BARRIER_SYNC_SIZE (1) +#define _SHMEM_BCAST_SYNC_SIZE (1 + _SHMEM_BARRIER_SYNC_SIZE) +#define _SHMEM_COLLECT_SYNC_SIZE (1 + _SHMEM_BCAST_SYNC_SIZE) +#define _SHMEM_REDUCE_SYNC_SIZE (1 + _SHMEM_BCAST_SYNC_SIZE) +#define _SHMEM_REDUCE_MIN_WRKDATA_SIZE (1) +#define _SHMEM_SYNC_VALUE (-1) + +#define SHMEM_BARRIER_SYNC_SIZE _SHMEM_BARRIER_SYNC_SIZE +#define SHMEM_BCAST_SYNC_SIZE _SHMEM_BCAST_SYNC_SIZE +#define SHMEM_COLLECT_SYNC_SIZE _SHMEM_COLLECT_SYNC_SIZE +#define SHMEM_REDUCE_SYNC_SIZE _SHMEM_REDUCE_SYNC_SIZE +#define SHMEM_REDUCE_MIN_WRKDATA_SIZE _SHMEM_REDUCE_MIN_WRKDATA_SIZE +#define SHMEM_SYNC_VALUE _SHMEM_SYNC_VALUE + + +/* + * Initialization routines + */ +OSHMEM_DECLSPEC void start_pes(int npes); + + +/* + * Query routines + */ +OSHMEM_DECLSPEC int _num_pes(void); +OSHMEM_DECLSPEC int _my_pe(void); + + +/* + * Accessability routines + */ +OSHMEM_DECLSPEC int shmem_pe_accessible(int pe); +OSHMEM_DECLSPEC int shmem_addr_accessible(void *addr, int pe); + +/* + * Symmetric heap routines + */ +OSHMEM_DECLSPEC void* shmalloc(size_t size); +OSHMEM_DECLSPEC void* shmemalign(size_t align, size_t size); +OSHMEM_DECLSPEC void* shrealloc(void *ptr, size_t size); +OSHMEM_DECLSPEC void shfree(void* ptr); + +/* + * Remote pointer operations + */ +OSHMEM_DECLSPEC void *shmem_ptr(void *ptr, int pe); + +/* + * Elemental put routines + */ +OSHMEM_DECLSPEC void shmem_short_p(short* addr, short value, int pe); +OSHMEM_DECLSPEC void shmem_int_p(int* addr, int value, int pe); +OSHMEM_DECLSPEC void shmem_long_p(long* addr, long value, int pe); +OSHMEM_DECLSPEC void shmem_float_p(float* addr, float value, int pe); +OSHMEM_DECLSPEC void shmem_double_p(double* addr, double value, int pe); +OSHMEM_DECLSPEC void shmem_longlong_p(long long* addr, long long value, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_p(long double* addr, long double value, int pe); + +/* + * Block data put routines + */ +OSHMEM_DECLSPEC void shmem_char_put(char *target, const char *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_put(short *target, const short *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_int_put(int* target, const int* source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_put(long *target, const long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_put(float *target, const float *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_put(double *target, const double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_put(long long *target, const long long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_put(long double *target, const long double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_put32(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_put64(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_put128(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_putmem(void *target, const void *source, size_t len, int pe); + +/* + * Strided put routines + */ +OSHMEM_DECLSPEC void shmem_int_iput(int* target, const int* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_iput(short* target, const short* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_iput(float* target, const float* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_iput(double* target, const double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_iput(long long* target, const long long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_iput(long double* target, const long double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_iput(long* target, const long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iput32(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iput64(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iput128(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); + +/* + * Elemental get routines + */ +OSHMEM_DECLSPEC short shmem_short_g(short* addr, int pe); +OSHMEM_DECLSPEC int shmem_int_g(int* addr, int pe); +OSHMEM_DECLSPEC long shmem_long_g(long* addr, int pe); +OSHMEM_DECLSPEC float shmem_float_g(float* addr, int pe); +OSHMEM_DECLSPEC double shmem_double_g(double* addr, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_g(long long* addr, int pe); +OSHMEM_DECLSPEC long double shmem_longdouble_g(long double* addr, int pe); + +/* + * Block data get routines + */ +OSHMEM_DECLSPEC void shmem_char_get(char *target, const char *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_get(short *target, const short *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_int_get(int *target, const int *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_get(long *target, const long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_get(float *target, const float *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_get(double *target, const double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_get(long long *target, const long long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_get(long double *target, const long double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get32(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get64(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get128(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_getmem(void *target, const void *source, size_t len, int pe); + +/* + * Strided get routines + */ +OSHMEM_DECLSPEC void shmem_int_iget(int* target, const int* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_iget(short* target, const short* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_iget(float* target, const float* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_iget(double* target, const double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_iget(long long* target, const long long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_iget(long double* target, const long double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_iget(long* target, const long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iget32(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iget64(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iget128(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); + +/* + * Atomic operations + */ +/* Atomic swap */ +OSHMEM_DECLSPEC long shmem_swap(long *target, long value, int pe); +OSHMEM_DECLSPEC double shmem_double_swap(double *target, double value, int pe); +OSHMEM_DECLSPEC float shmem_float_swap(float *target, float value, int pe); +OSHMEM_DECLSPEC int shmem_int_swap(int *target, int value, int pe); +OSHMEM_DECLSPEC long shmem_long_swap(long *target, long value, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_swap(long long*target, long long value, int pe); + +/* Atomic conditional swap */ +OSHMEM_DECLSPEC int shmem_int_cswap(int *target, int cond, int value, int pe); +OSHMEM_DECLSPEC long shmem_long_cswap(long *target, long cond, long value, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_cswap(long long *target, long long cond, long long value, int pe); + +/* Atomic Fetch&Add */ +OSHMEM_DECLSPEC int shmem_int_fadd(int *target, int value, int pe); +OSHMEM_DECLSPEC long shmem_long_fadd(long *target, long value, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_fadd(long long *target, long long value, int pe); + +/* Atomic Fetch&Inc */ +OSHMEM_DECLSPEC int shmem_int_finc(int *target, int pe); +OSHMEM_DECLSPEC long shmem_long_finc(long *target, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_finc(long long *target, int pe); + +/* Atomic Add*/ +OSHMEM_DECLSPEC void shmem_int_add(int *target, int value, int pe); +OSHMEM_DECLSPEC void shmem_long_add(long *target, long value, int pe); +OSHMEM_DECLSPEC void shmem_longlong_add(long long *target, long long value, int pe); + +/* Atomic Inc */ +OSHMEM_DECLSPEC void shmem_int_inc(int *target, int pe); +OSHMEM_DECLSPEC void shmem_long_inc(long *target, int pe); +OSHMEM_DECLSPEC void shmem_longlong_inc(long long *target, int pe); + +/* + * Lock functions + */ +OSHMEM_DECLSPEC void shmem_set_lock(long *lock); +OSHMEM_DECLSPEC void shmem_clear_lock(long *lock); +OSHMEM_DECLSPEC int shmem_test_lock(long *lock); + +/* + * P2P sync routines + */ +OSHMEM_DECLSPEC void shmem_short_wait(short *addr, short value); +OSHMEM_DECLSPEC void shmem_int_wait(int *addr, int value); +OSHMEM_DECLSPEC void shmem_long_wait(long *addr, long value); +OSHMEM_DECLSPEC void shmem_longlong_wait(long long *addr, long long value); +OSHMEM_DECLSPEC void shmem_wait(long *addr, long value); + +OSHMEM_DECLSPEC void shmem_short_wait_until(short *addr, int cmp, short value); +OSHMEM_DECLSPEC void shmem_int_wait_until(int *addr, int cmp, int value); +OSHMEM_DECLSPEC void shmem_long_wait_until(long *addr, int cmp, long value); +OSHMEM_DECLSPEC void shmem_longlong_wait_until(long long *addr, int cmp, long long value); +OSHMEM_DECLSPEC void shmem_wait_until(long *addr, int cmp, long value); + +/* + * Barrier sync routines + */ +OSHMEM_DECLSPEC void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_barrier_all(void); +OSHMEM_DECLSPEC void shmem_fence(void); +OSHMEM_DECLSPEC void shmem_quiet(void); + +/* + * Collective routines + */ +OSHMEM_DECLSPEC void shmem_broadcast32(void *target, const void *source, size_t nlong, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_broadcast64(void *target, const void *source, size_t nlong, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_broadcast(void *target, const void *source, size_t nlong, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_collect32(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_collect64(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_fcollect32(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_fcollect64(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); + +/* + * Reduction routines + */ +OSHMEM_DECLSPEC void shmem_short_and_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_and_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_and_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_and_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_or_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_or_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_or_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_or_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_xor_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_xor_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_xor_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_xor_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_max_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_max_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_max_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_max_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_max_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_max_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_max_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_min_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_min_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_min_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_min_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_min_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_min_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_min_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_sum_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_sum_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_sum_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_sum_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_sum_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_sum_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_sum_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexf_sum_to_all(OSHMEM_COMPLEX_TYPE(float) *target, OSHMEM_COMPLEX_TYPE(float) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(float) *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexd_sum_to_all(OSHMEM_COMPLEX_TYPE(double) *target, OSHMEM_COMPLEX_TYPE(double) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(double) *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_prod_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_prod_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_prod_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_prod_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_prod_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_prod_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_prod_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexf_prod_to_all(OSHMEM_COMPLEX_TYPE(float) *target, OSHMEM_COMPLEX_TYPE(float) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(float) *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexd_prod_to_all(OSHMEM_COMPLEX_TYPE(double) *target, OSHMEM_COMPLEX_TYPE(double) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(double) *pWrk, long *pSync); + +/* + * Platform specific cache management routines + */ +OSHMEM_DECLSPEC void shmem_udcflush(void); +OSHMEM_DECLSPEC void shmem_udcflush_line(void* target); +OSHMEM_DECLSPEC void shmem_set_cache_inv(void); +OSHMEM_DECLSPEC void shmem_set_cache_line_inv(void* target); +OSHMEM_DECLSPEC void shmem_clear_cache_inv(void); +OSHMEM_DECLSPEC void shmem_clear_cache_line_inv(void* target); + +/* + * Legacy API + */ +OSHMEM_DECLSPEC int num_pes(void); +OSHMEM_DECLSPEC int my_pe(void); + +/* old init/destruct functions - not in the open shmem spec but still supported */ +OSHMEM_DECLSPEC void shmem_init(void); +OSHMEM_DECLSPEC int shmem_finalize(void) OSHMEM_DESTRUCTOR; +OSHMEM_DECLSPEC int shmem_n_pes(void); +OSHMEM_DECLSPEC int shmem_my_pe(void); + +OSHMEM_DECLSPEC void shmem_put(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get(void *target, const void *source, size_t len, int pe); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + + +#endif /* OSHMEM_SHMEM_H */ diff --git a/oshmem/include/shmem_portable_platform.h.in b/oshmem/include/shmem_portable_platform.h.in new file mode 100644 index 0000000000..7658d68eb3 --- /dev/null +++ b/oshmem/include/shmem_portable_platform.h.in @@ -0,0 +1,401 @@ +/* + * Header file with preprocessor magic to figure out, which compiler the user has been calling! + * + * This code is adapted from the file other/portable_platform.h of GASnet-1.12.0: + * - Ripping out the required parts. + * - Get rid of brackets as it messes up autoconf + * - Delete version tests for older PGI versions (#include "omp.h" not acceptabe) + * - Indent ('#' should be in column 0) + * + * External packages (vt, romio) depend on top_build_dir/ompi/include, therefore + * although this is not changed in the configure process, this has to be set as + * a .in file... + * --------------------------------------------------------------------------- + */ +#ifndef MPI_PORTABLE_PLATFORM_H +#define MPI_PORTABLE_PLATFORM_H + +/* All files in this directory and all sub-directories (except where otherwise noted) + * are subject to the following licensing terms: + * + * --------------------------------------------------------------------------- + * "Copyright (c) 2013 Mellanox Technologies, Inc. + * " All rights reserved. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose, without fee, and without written agreement is + * hereby granted, provided that the above copyright notice and the following + * two paragraphs appear in all copies of this software. + * + * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT + * OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF + * CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS." + * --------------------------------------------------------------------------- + * + * Please see the license.txt files within the gm-conduit, lapi-conduit and + * vapi-conduit directories for the licensing terms governing those + * contributed components. + * + * The authors/contributors of GASNet include: + * + * Dan Bonachea : + * General infrastructure & documentation + * mpi-conduit + * elan-conduit + * smp-conduit + * udp-conduit + * extended-ref + * template-conduit + * Christian Bell : gm-conduit, shmem-conduit + * Mike Welcome : lapi-conduit, portals-conduit + * Paul H. Hargrove : vapi-conduit, ibv-conduit + * Rajesh Nishtala : collectives, dcmf-conduit + * Parry Husbands (PJRHusbands@lbl.gov): lapi-conduit + * + * For more information about GASNet, visit our home page at: + * http://gasnet.cs.berkeley.edu/ + * Or send email to: + * + * + * Source code contributions (fixes, patches, extensions etc.) should be + * sent to to be reviewed for acceptance into the primary + * distribution. Contributions are most likely to be accepted if they + * are provided as public domain, or under a BSD-style license such as + * the one above. + * + */ +#ifndef _STRINGIFY +#define _STRINGIFY_HELPER(x) #x +#define _STRINGIFY(x) _STRINGIFY_HELPER(x) +#endif + +#if defined(__INTEL_COMPILER) +# define PLATFORM_COMPILER_FAMILYNAME INTEL +# define PLATFORM_COMPILER_FAMILYID 2 +# ifdef __cplusplus +# define PLATFORM_COMPILER_INTEL_CXX 1 +# else +# define PLATFORM_COMPILER_INTEL_C 1 +# endif +# define _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE 19700000 /* year 1970: predates most intel products :) */ +# ifdef __INTEL_COMPILER_BUILD_DATE +# define _PLATFORM_INTEL_COMPILER_BUILD_DATE __INTEL_COMPILER_BUILD_DATE +# else +# define _PLATFORM_INTEL_COMPILER_BUILD_DATE _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE +# endif + /* patch number is a decimal build date: YYYYMMDD */ +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + (((((maj) * 10) | (min)) << 20) | \ + ((pat) < _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE ? \ + _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE : ((pat)-_PLATFORM_COMPILER_INTEL_MIN_BUILDDATE))) +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__INTEL_COMPILER/10, __INTEL_COMPILER/100, _PLATFORM_INTEL_COMPILER_BUILD_DATE) +# define PLATFORM_COMPILER_VERSION_STR \ + _STRINGIFY(__INTEL_COMPILER)"."_STRINGIFY(_PLATFORM_INTEL_COMPILER_BUILD_DATE) + +#elif defined(__PATHSCALE__) +# define PLATFORM_COMPILER_PATHSCALE 1 +# define PLATFORM_COMPILER_FAMILYNAME PATHSCALE +# define PLATFORM_COMPILER_FAMILYID 3 +# ifdef __cplusplus +# define PLATFORM_COMPILER_PATHSCALE_CXX 1 +# else +# define PLATFORM_COMPILER_PATHSCALE_C 1 +# endif +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__PATHCC__,__PATHCC_MINOR__,__PATHCC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION_STR __PATHSCALE__ + +#elif defined(__PGI) +# define PLATFORM_COMPILER_PGI 1 +# define PLATFORM_COMPILER_FAMILYNAME PGI +# define PLATFORM_COMPILER_FAMILYID 4 +# ifdef __cplusplus +# define PLATFORM_COMPILER_PGI_CXX 1 +# else +# define PLATFORM_COMPILER_PGI_C 1 +# endif +# if __PGIC__ == 99 + /* bug 2230: PGI versioning was broken for some platforms in 7.0 + no way to know exact version, but provide something slightly more accurate */ +# define PLATFORM_COMPILER_VERSION 0x070000 +# define PLATFORM_COMPILER_VERSION_STR "7.?-?" +# elif defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__PGIC__,__PGIC_MINOR__,__PGIC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION_STR \ + _STRINGIFY(__PGIC__)"."_STRINGIFY(__PGIC_MINOR__)"-"_STRINGIFY(__PGIC_PATCHLEVEL__) +# else + /* PGI before 6.1-4 lacks any version ID preprocessor macros - so use this filthy hack */ + /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + * We cannot do these within mpi.h.in, as we should not include ompi.h + * Hopefully, compilers with integrated preprocessors will not analyse code within the #if 0-block + * XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + */ +#if 0 +# ifdef PLATFORM_PGI_IS_ANCIENT + /* Include below might fail for ancient versions lacking this header, but testing shows it + works back to at least 5.1-3 (Nov 2003), and based on docs probably back to 3.2 (Sep 2000) */ +# define PLATFORM_COMPILER_VERSION 0 +# elif defined(__x86_64__) /* bug 1753 - 64-bit omp.h upgrade happenned in <6.0-8,6.1-1) */ +# include "omp.h" +# if defined(_PGOMP_H) + /* 6.1.1 or newer */ +# define PLATFORM_COMPILER_VERSION 0x060101 +# define PLATFORM_COMPILER_VERSION_STR ">=6.1-1" +# else + /* 6.0.8 or older */ +# define PLATFORM_COMPILER_VERSION 0 +# define PLATFORM_COMPILER_VERSION_STR "<=6.0-8" +# endif +# else /* 32-bit omp.h upgrade happenned in <5.2-4,6.0-8 */ +# include "omp.h" +# if defined(_PGOMP_H) + /* 6.0-8 or newer */ +# define PLATFORM_COMPILER_VERSION 0x060008 +# define PLATFORM_COMPILER_VERSION_STR ">=6.0-8" +# else + /* 5.2-4 or older */ +# define PLATFORM_COMPILER_VERSION 0 +# define PLATFORM_COMPILER_VERSION_STR "<=5.2-4" +# endif +# endif +#endif /* 0 */ + /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ +# endif + +#elif defined(__xlC__) +# define PLATFORM_COMPILER_XLC 1 +# define PLATFORM_COMPILER_FAMILYNAME XLC +# define PLATFORM_COMPILER_FAMILYID 5 +# ifdef __cplusplus +# define PLATFORM_COMPILER_XLC_CXX 1 +# else +# define PLATFORM_COMPILER_XLC_C 1 +# endif +# define PLATFORM_COMPILER_VERSION __xlC__ +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 8) | ((min) << 4) | (pat) ) + +#elif defined(__DECC) || defined(__DECCXX) +# define PLATFORM_COMPILER_COMPAQ 1 +# define PLATFORM_COMPILER_FAMILYNAME COMPAQ +# define PLATFORM_COMPILER_FAMILYID 6 +# ifdef __cplusplus +# define PLATFORM_COMPILER_COMPAQ_CXX 1 +# else +# define PLATFORM_COMPILER_COMPAQ_C 1 +# endif +# if defined(__DECC_VER) +# define PLATFORM_COMPILER_VERSION __DECC_VER +# elif defined(__DECCXX_VER) +# define PLATFORM_COMPILER_VERSION __DECCXX_VER +# endif + +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) * 10000000) + ((min) * 100000) + (90000) + (pat) ) + /* 90000 = official ver, 80000 = customer special ver, 60000 = field test ver */ + +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# define PLATFORM_COMPILER_SUN 1 +# define PLATFORM_COMPILER_FAMILYNAME SUN +# define PLATFORM_COMPILER_FAMILYID 7 +# ifdef __cplusplus +# define PLATFORM_COMPILER_SUN_CXX 1 +# else +# define PLATFORM_COMPILER_SUN_C 1 +# endif +# if defined(__SUNPRO_C) && __SUNPRO_C > 0 +# define PLATFORM_COMPILER_VERSION __SUNPRO_C +# elif defined(__SUNPRO_CC) && __SUNPRO_CC > 0 +# define PLATFORM_COMPILER_VERSION __SUNPRO_CC +# endif +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 8) | ((min) << 4) | (pat) ) + +#elif defined(__HP_cc) || defined(__HP_aCC) +# define PLATFORM_COMPILER_HP 1 +# define PLATFORM_COMPILER_FAMILYNAME HP +# define PLATFORM_COMPILER_FAMILYID 8 +# ifdef __cplusplus +# define PLATFORM_COMPILER_HP_CXX 1 +# else +# define PLATFORM_COMPILER_HP_C 1 +# endif +# if defined(__HP_cc) && __HP_cc > 0 +# define PLATFORM_COMPILER_VERSION __HP_cc +# elif defined(__HP_aCC) && __HP_aCC > 0 +# define PLATFORM_COMPILER_VERSION __HP_aCC +# endif +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 16) | ((min) << 8) | (pat) ) + +#elif defined(_SGI_COMPILER_VERSION) || \ + (defined(_COMPILER_VERSION) && defined(__sgi) && !defined(__GNUC__)) /* 7.3.0 and earlier lack _SGI_COMPILER_VERSION */ +# define PLATFORM_COMPILER_SGI 1 +# define PLATFORM_COMPILER_FAMILYNAME SGI +# define PLATFORM_COMPILER_FAMILYID 9 +# ifdef __cplusplus +# define PLATFORM_COMPILER_SGI_CXX 1 +# else +# define PLATFORM_COMPILER_SGI_C 1 +# endif +# if defined(_SGI_COMPILER_VERSION) && _SGI_COMPILER_VERSION > 0 +# define PLATFORM_COMPILER_VERSION _SGI_COMPILER_VERSION +# elif defined(_COMPILER_VERSION) && _COMPILER_VERSION > 0 +# define PLATFORM_COMPILER_VERSION _COMPILER_VERSION +# endif +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 8) | ((min) << 4) | (pat) ) + +#elif defined(_CRAYC) +# define PLATFORM_COMPILER_CRAY 1 +# define PLATFORM_COMPILER_FAMILYNAME CRAY +# define PLATFORM_COMPILER_FAMILYID 10 +# ifdef __cplusplus +# define PLATFORM_COMPILER_CRAY_CXX 1 +# else +# define PLATFORM_COMPILER_CRAY_C 1 +# endif +# if defined(_RELEASE) && defined(_RELEASE_MINOR) /* X1 */ +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(_RELEASE,_RELEASE_MINOR,0) +# elif defined(_RELEASE) /* T3E */ +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(_RELEASE,0,0) +# endif +# ifdef _RELEASE_STRING /* X1 */ +# define PLATFORM_COMPILER_VERSION_STR _RELEASE_STRING +# endif + +#elif defined(__KCC) +# define PLATFORM_COMPILER_KAI 1 +# define PLATFORM_COMPILER_FAMILYNAME KAI +# define PLATFORM_COMPILER_FAMILYID 11 +# ifdef __cplusplus +# define PLATFORM_COMPILER_KAI_CXX 1 +# else +# define PLATFORM_COMPILER_KAI_C 1 +# endif + +#elif defined(__MTA__) +# define PLATFORM_COMPILER_MTA 1 +# define PLATFORM_COMPILER_FAMILYNAME MTA +# define PLATFORM_COMPILER_FAMILYID 12 +# ifdef __cplusplus +# define PLATFORM_COMPILER_MTA_CXX 1 +# else +# define PLATFORM_COMPILER_MTA_C 1 +# endif + +#elif defined(_SX) +# define PLATFORM_COMPILER_NECSX 1 +# define PLATFORM_COMPILER_FAMILYNAME NECSX +# define PLATFORM_COMPILER_FAMILYID 13 +# ifdef __cplusplus +# define PLATFORM_COMPILER_NECSX_CXX 1 +# else +# define PLATFORM_COMPILER_NECSX_C 1 +# endif + +#elif defined(_MSC_VER) +# define PLATFORM_COMPILER_MICROSOFT 1 +# define PLATFORM_COMPILER_FAMILYNAME MICROSOFT +# define PLATFORM_COMPILER_FAMILYID 14 +# ifdef __cplusplus +# define PLATFORM_COMPILER_MICROSOFT_CXX 1 +# else +# define PLATFORM_COMPILER_MICROSOFT_C 1 +# endif +# define PLATFORM_COMPILER_VERSION _MSC_VER + +#elif defined(__TINYC__) +# define PLATFORM_COMPILER_TINY 1 +# define PLATFORM_COMPILER_FAMILYNAME TINY +# define PLATFORM_COMPILER_FAMILYID 15 +# ifdef __cplusplus +# define PLATFORM_COMPILER_TINY_CXX 1 +# else +# define PLATFORM_COMPILER_TINY_C 1 +# endif + +#elif defined(__LCC__) +# define PLATFORM_COMPILER_LCC 1 +# define PLATFORM_COMPILER_FAMILYNAME LCC +# define PLATFORM_COMPILER_FAMILYID 16 +# ifdef __cplusplus +# define PLATFORM_COMPILER_LCC_CXX 1 +# else +# define PLATFORM_COMPILER_LCC_C 1 +# endif + +#else /* unknown compiler */ +# define PLATFORM_COMPILER_UNKNOWN 1 +#endif + +/* this stanza comes last, because many vendor compilers lie and claim + to be GNU C for compatibility reasons and/or because they share a frontend */ +#if defined(__GNUC__) +# undef PLATFORM_COMPILER_UNKNOWN +# ifndef PLATFORM_COMPILER_FAMILYID +# define PLATFORM_COMPILER_GNU 1 +# define PLATFORM_COMPILER_FAMILYNAME GNU +# define PLATFORM_COMPILER_FAMILYID 1 +# ifdef __cplusplus +# define PLATFORM_COMPILER_GNU_CXX 1 +# else +# define PLATFORM_COMPILER_GNU_C 1 +# endif +# if defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) +# elif defined(__GNUC_MINOR__) /* older versions of egcs lack __GNUC_PATCHLEVEL__ */ +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__GNUC__,__GNUC_MINOR__,0) +# else +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__GNUC__,0,0) +# endif +# define PLATFORM_COMPILER_VERSION_STR __PLATFORM_COMPILER_GNU_VERSION_STR +# else +# define _PLATFORM_COMPILER_GNU_VERSION_STR __PLATFORM_COMPILER_GNU_VERSION_STR +# endif + /* gather any advertised GNU version number info, even for non-gcc compilers */ +# if defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +# define __PLATFORM_COMPILER_GNU_VERSION_STR \ + _STRINGIFY(__GNUC__)"."_STRINGIFY(__GNUC_MINOR__)"."_STRINGIFY(__GNUC_PATCHLEVEL__) +# elif defined(__GNUC_MINOR__) +# define __PLATFORM_COMPILER_GNU_VERSION_STR \ + _STRINGIFY(__GNUC__)"."_STRINGIFY(__GNUC_MINOR__)".?" +# else +# define __PLATFORM_COMPILER_GNU_VERSION_STR \ + _STRINGIFY(__GNUC__)".?.?" +# endif +#elif defined(PLATFORM_COMPILER_UNKNOWN) /* unknown compiler */ +# define PLATFORM_COMPILER_FAMILYNAME UNKNOWN +# define PLATFORM_COMPILER_FAMILYID 0 +#endif + +/* Default Values */ +#ifndef PLATFORM_COMPILER_VERSION +# define PLATFORM_COMPILER_VERSION 0 /* don't know */ +#endif + +#ifndef PLATFORM_COMPILER_VERSION_STR +# define PLATFORM_COMPILER_VERSION_STR _STRINGIFY(PLATFORM_COMPILER_VERSION) +#endif + +#ifndef PLATFORM_COMPILER_VERSION_INT +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + (((maj) << 16) | ((min) << 8) | (pat)) +#endif + + +#endif /* MPI_PORTABLE_PLATFORM_H */ diff --git a/oshmem/mca/atomic/Makefile.am b/oshmem/mca/atomic/Makefile.am new file mode 100644 index 0000000000..9a1f7c7c95 --- /dev/null +++ b/oshmem/mca/atomic/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_atomic.la +libmca_atomic_la_SOURCES = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +# local files +headers = atomic.h +libmca_atomic_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/atomic +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/atomic/atomic.h b/oshmem/mca/atomic/atomic.h new file mode 100644 index 0000000000..5d4195f2b3 --- /dev/null +++ b/oshmem/mca/atomic/atomic.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Atomic Operations Interface + * + */ + +#ifndef OSHMEM_MCA_ATOMIC_H +#define OSHMEM_MCA_ATOMIC_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "opal/util/output.h" +#include "mpi.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "oshmem/mca/atomic/base/base.h" + +BEGIN_C_DECLS + +/* ******************************************************************** */ + +struct oshmem_op_t; + +/* ******************************************************************** */ + +typedef int (*mca_atomic_base_component_init_fn_t)(bool enable_progress_threads, + bool enable_threads); + +typedef int (*mca_atomic_base_component_finalize_fn_t)(void); + +typedef struct mca_atomic_base_module_1_0_0_t* (*mca_atomic_base_component_query_fn_t)(int *priority); + +/* ******************************************************************** */ + +/** + * Atomic component interface + * + * Component interface for the atomic framework. A public + * instance of this structure, called + * mca_atomic_[component_name]_component, must exist in any atomic + * component. + */ +struct mca_atomic_base_component_1_0_0_t { + /** Base component description */ + mca_base_component_t atomic_version; + /** Base component data block */ + mca_base_component_data_t atomic_data; + + /** Component initialization function */ + mca_atomic_base_component_init_fn_t atomic_init; + mca_atomic_base_component_finalize_fn_t atomic_finalize; + mca_atomic_base_component_query_fn_t atomic_query; +}; +typedef struct mca_atomic_base_component_1_0_0_t mca_atomic_base_component_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_atomic_base_component_1_0_0_t mca_atomic_base_component_t; + +/** + * Atomic module interface + * + */ +struct mca_atomic_base_module_1_0_0_t { + /** Collective modules all inherit from opal_object */ + opal_object_t super; + + /* Collective function pointers */ + int (*atomic_fadd)(void *target, + void *prev, + const void *value, + size_t nlong, + int pe, + struct oshmem_op_t *op); + int (*atomic_cswap)(void *target, + void *prev, + const void *cond, + const void *value, + size_t nlong, + int pe); +}; +typedef struct mca_atomic_base_module_1_0_0_t mca_atomic_base_module_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_atomic_base_module_1_0_0_t mca_atomic_base_module_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_atomic_base_module_t); + +/* ******************************************************************** */ + +/* + * Macro for use in components + */ +#define MCA_ATOMIC_BASE_VERSION_2_0_0 \ + MCA_BASE_VERSION_2_0_0, \ + "atomic", 1, 0, 0 + +/* ******************************************************************** */ + +OSHMEM_DECLSPEC extern mca_atomic_base_component_t mca_atomic_base_selected_component; +OSHMEM_DECLSPEC extern mca_atomic_base_module_t mca_atomic; +#define MCA_ATOMIC_CALL(a) mca_atomic.atomic_ ## a + +END_C_DECLS + +#endif /* OSHMEM_MCA_ATOMIC_H */ diff --git a/oshmem/mca/atomic/base/Makefile.am b/oshmem/mca/atomic/base/Makefile.am new file mode 100644 index 0000000000..b27f0609eb --- /dev/null +++ b/oshmem/mca/atomic/base/Makefile.am @@ -0,0 +1,19 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +headers += \ + base/base.h + +libmca_atomic_la_SOURCES += \ + base/atomic_base_frame.c \ + base/atomic_base_available.c \ + base/atomic_base_select.c diff --git a/oshmem/mca/atomic/base/atomic_base_available.c b/oshmem/mca/atomic/base/atomic_base_available.c new file mode 100644 index 0000000000..6209c63079 --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_available.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "oshmem_config.h" + +#include "orte/util/show_help.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + +/* + * Private functions + */ +static int init_query(const mca_base_component_t * ls, + bool enable_progress_threads, + bool enable_threads); + +/* + * Scan down the list of successfully opened components and query each of + * them (the opened list will be one or more components. If the user + * requested a specific component, it will be the only component in the + * opened list). Create and populate the available list of all + * components who indicate that they want to be considered for selection. + * Close all components who do not want to be considered for selection, + * and destroy the opened list. + * + * Also find the basic component while we're doing all of this, and save + * it in a global variable so that we can find it easily later (e.g., + * during scope selection). + */ +int mca_atomic_base_find_available(bool enable_progress_threads, + bool enable_threads) +{ + mca_base_component_list_item_t *cli, *next; + const mca_base_component_t *component; + + OPAL_LIST_FOREACH_SAFE(cli, next, &oshmem_atomic_base_framework.framework_components, mca_base_component_list_item_t) { + component = cli->cli_component; + + /* Call a subroutine to do the work, because the component may + represent different versions of the coll MCA. */ + + if (OSHMEM_SUCCESS != init_query(component, enable_progress_threads, + enable_threads)) { + /* If the component doesn't want to run, then close it. + Now close it out and release it from the DSO repository (if it's there). */ + opal_list_remove_item(&oshmem_atomic_base_framework.framework_components, &cli->super); + mca_base_component_close(component, oshmem_atomic_base_framework.framework_output); + OBJ_RELEASE(cli); + } + } + + /* If we have no collective components available, it's an error. + Thanks for playing! */ + + if (opal_list_get_size(&oshmem_atomic_base_framework.framework_components) == 0) { + ATOMIC_VERBOSE(10, + "atomic:find_available: no components available!"); + return OSHMEM_ERROR; + } + + /* All done */ + + return mca_atomic_base_select(); +} + +/* + * Query a component, see if it wants to run at all. If it does, save + * some information. If it doesn't, close it. + */ +static int init_query(const mca_base_component_t * component, + bool enable_progress_threads, + bool enable_threads) +{ + int ret; + + ATOMIC_VERBOSE(10, + "atomic:find_available: querying atomic component %s", + component->mca_component_name); + + /* This component has already been successfully opened. So now + query it. */ + + if (1 == component->mca_type_major_version + && 0 == component->mca_type_minor_version + && 0 == component->mca_type_release_version) { + + mca_atomic_base_component_t *atomic = + (mca_atomic_base_component_t *) component; + + ret = atomic->atomic_init(enable_progress_threads, enable_threads); + } else { + /* Unrecognized coll API version */ + + ATOMIC_VERBOSE(10, + "atomic:find_available: unrecognized atomic API version (%d.%d.%d, ignored)", + component->mca_type_major_version, + component->mca_type_minor_version, + component->mca_type_release_version); + return OSHMEM_ERROR; + } + + /* Query done -- look at the return value to see what happened */ + + if (OSHMEM_SUCCESS != ret) { + ATOMIC_VERBOSE(10, + "atomic:find_available: atomic component %s is not available", + component->mca_component_name); + if (NULL != component->mca_close_component) { + component->mca_close_component(); + } + } else { + ATOMIC_VERBOSE(10, + "atomic:find_available: atomic component %s is available", + component->mca_component_name); + } + + /* All done */ + + return ret; +} diff --git a/oshmem/mca/atomic/base/atomic_base_frame.c b/oshmem/mca/atomic/base/atomic_base_frame.c new file mode 100644 index 0000000000..d152468563 --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_frame.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "oshmem_config.h" + +#include "oshmem/constants.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "oshmem/mca/atomic/base/static-components.h" + +/* + * Global variables; most of which are loaded by back-ends of MCA + * variables + */ + +/* + * Ensure all function pointers are NULL'ed out to start with + */ +static void atomic_base_module_construct(mca_atomic_base_module_t *m) +{ + /* Atomic function pointers */ + m->atomic_fadd = NULL; + m->atomic_cswap = NULL; +} + +OBJ_CLASS_INSTANCE(mca_atomic_base_module_t, opal_object_t, + atomic_base_module_construct, NULL); + +static int mca_atomic_base_register(mca_base_register_flag_t flags) +{ + return OSHMEM_SUCCESS; +} + +static int mca_atomic_base_close(void) +{ + mca_base_component_list_item_t *cli, *next; + const mca_base_component_t *component; + + OPAL_LIST_FOREACH_SAFE(cli, next, &oshmem_atomic_base_framework.framework_components, mca_base_component_list_item_t) { + component = cli->cli_component; + mca_atomic_base_component_t *atomic = + (mca_atomic_base_component_t *) component; + + if (NULL != atomic->atomic_finalize) { + atomic->atomic_finalize(); + } + } + + /* Close all remaining available components */ + return mca_base_framework_components_close(&oshmem_atomic_base_framework, NULL); +} + +static int mca_atomic_base_open(mca_base_open_flag_t flags) +{ + /* Open up all available components */ + if (OPAL_SUCCESS != + mca_base_framework_components_open(&oshmem_atomic_base_framework, flags)) { + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} + +MCA_BASE_FRAMEWORK_DECLARE(oshmem, atomic, + "OSHMEM ATOMIC", + mca_atomic_base_register, + mca_atomic_base_open, + mca_atomic_base_close, + mca_atomic_base_static_components, + 0); diff --git a/oshmem/mca/atomic/base/atomic_base_select.c b/oshmem/mca/atomic/base/atomic_base_select.c new file mode 100644 index 0000000000..8e344986e3 --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_select.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include +#include +#include + +#include "oshmem/constants.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + +/* + * Global variables; most of which are loaded by back-ends of MCA + * variables + */ +mca_atomic_base_module_t mca_atomic; + +/* + * Local types + */ +struct avail_com_t { + opal_list_item_t super; + + int ac_priority; + mca_atomic_base_module_t *ac_module; +}; +typedef struct avail_com_t avail_com_t; + +/* + * Local functions + */ +static opal_list_t *check_components(opal_list_t * components); +static int check_one_component(const mca_base_component_t * component, + mca_atomic_base_module_1_0_0_t ** module); + +static int query(const mca_base_component_t * component, + int *priority, + mca_atomic_base_module_1_0_0_t ** module); + +static int query_1_0_0(const mca_atomic_base_component_1_0_0_t * atomic_component, + int *priority, + mca_atomic_base_module_1_0_0_t ** module); + +/* + * Stuff for the OBJ interface + */ +static OBJ_CLASS_INSTANCE(avail_com_t, opal_list_item_t, NULL, NULL); + +/* + * This function is called at the initialization. + * It is used to select which atomic component will be + * active for a given group. + */ +int mca_atomic_base_select(void) +{ + opal_list_t *selectable; + opal_list_item_t *item; + + /* Announce */ + ATOMIC_VERBOSE(10, + "atomic:base:atomic_select: Checking all available modules"); + selectable = check_components(&oshmem_atomic_base_framework.framework_components); + + /* Upon return from the above, the modules list will contain the + list of modules that returned (priority >= 0). If we have no + atomic modules available, then print error and return. */ + if (NULL == selectable) { + /* There's no modules available */ + return OSHMEM_ERROR; + } + + /* do the selection loop */ + for (item = opal_list_remove_first(selectable); NULL != item; item = + opal_list_remove_first(selectable)) { + avail_com_t *avail = (avail_com_t *) item; + + /* Set module having the highest priority */ + memcpy(&mca_atomic, avail->ac_module, sizeof(mca_atomic)); + + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); + /* check correctness */ + if (!(mca_atomic.atomic_fadd) || !(mca_atomic.atomic_cswap)) { + return OSHMEM_ERR_NOT_FOUND; + } + } + + /* Done with the list from the check_components() call so release it. */ + OBJ_RELEASE(selectable); + + return OSHMEM_SUCCESS; +} + +static int avail_com_compare (opal_list_item_t **a, + opal_list_item_t **b) +{ + avail_com_t *acom = (avail_com_t *) *a; + avail_com_t *bcom = (avail_com_t *) *b; + + if (acom->ac_priority > bcom->ac_priority) { + return 1; + } else if (acom->ac_priority < bcom->ac_priority) { + return -1; + } + + return 0; +} + +/* + * For each module in the list, check and see if it wants to run, and + * do the resulting priority comparison. Make a list of modules to be + * only those who returned that they want to run, and put them in + * priority order. + */ +static opal_list_t *check_components(opal_list_t *components) +{ + int priority; + const mca_base_component_t *component; + mca_base_component_list_item_t *cli; + mca_atomic_base_module_1_0_0_t *module; + opal_list_t *selectable; + avail_com_t *avail; + + /* Make a list of the components that query successfully */ + selectable = OBJ_NEW(opal_list_t); + + /* Scan through the list of components */ + OPAL_LIST_FOREACH(cli, &oshmem_atomic_base_framework.framework_components, mca_base_component_list_item_t) { + component = cli->cli_component; + + priority = check_one_component(component, &module); + if (priority >= 0) { + /* We have a component that indicated that it wants to run + by giving us a module */ + avail = OBJ_NEW(avail_com_t); + avail->ac_priority = priority; + avail->ac_module = module; + + opal_list_append(selectable, &avail->super); + } + } + + /* If we didn't find any available components, return an error */ + if (0 == opal_list_get_size(selectable)) { + OBJ_RELEASE(selectable); + return NULL; + } + + /* Put this list in priority order */ + opal_list_sort(selectable, avail_com_compare); + + /* All done */ + return selectable; +} + +/* + * Check a single component + */ +static int check_one_component(const mca_base_component_t *component, + mca_atomic_base_module_1_0_0_t **module) +{ + int err; + int priority = -1; + + err = query(component, &priority, module); + + if (OSHMEM_SUCCESS == err) { + priority = (priority < 100) ? priority : 100; + ATOMIC_VERBOSE(10, + "atomic:base:atomic_select: component available: %s, priority: %d", + component->mca_component_name, priority); + + } else { + priority = -1; + ATOMIC_VERBOSE(10, + "atomic:base:atomic_select: component not available: %s", + component->mca_component_name); + } + + return priority; +} + +/************************************************************************** + * Query functions + **************************************************************************/ + +/* + * Take any version of a atomic module, query it, and return the right + * module struct + */ +static int query(const mca_base_component_t *component, + int *priority, + mca_atomic_base_module_1_0_0_t **module) +{ + *module = NULL; + if (1 == component->mca_type_major_version + && 0 == component->mca_type_minor_version + && 0 == component->mca_type_release_version) { + const mca_atomic_base_component_1_0_0_t *atomic100 = + (mca_atomic_base_component_1_0_0_t *) component; + + return query_1_0_0(atomic100, priority, module); + } + + /* Unknown atomic API version -- return error */ + + return OSHMEM_ERROR; +} + +static int query_1_0_0(const mca_atomic_base_component_1_0_0_t *component, + int *priority, + mca_atomic_base_module_1_0_0_t **module) +{ + mca_atomic_base_module_1_0_0_t *ret; + + /* There's currently no need for conversion */ + + ret = component->atomic_query(priority); + if (NULL != ret) { + *module = ret; + return OSHMEM_SUCCESS; + } + + return OSHMEM_ERROR; +} diff --git a/oshmem/mca/atomic/base/base.h b/oshmem/mca/atomic/base/base.h new file mode 100644 index 0000000000..c751acd514 --- /dev/null +++ b/oshmem/mca/atomic/base/base.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_ATOMIC_BASE_H +#define MCA_ATOMIC_BASE_H + +#include "oshmem_config.h" + +#include "oshmem/mca/atomic/atomic.h" +#include "opal/class/opal_list.h" + +/* + * Global functions for MCA overall atomic open and close + */ + +BEGIN_C_DECLS + +int mca_atomic_base_find_available(bool enable_progress_threads, + bool enable_threads); + +int mca_atomic_base_select(void); + +/* + * MCA framework + */ +OSHMEM_DECLSPEC extern mca_base_framework_t oshmem_atomic_base_framework; + +/* ******************************************************************** */ +#ifdef __BASE_FILE__ +#define __ATOMIC_FILE__ __BASE_FILE__ +#else +#define __ATOMIC_FILE__ __FILE__ +#endif + +#define ATOMIC_VERBOSE(level, format, ...) \ + opal_output_verbose(level, oshmem_atomic_base_framework.framework_output, "%s:%d - %s() " format, \ + __ATOMIC_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define ATOMIC_ERROR(format, ... ) \ + opal_output_verbose(0, oshmem_atomic_base_framework.framework_output, "Error: %s:%d - %s() " format, \ + __ATOMIC_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +END_C_DECLS + +#endif /* MCA_ATOMIC_BASE_H */ diff --git a/oshmem/mca/atomic/basic/Makefile.am b/oshmem/mca/atomic/basic/Makefile.am new file mode 100644 index 0000000000..b5131f540d --- /dev/null +++ b/oshmem/mca/atomic/basic/Makefile.am @@ -0,0 +1,40 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +sources = \ + atomic_basic.h \ + atomic_basic_module.c \ + atomic_basic_component.c \ + atomic_basic_fadd.c \ + atomic_basic_cswap.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_oshmem_atomic_basic_DSO +component_noinst = +component_install = mca_atomic_basic.la +else +component_noinst = libmca_atomic_basic.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_atomic_basic_la_SOURCES = $(sources) +mca_atomic_basic_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_atomic_basic_la_SOURCES =$(sources) +libmca_atomic_basic_la_LDFLAGS = -module -avoid-version diff --git a/oshmem/mca/atomic/basic/atomic_basic.h b/oshmem/mca/atomic/basic/atomic_basic.h new file mode 100644 index 0000000000..39dc9f72e1 --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_ATOMIC_BASIC_H +#define MCA_ATOMIC_BASIC_H + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "oshmem/mca/atomic/atomic.h" + +BEGIN_C_DECLS + +/* Globally exported variables */ + +OSHMEM_MODULE_DECLSPEC extern mca_atomic_base_component_1_0_0_t +mca_atomic_basic_component; + +extern int mca_atomic_basic_priority_param; + +OSHMEM_DECLSPEC void atomic_basic_lock(int pe); +OSHMEM_DECLSPEC void atomic_basic_unlock(int pe); + +/* API functions */ + +int mca_atomic_basic_init(bool enable_progress_threads, bool enable_threads); +int mca_atomic_basic_finalize(void); +mca_atomic_base_module_t* +mca_atomic_basic_query(int *priority); + +int mca_atomic_basic_fadd(void *target, + void *prev, + const void *value, + size_t nlong, + int pe, + struct oshmem_op_t *op); +int mca_atomic_basic_cswap(void *target, + void *prev, + const void *cond, + const void *value, + size_t nlong, + int pe); + +struct mca_atomic_basic_module_t { + mca_atomic_base_module_t super; +}; +typedef struct mca_atomic_basic_module_t mca_atomic_basic_module_t; +OBJ_CLASS_DECLARATION(mca_atomic_basic_module_t); + +END_C_DECLS + +#endif /* MCA_ATOMIC_BASIC_H */ diff --git a/oshmem/mca/atomic/basic/atomic_basic_component.c b/oshmem/mca/atomic/basic/atomic_basic_component.c new file mode 100644 index 0000000000..3f47b4e1ff --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_component.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "atomic_basic.h" + +/* + * Public string showing the scoll basic component version number + */ +const char *mca_atomic_basic_component_version_string = +"Open SHMEM basic atomic MCA component version " OSHMEM_VERSION; + +/* + * Global variable + */ +int mca_atomic_basic_priority_param = -1; + +/* + * Local function + */ +static int __basic_open(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_atomic_base_component_t mca_atomic_basic_component = { + + /* First, the mca_component_t struct containing meta information + about the component itself */ + + { + MCA_ATOMIC_BASE_VERSION_2_0_0, + + /* Component name and version */ + "basic", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + __basic_open, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Initialization / querying functions */ + + mca_atomic_basic_init, + mca_atomic_basic_finalize, + mca_atomic_basic_query +}; + +static int __basic_open(void) +{ + mca_atomic_basic_priority_param = 75; + (void) mca_base_component_var_register(&mca_atomic_basic_component.atomic_version, + "priority", + "Priority of the atomic:basic component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_atomic_basic_priority_param); + + return OSHMEM_SUCCESS; +} + +OBJ_CLASS_INSTANCE(mca_atomic_basic_module_t, + mca_atomic_base_module_t, + NULL, + NULL); diff --git a/oshmem/mca/atomic/basic/atomic_basic_cswap.c b/oshmem/mca/atomic/basic/atomic_basic_cswap.c new file mode 100644 index 0000000000..bb28226586 --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_cswap.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "atomic_basic.h" + +int mca_atomic_basic_cswap(void *target, + void *prev, + const void *cond, + const void *value, + size_t nlong, + int pe) +{ + int rc = OSHMEM_SUCCESS; + + if (!prev) { + rc = OSHMEM_ERROR; + } + + if (rc == OSHMEM_SUCCESS) { + atomic_basic_lock(pe); + + rc = MCA_SPML_CALL(get(target, nlong, prev, pe)); + + if ((rc == OSHMEM_SUCCESS) && (!cond || !memcmp(prev, cond, nlong))) { + rc = MCA_SPML_CALL(put(target, nlong, (void*)value, pe)); + shmem_quiet(); + } + + atomic_basic_unlock(pe); + } + + return rc; +} diff --git a/oshmem/mca/atomic/basic/atomic_basic_fadd.c b/oshmem/mca/atomic/basic/atomic_basic_fadd.c new file mode 100644 index 0000000000..6825b400e5 --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_fadd.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "atomic_basic.h" + +int mca_atomic_basic_fadd(void *target, + void *prev, + const void *value, + size_t nlong, + int pe, + struct oshmem_op_t *op) +{ + int rc = OSHMEM_SUCCESS; + + if (!target || !value) { + rc = OSHMEM_ERROR; + } + + if (rc == OSHMEM_SUCCESS) { + long long temp_value = 0; + + atomic_basic_lock(pe); + + rc = MCA_SPML_CALL(get(target, nlong, (void*)&temp_value, pe)); + + if (prev) + memcpy(prev, (void*) &temp_value, nlong); + + op->o_func.c_fn((void*) value, + (void*) &temp_value, + nlong / op->dt_size); + + if (rc == OSHMEM_SUCCESS) { + rc = MCA_SPML_CALL(put(target, nlong, (void*)&temp_value, pe)); + shmem_quiet(); + } + + atomic_basic_unlock(pe); + } + + return rc; +} diff --git a/oshmem/mca/atomic/basic/atomic_basic_module.c b/oshmem/mca/atomic/basic/atomic_basic_module.c new file mode 100644 index 0000000000..2420e9931a --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_module.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/proc/proc.h" +#include "atomic_basic.h" + +static char *atomic_lock_sync; +static int *atomic_lock_turn; +static char *local_lock_sync; +static int *local_lock_turn; + +enum { + ATOMIC_LOCK_IDLE = 0, + ATOMIC_LOCK_WAITING = 1, + ATOMIC_LOCK_ACTIVE = 2 +}; + +/* + * Initial query function that is invoked during initialization, allowing + * this module to indicate what level of thread support it provides. + */ +int mca_atomic_basic_init(bool enable_progress_threads, bool enable_threads) +{ + int rc = OSHMEM_SUCCESS; + void* ptr = NULL; + int num_pe = oshmem_num_procs(); + + rc = MCA_MEMHEAP_CALL(private_alloc((num_pe * sizeof(char)), &ptr)); + if (rc == OSHMEM_SUCCESS) { + atomic_lock_sync = (char*) ptr; + memset(atomic_lock_sync, ATOMIC_LOCK_IDLE, sizeof(char) * num_pe); + + rc = MCA_MEMHEAP_CALL(private_alloc(sizeof(int), &ptr)); + if (rc == OSHMEM_SUCCESS) { + atomic_lock_turn = (int*) ptr; + *atomic_lock_turn = 0; + if (rc == OSHMEM_SUCCESS) { + local_lock_sync = (char*) malloc(num_pe * sizeof(char)); + local_lock_turn = (int*) malloc(sizeof(int)); + if (!local_lock_sync || !local_lock_turn) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } else { + memcpy((void*) local_lock_sync, + (void*) atomic_lock_sync, + sizeof(char) * num_pe); + *local_lock_turn = *atomic_lock_turn; + } + } + } + } + + return rc; +} + +int mca_atomic_basic_finalize(void) +{ + void* ptr = NULL; + + ptr = (void*) atomic_lock_sync; + MCA_MEMHEAP_CALL(private_free(ptr)); + atomic_lock_sync = NULL; + + ptr = (void*) atomic_lock_turn; + MCA_MEMHEAP_CALL(private_free(ptr)); + atomic_lock_turn = NULL; + + if (local_lock_sync) { + free((void*) local_lock_sync); + local_lock_sync = NULL; + } + + if (local_lock_turn) { + free((void*) local_lock_turn); + local_lock_turn = NULL; + } + + return OSHMEM_SUCCESS; +} + +mca_atomic_base_module_t * +mca_atomic_basic_query(int *priority) +{ + mca_atomic_basic_module_t *module; + + *priority = mca_atomic_basic_priority_param; + + module = OBJ_NEW(mca_atomic_basic_module_t); + if (module) { + module->super.atomic_fadd = mca_atomic_basic_fadd; + module->super.atomic_cswap = mca_atomic_basic_cswap; + return &(module->super); + } + + return NULL ; +} + +void atomic_basic_lock(int pe) +{ + int index = -1; + int me = oshmem_my_proc_id(); + int num_pe = oshmem_num_procs(); + char lock_required = ATOMIC_LOCK_WAITING; + char lock_active = ATOMIC_LOCK_ACTIVE; + int root_pe = pe; + + do { + /* announce that we need the resource */ + do { + MCA_SPML_CALL(put((void*)(atomic_lock_sync + me), sizeof(lock_required), (void*)&lock_required, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } while (local_lock_sync[me] != lock_required); + + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + while (index != me) { + if (local_lock_sync[index] != ATOMIC_LOCK_IDLE) { + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } else { + index = (index + 1) % num_pe; + } + } + + /* now tentatively claim the resource */ + do { + MCA_SPML_CALL(put((void*)(atomic_lock_sync + me), sizeof(lock_active), (void*)&lock_active, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } while (local_lock_sync[me] != lock_active); + + index = 0; + while ((index < num_pe) + && ((index == me) + || (local_lock_sync[index] != ATOMIC_LOCK_ACTIVE))) { + index = index + 1; + } + + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(*atomic_lock_turn), (void*)local_lock_turn, root_pe)); + } while (!((index >= num_pe) + && ((*local_lock_turn == me) + || (local_lock_sync[*local_lock_turn] == ATOMIC_LOCK_IDLE)))); + + MCA_SPML_CALL(put((void*)atomic_lock_turn, sizeof(me), (void*)&me, root_pe)); +} + +void atomic_basic_unlock(int pe) +{ + int index = -1; + int me = oshmem_my_proc_id(); + int num_pe = oshmem_num_procs(); + char lock_idle = ATOMIC_LOCK_IDLE; + int root_pe = pe; + + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + + do { + index = (index + 1) % num_pe; + } while (local_lock_sync[index] == ATOMIC_LOCK_IDLE); + + MCA_SPML_CALL(put((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + + do { + MCA_SPML_CALL(put((void*)(atomic_lock_sync + me), sizeof(lock_idle), (void*)&lock_idle, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } while (local_lock_sync[me] != lock_idle); +} diff --git a/oshmem/mca/atomic/basic/configure.params b/oshmem/mca/atomic/basic/configure.params new file mode 100644 index 0000000000..1b6b5ba51c --- /dev/null +++ b/oshmem/mca/atomic/basic/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/atomic/mxm/Makefile.am b/oshmem/mca/atomic/mxm/Makefile.am new file mode 100644 index 0000000000..7fade4ee8c --- /dev/null +++ b/oshmem/mca/atomic/mxm/Makefile.am @@ -0,0 +1,42 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) +AM_CPPFLAGS = $(atomic_mxm_CPPFLAGS) + +mxm_sources = \ + atomic_mxm.h \ + atomic_mxm_module.c \ + atomic_mxm_component.c \ + atomic_mxm_fadd.c \ + atomic_mxm_cswap.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_oshmem_atomic_mxm_DSO +component_noinst = +component_install = mca_atomic_mxm.la +else +component_noinst = libmca_atomic_mxm.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_atomic_mxm_la_SOURCES = $(mxm_sources) +mca_atomic_mxm_la_LIBADD = $(atomic_mxm_LIBS) +mca_atomic_mxm_la_LDFLAGS = -module -avoid-version $(atomic_mxm_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_atomic_mxm_la_SOURCES =$(mxm_sources) +libmca_atomic_mxm_la_LDFLAGS = -module -avoid-version $(atomic_mxm_LDFLAGS) diff --git a/oshmem/mca/atomic/mxm/atomic_mxm.h b/oshmem/mca/atomic/mxm/atomic_mxm.h new file mode 100644 index 0000000000..e5fda9f640 --- /dev/null +++ b/oshmem/mca/atomic/mxm/atomic_mxm.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_ATOMIC_MXM_H +#define MCA_ATOMIC_MXM_H + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "oshmem/mca/atomic/atomic.h" + +/* This component does uses SPML:IKRIT */ +#include "oshmem/mca/spml/ikrit/spml_ikrit.h" + + +BEGIN_C_DECLS + +/* Globally exported variables */ + +OSHMEM_MODULE_DECLSPEC extern mca_atomic_base_component_1_0_0_t +mca_atomic_mxm_component; + +extern int mca_atomic_mxm_priority_param; + +/* this component works with spml:ikrit only */ +extern mca_spml_ikrit_t *mca_spml_self; + +OSHMEM_DECLSPEC void atomic_mxm_lock(int pe); +OSHMEM_DECLSPEC void atomic_mxm_unlock(int pe); + +/* API functions */ + +int mca_atomic_mxm_init(bool enable_progress_threads, bool enable_threads); +int mca_atomic_mxm_finalize(void); +mca_atomic_base_module_t* +mca_atomic_mxm_query(int *priority); + +int mca_atomic_mxm_fadd(void *target, + void *prev, + const void *value, + size_t nlong, + int pe, + struct oshmem_op_t *op); +int mca_atomic_mxm_cswap(void *target, + void *prev, + const void *cond, + const void *value, + size_t nlong, + int pe); + +struct mca_atomic_mxm_module_t { + mca_atomic_base_module_t super; +}; +typedef struct mca_atomic_mxm_module_t mca_atomic_mxm_module_t; +OBJ_CLASS_DECLARATION(mca_atomic_mxm_module_t); + +END_C_DECLS + +#endif /* MCA_ATOMIC_MXM_H */ diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_component.c b/oshmem/mca/atomic/mxm/atomic_mxm_component.c new file mode 100644 index 0000000000..8fd42af572 --- /dev/null +++ b/oshmem/mca/atomic/mxm/atomic_mxm_component.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "oshmem/mca/spml/base/base.h" + +#include "atomic_mxm.h" + + +/* + * Public string showing the scoll mxm component version number + */ +const char *mca_atomic_mxm_component_version_string = +"Open SHMEM mxm atomic MCA component version " OSHMEM_VERSION; + +/* + * Global variable + */ +int mca_atomic_mxm_priority_param = -1; +mca_spml_ikrit_t *mca_spml_self = NULL; + +/* + * Local function + */ +static int __mxm_open(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_atomic_base_component_t mca_atomic_mxm_component = { + + /* First, the mca_component_t struct containing meta information + about the component itself */ + + { + MCA_ATOMIC_BASE_VERSION_2_0_0, + + /* Component name and version */ + "mxm", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + __mxm_open, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Initialization / querying functions */ + + mca_atomic_mxm_init, + mca_atomic_mxm_finalize, + mca_atomic_mxm_query +}; + +static int __mxm_open(void) +{ + /* + * This component is able to work using spml:ikrit component only + * (this check is added instead of !mca_spml_ikrit.enabled) + */ + if (strcmp(mca_spml_base_selected_component.spmlm_version.mca_component_name, "ikrit")) { + ATOMIC_VERBOSE(5, + "Can not use atomic/mxm because spml ikrit component disabled"); + return OSHMEM_ERR_NOT_AVAILABLE; + } + mca_spml_self = (mca_spml_ikrit_t *)mca_spml.self; + + mca_atomic_mxm_priority_param = 100; + (void) mca_base_component_var_register(&mca_atomic_mxm_component.atomic_version, + "priority", + "Priority of the basic atomic:mxm component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_atomic_mxm_priority_param); + + return OSHMEM_SUCCESS; +} + +OBJ_CLASS_INSTANCE(mca_atomic_mxm_module_t, + mca_atomic_base_module_t, + NULL, + NULL); diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c new file mode 100644 index 0000000000..b6da73b4e1 --- /dev/null +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/runtime/runtime.h" + +#include "atomic_mxm.h" + + +int mca_atomic_mxm_cswap(void *target, + void *prev, + const void *cond, + const void *value, + size_t nlong, + int pe) +{ + unsigned my_pe; + uint8_t nlong_order; + uint64_t remote_addr; + int ptl_id; + mxm_send_req_t sreq; + mxm_error_t mxm_err; + + my_pe = oshmem_my_proc_id(); + ptl_id = -1; + mxm_err = MXM_OK; + + if (!prev || !target || !value) { + ATOMIC_ERROR("[#%d] Whether target, value or prev are not defined", + my_pe); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + if ((pe < 0) || (pe >= oshmem_num_procs())) { + ATOMIC_ERROR("[#%d] PE=%d not valid", my_pe, pe); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + switch (nlong) { + case 1: + nlong_order = 0; + break; + case 2: + nlong_order = 1; + break; + case 4: + nlong_order = 2; + break; + case 8: + nlong_order = 3; + break; + default: + ATOMIC_ERROR("[#%d] Type size must be 1/2/4 or 8 bytes.", my_pe); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + ptl_id = oshmem_proc_group_all(pe)->transport_ids[0]; + if (MXM_PTL_SHM == ptl_id) { + ptl_id = MXM_PTL_RDMA; + } + + if (!mca_memheap.memheap_get_cached_mkey(pe, + (unsigned long) target, + ptl_id, + &remote_addr)) { + ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", + my_pe, target); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + /* mxm request init */ + sreq.base.state = MXM_REQ_NEW; + sreq.base.mq = mca_spml_self->mxm_mq; + sreq.base.conn = mca_spml_self->mxm_peers[pe]->mxm_conn; + sreq.base.completed_cb = NULL; + sreq.base.data_type = MXM_REQ_DATA_BUFFER; + + /* set data */ + sreq.base.data.buffer.ptr = (void *) value; + sreq.base.data.buffer.length = nlong; + sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; + + sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; +#if MXM_API < MXM_VERSION(2,0) + sreq.base.flags = 0; + sreq.op.atomic.remote_memh = MXM_INVALID_MEM_HANDLE; +#else + sreq.flags = 0; + sreq.op.atomic.remote_mkey = MXM_INVALID_MEM_HANDLE; +#endif + sreq.op.atomic.order = nlong_order; + + if (NULL == cond) { + sreq.opcode = MXM_REQ_OP_ATOMIC_SWAP; + } else { +#if MXM_API < MXM_VERSION(2,0) + memcpy(&sreq.op.atomic.value8, cond, nlong); +#else + memcpy(&sreq.op.atomic.value, cond, nlong); +#endif + sreq.opcode = MXM_REQ_OP_ATOMIC_CSWAP; + } + + if (MXM_OK != (mxm_err = mxm_req_send(&sreq))) { + ATOMIC_ERROR("[#%d] mxm_req_send failed, mxm_error = %d", + my_pe, mxm_err); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + mxm_req_wait(&sreq.base); + if (MXM_OK != sreq.base.error) { + ATOMIC_ERROR("[#%d] mxm_req_wait got non MXM_OK error: %d", + my_pe, sreq.base.error); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + memcpy(prev, value, nlong); + + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c new file mode 100644 index 0000000000..7475667948 --- /dev/null +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/runtime/runtime.h" + +#include "atomic_mxm.h" + + +int mca_atomic_mxm_fadd(void *target, + void *prev, + const void *value, + size_t nlong, + int pe, + struct oshmem_op_t *op) +{ + unsigned my_pe; + uint8_t nlong_order; + uint64_t remote_addr; + int ptl_id; + mxm_send_req_t sreq; + mxm_error_t mxm_err; + + my_pe = oshmem_my_proc_id(); + ptl_id = -1; + mxm_err = MXM_OK; + + if (!target || !value) { + ATOMIC_ERROR("[#%d] target or value are not defined", my_pe); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + if ((pe < 0) || (pe >= oshmem_num_procs())) { + ATOMIC_ERROR("[#%d] PE=%d not valid", my_pe, pe); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + switch (nlong) { + case 1: + nlong_order = 0; + break; + case 2: + nlong_order = 1; + break; + case 4: + nlong_order = 2; + break; + case 8: + nlong_order = 3; + break; + default: + ATOMIC_ERROR("[#%d] Type size must be 1/2/4 or 8 bytes.", my_pe); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + ptl_id = oshmem_proc_group_all(pe)->transport_ids[0]; + if (MXM_PTL_SHM == ptl_id) { + ptl_id = MXM_PTL_RDMA; + } + + if (!mca_memheap.memheap_get_cached_mkey(pe, + (unsigned long) target, + ptl_id, + &remote_addr)) { + ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", + my_pe, target); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; + } + + /* mxm request init */ + sreq.base.state = MXM_REQ_NEW; + sreq.base.mq = mca_spml_self->mxm_mq; + sreq.base.conn = mca_spml_self->mxm_peers[pe]->mxm_conn; + sreq.base.completed_cb = NULL; + sreq.base.data_type = MXM_REQ_DATA_BUFFER; + + sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; +#if MXM_API < MXM_VERSION(2,0) + sreq.op.atomic.remote_memh = MXM_INVALID_MEM_HANDLE; +#else + sreq.op.atomic.remote_mkey = MXM_INVALID_MEM_HANDLE; +#endif + sreq.op.atomic.order = nlong_order; + memcpy(&sreq.op.atomic.value8, value, nlong); + + /* Do we need atomic 'add' or atomic 'fetch and add'? */ + if (NULL == prev) { + sreq.base.data.buffer.ptr = NULL; + sreq.base.data.buffer.length = 0; + sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; +#if MXM_API < MXM_VERSION(2,0) + sreq.base.flags = MXM_REQ_FLAG_SEND_SYNC; + sreq.opcode = MXM_REQ_OP_ATOMIC_ADD; +#else + sreq.flags = 0; + sreq.opcode = MXM_REQ_OP_ATOMIC_ADD_SYNC; +#endif + } else { + sreq.base.data.buffer.ptr = prev; + sreq.base.data.buffer.length = nlong; + sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; +#if MXM_API < MXM_VERSION(2,0) + sreq.base.flags = 0; +#else + sreq.flags = 0; +#endif + + sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; + } + + if (MXM_OK != (mxm_err = mxm_req_send(&sreq))) { + ATOMIC_ERROR("[#%d] mxm_req_send failed, mxm_error = %d", + my_pe, mxm_err); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + mxm_req_wait(&sreq.base); + if (MXM_OK != sreq.base.error) { + ATOMIC_ERROR("[#%d] mxm_req_wait got non MXM_OK error: %d", + my_pe, sreq.base.error); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_module.c b/oshmem/mca/atomic/mxm/atomic_mxm_module.c new file mode 100644 index 0000000000..c846bf9526 --- /dev/null +++ b/oshmem/mca/atomic/mxm/atomic_mxm_module.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/proc/proc.h" +#include "atomic_mxm.h" + +/* + * Initial query function that is invoked during initialization, allowing + * this module to indicate what level of thread support it provides. + */ +int mca_atomic_mxm_init(bool enable_progress_threads, bool enable_threads) +{ + return OSHMEM_SUCCESS; +} + +int mca_atomic_mxm_finalize(void) +{ + return OSHMEM_SUCCESS; +} + +mca_atomic_base_module_t * +mca_atomic_mxm_query(int *priority) +{ + mca_atomic_mxm_module_t *module; + + *priority = mca_atomic_mxm_priority_param; + + module = OBJ_NEW(mca_atomic_mxm_module_t); + if (module) { + module->super.atomic_fadd = mca_atomic_mxm_fadd; + module->super.atomic_cswap = mca_atomic_mxm_cswap; + return &(module->super); + } + + return NULL ; +} diff --git a/oshmem/mca/atomic/mxm/configure.m4 b/oshmem/mca/atomic/mxm/configure.m4 new file mode 100644 index 0000000000..af057cd96f --- /dev/null +++ b/oshmem/mca/atomic/mxm/configure.m4 @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +# MCA_oshmem_atomic_mxm_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_oshmem_atomic_mxm_CONFIG],[ + AC_CONFIG_FILES([oshmem/mca/atomic/mxm/Makefile]) + OMPI_CHECK_MXM([atomic_mxm], + [save_CPPFLAGS="$CPPFLAGS" + save_LDFLAGS="$LDFLAGS" + save_LIBS="$LIBS" + + CPPFLAGS="$CPPFLAGS -I$ompi_check_mxm_dir/include" + LDFLAGS="$LDFLAGS -L$ompi_check_mxm_dir/lib" + LIBS="$LIBS -lmxm" + AC_TRY_RUN([ + #include + int main() { + if (mxm_get_version() < MXM_VERSION(1,5) ) + return 1; + + /* if compiler sees these constansts then mxm has atomic support*/ + int add_index = MXM_REQ_OP_ATOMIC_ADD; + int swap_index = MXM_REQ_OP_ATOMIC_SWAP; + return 0; + }], + [AC_DEFINE([OSHMEM_HAS_ATOMIC_MXM], [1], [mxm support is available]) atomic_mxm_happy="yes"], + [atomic_mxm_happy="no"]) + CPPFLAGS=$save_CPPFLAGS + LDFLAGS=$save_LDFLAGS + LIBS=$save_LIBS + ], + [atomic_mxm_happy="no"]) + + AS_IF([test "$atomic_mxm_happy" = "yes"], + [atomic_mxm_WRAPPER_EXTRA_LDFLAGS="$atomic_mxm_LDFLAGS" + atomic_mxm_WRAPPER_EXTRA_LIBS="$atomic_mxm_LIBS" + $1], + [$2]) + + + # substitute in the things needed to build mxm + AC_SUBST([atomic_mxm_CFLAGS]) + AC_SUBST([atomic_mxm_CPPFLAGS]) + AC_SUBST([atomic_mxm_LDFLAGS]) + AC_SUBST([atomic_mxm_LIBS]) + + AC_MSG_CHECKING([if oshmem/atomic/mxm component can be compiled]) + AC_MSG_RESULT([$atomic_mxm_happy]) +])dnl + diff --git a/oshmem/mca/atomic/mxm/configure.params b/oshmem/mca/atomic/mxm/configure.params new file mode 100644 index 0000000000..1b6b5ba51c --- /dev/null +++ b/oshmem/mca/atomic/mxm/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/memheap/Makefile.am b/oshmem/mca/memheap/Makefile.am new file mode 100644 index 0000000000..b8295ce139 --- /dev/null +++ b/oshmem/mca/memheap/Makefile.am @@ -0,0 +1,39 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_memheap.la +libmca_memheap_la_SOURCES = +libmca_memheap_la_LDFLAGS = +libmca_memheap_la_LIBADD = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +dist_pkgdata_DATA = + +# local files +headers = memheap.h +libmca_memheap_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/memheap +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/memheap/README b/oshmem/mca/memheap/README new file mode 100644 index 0000000000..42edca9238 --- /dev/null +++ b/oshmem/mca/memheap/README @@ -0,0 +1,50 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved +# $COPYRIGHT$ +MEMHEAP Infrustructure documentation +------------------------------------ + +MEMHEAP Infrustructure is responsible for managing the symmetric heap. +The framework currently has following components: buddy and ptmalloc. buddy which uses a buddy allocator in order to manage the Memory allocations on the symmetric heap. Ptmalloc is an adaptation of ptmalloc3. + +Additional components may be added easily to the framework by defining the component's and the module's base and extended structures, and their funtionalities. + +The buddy allocator has the following data structures: +1. Base component - of type struct mca_memheap_base_component_2_0_0_t +2. Base module - of type struct mca_memheap_base_module_t +3. Buddy component - of type struct mca_memheap_base_component_2_0_0_t +4. Buddy module - of type struct mca_memheap_buddy_module_t extending the base module (struct mca_memheap_base_module_t) + +Each data structure includes the following fields: +1. Base component - memheap_version, memheap_data and memheap_init +2. Base module - Holds pointers to the base component and to the functions: alloc, free and finalize +3. Buddy component - is a base component. +4. Buddy module - Extends the base module and holds additional data on the components's priority, buddy allocator, + maximal order of the symmetric heap, symmetric heap, pointer to the symmetric heap and hashtable maintaining the size of each allocated address. + +In the case that the user decides to implement additional components, the Memheap infrastructure chooses a component with the maximal priority. +Handling the component opening is done under the base directory, in three stages: +1. Open all available components. Implemented by memheap_base_open.c and called from shmem_init. +2. Select the maximal priority component. This procedure involves the initialization of all components and then their + finalization except to the chosen component. It is implemented by memheap_base_select.c and called from shmem_init. +3. Close the max priority active cmponent. Implemented by memheap_base_close.c and called from shmem finalize. + + +Buddy Component/Module +---------------------- + +Responsible for handling the entire activities of the symmetric heap. +The supported activities are: + - buddy_init (Initialization) + - buddy_alloc (Allocates a variable on the symmetric heap) + - buddy_free (frees a variable previously allocated on the symetric heap) + - buddy_finalize (Finalization). + +Data members of buddy module: - priority. The module's priority. + - buddy allocator: bits, num_free, lock and the maximal order (log2 of the maximal size) + of a variable on the symmetric heap. Buddy Allocator gives the offset in the symmetric heap + where a variable should be allocated. + - symmetric_heap: a range of reserved addresses (equal in all executing PE's) dedicated to "shared memory" allocation. + - symmetric_heap_hashtable (holding the size of an allocated variable on the symmetric heap. + used to free an allocated variable on the symmetric heap) + diff --git a/oshmem/mca/memheap/base/Makefile.am b/oshmem/mca/memheap/base/Makefile.am new file mode 100644 index 0000000000..2fe0a74f72 --- /dev/null +++ b/oshmem/mca/memheap/base/Makefile.am @@ -0,0 +1,27 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) +AM_CPPFLAGS = $(openib_CPPFLAGS) + +dist_pkgdata_DATA += base/help-shmem-mca.txt + +headers += \ + base/base.h + +libmca_memheap_la_SOURCES += \ + base/memheap_base_frame.c \ + base/memheap_base_select.c \ + base/memheap_base_alloc.c \ + base/memheap_base_static.c \ + base/memheap_base_register.c \ + base/memheap_base_mkey.c + +libmca_memheap_la_LDFLAGS += -module -avoid-version $(openib_LDFLAGS) +libmca_memheap_la_LIBADD += $(openib_LIBS) diff --git a/oshmem/mca/memheap/base/base.h b/oshmem/mca/memheap/base/base.h new file mode 100644 index 0000000000..d7f11f724a --- /dev/null +++ b/oshmem/mca/memheap/base/base.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_MEMHEAP_BASE_H +#define MCA_MEMHEAP_BASE_H + +#include "oshmem_config.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_value_array.h" +#include "opal/mca/mca.h" + +#include "oshmem/mca/memheap/memheap.h" + +BEGIN_C_DECLS + +/* + * Global functions for MCA: overall MEMHEAP open and close + */ +OSHMEM_DECLSPEC int mca_memheap_base_select(void); + +/* + * Globals + */ +OSHMEM_DECLSPEC extern struct mca_memheap_base_module_t* mca_memheap_base_module_initialized; + +/* only used within base -- no need to DECLSPEC */ +#define MEMHEAP_BASE_MIN_ORDER 3 /* forces 64 bit alignment */ +#define MEMHEAP_BASE_PAGE_ORDER 21 +#define MEMHEAP_BASE_PRIVATE_SIZE (1ULL << MEMHEAP_BASE_PAGE_ORDER) /* should be at least the same as a huge page size */ +#define MEMHEAP_BASE_MIN_SIZE (1ULL << MEMHEAP_BASE_PAGE_ORDER) /* must fit into at least one huge page */ + +extern unsigned long long mca_memheap_base_start_address; +extern char* mca_memheap_base_include; +extern char* mca_memheap_base_exclude; +extern int mca_memheap_base_already_opened; +extern int mca_memheap_base_alloc_type; +extern int mca_memheap_base_key_exchange; +extern int mca_memheap_base_mr_interleave_factor; + +#define MCA_MEMHEAP_MAX_SEGMENTS 256 +#define HEAP_SEG_INDEX 0 +#define SYMB_SEG_INDEX 1 + +#define MEMHEAP_SHM_INVALID (-1) + +#define MEMHEAP_SHM_CODE( type, id ) ((((uint64_t)(type)) << 32) | ((uint32_t)(id))) +#define MEMHEAP_SHM_GET_TYPE( x ) (((uint32_t)((x) >> 32)) & 0xFFFFFFFF) +#define MEMHEAP_SHM_GET_ID( x ) ((uint32_t)((x) & 0xFFFFFFFF)) + +typedef enum { + MAP_SEGMENT_STATIC = 0, + MAP_SEGMENT_ALLOC_MMAP, + MAP_SEGMENT_ALLOC_SHM, + MAP_SEGMENT_ALLOC_IBV, + MAP_SEGMENT_UNKNOWN +} segment_type_t; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +#include + +typedef struct openib_device_t { + struct ibv_device **ib_devs; + struct ibv_device *ib_dev; + struct ibv_context *ib_dev_context; + struct ibv_device_attr ib_dev_attr; + struct ibv_pd *ib_pd; + opal_value_array_t ib_mr_array;; + struct ibv_mr *ib_mr_shared; +} openib_device_t; +#endif /* MPAGE_ENABLE */ + +typedef struct map_segment_t { + mca_spml_mkey_t **mkeys_cache; /* includes remote segment bases in va_base */ + mca_spml_mkey_t *mkeys; /* includes local segment bases in va_base */ + int is_active; /* enable/disable flag */ + int shmid; + + uint64_t start; /* base address of the segment */ + uint64_t end; /* final address of the segment */ + size_t size; /* length of the segment */ + + segment_type_t type; /* type of the segment */ + void *context; /* additional data related the segment */ +} map_segment_t; + +typedef struct mca_memheap_map { + map_segment_t mem_segs[MCA_MEMHEAP_MAX_SEGMENTS]; /* TODO: change into pointer array */ + int n_segments; + int num_transports; +} mca_memheap_map_t; + +extern mca_memheap_map_t mca_memheap_base_map; + +int mca_memheap_base_alloc_init(mca_memheap_map_t *, size_t); +void mca_memheap_base_alloc_exit(mca_memheap_map_t *); +int mca_memheap_base_static_init(mca_memheap_map_t *); +void mca_memheap_base_static_exit(mca_memheap_map_t *); +int mca_memheap_base_reg(mca_memheap_map_t *); +int mca_memheap_base_dereg(mca_memheap_map_t *); +int memheap_oob_init(mca_memheap_map_t *); +void memheap_oob_destruct(void); + +OSHMEM_DECLSPEC uint64_t mca_memheap_base_find_offset(int pe, + int tr_id, + unsigned long va, + uint64_t rva); +OSHMEM_DECLSPEC int mca_memheap_base_is_symmetric_addr(unsigned long va); +OSHMEM_DECLSPEC mca_spml_mkey_t *mca_memheap_base_get_mkey(unsigned long va, + int tr_id); +OSHMEM_DECLSPEC mca_spml_mkey_t * mca_memheap_base_get_cached_mkey(int pe, + unsigned long va, + int btl_id, + uint64_t *rva); +OSHMEM_DECLSPEC void mca_memheap_modex_recv_all(void); + +/* This function is for internal usage only + * return value: + * 0 - addr is not symmetric address + * 1 - addr is part of user memheap + * 2 - addr is part of private memheap + * 3 - addr is static variable + */ +typedef enum { + ADDR_INVALID = 0, ADDR_USER, ADDR_PRIVATE, ADDR_STATIC, +} addr_type_t; + +OSHMEM_DECLSPEC int mca_memheap_base_detect_addr_type(unsigned long va); + +static inline unsigned memheap_log2(unsigned long long val) +{ + /* add 1 if val is NOT a power of 2 (to do the ceil) */ + unsigned int count = (val & (val - 1) ? 1 : 0); + + while (val > 0) { + val = val >> 1; + count++; + } + + return count > 0 ? count - 1 : 0; +} + +static inline void *memheap_down_align_addr(void* addr, unsigned int shift) +{ + return (void*) (((intptr_t) addr) & (~(intptr_t) 0) << shift); +} + +static inline void *memheap_up_align_addr(void*addr, unsigned int shift) +{ + return (void*) ((((intptr_t) addr) | ~((~(intptr_t) 0) << shift))); +} + +static inline unsigned long long memheap_align(unsigned long top) +{ + return ((top + MEMHEAP_BASE_MIN_SIZE - 1) & ~(MEMHEAP_BASE_MIN_SIZE - 1)); +} + +/* + * MCA framework + */ +OSHMEM_DECLSPEC extern mca_base_framework_t oshmem_memheap_base_framework; + +/* ******************************************************************** */ +#ifdef __BASE_FILE__ +#define __SPML_FILE__ __BASE_FILE__ +#else +#define __SPML_FILE__ __FILE__ +#endif + +#define MEMHEAP_VERBOSE(level, format, ...) \ + opal_output_verbose(level, oshmem_memheap_base_framework.framework_output, "%s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define MEMHEAP_ERROR(format, ... ) \ + opal_output_verbose(0, oshmem_memheap_base_framework.framework_output, "Error: %s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define MEMHEAP_WARN(format, ... ) \ + opal_output_verbose(0, oshmem_memheap_base_framework.framework_output, "Warning: %s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) +END_C_DECLS + +#endif /* MCA_MEMHEAP_BASE_H */ diff --git a/oshmem/mca/memheap/base/help-shmem-mca.txt b/oshmem/mca/memheap/base/help-shmem-mca.txt new file mode 100644 index 0000000000..680dbb635a --- /dev/null +++ b/oshmem/mca/memheap/base/help-shmem-mca.txt @@ -0,0 +1,23 @@ +# -*- text -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open SHMEM MCA error messages. +# +[find-available:none-found] +No available %s components were found! + +This means that there are no components of this type installed on your +system or all the components reported that they could not be used. + +This is a fatal error; your SHMEM process is likely to abort. Check the +output of the "ompi_info" command and ensure that components of this +type are available on your system. You may also wish to check the +value of the "component_path" MCA parameter and ensure that it has at +least one directory that contains valid MCA components. diff --git a/oshmem/mca/memheap/base/memheap_base_alloc.c b/oshmem/mca/memheap/base/memheap_base_alloc.c new file mode 100644 index 0000000000..6a2bc6f5e8 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_alloc.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#include +#include + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +#include +#endif /* MPAGE_ENABLE */ + +extern char* mca_memheap_base_param_hca_name; + +static int __shm_attach(map_segment_t *, size_t, int, int); +static void __shm_detach(map_segment_t *); + +static int __mmap_attach(map_segment_t *, size_t); +static void __mmap_detach(map_segment_t *); + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +static int __ibv_attach(map_segment_t *, size_t); +static void __ibv_detach(map_segment_t *); +#endif /* MPAGE_ENABLE */ + +static int __adaptive_attach(map_segment_t *, size_t); + +int mca_memheap_base_alloc_init(mca_memheap_map_t *map, size_t size) +{ + int ret = OSHMEM_SUCCESS; + int value = mca_memheap_base_alloc_type; + + assert(map); + assert(HEAP_SEG_INDEX == map->n_segments); + + MEMHEAP_VERBOSE(5, + "memheap method : %d", + mca_memheap_base_alloc_type); + + map_segment_t *s = &map->mem_segs[map->n_segments]; + memset(s, 0, sizeof(*s)); + s->is_active = 0; + s->shmid = MEMHEAP_SHM_INVALID; + s->start = 0; + s->end = 0; + s->size = 0; + s->type = MAP_SEGMENT_UNKNOWN; + s->context = NULL; + + switch (value) { + case 0: + /* use sysv alloc without hugepages */ + ret = __shm_attach(s, size, 0, 1); + break; + + case 1: + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) + ret = __shm_attach(s, size, 0, 1); + break; + + case 2: + /* huge pages only */ + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) + MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", + errno); + break; + + case 3: + /* huge pages only + cleanup shmid */ + ret = __shm_attach(s, size, 1, 0); + if (OSHMEM_SUCCESS != ret) + MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", + errno); + break; + + case 4: + /* use sysv alloc without hugepages */ + ret = __shm_attach(s, size, 0, 0); + break; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + case 5: + /* use shared memory registration (mpages) */ + ret = __ibv_attach(s, size); + if (OSHMEM_SUCCESS != ret) + ret = __shm_attach(s, size, 0, 1); + + break; +#endif /* MPAGE_ENABLE */ + + case 100: + /* use mmap. It will severaly impact performance of intra node communication */ + ret = __mmap_attach(s, size); + MEMHEAP_VERBOSE(1, + "mmap() memheap allocation will severely impact performance of intra node communication"); + break; + + case 101: + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) { + MEMHEAP_ERROR("Failed to allocate hugepages. Falling back on regular allocation"); + ret = __mmap_attach(s, size); + } else { + s->shmid = MEMHEAP_SHM_INVALID; + } + MEMHEAP_VERBOSE(1, "SM BTL will be always used for intranode comm\n"); + break; + + case 102: + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) { + MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", + errno); + } else { + s->shmid = MEMHEAP_SHM_INVALID; + } + break; + + default: + ret = __adaptive_attach(s, size); + } + + if (OSHMEM_SUCCESS == ret) { + map->n_segments++; + MEMHEAP_VERBOSE(1, + "Memheap alloc memory: %llu byte(s), %d segments by method: %d", + (unsigned long long)size, map->n_segments, s->type); + } + + return ret; +} + +void mca_memheap_base_alloc_exit(mca_memheap_map_t *map) +{ + if (map) { + map_segment_t *s = &map->mem_segs[HEAP_SEG_INDEX]; + + assert(s); + + switch (s->type) { + case MAP_SEGMENT_ALLOC_SHM: + __shm_detach(s); + break; + + case MAP_SEGMENT_ALLOC_MMAP: + __mmap_detach(s); + break; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + case MAP_SEGMENT_ALLOC_IBV: + __ibv_detach(s); + break; +#endif /* MPAGE_ENABLE */ + + default: + MEMHEAP_ERROR("Unknown segment type: %d", (int)s->type); + } + } +} + +static int __adaptive_attach(map_segment_t *s, size_t size) +{ + int rc = OSHMEM_SUCCESS; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + rc = __ibv_attach(s, size); +#endif /* MPAGE_ENABLE */ + + if (rc) { + rc = __shm_attach(s, size, 1, 1); + } + + if (rc) { + rc = __shm_attach(s, size, 0, 1); + } + + if (rc) { + rc = __shm_attach(s, size, 0, 0); + } + + if (rc) { + rc = __mmap_attach(s, size); + } + + return rc; +} + +static int __shm_attach(map_segment_t *s, size_t size, int use_hp, int do_rmid) +{ + static int shm_context = 0; + ; + void *addr = NULL; + int shmid = MEMHEAP_SHM_INVALID; + int flags; + + assert(s); + + shm_context = use_hp; + + flags = IPC_CREAT | IPC_EXCL | SHM_R | SHM_W; +#if defined (SHM_HUGETLB) + flags |= (use_hp ? SHM_HUGETLB : 0); +#endif + + /* Create a new shared memory segment and save the shmid. */ + shmid = shmget(IPC_PRIVATE, size, flags); + if (shmid == MEMHEAP_SHM_INVALID) { + MEMHEAP_VERBOSE(1, "Failed to get shm segment (errno=%d)", errno); + return OSHMEM_ERROR; + } + + /* Attach to the sement */ + addr = shmat(shmid, (void *) mca_memheap_base_start_address, 0); + if (addr == (void *) -1L) { + MEMHEAP_VERBOSE(1, "Failed to attach to shm segment (errno=%d)", errno); + + shmctl(shmid, IPC_RMID, NULL ); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + MEMHEAP_VERBOSE(5, "got shmid %d", shmid); + + if (do_rmid) + shmctl(shmid, IPC_RMID, NULL ); + + s->type = MAP_SEGMENT_ALLOC_SHM; + s->shmid = shmid; + s->start = (uintptr_t) addr; + s->size = size; + s->end = s->start + s->size; + s->context = &shm_context; + + return OSHMEM_SUCCESS; +} + +static void __shm_detach(map_segment_t *s) +{ + assert(s); + + if (s->shmid != MEMHEAP_SHM_INVALID) { + shmctl(s->shmid, IPC_RMID, NULL ); + } + + if (s->context && (*((int *) (s->context))) > 0) { + /** + * Workaround kernel panic when detaching huge pages from user space simultanously from several processes + * dont detach here instead let kernel do it during process cleanup + */ + /* shmdt((void *)s->start); */ + } +} + +static int __mmap_attach(map_segment_t *s, size_t size) +{ + void *addr = NULL; + + assert(s); + + addr = mmap((void *) mca_memheap_base_start_address, + size, + PROT_READ | PROT_WRITE, + MAP_SHARED | +#if defined (__APPLE__) +MAP_ANON +#elif defined (__GNUC__) +MAP_ANONYMOUS +#endif + | MAP_FIXED, + 0, + 0); + + if (MAP_FAILED == addr) { + MEMHEAP_ERROR("Failed to mmap() %llu bytes (errno=%d)", + (unsigned long long)size, errno); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + s->type = MAP_SEGMENT_ALLOC_MMAP; + s->shmid = MEMHEAP_SHM_INVALID; + s->start = (uintptr_t) addr; + s->size = size; + s->end = s->start + s->size; + s->context = NULL; + + return OSHMEM_SUCCESS; +} + +static void __mmap_detach(map_segment_t *s) +{ + assert(s); + + munmap((void *) s->start, s->size); +} + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + +static int __ibv_attach(map_segment_t *s, size_t size) +{ + int rc = OSHMEM_SUCCESS; + static openib_device_t memheap_device; + openib_device_t *device = &memheap_device; + int num_devs = 0; + + assert(s); + + memset(device, 0, sizeof(*device)); + +#ifdef HAVE_IBV_GET_DEVICE_LIST + device->ib_devs = ibv_get_device_list(&num_devs); +#else +#error unsupported ibv_get_device_list in infiniband/verbs.h +#endif + + if (num_devs == 0 || !device->ib_devs) + { + rc = OSHMEM_ERR_NOT_SUPPORTED; + } + + /* Open device */ + if (!rc) + { + int i = 0; + + if (num_devs > 1) + { + if (NULL == mca_memheap_base_param_hca_name) + { + MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs); + } + else + { + MEMHEAP_VERBOSE(5, "found %d HCAs, searching for %s", num_devs, mca_memheap_base_param_hca_name); + } + } + + for (i = 0; i < num_devs; i++) + { + device->ib_dev = device->ib_devs[i]; + + device->ib_dev_context = ibv_open_device(device->ib_dev); + if (NULL == device->ib_dev_context) + { + MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s", + ibv_get_device_name(device->ib_dev), errno, strerror(errno)); + rc = OSHMEM_ERR_RESOURCE_BUSY; + } + else + { + if (NULL != mca_memheap_base_param_hca_name) + { + if (0 == strcmp(mca_memheap_base_param_hca_name,ibv_get_device_name(device->ib_dev))) + { + MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs); + rc = OSHMEM_SUCCESS; + break; + } + } + else + { + MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs); + rc = OSHMEM_SUCCESS; + break; + } + } + } + } + + /* Obtain device attributes */ + if (!rc) + { + if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) + { + MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s", + ibv_get_device_name(device->ib_dev), errno, strerror(errno)); + rc = OSHMEM_ERR_RESOURCE_BUSY; + } + else + { + MEMHEAP_VERBOSE(5, "ibv device %s", + ibv_get_device_name(device->ib_dev)); + } + } + + /* Allocate the protection domain for the device */ + if (!rc) + { + device->ib_pd = ibv_alloc_pd(device->ib_dev_context); + if (NULL == device->ib_pd) + { + MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s", + ibv_get_device_name(device->ib_dev), errno, strerror(errno)); + rc = OSHMEM_ERR_RESOURCE_BUSY; + } + } + + /* Allocate memory */ + if (!rc) + { + void *addr = NULL; + struct ibv_mr *ib_mr = NULL; + int access_flag = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ; + + OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); + opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + access_flag |= IBV_ACCESS_ALLOCATE_MR | + IBV_ACCESS_SHARED_MR_USER_READ | + IBV_ACCESS_SHARED_MR_USER_WRITE; +#endif /* MPAGE_ENABLE */ + + ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag); + if (NULL == ib_mr) + { + MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s", + (unsigned long long)size, errno, strerror(errno)); + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + else + { + device->ib_mr_shared = ib_mr; + opal_value_array_append_item(&device->ib_mr_array, &ib_mr); + } + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + if (!rc) + { + access_flag = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ| + IBV_ACCESS_NO_RDMA; + + addr = (void *)mca_memheap_base_start_address; + ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle, + device->ib_pd, addr, access_flag); + if (NULL == ib_mr) + { + MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s", + (unsigned long long)size, errno, strerror(errno)); + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + else + { + opal_value_array_append_item(&device->ib_mr_array, &ib_mr); + } + } +#endif /* MPAGE_ENABLE */ + + if (!rc) + { + assert(size == device->ib_mr_shared->length); + + s->type = MAP_SEGMENT_ALLOC_IBV; + s->shmid = device->ib_mr_shared->handle; + s->start = (intptr_t)ib_mr->addr; + s->size = size; + s->end = s->start + s->size; + s->context = &memheap_device; + } + } + + return rc; +} + +static void __ibv_detach(map_segment_t *s) +{ + int rc = OSHMEM_SUCCESS; + openib_device_t *device = NULL; + + assert(s); + + device = (openib_device_t *)s->context; + + if (device) + { + if(!rc && opal_value_array_get_size(&device->ib_mr_array)) + { + struct ibv_mr** array; + struct ibv_mr* ib_mr = NULL; + array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); + while (opal_value_array_get_size(&device->ib_mr_array) > 0) + { + ib_mr = array[0]; + if(ibv_dereg_mr(ib_mr)) + { + MEMHEAP_ERROR("error ibv_dereg_mr(): %d: %s", errno, strerror(errno)); + rc = OSHMEM_ERROR; + } + opal_value_array_remove_item(&device->ib_mr_array, 0); + } + + if(!rc && device->ib_mr_shared) + { + device->ib_mr_shared = NULL; + } + OBJ_DESTRUCT(&device->ib_mr_array); + } + + if(!rc && device->ib_pd) + { + if(ibv_dealloc_pd(device->ib_pd)) + { + MEMHEAP_ERROR("error ibv_dealloc_pd(): %d: %s", errno, strerror(errno)); + rc = OSHMEM_ERROR; + } + else + { + device->ib_pd = NULL; + } + } + + if(!rc && device->ib_dev_context) + { + if(ibv_close_device(device->ib_dev_context)) + { + MEMHEAP_ERROR("error ibv_close_device(): %d: %s", errno, strerror(errno)); + rc = OSHMEM_ERROR; + } + else + { + device->ib_dev_context = NULL; + } + } + + if(!rc && device->ib_devs) + { + ibv_free_device_list(device->ib_devs); + device->ib_devs = NULL; + } + } +} + +#endif /* MPAGE_ENABLE */ diff --git a/oshmem/mca/memheap/base/memheap_base_frame.c b/oshmem/mca/memheap/base/memheap_base_frame.c new file mode 100644 index 0000000000..a011be28e8 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_frame.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "oshmem/mca/memheap/base/static-components.h" + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +int mca_memheap_base_alloc_type = 5; +#else +int mca_memheap_base_alloc_type = 1; +#endif /* MPAGE_ENABLE */ + +unsigned long long int mca_memheap_base_start_address = 0xFF000000; +int mca_memheap_base_output = -1; +int mca_memheap_base_key_exchange = 1; +int mca_memheap_base_mr_interleave_factor = 2; +char* mca_memheap_base_include = NULL; +char* mca_memheap_base_exclude = NULL; +char* mca_memheap_base_param_hca_name = NULL; +opal_list_t mca_memheap_base_components_opened; +struct mca_memheap_base_module_t* mca_memheap_base_module_initialized = NULL; +int mca_memheap_base_already_opened = 0; +mca_memheap_map_t mca_memheap_base_map; + +static int mca_memheap_base_register(mca_base_register_flag_t flags) +{ + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "alloc_type", + "0|1|2|5 - disabled huge pages, enabled huge pages with fallback to mmap(), do not fallback to mmap(), enabled mpages(default)", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_alloc_type); +#else + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "alloc_type", + "0|1|2 - disabled huge pages, enabled huge pages(default) with fallback to mmap(), do not fallback to mmap()", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_alloc_type); +#endif /* MPAGE_ENABLE */ + + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "start_address", + "Specify base address for shared memory region", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_start_address); + + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "key_exchange", + "0|1 - disabled, enabled(default) force memory keys exchange", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_key_exchange); + + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "mr_interleave_factor", + "2 - default, try to give at least N Gbytes spaces between mapped memheaps of other pes that are local to me", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_mr_interleave_factor); + + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "include", + "Specify a specific MEMHEAP implementation to use", + MCA_BASE_VAR_TYPE_STRING, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_include); + + if (NULL == mca_memheap_base_include) { + mca_memheap_base_include = getenv(SHMEM_HEAP_TYPE); + if (NULL == mca_memheap_base_include) + mca_memheap_base_include = strdup(""); + else + mca_memheap_base_include = strdup(mca_memheap_base_include); + } + + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "exclude", + "Specify excluded MEMHEAP implementations", + MCA_BASE_VAR_TYPE_STRING, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_exclude); + + (void) mca_base_var_register("oshmem", + "memheap", + "base", + "hca_name", + "Specify excluded memheap implementations", + MCA_BASE_VAR_TYPE_STRING, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_memheap_base_param_hca_name); + + return OSHMEM_SUCCESS; +} + +static int mca_memheap_base_close(void) +{ + if (mca_memheap_base_already_opened <= 0) { + return OSHMEM_ERROR; + } + mca_memheap_base_already_opened--; + if (mca_memheap_base_already_opened > 0) { + return OSHMEM_SUCCESS; + } + + memheap_oob_destruct(); + + mca_memheap_base_dereg(&mca_memheap_base_map); + + mca_memheap_base_alloc_exit(&mca_memheap_base_map); + mca_memheap_base_static_exit(&mca_memheap_base_map); + + /* Close all remaining available components */ + return mca_base_framework_components_close(&oshmem_memheap_base_framework, NULL); +} + +static int mca_memheap_base_open(mca_base_open_flag_t flags) +{ + mca_memheap_base_already_opened = mca_memheap_base_already_opened + 1; + if (mca_memheap_base_already_opened > 1) { + return OSHMEM_SUCCESS; + } + + memset(&mca_memheap_base_map, 0, sizeof(mca_memheap_base_map)); + mca_memheap_base_map.n_segments = 0; + mca_memheap_base_map.num_transports = 0; + + /* Open up all available components */ + if (OPAL_SUCCESS != + mca_base_framework_components_open(&oshmem_memheap_base_framework, flags)) { + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} + +MCA_BASE_FRAMEWORK_DECLARE(oshmem, memheap, + "OSHMEM MEMHEAP", + mca_memheap_base_register, + mca_memheap_base_open, + mca_memheap_base_close, + mca_memheap_base_static_components, + 0); diff --git a/oshmem/mca/memheap/base/memheap_base_mkey.c b/oshmem/mca/memheap/base/memheap_base_mkey.c new file mode 100644 index 0000000000..cf2fe17052 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_mkey.c @@ -0,0 +1,630 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "opal/util/output.h" +#include "opal/dss/dss.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/bml/bml.h" +#include "ompi/mca/dpm/dpm.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/spml/spml.h" + +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#include +#include + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +#include +#endif /* MPAGE_ENABLE */ + +/* Turn ON/OFF debug output from build (default 0) */ +#ifndef MEMHEAP_BASE_DEBUG +#define MEMHEAP_BASE_DEBUG 0 +#endif + +#define MEMHEAP_RKEY_REQ 0xA1 +#define MEMHEAP_RKEY_RESP 0xA2 +#define MEMHEAP_RKEY_RESP_FAIL 0xA3 + +struct oob_comm { + opal_mutex_t lck; + opal_condition_t cond; + mca_spml_mkey_t *mkeys; + int mkeys_rcvd; +}; + +#define MEMHEAP_VERBOSE_FASTPATH(...) + +static mca_memheap_map_t* memheap_map = NULL; + +struct oob_comm memheap_oob; + +/* pickup list of rkeys and remote va */ +static int memheap_oob_get_mkeys(int pe, + uint32_t va_seg_num, + mca_spml_mkey_t *mkey); + +static inline unsigned long __seg2base_va(int seg) +{ + return memheap_map->mem_segs[seg].start; +} + +static int __seg_cmp(const void *k, const void *v) +{ + unsigned long va = (unsigned long) k; + map_segment_t *s = (map_segment_t *) v; + + if (va < s->start) + return -1; + if (va >= s->end) + return 1; + + return 0; +} + +static inline map_segment_t *__find_va(unsigned long va) +{ + map_segment_t *s; + + if (OPAL_LIKELY(va >= (unsigned long)memheap_map->mem_segs[HEAP_SEG_INDEX].start && + va < (unsigned long)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) { + s = &memheap_map->mem_segs[HEAP_SEG_INDEX]; + } else { + s = bsearch((const void *) va, + &memheap_map->mem_segs[SYMB_SEG_INDEX], + memheap_map->n_segments - 1, + sizeof(*s), + __seg_cmp); + } + +#if MEMHEAP_BASE_DEBUG == 1 + if (s) { + MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p", + s - memheap_map->mem_segs, + (long long)s->start, + (long long)s->end, + (long long)(s->end - s->start), + (void *)va); + } +#endif + return s; +} + +static int do_mkey_req(opal_buffer_t *msg, int pe, int seg) +{ + uint8_t msg_type; + oshmem_proc_t *proc; + int i, n, tr_id; + mca_spml_mkey_t *mkey; + + msg_type = MEMHEAP_RKEY_RESP; + opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); + + /* go over all transports to remote pe and pack mkeys */ + n = oshmem_get_transport_count(pe); + proc = oshmem_proc_group_find(oshmem_group_all, pe); + opal_dss.pack(msg, &n, 1, OPAL_UINT32); + MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); + for (i = 0; i < n; i++) { + tr_id = proc->transport_ids[i]; + + mkey = mca_memheap_base_get_mkey(__seg2base_va(seg), tr_id); + if (!mkey) { + MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", + seg, tr_id); + return OSHMEM_ERROR; + } + opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32); + opal_dss.pack(msg, &mkey->key, 1, OPAL_UINT64); + opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); + + if (NULL != MCA_SPML_CALL(get_remote_context_size)) { + uint32_t context_size = + (mkey->spml_context == NULL ) ? + 0 : + (uint32_t) MCA_SPML_CALL(get_remote_context_size(mkey->spml_context)); + opal_dss.pack(msg, &context_size, 1, OPAL_UINT32); + if (0 != context_size) { + opal_dss.pack(msg, + MCA_SPML_CALL(get_remote_context(mkey->spml_context)), + context_size, + OPAL_BYTE); + } + } + + MEMHEAP_VERBOSE(5, + "seg#%d tr_id: %d key %llx base_va %llx", + seg, tr_id, (unsigned long long)mkey->key, (unsigned long long)mkey->va_base); + } + return OSHMEM_SUCCESS; +} + +static void memheap_attach_segment(mca_spml_mkey_t *mkey, int tr_id) +{ + /* process special case when va was got using shmget(IPC_PRIVATE) + * this case is notable for: + * - key is set as (type|shmid); + * - va_base is set as 0; + */ + if (!mkey->va_base + && ((int) MEMHEAP_SHM_GET_ID(mkey->key) != MEMHEAP_SHM_INVALID)) { + MEMHEAP_VERBOSE(5, + "shared memory usage tr_id: %d key %llx base_va %llx shmid 0x%X|0x%X", + tr_id, + (unsigned long long)mkey->key, + (unsigned long long)mkey->va_base, + MEMHEAP_SHM_GET_TYPE(mkey->key), + MEMHEAP_SHM_GET_ID(mkey->key)); + + if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_SHM) { + mkey->va_base = (intptr_t) shmat(MEMHEAP_SHM_GET_ID(mkey->key), + 0, + 0); + } else if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_IBV) { +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + openib_device_t *device = NULL; + struct ibv_mr *ib_mr; + void *addr; + static int mr_count; + + int access_flag = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_NO_RDMA; + + device = (openib_device_t *)memheap_map->mem_segs[HEAP_SEG_INDEX].context; + assert(device); + + /* workaround mtt problem - request aligned addresses */ + ++mr_count; + addr = (void *)(mca_memheap_base_start_address + mca_memheap_base_mr_interleave_factor*1024ULL*1024ULL*1024ULL*mr_count); + ib_mr = ibv_reg_shared_mr(MEMHEAP_SHM_GET_ID(mkey->key), + device->ib_pd, addr, access_flag); + if (NULL == ib_mr) + { + mkey->va_base = -1; + MEMHEAP_ERROR("error to ibv_reg_shared_mr() errno says %d: %s", + errno, strerror(errno)); + } + else + { + if (ib_mr->addr != addr) { + MEMHEAP_WARN("Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_memheap_base_mr_interleave_factor); + } + + opal_value_array_append_item(&device->ib_mr_array, &ib_mr); + mkey->va_base = (intptr_t)ib_mr->addr; + } +#endif /* MPAGE_ENABLE */ + } else { + MEMHEAP_ERROR("tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X", + tr_id, + (unsigned long long)mkey->key, + MEMHEAP_SHM_GET_TYPE(mkey->key), + MEMHEAP_SHM_GET_ID(mkey->key)); + oshmem_shmem_abort(-1); + } + + if ((void *) -1 == (void *) mkey->va_base) { + MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d", + tr_id, (unsigned long long)mkey->key, errno); + oshmem_shmem_abort(-1); + } + } +} + +static void do_mkey_resp(opal_buffer_t *msg) +{ + int32_t cnt; + int32_t n; + int32_t tr_id; + int i; + + cnt = 1; + opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32); + for (i = 0; i < n; i++) { + opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32); + opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].key, &cnt, OPAL_UINT64); + opal_dss.unpack(msg, + &memheap_oob.mkeys[tr_id].va_base, + &cnt, + OPAL_UINT64); + + if (NULL != MCA_SPML_CALL(set_remote_context_size)) { + int32_t context_size; + opal_dss.unpack(msg, &context_size, &cnt, OPAL_UINT32); + if (0 != context_size) { + MCA_SPML_CALL(set_remote_context_size(&(memheap_oob.mkeys[tr_id].spml_context), context_size)); + void* context; + context = calloc(1, context_size); + opal_dss.unpack(msg, context, &context_size, OPAL_BYTE); + MCA_SPML_CALL(set_remote_context(&(memheap_oob.mkeys[tr_id].spml_context),context)); + } + } + + memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id); + + MEMHEAP_VERBOSE(5, + "tr_id: %d key %llx base_va %llx", + tr_id, (unsigned long long)memheap_oob.mkeys[tr_id].key, (unsigned long long)memheap_oob.mkeys[tr_id].va_base); + } +} + +static void memheap_buddy_rml_recv_cb(int status, + orte_process_name_t* process_name, + opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + MEMHEAP_VERBOSE(5, + "**** get request from %u:%d", + process_name->jobid, process_name->vpid); + int32_t cnt = 1; + int rc; + opal_buffer_t *msg; + uint8_t msg_type; + uint32_t seg; + + MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8); + rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + goto send_fail; + } + + switch (msg_type) { + case MEMHEAP_RKEY_REQ: + cnt = 1; + rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32); + if (ORTE_SUCCESS != rc) { + MEMHEAP_ERROR("bad RKEY_REQ msg"); + goto send_fail; + } + + MEMHEAP_VERBOSE(5, "*** RKEY REQ"); + msg = OBJ_NEW(opal_buffer_t); + if (!msg) { + MEMHEAP_ERROR("failed to get msg buffer"); + ORTE_ERROR_LOG(rc); + return; + } + + if (OSHMEM_SUCCESS != do_mkey_req(msg, process_name->vpid, seg)) { + OBJ_RELEASE(msg); + goto send_fail; + } + + rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL); + + if (0 > rc) { + MEMHEAP_ERROR("FAILED to send rml message %d", rc); + ORTE_ERROR_LOG(rc); + goto send_fail; + } + break; + + case MEMHEAP_RKEY_RESP: + MEMHEAP_VERBOSE(5, "*** RKEY RESP"); + OPAL_THREAD_LOCK(&memheap_oob.lck); + do_mkey_resp(buffer); + memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP; + opal_condition_broadcast(&memheap_oob.cond); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + break; + + case MEMHEAP_RKEY_RESP_FAIL: + MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL"); + memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL; + opal_condition_broadcast(&memheap_oob.cond); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + break; + + default: + MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type); + goto send_fail; + } + return; + + send_fail: msg = OBJ_NEW(opal_buffer_t); + if (!msg) { + MEMHEAP_ERROR("failed to get msg buffer"); + ORTE_ERROR_LOG(rc); + return; + } + msg_type = MEMHEAP_RKEY_RESP_FAIL; + opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); + + rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL); + if (0 > rc) { + MEMHEAP_ERROR("FAILED to send rml message %d", rc); + ORTE_ERROR_LOG(rc); + } + +} + +int memheap_oob_init(mca_memheap_map_t *map) +{ + int rc = OSHMEM_SUCCESS; + + memheap_map = map; + + OBJ_CONSTRUCT(&memheap_oob.lck, opal_mutex_t); + OBJ_CONSTRUCT(&memheap_oob.cond, opal_condition_t); + + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + OMPI_RML_TAG_SHMEM, + ORTE_RML_PERSISTENT, + memheap_buddy_rml_recv_cb, + NULL ); + + return rc; +} + +void memheap_oob_destruct(void) +{ + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, OMPI_RML_TAG_SHMEM); + OBJ_DESTRUCT(&memheap_oob.lck); + OBJ_DESTRUCT(&memheap_oob.cond); +} + +static int memheap_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) +{ + orte_process_name_t name; + opal_buffer_t *msg; + int rc; + uint8_t cmd; + int i; + + if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { + for (i = 0; i < memheap_map->num_transports; i++) { + mkeys[i].va_base = __seg2base_va(seg); + MEMHEAP_VERBOSE(5, + "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d key %llx base_va %llx", + pe, + i, + (unsigned long long)mkeys[i].key, + (unsigned long long)mkeys[i].va_base); + } + return OSHMEM_SUCCESS; + } + + OPAL_THREAD_LOCK(&memheap_oob.lck); + + memheap_oob.mkeys = mkeys; + memheap_oob.mkeys_rcvd = 0; + + name.jobid = ORTE_PROC_MY_NAME->jobid; + name.vpid = pe; + + msg = OBJ_NEW(opal_buffer_t); + if (!msg) { + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + MEMHEAP_ERROR("failed to get msg buffer"); + return OSHMEM_ERROR; + } + + OPAL_THREAD_LOCK(&memheap_oob.lck); + cmd = MEMHEAP_RKEY_REQ; + opal_dss.pack(msg, &cmd, 1, OPAL_UINT8); + opal_dss.pack(msg, &seg, 1, OPAL_UINT32); + rc = orte_rml.send_buffer_nb(&name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL); + if (0 > rc) { + OBJ_RELEASE(msg); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + MEMHEAP_ERROR("FAILED to send rml message %d", rc); + return OSHMEM_ERROR; + } + + MEMHEAP_VERBOSE(5, "message sent: %d bytes!", rc); + + while (!memheap_oob.mkeys_rcvd) { + opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck); + } + + if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) { + rc = OSHMEM_SUCCESS; + } else { + MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe); + rc = OSHMEM_ERROR; + } + + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + return rc; +} + +void mca_memheap_modex_recv_all(void) +{ + int i; + int j; + int nprocs, my_pe; + oshmem_proc_t *proc; + mca_spml_mkey_t *mkey; + uint64_t dummy_rva; + + if (!mca_memheap_base_key_exchange) + return; + + /* init rkey cache */ + nprocs = oshmem_num_procs(); + my_pe = oshmem_my_proc_id(); + + /* Note: + * Doing exchange via rml till we figure out problem with grpcomm.modex and barrier + */ + for (i = 0; i < nprocs; i++) { + if (i == my_pe) + continue; + + proc = oshmem_proc_group_find(oshmem_group_all, i); + for (j = 0; j < memheap_map->n_segments; j++) { + mkey = + mca_memheap_base_get_cached_mkey(i, + memheap_map->mem_segs[j].start, + proc->transport_ids[0], + &dummy_rva); + if (!mkey) { + MEMHEAP_ERROR("Failed to receive mkeys"); + oshmem_shmem_abort(-1); + } + } + + } + + /* + * There is an issue with orte_grpcomm.barrier usage as + * ess/pmi directs to use grpcomm/pmi in case slurm srun() call grpcomm/pmi calls PMI_Barrier() + * that is a function of external library. + * There is no opal_progress() in such way. As a result slow PEs send a request (MEMHEAP_RKEY_REQ) to + * fast PEs waiting on barrier and do not get a respond (MEMHEAP_RKEY_RESP). + * + * there are following ways to solve one: + * 1. calculate requests from remote PEs and do ORTE_PROGRESSED_WAIT waiting for expected value; + * 2. use shmem_barrier_all(); + * 3. rework pmi/barrier to use opal_progress(); + * 4. use orte_grpcomm.barrier carefully; + * + * It seems there is no need to use orte_grpcomm.barrier here + */ + + if (memheap_map->mem_segs[HEAP_SEG_INDEX].shmid != MEMHEAP_SHM_INVALID) { + /* unfortunately we must do barrier here to assure that everyone are attached to our segment + * good thing that this code path only invoked on older linuxes (-mca shmalloc_use_hugepages 3|4) + * try to minimize damage here by waiting 5 seconds and doing progress + */ + shmem_barrier_all(); + /* keys exchanged, segments attached, now we can safely cleanup */ + if (memheap_map->mem_segs[HEAP_SEG_INDEX].type + == MAP_SEGMENT_ALLOC_SHM) { + shmctl(memheap_map->mem_segs[HEAP_SEG_INDEX].shmid, + IPC_RMID, + NULL ); + } + } +} + +static inline uint64_t va2rva(unsigned long va, + uint64_t local_base, + uint64_t remote_base) +{ + return remote_base > local_base ? va + (remote_base - local_base) : + va - (local_base - remote_base); +} + +mca_spml_mkey_t * mca_memheap_base_get_cached_mkey(int pe, + unsigned long va, + int btl_id, + uint64_t *rva) +{ + map_segment_t *s; + int rc; + mca_spml_mkey_t *mkey; + + MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p", pe, (void *)va); + s = __find_va(va); + if (NULL == s) + return NULL ; + + if (!s->is_active) + return NULL ; + + if (pe == oshmem_my_proc_id()) { + *rva = va; + MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (local) %lx %p", pe, (void *)va, + s->mkeys[btl_id].key, (void *)*rva); + return &s->mkeys[btl_id]; + } + + if (OPAL_LIKELY(s->mkeys_cache[pe])) { + mkey = &s->mkeys_cache[pe][btl_id]; + *rva = va2rva(va, s->start, mkey->va_base); + MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (cached) %lx %p", pe, (void *)va, mkey->key, (void *)*rva); + return mkey; + } + + s->mkeys_cache[pe] = (mca_spml_mkey_t *) calloc(memheap_map->num_transports, + sizeof(mca_spml_mkey_t)); + if (!s->mkeys_cache[pe]) + return NULL ; + + rc = memheap_oob_get_mkeys(pe, + s - memheap_map->mem_segs, + s->mkeys_cache[pe]); + if (OSHMEM_SUCCESS != rc) + return NULL ; + + mkey = &s->mkeys_cache[pe][btl_id]; + *rva = va2rva(va, s->start, mkey->va_base); + + MEMHEAP_VERBOSE_FASTPATH(5, "rkey: pe=%d va=%p -> (remote lookup) %lx %p", pe, (void *)va, mkey->key, (void *)*rva); + return mkey; +} + +mca_spml_mkey_t *mca_memheap_base_get_mkey(unsigned long va, int tr_id) +{ + map_segment_t *s; + + s = __find_va(va); + + return ((s && s->is_active) ? &s->mkeys[tr_id] : NULL ); +} + +uint64_t mca_memheap_base_find_offset(int pe, + int tr_id, + unsigned long va, + uint64_t rva) +{ + map_segment_t *s; + + s = __find_va(va); + + return ((s && s->is_active) ? (rva - s->mkeys_cache[pe][tr_id].va_base) : 0); +} + +int mca_memheap_base_is_symmetric_addr(unsigned long va) +{ + return (__find_va(va) ? 1 : 0); +} + +int mca_memheap_base_detect_addr_type(unsigned long va) +{ + int addr_type = ADDR_INVALID; + map_segment_t *s; + + s = __find_va(va); + + if (s) { + if (s->type == MAP_SEGMENT_STATIC) { + addr_type = ADDR_STATIC; + } else if (va >= (unsigned long) s->start + && va < (unsigned long) (s->start + mca_memheap.memheap_size)) { + addr_type = ADDR_USER; + } else { + assert( va >= (unsigned long)(s->start + mca_memheap.memheap_size) && va < (unsigned long)s->end); + addr_type = ADDR_PRIVATE; + } + } + + return addr_type; +} diff --git a/oshmem/mca/memheap/base/memheap_base_register.c b/oshmem/mca/memheap/base/memheap_base_register.c new file mode 100644 index 0000000000..67c123847a --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_register.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#include + +static int __dereg_segment(map_segment_t *s); +static int __reg_segment(map_segment_t *s, int *num_btl); + +int mca_memheap_base_reg(mca_memheap_map_t *memheap_map) +{ + int ret = OSHMEM_SUCCESS; + int i; + + for (i = 0; i < memheap_map->n_segments; i++) { + map_segment_t *s = &memheap_map->mem_segs[i]; + + MEMHEAP_VERBOSE(5, + "register seg#%02d: 0x%llX - 0x%llX %llu bytes type=0x%X id=0x%X", + i, + (long long)s->start, + (long long)s->end, + (long long)(s->end - s->start), + s->type, + s->shmid); + ret = __reg_segment(s, &memheap_map->num_transports); + } + + return ret; +} + +int mca_memheap_base_dereg(mca_memheap_map_t *memheap_map) +{ + int ret = OSHMEM_SUCCESS; + int i; + + for (i = 0; i < memheap_map->n_segments; i++) { + map_segment_t *s = &memheap_map->mem_segs[i]; + + if (!s->is_active) + continue; + + MEMHEAP_VERBOSE(5, + "deregistering segment#%d: %llx - %llx %llu bytes", + i, + (long long)s->start, + (long long)s->end, + (long long)(s->end - s->start)); + ret = __dereg_segment(s); + } + + return ret; +} + +static int __dereg_segment(map_segment_t *s) +{ + int rc = OSHMEM_SUCCESS; + int j; + int nprocs, my_pe; + + nprocs = oshmem_num_procs(); + my_pe = oshmem_my_proc_id(); + + MCA_SPML_CALL(deregister(s->mkeys)); + + if (s->mkeys_cache) { + for (j = 0; j < nprocs; j++) { + if (j == my_pe) + continue; + if (s->mkeys_cache[j]) { + free(s->mkeys_cache[j]); + s->mkeys_cache[j] = NULL; + } + } + free(s->mkeys_cache); + s->mkeys_cache = NULL; + } + + s->is_active = 0; + + return rc; +} + +static int __reg_segment(map_segment_t *s, int *num_btl) +{ + int rc = OSHMEM_SUCCESS; + int my_pe; + int nprocs; + + nprocs = oshmem_num_procs(); + my_pe = oshmem_my_proc_id(); + + s->mkeys_cache = (mca_spml_mkey_t **) calloc(nprocs, + sizeof(mca_spml_mkey_t *)); + if (NULL == s->mkeys_cache) { + MEMHEAP_ERROR("Failed to allocate memory for remote segments"); + rc = OSHMEM_ERROR; + } + + if (!rc) { + s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->start, + s->end - s->start, + MEMHEAP_SHM_CODE(s->type, s->shmid), + num_btl)); + if (NULL == s->mkeys) { + free(s->mkeys_cache); + s->mkeys_cache = NULL; + + MEMHEAP_ERROR("Failed to register segment"); + rc = OSHMEM_ERROR; + } + } + + if (OSHMEM_SUCCESS == rc) { + s->mkeys_cache[my_pe] = s->mkeys; + s->is_active = 1; + } + + return rc; +} diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c new file mode 100644 index 0000000000..979a7741b4 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_select.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "orte/mca/errmgr/errmgr.h" +#include "opal/runtime/opal.h" + +mca_memheap_base_module_t mca_memheap; + +/** + * Function for weeding out memheap components that shouldn't be executed. + * Implementation inspired by btl/base. + * + * Call the init function on all available components to find out if + * they want to run. Select all components that don't fail. Failing + * components will be closed and unloaded. The selected modules will + * be pointed to by mca_memheap_base_module_t. + */ + +static memheap_context_t* __memheap_create(void); + +/** + * Choose to init one component with the highest priority. + * If the include list if it is not empty choose a component that appear in the list. + * O/W choose the highest priority component not in the exclude list. + * Include and exclude lists may be given in the shmem launcher command line. + */ +int mca_memheap_base_select() +{ + int priority = 0; + int max_priority = 0; + mca_base_component_list_item_t *cli, *next; + mca_memheap_base_component_t *component = NULL; + mca_memheap_base_component_t *max_priority_component = NULL; + mca_memheap_base_module_t *module = NULL; + memheap_context_t *context = NULL; + + char** include = opal_argv_split(mca_memheap_base_include, ','); + char** exclude = opal_argv_split(mca_memheap_base_exclude, ','); + + context = __memheap_create(); + if (!context) { + return OSHMEM_ERROR; + } + + OPAL_LIST_FOREACH_SAFE(cli, next, &oshmem_memheap_base_framework.framework_components, mca_base_component_list_item_t) { + component = (mca_memheap_base_component_t *) cli->cli_component; + + /* Verify if the component is in the include or the exclude list. */ + /* If there is an include list - item must be in the list to be included */ + if (NULL != include) { + char** argv = include; + bool found = false; + while (argv && *argv) { + if (strcmp(component->memheap_version.mca_component_name, *argv) + == 0) { + found = true; + break; + } + argv++; + } + /* If not in the list do not choose this component */ + if (found == false) { + continue; + } + + /* Otherwise - check the exclude list to see if this item has been specifically excluded */ + } else if (NULL != exclude) { + char** argv = exclude; + bool found = false; + while (argv && *argv) { + if (strcmp(component->memheap_version.mca_component_name, *argv) + == 0) { + found = true; + break; + } + argv++; + } + if (found == true) { + continue; + } + } + + /* Verify that the component has an init function */ + if (NULL == component->memheap_init) { + MEMHEAP_VERBOSE(10, + "select: no init function; for component %s. No component selected", + component->memheap_version.mca_component_name); + } else { + + MEMHEAP_VERBOSE(5, + "select: component %s size : user %d private: %d", + component->memheap_version.mca_component_name, (int)context->user_size, (int)context->private_size); + + /* Init the component in order to get its priority */ + module = component->memheap_init(context, &priority); + + /* If the component didn't initialize, remove it from the opened list, remove it from the component repository and return an error */ + if (NULL == module) { + MEMHEAP_VERBOSE(10, + "select: init of component %s returned failure", + component->memheap_version.mca_component_name); + + opal_list_remove_item(&oshmem_memheap_base_framework.framework_components, &cli->super); + mca_base_component_close((mca_base_component_t *) component, + oshmem_memheap_base_framework.framework_output); + } + /* Calculate memheap size in case it was not set during component initialization */ + module->memheap_size = context->user_size; + } + + /* Init max priority component */ + if (NULL == max_priority_component) { + max_priority_component = component; + mca_memheap_base_module_initialized = module; + max_priority = priority; + } + + /* Update max priority component if current component has greater priority */ + if (priority > max_priority) { + max_priority = priority; + max_priority_component = component; + mca_memheap_base_module_initialized = module; + } + } + + opal_argv_free(include); + opal_argv_free(exclude); + + /* Verify that a component was selected */ + if (NULL == max_priority_component) { + MEMHEAP_VERBOSE(10, "select: no component selected"); + return OSHMEM_ERROR; + } + + /* Verify that some module was initialized */ + if (NULL == mca_memheap_base_module_initialized) { + orte_show_help("help-shmem-mca.txt", + "find-available:none-found", + true, + "memheap"); + orte_errmgr.abort(1, NULL ); + } + + MEMHEAP_VERBOSE(10, + "SELECTED %s component %s", + max_priority_component->memheap_version.mca_type_name, max_priority_component->memheap_version.mca_component_name); + + setenv(SHMEM_HEAP_TYPE, + max_priority_component->memheap_version.mca_component_name, + 1); + + mca_memheap = *mca_memheap_base_module_initialized; + + return OSHMEM_SUCCESS; +} + +static size_t memheap_size(void) +{ + char *p; + unsigned long long factor; + int idx; + unsigned long long size; + + p = getenv(SHMEM_HEAP_SIZE); + if (!p) + return SIZE_IN_MEGA_BYTES(DEFAULT_SYMMETRIC_HEAP_SIZE); + + idx = strlen(p) - 1; + if (p[idx] == 'k' || p[idx] == 'K') { + factor = 1024; + } else if (p[idx] == 'm' || p[idx] == 'M') { + factor = 1024 * 1024; + } else if (p[idx] == 'g' || p[idx] == 'G') { + factor = 1024 * 1024 * 1024; + } else if (p[idx] == 't' || p[idx] == 'T') { + factor = 1024UL * 1024UL * 1024UL * 1024UL; + } else + factor = 1; + + size = atoll(p); + if (size == 0) { + MEMHEAP_ERROR("Incorrect symmetric heap size %s. Using default heap size %d M\n", + p, DEFAULT_SYMMETRIC_HEAP_SIZE); + return SIZE_IN_MEGA_BYTES(DEFAULT_SYMMETRIC_HEAP_SIZE); + } + return (size_t) memheap_align(size * factor); +} + +static memheap_context_t* __memheap_create(void) +{ + int rc = OSHMEM_SUCCESS; + static memheap_context_t context; + size_t user_size; + + user_size = memheap_size(); + if (user_size < MEMHEAP_BASE_MIN_SIZE) { + MEMHEAP_ERROR("Requested memheap size is less than minimal meamheap size (%llu < %llu)", + (unsigned long long)user_size, MEMHEAP_BASE_MIN_SIZE); + return NULL ; + } + /* Inititialize symmetric area */ + if (OSHMEM_SUCCESS == rc) { + rc = mca_memheap_base_alloc_init(&mca_memheap_base_map, + user_size + MEMHEAP_BASE_PRIVATE_SIZE); + } + + /* Inititialize static/global variables area */ + if (OSHMEM_SUCCESS == rc) { + rc = mca_memheap_base_static_init(&mca_memheap_base_map); + } + + /* Memory Registration */ + if (OSHMEM_SUCCESS == rc) { + rc = mca_memheap_base_reg(&mca_memheap_base_map); + } + + /* Init OOB channel */ + if (OSHMEM_SUCCESS == rc) { + rc = memheap_oob_init(&mca_memheap_base_map); + } + + if (OSHMEM_SUCCESS == rc) { + context.user_size = user_size; + context.private_size = MEMHEAP_BASE_PRIVATE_SIZE; + context.user_base_addr = + (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].start + + 0); + context.private_base_addr = + (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].start + + context.user_size); + } + + return ((OSHMEM_SUCCESS == rc) ? &context : NULL ); +} diff --git a/oshmem/mca/memheap/base/memheap_base_static.c b/oshmem/mca/memheap/base/memheap_base_static.c new file mode 100644 index 0000000000..a496e0dd4a --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_static.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#include + +struct map_segment_desc { + uint64_t start; + uint64_t end; + char perms[8]; + uint64_t offset; + char dev[8]; + uint64_t inode; + char pathname[MAXPATHLEN]; +}; + +typedef struct memheap_static_context { + struct { + uint64_t start; + uint64_t end; + } mem_segs[MCA_MEMHEAP_MAX_SEGMENTS]; + int n_segments; +} memheap_static_context_t; + +static memheap_static_context_t memheap_context; + +static int __load_segments(void); +static int __check_perms(struct map_segment_desc *seg); +static int __check_address(struct map_segment_desc *seg); +static int __check_pathname(struct map_segment_desc *seg); + +int mca_memheap_base_static_init(mca_memheap_map_t *map) +{ + /* read and parse segments from /proc/self/maps */ + int ret = OSHMEM_SUCCESS; + + assert(map); + assert(SYMB_SEG_INDEX <= map->n_segments); + + ret = __load_segments(); + + if (OSHMEM_SUCCESS == ret) { + int i; + size_t total_mem; + + for (i = 0, total_mem = 0; i < memheap_context.n_segments; i++) { + map_segment_t *s = &map->mem_segs[map->n_segments]; + + memset(s, 0, sizeof(*s)); + s->is_active = 0; + s->shmid = MEMHEAP_SHM_INVALID; + s->start = memheap_context.mem_segs[i].start; + s->end = memheap_context.mem_segs[i].end; + s->size = s->end - s->start; + s->type = MAP_SEGMENT_STATIC; + s->context = NULL; + map->n_segments++; + + total_mem += s->end - s->start; + } + MEMHEAP_VERBOSE(1, + "Memheap static memory: %llu byte(s), %d segments", + (unsigned long long)total_mem, map->n_segments); + } + + return ret; +} + +void mca_memheap_base_static_exit(mca_memheap_map_t *map) +{ + assert(map); +} + +static int __check_perms(struct map_segment_desc *seg) +{ + if (!strcmp(seg->perms, "rw-p") || !strcmp(seg->perms, "rwxp")) + return OSHMEM_SUCCESS; + + return OSHMEM_ERROR; +} + +static int __check_address(struct map_segment_desc *seg) +{ + extern unsigned _end; + unsigned long data_end = (unsigned long) &_end; + + /** + * SGI shmem only supports globals&static in main program. + * It does not support them in shared objects or in dlopen() + * (Clarified on PGAS 2011 tutorial) + * + * So ignored any maps that start higher then process _end + * FIXME: make sure we do not register symmetric heap twice + * if we decide to allow shared objects + */ + if (seg->start > data_end) { + MEMHEAP_VERBOSE(100, + "skip segment: data _end < segment start (%llx < %llx)", + (unsigned long long)data_end, (unsigned long long)seg->start); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int __check_pathname(struct map_segment_desc *seg) +{ + /* Probably we need to check found path but + * To press check coverity issue following code is disabled + */ +#if 0 + char *p; + if ('\0' == seg->pathname[0]) + return OSHMEM_SUCCESS; + + if (0 == strncmp(seg->pathname, "/lib", 4)) + return OSHMEM_ERROR; + + if (0 == strncmp(seg->pathname, "/usr/lib", 8)) + return OSHMEM_ERROR; + + if (0 == strncmp(seg->pathname, "/dev", 4)) + return OSHMEM_ERROR; + + if (0 == strcmp(seg->pathname, "[stack]")) + return OSHMEM_ERROR; + + if (0 == strcmp(seg->pathname, "[vdso]")) + return OSHMEM_ERROR; + + if (0 == strcmp(seg->pathname, "[vsyscall]")) + return OSHMEM_ERROR; + + p = rindex(seg->pathname, '/'); + if (p) { + if (0 == strncmp(p+1, "libshmem.so", 11)) + return OSHMEM_ERROR; + + if (0 == strncmp(p+1, "libmpi.so", 9)) + return OSHMEM_ERROR; + + if (0 == strncmp(p+1, "libmca_common_sm.so", 19)) + return OSHMEM_ERROR; + } +#endif + return OSHMEM_SUCCESS; +} + +static int __load_segments(void) +{ + FILE *fp; + char line[1024]; + struct map_segment_desc seg; + + memheap_context.n_segments = 0; + + fp = fopen("/proc/self/maps", "r"); + if (NULL == fp) { + MEMHEAP_ERROR("Failed to open /proc/self/maps"); + return OSHMEM_ERROR; + } + + while (NULL != fgets(line, sizeof(line), fp)) { + memset(&seg, 0, sizeof(seg)); + sscanf(line, + "%llx-%llx %s %llx %s %llx %s", + (long long *) &seg.start, + (long long *) &seg.end, + seg.perms, + (long long *) &seg.offset, + seg.dev, + (long long *) &seg.inode, + seg.pathname); + + if (OSHMEM_ERROR == __check_address(&seg)) + continue; + + if (OSHMEM_ERROR == __check_pathname(&seg)) + continue; + + if (OSHMEM_ERROR == __check_perms(&seg)) + continue; + + MEMHEAP_VERBOSE(5, "add: %s", line); + if (MCA_MEMHEAP_MAX_SEGMENTS <= memheap_context.n_segments) { + MEMHEAP_ERROR("too many segments (max = %d): skip %s", + MCA_MEMHEAP_MAX_SEGMENTS, line); + continue; + } + if (memheap_context.n_segments > 0 + && seg.start + == memheap_context.mem_segs[memheap_context.n_segments + - 1].end) { + MEMHEAP_VERBOSE(5, "Coalescing segment"); + memheap_context.mem_segs[memheap_context.n_segments - 1].end = + seg.end; + } else { + memheap_context.mem_segs[memheap_context.n_segments].start = + seg.start; + memheap_context.mem_segs[memheap_context.n_segments].end = seg.end; + memheap_context.n_segments++; + } + } + + fclose(fp); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/buddy/Makefile.am b/oshmem/mca/memheap/buddy/Makefile.am new file mode 100644 index 0000000000..73a1d4d78f --- /dev/null +++ b/oshmem/mca/memheap/buddy/Makefile.am @@ -0,0 +1,41 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +buddy_sources = \ + memheap_buddy.c \ + memheap_buddy.h \ + memheap_buddy_component.c \ + memheap_buddy_component.h + +#if OMPI_BUILD_memheap_buddy_DSO +if MCA_BUILD_ompi_pml_ob1_DSO +component_noinst = +component_install = mca_memheap_buddy.la +else +component_noinst = libmca_memheap_buddy.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_memheap_buddy_la_SOURCES = $(buddy_sources) +mca_memheap_buddy_la_LDFLAGS = -module -avoid-version + +#noinst_LTLIBRARIES = $(lib) +noinst_LTLIBRARIES = $(component_noinst) +libmca_memheap_buddy_la_SOURCES = $(buddy_sources) +libmca_memheap_buddy_la_LDFLAGS = -module -avoid-version + + + diff --git a/oshmem/mca/memheap/buddy/configure.params b/oshmem/mca/memheap/buddy/configure.params new file mode 100644 index 0000000000..1b6b5ba51c --- /dev/null +++ b/oshmem/mca/memheap/buddy/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/memheap/buddy/memheap_buddy.c b/oshmem/mca/memheap/buddy/memheap_buddy.c new file mode 100644 index 0000000000..2a7062ff6d --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy.c @@ -0,0 +1,718 @@ +/* Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy_component.h" +#include "oshmem/mca/memheap/base/base.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "opal/class/opal_hash_table.h" +#include "opal/class/opal_object.h" +#include "orte/util/name_fns.h" + +static int buddy_init(mca_memheap_buddy_module_t* buddy); + +mca_memheap_buddy_module_t memheap_buddy = { + { + &mca_memheap_buddy_component, + mca_memheap_buddy_finalize, + mca_memheap_buddy_alloc, + mca_memheap_buddy_align, + mca_memheap_buddy_realloc, + mca_memheap_buddy_free, + + mca_memheap_buddy_private_alloc, + mca_memheap_buddy_private_free, + + mca_memheap_base_get_cached_mkey, + mca_memheap_base_get_mkey, + mca_memheap_base_find_offset, + mca_memheap_base_is_symmetric_addr, + mca_memheap_modex_recv_all, + + 0 + }, + 1 /* priority */ +}; + +/* Memory Heap Buddy Implementation */ + +/* Static inline functions */ +static inline unsigned int bits_per_long(void) +{ + return BITS_PER_BYTE * sizeof(unsigned long); +} + +static inline void bitmap_zero(unsigned long *dst, unsigned long nbits) +{ + unsigned long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + memset(dst, 0, len); +} + +/* + * WARNING: Non atomic version. + */ +static inline void __clear_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + *m &= ~(1 << (nr & 31)); +} + +/* + * WARNING: non atomic version. + */ +static inline void __set_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + *m |= 1 << (nr & 31); +} + +static inline int test_bit(int nr, const volatile void * addr) +{ + return (1UL & (((const int *) addr)[nr >> 5] >> (nr & 31))) != 0UL; +} + +/* + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline __opal_attribute_always_inline__ unsigned long __ffs(unsigned long word) +{ + int num = 0; + + if(bits_per_long() == 64) { + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } + } + + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf) == 0) { + num += 4; + word >>= 4; + } + if ((word & 0x3) == 0) { + num += 2; + word >>= 2; + } + if ((word & 0x1) == 0) + num += 1; + return num; +} + +/* round up to next power of two */ +static inline unsigned memheap_buddy_find_order(unsigned long size) +{ + unsigned order; + + if (size & (size - 1)) + order = 1; + else + order = 0; + + while (size >>= 1) { + order++; + } + return order; +} + +/* + * find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ + +static inline unsigned long find_next_bit(const unsigned long *addr, + unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(bits_per_long() - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= bits_per_long(); + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < bits_per_long()) + goto found_first; + if (tmp) + goto found_middle; + size -= bits_per_long(); + result += bits_per_long(); + } + while (size & ~(bits_per_long() - 1)) { + if ((tmp = *(p++))) + goto found_middle; + result += bits_per_long(); + size -= bits_per_long(); + } + if (!size) + return result; + tmp = *p; + + found_first: tmp &= (~0UL >> (bits_per_long() - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ + found_middle: return result + __ffs(tmp); +} + +/** + * Initialize the Memory Heap + */ +int mca_memheap_buddy_module_init(memheap_context_t *context) +{ + if (!context || !context->user_size || !context->private_size) { + return OSHMEM_ERR_BAD_PARAM; + } + + /* Construct a mutex object */ + OBJ_CONSTRUCT(&memheap_buddy.lock, opal_mutex_t); + + memheap_buddy.heap.max_order = memheap_log2(context->user_size); + memheap_buddy.heap.min_order = MEMHEAP_BASE_MIN_ORDER; + memheap_buddy.private_heap.max_order = memheap_log2(context->private_size); + memheap_buddy.private_heap.min_order = MEMHEAP_BASE_MIN_ORDER; + + if (context->user_size != (1ULL << memheap_buddy.heap.max_order)) { + MEMHEAP_VERBOSE(1, + "Memheap rounded to the nearest power of two: requested %llu bytes, allocated %llu bytes", + (unsigned long long)context->user_size, 1ULL << memheap_buddy.heap.max_order); + } + + assert(context->private_size == (1ULL << memheap_buddy.private_heap.max_order)); + + memheap_buddy.heap.symmetric_heap = context->user_base_addr; + memheap_buddy.private_heap.symmetric_heap = context->private_base_addr; + + memheap_buddy.super.memheap_size = (1ULL << memheap_buddy.heap.max_order); + + MEMHEAP_VERBOSE(1, + "symmetric heap memory (user+private): %llu bytes", + (unsigned long long)(context->user_size + context->private_size)); + + /* Initialize buddy allocator */ + if (OSHMEM_SUCCESS != buddy_init(&memheap_buddy)) { + MEMHEAP_ERROR("Failed to setup MEMHEAP buddy allocator"); + goto err; + } + + return OSHMEM_SUCCESS; + + err: mca_memheap_buddy_finalize(); + return OSHMEM_ERROR; +} + +static int buddy_init(mca_memheap_buddy_module_t* buddy) +{ + unsigned long long total_size; + unsigned i; + unsigned long long s; + + /* Allocate and init Hashtable */ + memheap_buddy.heap.symmetric_heap_hashtable = OBJ_NEW(opal_hash_table_t); + if (NULL == memheap_buddy.heap.symmetric_heap_hashtable) { + MEMHEAP_ERROR("Opal failed to allocate hashtable object"); + goto err; + } + memheap_buddy.private_heap.symmetric_heap_hashtable = + OBJ_NEW(opal_hash_table_t); + if (NULL == memheap_buddy.private_heap.symmetric_heap_hashtable) { + MEMHEAP_ERROR("Opal failed to allocate hashtable object"); + goto err; + } + + opal_hash_table_init(memheap_buddy.heap.symmetric_heap_hashtable, + DEFAULT_HASHTABLE_SIZE); + opal_hash_table_init(memheap_buddy.private_heap.symmetric_heap_hashtable, + DEFAULT_HASHTABLE_SIZE); + /* Init Buddy Allocator */ + buddy->heap.bits = (unsigned long**) calloc((buddy->heap.max_order + 1), + sizeof(unsigned long *)); + buddy->private_heap.bits = + (unsigned long**) calloc((buddy->private_heap.max_order + 1), + sizeof(unsigned long *)); + buddy->heap.num_free = (unsigned int*) calloc((buddy->heap.max_order + 1), + sizeof(unsigned int)); + buddy->private_heap.num_free = + (unsigned int*) calloc((buddy->private_heap.max_order + 1), + sizeof(unsigned int)); + if ((NULL == buddy->heap.bits) || (NULL == buddy->heap.num_free) + || (NULL == buddy->private_heap.bits) + || (NULL == buddy->private_heap.num_free)) { + + MEMHEAP_ERROR("Failed to allocate buddy allocator"); + goto err; + } + + total_size = 0; + for (i = buddy->heap.min_order; i <= buddy->heap.max_order; ++i) { + s = BITS_TO_LONGS(1UL << (buddy->heap.max_order - i)); + MEMHEAP_VERBOSE(20, + "%d: (order=%d) allocating %llu longs (sizeof long = %d)", + i, buddy->heap.max_order, s, (int)sizeof(unsigned long)); + total_size += s * sizeof(unsigned long); + buddy->heap.bits[i] = (unsigned long*) malloc(s + * sizeof(unsigned long)); + if (NULL == buddy->heap.bits[i]) { + MEMHEAP_ERROR("Failed to allocate buddy->allocator"); + goto err; + } + bitmap_zero(buddy->heap.bits[i], 1UL << (buddy->heap.max_order - i)); + } + MEMHEAP_VERBOSE(5, "MEMHEAP metadata size = %llu bytes", total_size); + + total_size = 0; + for (i = buddy->private_heap.min_order; i <= buddy->private_heap.max_order; + ++i) { + s = BITS_TO_LONGS(1UL << (buddy->private_heap.max_order - i)); + MEMHEAP_VERBOSE(20, + "%d: (order=%d) allocating %llu longs (sizeof long = %d)", + i, buddy->private_heap.max_order, s, (int)sizeof(unsigned long)); + total_size += s * sizeof(unsigned long); + buddy->private_heap.bits[i] = (unsigned long*) malloc(s + * sizeof(unsigned long)); + if (NULL == buddy->private_heap.bits[i]) { + MEMHEAP_ERROR("Failed to allocate buddy->allocator"); + goto err; + } + bitmap_zero(buddy->private_heap.bits[i], + 1UL << (buddy->private_heap.max_order - i)); + } + MEMHEAP_VERBOSE(5, + "private MEMHEAP metadata size = %llu bytes", + total_size); + + set_bit(0, buddy->heap.bits[buddy->heap.max_order]); + set_bit(0, buddy->private_heap.bits[buddy->private_heap.max_order]); + buddy->heap.num_free[buddy->heap.max_order] = 1; + buddy->private_heap.num_free[buddy->private_heap.max_order] = 1; + + return OSHMEM_SUCCESS; + + err: return OSHMEM_ERROR; +} + +static int buddy_cleanup(mca_memheap_buddy_module_t* buddy) +{ + unsigned int i; + + MEMHEAP_VERBOSE(5, "buddy cleanup"); + if (NULL == buddy) { + return OSHMEM_SUCCESS; + } + + for (i = 0; i <= buddy->heap.max_order; ++i) { + if (NULL != buddy->heap.bits && NULL != buddy->heap.bits[i]) { + free(buddy->heap.bits[i]); + } + } + + for (i = 0; i <= buddy->private_heap.max_order; ++i) { + if (NULL != buddy->private_heap.bits + && NULL != buddy->private_heap.bits[i]) { + free(buddy->private_heap.bits[i]); + } + } + + if (NULL != buddy->heap.bits) { + free(buddy->heap.bits); + } + if (NULL != buddy->heap.num_free) { + free(buddy->heap.num_free); + } + + if (NULL != buddy->private_heap.bits) { + free(buddy->private_heap.bits); + } + if (NULL != buddy->private_heap.num_free) { + free(buddy->private_heap.num_free); + } + + OBJ_DESTRUCT(&buddy->lock); + return OSHMEM_SUCCESS; +} + +static int _buddy_alloc(unsigned order, + uint32_t* seg, + mca_memheap_buddy_heap_t *heap) +{ + uint32_t o; + uint32_t m; + + MEMHEAP_VERBOSE(20, "order=%d size=%d", order, 1<max_order; ++o) { + if (heap->num_free[o]) { + m = 1 << (heap->max_order - o); + *seg = find_first_bit(heap->bits[o], m); + MEMHEAP_VERBOSE(20, + "found free bit: order=%d, bits=0x%lx m=%d, *seg=%d", + o, heap->bits[o][0], m, *seg); + if (*seg < m) + goto found; + } + } + + OPAL_THREAD_UNLOCK(&memheap_buddy.lock); + return OSHMEM_ERROR; + + found: + clear_bit(*seg, heap->bits[o]); + --(heap->num_free[o]); + + while (o > order) { + --o; + *seg <<= 1; + set_bit(*seg ^ 1, heap->bits[o]); + ++(heap->num_free[o]); + } + + OPAL_THREAD_UNLOCK(&memheap_buddy.lock); + *seg <<= order; + + return OSHMEM_SUCCESS; +} + +static int _buddy_free(mca_memheap_buddy_module_t* buddy, + uint32_t seg, + unsigned order, + mca_memheap_buddy_heap_t *heap) +{ + MEMHEAP_VERBOSE(20, "order=%d size=%d seg=%d", order, 1<>= order; + OPAL_THREAD_LOCK(&buddy->lock); + + while (test_bit(seg ^ 1, heap->bits[order])) { + clear_bit(seg ^ 1, heap->bits[order]); + --(heap->num_free[order]); + seg >>= 1; + ++order; + } + + set_bit(seg, heap->bits[order]); + ++(heap->num_free[order]); + OPAL_THREAD_UNLOCK(&buddy->lock); + return OSHMEM_SUCCESS; +} + +static int buddy_free(mca_memheap_buddy_module_t* buddy, + uint32_t seg, + unsigned order) +{ + return _buddy_free(buddy, seg, order, &buddy->heap); +} + +static int buddy_private_free(mca_memheap_buddy_module_t* buddy, + uint32_t seg, + unsigned order) +{ + return _buddy_free(buddy, seg, order, &buddy->private_heap); +} + +static int _do_alloc(uint32_t order, + void **p_buff, + mca_memheap_buddy_heap_t *heap) +{ + int rc; + unsigned long base; + uint32_t offset; + unsigned long addr; + + if (order < heap->min_order) + order = heap->min_order; + + *p_buff = 0; + if (order > heap->max_order) { + /* Test allocated size overflow */ + MEMHEAP_VERBOSE(5, "Allocation overflow of symmetric heap size"); + return OSHMEM_ERROR; + } + + base = (unsigned long) heap->symmetric_heap; + + if (OSHMEM_SUCCESS != _buddy_alloc(order, &offset, heap)) { + MEMHEAP_VERBOSE(5, "Buddy Allocator failed to return a base address"); + return OSHMEM_ERROR; + } + + /* Save the order of the allocated variable */ + addr = base + offset; + + rc = opal_hash_table_set_value_uint64(heap->symmetric_heap_hashtable, + addr, + (void *) (unsigned long) order); + + if (OPAL_SUCCESS != rc) { + MEMHEAP_VERBOSE(5, "Failed to insert order to hashtable"); + goto alloc_error; + } + + *p_buff = (void*) addr; + /* no barrier because it is not required by spec! */ + return OSHMEM_SUCCESS; + + alloc_error: _buddy_free(&memheap_buddy, offset, order, heap); + return OSHMEM_ERROR; +} + +static int do_alloc(uint32_t order, void **p_buff) +{ + return _do_alloc(order, p_buff, &(memheap_buddy.heap)); +} + +static int do_private_alloc(uint32_t order, void **p_buff) +{ + return _do_alloc(order, p_buff, &(memheap_buddy.private_heap)); +} + +/** + * Allocate size bytes on the symmetric heap. + * The allocated variable is aligned to its size. + */ +int mca_memheap_buddy_alloc(size_t size, void** p_buff) +{ + + uint32_t order; + + order = memheap_buddy_find_order(size); + + return do_alloc(order, p_buff); +} + +int mca_memheap_buddy_private_alloc(size_t size, void** p_buff) +{ + uint32_t order; + int status = 0; + order = memheap_buddy_find_order(size); + + status = do_private_alloc(order, p_buff); + + MEMHEAP_VERBOSE(20, "private alloc addr: %p", *p_buff); + + return status; +} + +int mca_memheap_buddy_private_free(void* ptr) +{ + int rc; + uint32_t offset; + unsigned long addr; + unsigned long base; + void *order; + + if (0 == ptr) { + return OSHMEM_SUCCESS; + } + + base = (unsigned long) memheap_buddy.private_heap.symmetric_heap; + addr = (unsigned long) ptr; + offset = addr - base; + + rc = + opal_hash_table_get_value_uint64(memheap_buddy.private_heap.symmetric_heap_hashtable, + addr, + &order); + if (OPAL_SUCCESS != rc) { + return OSHMEM_ERROR; + } + + buddy_private_free(&memheap_buddy, + offset, + (unsigned) (unsigned long) order); + opal_hash_table_remove_value_uint64(memheap_buddy.private_heap.symmetric_heap_hashtable, + addr); + + return OSHMEM_SUCCESS; +} + +int mca_memheap_buddy_align(size_t align, size_t size, void **p_buff) +{ + uint32_t order; + + if (align == 0) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + /* check that align is power of 2 */ + if (align & (align - 1)) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + order = memheap_buddy_find_order(size); + if ((unsigned long) align > (1UL << order)) + order = memheap_buddy_find_order(align); + + return do_alloc(order, p_buff); +} + +int mca_memheap_buddy_realloc(size_t new_size, void *p_buff, void **p_new_buff) +{ + int rc; + unsigned long addr; + void *order; + size_t old_size; + char *tmp_buf; + + /* equiv to alloc if old ptr is null */ + if (NULL == p_buff) + return mca_memheap_buddy_alloc(new_size, p_new_buff); + + addr = (unsigned long) p_buff; + + rc = + opal_hash_table_get_value_uint64(memheap_buddy.heap.symmetric_heap_hashtable, + addr, + &order); + if (OPAL_SUCCESS != rc) { + *p_new_buff = NULL; + return OSHMEM_ERROR; + } + + /* equiv to free if new_size is 0 */ + if (0 == new_size) { + *p_new_buff = NULL; + return mca_memheap_buddy_free(p_buff); + } + + old_size = 1UL << (unsigned long) order; + + /* do nothing if new size is less then current size */ + if (new_size <= old_size) { + *p_new_buff = p_buff; + return OSHMEM_SUCCESS; + } + + if (new_size > (1UL << memheap_buddy.heap.max_order)) { + *p_new_buff = NULL; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + if (old_size + new_size >= (1UL << memheap_buddy.heap.max_order)) { + /* copy via temporary buffer */ + + tmp_buf = (char *) malloc(old_size); + if (!tmp_buf) + return OSHMEM_ERR_OUT_OF_RESOURCE; + memcpy(tmp_buf, p_buff, old_size); + mca_memheap_buddy_free(p_buff); + } else + tmp_buf = p_buff; + + /* alloc and copy data to new buffer, free old one */ + rc = mca_memheap_buddy_alloc(new_size, p_new_buff); + if (OSHMEM_SUCCESS != rc) { + *p_new_buff = NULL; + if (old_size + new_size >= (1UL << memheap_buddy.heap.max_order) + && tmp_buf) { + free(tmp_buf); + } + return rc; + } + + memcpy(*p_new_buff, tmp_buf, old_size); + + if (old_size + new_size < (1UL << memheap_buddy.heap.max_order)) + mca_memheap_buddy_free(p_buff); + else if (tmp_buf) + free(tmp_buf); + + return OSHMEM_SUCCESS; +} + +/* + * Free a variable allocated on the + * symmetric heap. + */ +int mca_memheap_buddy_free(void* ptr) +{ + int rc; + uint32_t offset; + unsigned long addr; + unsigned long base; + void *order; + + base = (unsigned long) memheap_buddy.heap.symmetric_heap; + addr = (unsigned long) ptr; + offset = addr - base; + + rc = + opal_hash_table_get_value_uint64(memheap_buddy.heap.symmetric_heap_hashtable, + addr, + &order); + if (OPAL_SUCCESS != rc) { + return OSHMEM_ERROR; + } + + buddy_free(&memheap_buddy, offset, (unsigned) (unsigned long) order); + opal_hash_table_remove_value_uint64(memheap_buddy.heap.symmetric_heap_hashtable, + addr); + + return OSHMEM_SUCCESS; +} + +int mca_memheap_buddy_finalize() +{ + MEMHEAP_VERBOSE(5, "deregistering symmetric heap"); + + /* was not initialized - do nothing */ + if (memheap_buddy.heap.max_order == 0) + return OSHMEM_SUCCESS; + + /* Destruct hashtable supporting shfree of symmetric heap variables */ + if (memheap_buddy.heap.symmetric_heap_hashtable) { + OBJ_RELEASE(memheap_buddy.heap.symmetric_heap_hashtable); + } + if (memheap_buddy.private_heap.symmetric_heap_hashtable) { + OBJ_RELEASE(memheap_buddy.private_heap.symmetric_heap_hashtable); + } + + buddy_cleanup(&memheap_buddy); + + return OSHMEM_SUCCESS; +} + +/** + * Return the base address of the symmetric heap. + */ + +static inline void* mca_memheap_buddy_get_symmetric_heap_base_addr(void) +{ + return memheap_buddy.heap.symmetric_heap; +} + +/** + * Return the last address in the symmetric heap. + */ +static inline void* mca_memheap_buddy_get_symmetric_heap_last_addr(void) +{ + return (void*) ((unsigned char*) (memheap_buddy.heap.symmetric_heap) + + (1ULL << memheap_buddy.heap.max_order) + + (1ULL << memheap_buddy.private_heap.max_order)); +} + diff --git a/oshmem/mca/memheap/buddy/memheap_buddy.h b/oshmem/mca/memheap/buddy/memheap_buddy.h new file mode 100644 index 0000000000..e3b051abf1 --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy.h @@ -0,0 +1,87 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * Description of the Registration Cache framework + */ +#ifndef MCA_MEMHEAP_BUDDY_H +#define MCA_MEMHEAP_BUDDY_H + +#include "oshmem_config.h" +#include "opal/mca/mca.h" +#include "opal/class/opal_list.h" +#include "opal/threads/mutex.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/spml/spml.h" +#include "opal/class/opal_hash_table.h" +#include "ompi/mca/btl/btl.h" +#include +#include +#include + +#define BITS_PER_BYTE 8 +#define __BITOPS_WORDSIZE 64 +#define DEFAULT_HASHTABLE_SIZE 100 + +#define BITOP_WORD(nr) ((nr) / bits_per_long()) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(unsigned long)) +#define __BITOPS_WORDS(bits) (((bits)+__BITOPS_WORDSIZE-1)/__BITOPS_WORDSIZE) +#define clear_bit(x,y) __clear_bit((x), (y)) +#define set_bit(x,y) __set_bit((x), (y)) +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) + +BEGIN_C_DECLS + +struct mca_memheap_buddy_heap_t { + unsigned long **bits; /** Part of the buddy allocator */ + unsigned *num_free; /** Part of the buddy allocator */ + unsigned max_order; /** Log2 of Maximal heap size, part of the allocator */ + unsigned min_order; /** min alloc order */ + void* symmetric_heap; /** Symmetric Heap */ + opal_hash_table_t* symmetric_heap_hashtable; /** Pointer to the Symmetric heap used for moving on it */ +}; +typedef struct mca_memheap_buddy_heap_t mca_memheap_buddy_heap_t; + +/* Structure for managing shmem symmetric heap */ +struct mca_memheap_buddy_module_t { + mca_memheap_base_module_t super; + + int priority; /** Module's Priority */ + mca_memheap_buddy_heap_t heap; + mca_memheap_buddy_heap_t private_heap; + opal_mutex_t lock; /** Part of the buddy allocator */ +}; +typedef struct mca_memheap_buddy_module_t mca_memheap_buddy_module_t; +OSHMEM_DECLSPEC extern mca_memheap_buddy_module_t memheap_buddy; + +/* + * Buddy interface. + * Please pay attention to the new differences in the interface. + */ +OSHMEM_DECLSPEC extern int mca_memheap_buddy_module_init(memheap_context_t *); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_alloc(size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_realloc(size_t, void*, void **); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_align(size_t, size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_free(void*); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_finalize(void); + +/* private alloc/free functions */ +OSHMEM_DECLSPEC extern int mca_memheap_buddy_private_alloc(size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_private_free(void*); + +/** + * static/global variables support. Consider making it a separate component + */ + +END_C_DECLS + +#endif /* MCA_MEMHEAP_BUDDY_H */ diff --git a/oshmem/mca/memheap/buddy/memheap_buddy_component.c b/oshmem/mca/memheap/buddy/memheap_buddy_component.c new file mode 100644 index 0000000000..ca71e861cc --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy_component.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include "opal/util/output.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy.h" +#include "memheap_buddy_component.h" + +static int mca_memheap_buddy_component_close(void); +static mca_memheap_base_module_t* mca_memheap_buddy_component_init(memheap_context_t *, + int *); + +static int __basic_open(void); + +mca_memheap_base_component_t mca_memheap_buddy_component = { + { + MCA_MEMHEAP_BASE_VERSION_2_0_0, + + "buddy", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + + __basic_open, + mca_memheap_buddy_component_close, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + mca_memheap_buddy_component_init +}; + +/* Open component */ +static int __basic_open(void) +{ + return OSHMEM_SUCCESS; +} + +/* Initialize component */ +mca_memheap_base_module_t* mca_memheap_buddy_component_init(memheap_context_t *context, + int *priority) +{ + int rc; + + *priority = memheap_buddy.priority; + rc = mca_memheap_buddy_module_init(context); + if (OSHMEM_SUCCESS != rc) { + return NULL ; + } + + return &(memheap_buddy.super); +} + +/* + * This function is automaticaly called from mca_base_components_close. + * It releases the component's allocated memory. + */ +int mca_memheap_buddy_component_close() +{ + mca_memheap_buddy_finalize(); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/buddy/memheap_buddy_component.h b/oshmem/mca/memheap/buddy/memheap_buddy_component.h new file mode 100644 index 0000000000..d0fbd093c7 --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy_component.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_MEMHEAP_BUDDY_COMPONENT_H +#define MCA_MEMHEAP_BUDDY_COMPONENT_H + +BEGIN_C_DECLS + +/* + * MEMHEAP module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_memheap_base_component_2_0_0_t mca_memheap_buddy_component; + +END_C_DECLS + +#endif diff --git a/oshmem/mca/memheap/configure.m4 b/oshmem/mca/memheap/configure.m4 new file mode 100644 index 0000000000..9a61468ef4 --- /dev/null +++ b/oshmem/mca/memheap/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_oshmem_memheap_CONFIG],[ + # configure all the components + MCA_CONFIGURE_FRAMEWORK($1, $2, 1) + + # this is a direct callable component, so set that up. + MCA_SETUP_DIRECT_CALL($1, $2) +]) diff --git a/oshmem/mca/memheap/memheap.h b/oshmem/mca/memheap/memheap.h new file mode 100644 index 0000000000..f1da5b8aa1 --- /dev/null +++ b/oshmem/mca/memheap/memheap.h @@ -0,0 +1,156 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_MEMHEAP_H +#define MCA_MEMHEAP_H +#include "opal/mca/mca.h" +#include "oshmem/constants.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" + +#define DEFAULT_SYMMETRIC_HEAP_SIZE 256 +#define SIZE_IN_MEGA_BYTES(size_in_mb) size_in_mb * 1024 * 1024 + +BEGIN_C_DECLS +struct mca_memheap_base_module_t; + +typedef struct memheap_context +{ + void* user_base_addr; + void* private_base_addr; + size_t user_size; + size_t private_size; +} memheap_context_t; + +/** + * Component initialize + */ +typedef struct mca_memheap_base_module_t* (*mca_memheap_base_component_init_fn_t)(memheap_context_t *, + int *priority); + +/* + * Symmetric heap allocation. Malloc like interface + */ +typedef int (*mca_memheap_base_module_alloc_fn_t)(size_t, void**); + +typedef int (*mca_memheap_base_module_memalign_fn_t)(size_t align, + size_t size, + void**); + +typedef int (*mca_memheap_base_module_realloc_fn_t)(size_t newsize, + void *, + void **); + +/* + * Symmetric heap free. + */ +typedef int (*mca_memheap_base_module_free_fn_t)(void*); + +/** + * Service functions + */ +typedef uint64_t (*mca_memheap_base_module_find_offset_fn_t)(int pe, + int tr_id, + unsigned long va, + uint64_t rva); + +/** + * @return mkey suitable to access pe via given transport id. rva is set to virtual address mapping of (va) + * on remote pe. + */ +typedef mca_spml_mkey_t * (*mca_memheap_base_module_get_cached_mkey_fn_t)(int pe, + unsigned long va, + int transport_id, + uint64_t *rva); +typedef mca_spml_mkey_t * (*mca_memheap_base_module_get_local_mkey_fn_t)(unsigned long va, + int transport_id); + +/* + * Symmetric heap destructor. + */ +typedef int (*mca_memheap_base_module_finalize_fn_t)(void); + +typedef int (*mca_memheap_base_is_memheap_addr_fn_t)(unsigned long va); + +/* get mkeys from all ranks */ +typedef void (*mca_memheap_base_mkey_exchange_fn_t)(void); + +/* + * memheap component descriptor. Contains component version, information and + * init functions + */ +struct mca_memheap_base_component_2_0_0_t { + mca_base_component_t memheap_version; /**< version */ + mca_base_component_data_t memheap_data; /**< metadata */ + mca_memheap_base_component_init_fn_t memheap_init; /**. Therefore, some of the comments below do not apply + for this modified version. However, it is the intention to keep + differences to Doug Lea's original version minimal, hence the + comments were mostly left unchanged. + + ----------------------------------------------------------------------- + + This is a version (aka dlmalloc) of malloc/free/realloc written by + Doug Lea and released to the public domain, as explained at + http://creativecommons.org/licenses/publicdomain. Send questions, + comments, complaints, performance data, etc to dl@cs.oswego.edu + +* Version pre-2.8.4 Wed Mar 29 19:46:29 2006 (dl at gee) + + Note: There may be an updated version of this malloc obtainable at + ftp://gee.cs.oswego.edu/pub/misc/malloc.c + Check before installing! + +* Quickstart + + This library is all in one file to simplify the most common usage: + ftp it, compile it (-O3), and link it into another program. All of + the compile-time options default to reasonable values for use on + most platforms. You might later want to step through various + compile-time and dynamic tuning options. + + For convenience, an include file for code using this malloc is at: + ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.3.h + You don't really need this .h file unless you call functions not + defined in your system include files. The .h file contains only the + excerpts from this file needed for using this malloc on ANSI C/C++ + systems, so long as you haven't changed compile-time options about + naming and tuning parameters. If you do, then you can create your + own malloc.h that does include all settings by cutting at the point + indicated below. Note that you may already by default be using a C + library containing a malloc that is based on some version of this + malloc (for example in linux). You might still want to use the one + in this file to customize settings or to avoid overheads associated + with library versions. + +* Vital statistics: + + Supported pointer/size_t representation: 4 or 8 bytes + size_t MUST be an unsigned type of the same width as + pointers. (If you are using an ancient system that declares + size_t as a signed type, or need it to be a different width + than pointers, you can use a previous release of this malloc + (e.g. 2.7.2) supporting these.) + + Alignment: 8 bytes (default) + This suffices for nearly all current machines and C compilers. + However, you can define MALLOC_ALIGNMENT to be wider than this + if necessary (up to 128bytes), at the expense of using more space. + + Minimum overhead per allocated chunk: 4 or 8 bytes (if 4byte sizes) + 8 or 16 bytes (if 8byte sizes) + Each malloced chunk has a hidden word of overhead holding size + and status information, and additional cross-check word + if FOOTERS is defined. + + Minimum allocated size: 4-byte ptrs: 16 bytes (including overhead) + 8-byte ptrs: 32 bytes (including overhead) + + Even a request for zero bytes (i.e., malloc(0)) returns a + pointer to something of the minimum allocatable size. + The maximum overhead wastage (i.e., number of extra bytes + allocated than were requested in malloc) is less than or equal + to the minimum size, except for requests >= mmap_threshold that + are serviced via mmap(), where the worst case wastage is about + 32 bytes plus the remainder from a system page (the minimal + mmap unit); typically 4096 or 8192 bytes. + + Security: static-safe; optionally more or less + The "security" of malloc refers to the ability of malicious + code to accentuate the effects of errors (for example, freeing + space that is not currently malloc'ed or overwriting past the + ends of chunks) in code that calls malloc. This malloc + guarantees not to modify any memory locations below the base of + heap, i.e., static variables, even in the presence of usage + errors. The routines additionally detect most improper frees + and reallocs. All this holds as long as the static bookkeeping + for malloc itself is not corrupted by some other means. This + is only one aspect of security -- these checks do not, and + cannot, detect all possible programming errors. + + If FOOTERS is defined nonzero, then each allocated chunk + carries an additional check word to verify that it was malloced + from its space. These check words are the same within each + execution of a program using malloc, but differ across + executions, so externally crafted fake chunks cannot be + freed. This improves security by rejecting frees/reallocs that + could corrupt heap memory, in addition to the checks preventing + writes to statics that are always on. This may further improve + security at the expense of time and space overhead. (Note that + FOOTERS may also be worth using with MSPACES.) + + By default detected errors cause the program to abort (calling + "abort()"). You can override this to instead proceed past + errors by defining PROCEED_ON_ERROR. In this case, a bad free + has no effect, and a malloc that encounters a bad address + caused by user overwrites will ignore the bad address by + dropping pointers and indices to all known memory. This may + be appropriate for programs that should continue if at all + possible in the face of programming errors, although they may + run out of memory because dropped memory is never reclaimed. + + If you don't like either of these options, you can define + CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything + else. And if if you are sure that your program using malloc has + no errors or vulnerabilities, you can define INSECURE to 1, + which might (or might not) provide a small performance improvement. + + Thread-safety: NOT thread-safe unless USE_LOCKS defined + When USE_LOCKS is defined, each public call to malloc, free, + etc is surrounded with either a pthread mutex or a win32 + spinlock (depending on WIN32). This is not especially fast, and + can be a major bottleneck. It is designed only to provide + minimal protection in concurrent environments, and to provide a + basis for extensions. If you are using malloc in a concurrent + program, consider instead using nedmalloc + (http://www.nedprod.com/programs/portable/nedmalloc/) or + ptmalloc (See http://www.malloc.de), which are derived + from versions of this malloc. + + System requirements: Any combination of MORECORE and/or MMAP/MUNMAP + This malloc can use unix sbrk or any emulation (invoked using + the CALL_MORECORE macro) and/or mmap/munmap or any emulation + (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system + memory. On most unix systems, it tends to work best if both + MORECORE and MMAP are enabled. On Win32, it uses emulations + based on VirtualAlloc. It also uses common C library functions + like memset. + + Compliance: I believe it is compliant with the Single Unix Specification + (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably + others as well. + +* Overview of algorithms + + This is not the fastest, most space-conserving, most portable, or + most tunable malloc ever written. However it is among the fastest + while also being among the most space-conserving, portable and + tunable. Consistent balance across these factors results in a good + general-purpose allocator for malloc-intensive programs. + + In most ways, this malloc is a best-fit allocator. Generally, it + chooses the best-fitting existing chunk for a request, with ties + broken in approximately least-recently-used order. (This strategy + normally maintains low fragmentation.) However, for requests less + than 256bytes, it deviates from best-fit when there is not an + exactly fitting available chunk by preferring to use space adjacent + to that used for the previous small request, as well as by breaking + ties in approximately most-recently-used order. (These enhance + locality of series of small allocations.) And for very large requests + (>= 256Kb by default), it relies on system memory mapping + facilities, if supported. (This helps avoid carrying around and + possibly fragmenting memory used only for large chunks.) + + All operations (except malloc_stats and mallinfo) have execution + times that are bounded by a constant factor of the number of bits in + a size_t, not counting any clearing in calloc or copying in realloc, + or actions surrounding MORECORE and MMAP that have times + proportional to the number of non-contiguous regions returned by + system allocation routines, which is often just 1. In real-time + applications, you can optionally suppress segment traversals using + NO_SEGMENT_TRAVERSAL, which assures bounded execution even when + system allocators return non-contiguous spaces, at the typical + expense of carrying around more memory and increased fragmentation. + + The implementation is not very modular and seriously overuses + macros. Perhaps someday all C compilers will do as good a job + inlining modular code as can now be done by brute-force expansion, + but now, enough of them seem not to. + + Some compilers issue a lot of warnings about code that is + dead/unreachable only on some platforms, and also about intentional + uses of negation on unsigned types. All known cases of each can be + ignored. + + For a longer but out of date high-level description, see + http://gee.cs.oswego.edu/dl/html/malloc.html + +* MSPACES + If MSPACES is defined, then in addition to malloc, free, etc., + this file also defines mspace_malloc, mspace_free, etc. These + are versions of malloc routines that take an "mspace" argument + obtained using create_mspace, to control all internal bookkeeping. + If ONLY_MSPACES is defined, only these versions are compiled. + So if you would like to use this allocator for only some allocations, + and your system malloc for others, you can compile with + ONLY_MSPACES and then do something like... + static mspace mymspace = create_mspace(0,0); // for example + #define mymalloc(bytes) mspace_malloc(mymspace, bytes) + + (Note: If you only need one instance of an mspace, you can instead + use "USE_DL_PREFIX" to relabel the global malloc.) + + You can similarly create thread-local allocators by storing + mspaces as thread-locals. For example: + static __thread mspace tlms = 0; + void* tlmalloc(size_t bytes) { + if (tlms == 0) tlms = create_mspace(0, 0); + return mspace_malloc(tlms, bytes); + } + void tlfree(void* mem) { mspace_free(tlms, mem); } + + Unless FOOTERS is defined, each mspace is completely independent. + You cannot allocate from one and free to another (although + conformance is only weakly checked, so usage errors are not always + caught). If FOOTERS is defined, then each chunk carries around a tag + indicating its originating mspace, and frees are directed to their + originating spaces. + + ------------------------- Compile-time options --------------------------- + +Be careful in setting #define values for numerical constants of type +size_t. On some systems, literal values are not automatically extended +to size_t precision unless they are explicitly casted. You can also +use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below. + +WIN32 default: defined if _WIN32 defined + Defining WIN32 sets up defaults for MS environment and compilers. + Otherwise defaults are for unix. + +MALLOC_ALIGNMENT default: (size_t)8 + Controls the minimum alignment for malloc'ed chunks. It must be a + power of two and at least 8, even on machines for which smaller + alignments would suffice. It may be defined as larger than this + though. Note however that code and data structures are optimized for + the case of 8-byte alignment. + +MSPACES default: 0 (false) + If true, compile in support for independent allocation spaces. + This is only supported if DL_HAVE_MMAP is true. + +ONLY_MSPACES default: 0 (false) + If true, only compile in mspace versions, not regular versions. + +USE_LOCKS default: 0 (false) + Causes each call to each public routine to be surrounded with + pthread or WIN32 mutex lock/unlock. (If set true, this can be + overridden on a per-mspace basis for mspace versions.) If set to a + non-zero value other than 1, locks are used, but their + implementation is left out, so lock functions must be supplied manually. + +USE_SPIN_LOCKS default: 1 iff USE_LOCKS and on x86 using gcc or MSC + If true, uses custom spin locks for locking. This is currently + supported only for x86 platforms using gcc or recent MS compilers. + Otherwise, posix locks or win32 critical sections are used. + +FOOTERS default: 0 + If true, provide extra checking and dispatching by placing + information in the footers of allocated chunks. This adds + space and time overhead. + +INSECURE default: 0 + If true, omit checks for usage errors and heap space overwrites. + +USE_DL_PREFIX default: NOT defined + Causes compiler to prefix all public routines with the string 'dl'. + This can be useful when you only want to use this malloc in one part + of a program, using your regular system malloc elsewhere. + +ABORT default: defined as abort() + Defines how to abort on failed checks. On most systems, a failed + check cannot die with an "assert" or even print an informative + message, because the underlying print routines in turn call malloc, + which will fail again. Generally, the best policy is to simply call + abort(). It's not very useful to do more than this because many + errors due to overwriting will show up as address faults (null, odd + addresses etc) rather than malloc-triggered checks, so will also + abort. Also, most compilers know that abort() does not return, so + can better optimize code conditionally calling it. + +PROCEED_ON_ERROR default: defined as 0 (false) + Controls whether detected bad addresses cause them to bypassed + rather than aborting. If set, detected bad arguments to free and + realloc are ignored. And all bookkeeping information is zeroed out + upon a detected overwrite of freed heap space, thus losing the + ability to ever return it from malloc again, but enabling the + application to proceed. If PROCEED_ON_ERROR is defined, the + static variable malloc_corruption_error_count is compiled in + and can be examined to see if errors have occurred. This option + generates slower code than the default abort policy. + +DL_DEBUG default: NOT defined + The DL_DEBUG setting is mainly intended for people trying to modify + this code or diagnose problems when porting to new platforms. + However, it may also be able to better isolate user errors than just + using runtime checks. The assertions in the check routines spell + out in more detail the assumptions and invariants underlying the + algorithms. The checking is fairly extensive, and will slow down + execution noticeably. Calling malloc_stats or mallinfo with DL_DEBUG + set will attempt to check every non-mmapped allocated and free chunk + in the course of computing the summaries. + +ABORT_ON_ASSERT_FAILURE default: defined as 1 (true) + Debugging assertion failures can be nearly impossible if your + version of the assert macro causes malloc to be called, which will + lead to a cascade of further failures, blowing the runtime stack. + ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(), + which will usually make debugging easier. + +MALLOC_FAILURE_ACTION default: sets errno to ENOMEM, or no-op on win32 + The action to take before "return 0" when malloc fails to be able to + return memory because there is none available. + +HAVE_MORECORE default: 1 (true) unless win32 or ONLY_MSPACES + True if this system supports sbrk or an emulation of it. + +MORECORE default: sbrk + The name of the sbrk-style system routine to call to obtain more + memory. See below for guidance on writing custom MORECORE + functions. The type of the argument to sbrk/MORECORE varies across + systems. It cannot be size_t, because it supports negative + arguments, so it is normally the signed type of the same width as + size_t (sometimes declared as "intptr_t"). It doesn't much matter + though. Internally, we only call it with arguments less than half + the max value of a size_t, which should work across all reasonable + possibilities, although sometimes generating compiler warnings. See + near the end of this file for guidelines for creating a custom + version of MORECORE. + +MORECORE_CONTIGUOUS default: 1 (true) if HAVE_MORECORE + If true, take advantage of fact that consecutive calls to MORECORE + with positive arguments always return contiguous increasing + addresses. This is true of unix sbrk. It does not hurt too much to + set it true anyway, since malloc copes with non-contiguities. + Setting it false when definitely non-contiguous saves time + and possibly wasted space it would take to discover this though. + +MORECORE_CANNOT_TRIM default: NOT defined + True if MORECORE cannot release space back to the system when given + negative arguments. This is generally necessary only if you are + using a hand-crafted MORECORE function that cannot handle negative + arguments. + +NO_SEGMENT_TRAVERSAL default: 0 + If non-zero, suppresses traversals of memory segments + returned by either MORECORE or CALL_MMAP. This disables + merging of segments that are contiguous, and selectively + releasing them to the OS if unused, but bounds execution times. + +DL_HAVE_MMAP default: 1 (true) + True if this system supports mmap or an emulation of it. If so, and + HAVE_MORECORE is not true, MMAP is used for all system + allocation. If set and HAVE_MORECORE is true as well, MMAP is + primarily used to directly allocate very large blocks. It is also + used as a backup strategy in cases where MORECORE fails to provide + space from system. Note: A single call to MUNMAP is assumed to be + able to unmap memory that may have be allocated using multiple calls + to MMAP, so long as they are adjacent. + +DL_HAVE_MREMAP default: 1 on linux, else 0 + If true realloc() uses mremap() to re-allocate large blocks and + extend or shrink allocation spaces. + +MMAP_CLEARS default: 1 except on WINCE. + True if mmap clears memory so calloc doesn't need to. This is true + for standard unix mmap using /dev/zero and on WIN32 except for WINCE. + +USE_BUILTIN_FFS default: 0 (i.e., not used) + Causes malloc to use the builtin ffs() function to compute indices. + Some compilers may recognize and intrinsify ffs to be faster than the + supplied C version. Also, the case of x86 using gcc is special-cased + to an asm instruction, so is already as fast as it can be, and so + this setting has no effect. Similarly for Win32 under recent MS compilers. + (On most x86s, the asm version is only slightly faster than the C version.) + +malloc_getpagesize default: derive from system includes, or 4096. + The system page size. To the extent possible, this malloc manages + memory from the system in page-size units. This may be (and + usually is) a function rather than a constant. This is ignored + if WIN32, where page size is determined using getSystemInfo during + initialization. + +USE_DEV_RANDOM default: 0 (i.e., not used) + Causes malloc to use /dev/random to initialize secure magic seed for + stamping footers. Otherwise, the current time is used. + +NO_MALLINFO default: 0 + If defined, don't compile "mallinfo". This can be a simple way + of dealing with mismatches between system declarations and + those in this file. + +MALLINFO_FIELD_TYPE default: size_t + The type of the fields in the mallinfo struct. This was originally + defined as "int" in SVID etc, but is more usefully defined as + size_t. The value is used only if HAVE_USR_INCLUDE_MALLOC_H is not set + +REALLOC_ZERO_BYTES_FREES default: not defined + This should be set if a call to realloc with zero bytes should + be the same as a call to free. Some people think it should. Otherwise, + since this malloc returns a unique pointer for malloc(0), so does + realloc(p, 0). + +LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H +LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H, LACKS_ERRNO_H +LACKS_STDLIB_H default: NOT defined unless on WIN32 + Define these if your system does not have these header files. + You might need to manually insert some of the declarations they provide. + +DEFAULT_GRANULARITY default: page size if MORECORE_CONTIGUOUS, + system_info.dwAllocationGranularity in WIN32, + otherwise 64K. + Also settable using mallopt(M_GRANULARITY, x) + The unit for allocating and deallocating memory from the system. On + most systems with contiguous MORECORE, there is no reason to + make this more than a page. However, systems with MMAP tend to + either require or encourage larger granularities. You can increase + this value to prevent system allocation functions to be called so + often, especially if they are slow. The value must be at least one + page and must be a power of two. Setting to 0 causes initialization + to either page size or win32 region size. (Note: In previous + versions of malloc, the equivalent of this option was called + "TOP_PAD") + +DEFAULT_TRIM_THRESHOLD default: 2MB + Also settable using mallopt(M_TRIM_THRESHOLD, x) + The maximum amount of unused top-most memory to keep before + releasing via malloc_trim in free(). Automatic trimming is mainly + useful in long-lived programs using contiguous MORECORE. Because + trimming via sbrk can be slow on some systems, and can sometimes be + wasteful (in cases where programs immediately afterward allocate + more large chunks) the value should be high enough so that your + overall system performance would improve by releasing this much + memory. As a rough guide, you might set to a value close to the + average size of a process (program) running on your system. + Releasing this much memory would allow such a process to run in + memory. Generally, it is worth tuning trim thresholds when a + program undergoes phases where several large chunks are allocated + and released in ways that can reuse each other's storage, perhaps + mixed with phases where there are no such chunks at all. The trim + value must be greater than page size to have any useful effect. To + disable trimming completely, you can set to MAX_SIZE_T. Note that the trick + some people use of mallocing a huge space and then freeing it at + program startup, in an attempt to reserve system memory, doesn't + have the intended effect under automatic trimming, since that memory + will immediately be returned to the system. + +DEFAULT_MMAP_THRESHOLD default: 256K + Also settable using mallopt(M_MMAP_THRESHOLD, x) + The request size threshold for using MMAP to directly service a + request. Requests of at least this size that cannot be allocated + using already-existing space will be serviced via mmap. (If enough + normal freed space already exists it is used instead.) Using mmap + segregates relatively large chunks of memory so that they can be + individually obtained and released from the host system. A request + serviced through mmap is never reused by any other request (at least + not directly; the system may just so happen to remap successive + requests to the same locations). Segregating space in this way has + the benefits that: Mmapped space can always be individually released + back to the system, which helps keep the system level memory demands + of a long-lived program low. Also, mapped memory doesn't become + `locked' between other chunks, as can happen with normally allocated + chunks, which means that even trimming via malloc_trim would not + release them. However, it has the disadvantage that the space + cannot be reclaimed, consolidated, and then used to service later + requests, as happens with normal chunks. The advantages of mmap + nearly always outweigh disadvantages for "large" chunks, but the + value of "large" may vary across systems. The default is an + empirically derived value that works well in most systems. You can + disable mmap by setting to MAX_SIZE_T. + +MAX_RELEASE_CHECK_RATE default: 255 unless not DL_HAVE_MMAP + The number of consolidated frees between checks to release + unused segments when freeing. When using non-contiguous segments, + especially with multiple mspaces, checking only for topmost space + doesn't always suffice to trigger trimming. To compensate for this, + free() will, with a period of MAX_RELEASE_CHECK_RATE (or the + current number of segments, if greater) try to release unused + segments to the OS when freeing chunks that result in + consolidation. The best value for this parameter is a compromise + between slowing down frees with relatively costly checks that + rarely trigger versus holding on to unused memory. To effectively + disable, set to MAX_SIZE_T. This may lead to a very slight speed + improvement at the expense of carrying around more memory. +*/ + +#ifndef WIN32 +#ifdef _WIN32 +#define WIN32 1 +#endif /* _WIN32 */ +#endif /* WIN32 */ +#ifdef WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#define DL_HAVE_MMAP 1 +#define HAVE_MORECORE 0 +#define LACKS_UNISTD_H +#define LACKS_SYS_PARAM_H +#define LACKS_SYS_MMAN_H +#define LACKS_STRING_H +#define LACKS_STRINGS_H +#define LACKS_SYS_TYPES_H +#define LACKS_ERRNO_H +#define MALLOC_FAILURE_ACTION +#ifdef _WIN32_WCE /* WINCE reportedly does not clear */ +#define MMAP_CLEARS 0 +#else +#define MMAP_CLEARS 1 +#endif /* _WIN32_WCE */ +#endif /* WIN32 */ + +#if defined(DARWIN) || defined(_DARWIN) +/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */ +#ifndef HAVE_MORECORE +#define HAVE_MORECORE 0 +#define DL_HAVE_MMAP 1 +#endif /* HAVE_MORECORE */ +#endif /* DARWIN */ + +#ifndef LACKS_SYS_TYPES_H +#include /* For size_t */ +#endif /* LACKS_SYS_TYPES_H */ + +/* The maximum possible size_t value has all bits set */ +#define MAX_SIZE_T (~(size_t)0) + +#ifndef ONLY_MSPACES +#define ONLY_MSPACES 0 +#endif /* ONLY_MSPACES */ +#ifndef MSPACES +#if ONLY_MSPACES +#define MSPACES 1 +#else /* ONLY_MSPACES */ +#define MSPACES 0 +#endif /* ONLY_MSPACES */ +#endif /* MSPACES */ +#ifndef MALLOC_ALIGNMENT +#define MALLOC_ALIGNMENT ((size_t)8U) +#endif /* MALLOC_ALIGNMENT */ +#ifndef FOOTERS +#define FOOTERS 0 +#endif /* FOOTERS */ +#ifndef ABORT +#define ABORT abort() +#endif /* ABORT */ +#ifndef ABORT_ON_ASSERT_FAILURE +#define ABORT_ON_ASSERT_FAILURE 1 +#endif /* ABORT_ON_ASSERT_FAILURE */ +#ifndef PROCEED_ON_ERROR +#define PROCEED_ON_ERROR 0 +#endif /* PROCEED_ON_ERROR */ +#ifndef USE_LOCKS +#define USE_LOCKS 0 +#endif /* USE_LOCKS */ +#ifndef USE_SPIN_LOCKS +#if USE_LOCKS && (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(_MSC_VER) && _MSC_VER>=1310) +#define USE_SPIN_LOCKS 1 +#else +#define USE_SPIN_LOCKS 0 +#endif /* USE_LOCKS && ... */ +#endif /* USE_SPIN_LOCKS */ +#ifndef INSECURE +#define INSECURE 0 +#endif /* INSECURE */ +#ifndef DL_HAVE_MMAP +#define DL_HAVE_MMAP 1 +#endif /* DL_HAVE_MMAP */ +#ifndef MMAP_CLEARS +#define MMAP_CLEARS 1 +#endif /* MMAP_CLEARS */ +#ifndef DL_HAVE_MREMAP +#ifdef linux +#define DL_HAVE_MREMAP 1 +#else /* linux */ +#define DL_HAVE_MREMAP 0 +#endif /* linux */ +#endif /* DL_HAVE_MREMAP */ +#ifndef MALLOC_FAILURE_ACTION +#define MALLOC_FAILURE_ACTION errno = ENOMEM; +#endif /* MALLOC_FAILURE_ACTION */ +#ifndef HAVE_MORECORE +#if ONLY_MSPACES +#define HAVE_MORECORE 0 +#else /* ONLY_MSPACES */ +#define HAVE_MORECORE 1 +#endif /* ONLY_MSPACES */ +#endif /* HAVE_MORECORE */ +#if !HAVE_MORECORE +#define MORECORE_CONTIGUOUS 0 +#else /* !HAVE_MORECORE */ +#ifndef MORECORE +#define MORECORE sbrk +#endif /* MORECORE */ +#ifndef MORECORE_CONTIGUOUS +#define MORECORE_CONTIGUOUS 1 +#endif /* MORECORE_CONTIGUOUS */ +#endif /* HAVE_MORECORE */ +#ifndef DEFAULT_GRANULARITY +#if MORECORE_CONTIGUOUS +#define DEFAULT_GRANULARITY (0) /* 0 means to compute in init_mparams */ +#else /* MORECORE_CONTIGUOUS */ +#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U) +#endif /* MORECORE_CONTIGUOUS */ +#endif /* DEFAULT_GRANULARITY */ +#ifndef DEFAULT_TRIM_THRESHOLD +#ifndef MORECORE_CANNOT_TRIM +#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U) +#else /* MORECORE_CANNOT_TRIM */ +#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T +#endif /* MORECORE_CANNOT_TRIM */ +#endif /* DEFAULT_TRIM_THRESHOLD */ +#ifndef DEFAULT_MMAP_THRESHOLD +#if DL_HAVE_MMAP +#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U) +#else /* DL_HAVE_MMAP */ +#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T +#endif /* DL_HAVE_MMAP */ +#endif /* DEFAULT_MMAP_THRESHOLD */ +#ifndef MAX_RELEASE_CHECK_RATE +#if DL_HAVE_MMAP +#define MAX_RELEASE_CHECK_RATE 255 +#else +#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T +#endif /* DL_HAVE_MMAP */ +#endif /* MAX_RELEASE_CHECK_RATE */ +#ifndef USE_BUILTIN_FFS +#define USE_BUILTIN_FFS 0 +#endif /* USE_BUILTIN_FFS */ +#ifndef USE_DEV_RANDOM +#define USE_DEV_RANDOM 0 +#endif /* USE_DEV_RANDOM */ +#ifndef NO_MALLINFO +#define NO_MALLINFO 0 +#endif /* NO_MALLINFO */ +#ifndef MALLINFO_FIELD_TYPE +#define MALLINFO_FIELD_TYPE size_t +#endif /* MALLINFO_FIELD_TYPE */ +#ifndef NO_SEGMENT_TRAVERSAL +#define NO_SEGMENT_TRAVERSAL 0 +#endif /* NO_SEGMENT_TRAVERSAL */ + +/* + mallopt tuning options. SVID/XPG defines four standard parameter + numbers for mallopt, normally defined in malloc.h. None of these + are used in this malloc, so setting them has no effect. But this + malloc does support the following options. +*/ + +#define M_TRIM_THRESHOLD (-1) +#define M_GRANULARITY (-2) +#define M_MMAP_THRESHOLD (-3) + +/* ------------------------ Mallinfo declarations ------------------------ */ + +#if !NO_MALLINFO +/* + This version of malloc supports the standard SVID/XPG mallinfo + routine that returns a struct containing usage properties and + statistics. It should work on any system that has a + /usr/include/malloc.h defining struct mallinfo. The main + declaration needed is the mallinfo struct that is returned (by-copy) + by mallinfo(). The malloinfo struct contains a bunch of fields that + are not even meaningful in this version of malloc. These fields are + are instead filled by mallinfo() with other numbers that might be of + interest. + + HAVE_USR_INCLUDE_MALLOC_H should be set if you have a + /usr/include/malloc.h file that includes a declaration of struct + mallinfo. If so, it is included; else a compliant version is + declared below. These must be precisely the same for mallinfo() to + work. The original SVID version of this struct, defined on most + systems with mallinfo, declares all fields as ints. But some others + define as unsigned long. If your system defines the fields using a + type of different width than listed here, you MUST #include your + system version and #define HAVE_USR_INCLUDE_MALLOC_H. +*/ + +/* #define HAVE_USR_INCLUDE_MALLOC_H */ + +#ifdef HAVE_USR_INCLUDE_MALLOC_H +#include "/usr/include/malloc.h" +#else /* HAVE_USR_INCLUDE_MALLOC_H */ + +struct mallinfo { + MALLINFO_FIELD_TYPE arena; /* non-mmapped space allocated from system */ + MALLINFO_FIELD_TYPE ordblks; /* number of free chunks */ + MALLINFO_FIELD_TYPE smblks; /* always 0 */ + MALLINFO_FIELD_TYPE hblks; /* always 0 */ + MALLINFO_FIELD_TYPE hblkhd; /* space in mmapped regions */ + MALLINFO_FIELD_TYPE usmblks; /* maximum total allocated space */ + MALLINFO_FIELD_TYPE fsmblks; /* always 0 */ + MALLINFO_FIELD_TYPE uordblks; /* total allocated space */ + MALLINFO_FIELD_TYPE fordblks; /* total free space */ + MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */ +}; + +#endif /* HAVE_USR_INCLUDE_MALLOC_H */ +#endif /* NO_MALLINFO */ + +/* + Try to persuade compilers to inline. The most critical functions for + inlining are defined as macros, so these aren't used for them. +*/ + +#ifndef FORCEINLINE + #if defined(__GNUC__) +#define FORCEINLINE __inline __attribute__ ((always_inline)) + #elif defined(_MSC_VER) + #define FORCEINLINE __forceinline + #endif +#endif +#ifndef NOINLINE + #if defined(__GNUC__) + #define NOINLINE __attribute__ ((noinline)) + #elif defined(_MSC_VER) + #define NOINLINE __declspec(noinline) + #else + #define NOINLINE + #endif +#endif + +#ifdef __cplusplus +extern "C" { +#ifndef FORCEINLINE + #define FORCEINLINE inline +#endif +#endif /* __cplusplus */ +#ifndef FORCEINLINE + #define FORCEINLINE +#endif + +#if !ONLY_MSPACES + +/* ------------------- Declarations of public routines ------------------- */ + +#ifndef USE_DL_PREFIX +#define dlcalloc calloc +#define dlfree free +#define dlmalloc malloc +#define dlmemalign memalign +#define dlrealloc realloc +#define dlvalloc valloc +#define dlpvalloc pvalloc +#define dlmallinfo mallinfo +#define dlmallopt mallopt +#define dlmalloc_trim malloc_trim +#define dlmalloc_stats malloc_stats +#define dlmalloc_usable_size malloc_usable_size +#define dlmalloc_footprint malloc_footprint +#define dlmalloc_max_footprint malloc_max_footprint +#define dlindependent_calloc independent_calloc +#define dlindependent_comalloc independent_comalloc +#endif /* USE_DL_PREFIX */ + + +/* + malloc(size_t n) + Returns a pointer to a newly allocated chunk of at least n bytes, or + null if no space is available, in which case errno is set to ENOMEM + on ANSI C systems. + + If n is zero, malloc returns a minimum-sized chunk. (The minimum + size is 16 bytes on most 32bit systems, and 32 bytes on 64bit + systems.) Note that size_t is an unsigned type, so calls with + arguments that would be negative if signed are interpreted as + requests for huge amounts of space, which will often fail. The + maximum supported value of n differs across systems, but is in all + cases less than the maximum representable value of a size_t. +*/ +void* dlmalloc(size_t); + +/* + free(void* p) + Releases the chunk of memory pointed to by p, that had been previously + allocated using malloc or a related routine such as realloc. + It has no effect if p is null. If p was not malloced or already + freed, free(p) will by default cause the current program to abort. +*/ +void dlfree(void*); + +/* + calloc(size_t n_elements, size_t element_size); + Returns a pointer to n_elements * element_size bytes, with all locations + set to zero. +*/ +void* dlcalloc(size_t, size_t); + +/* + realloc(void* p, size_t n) + Returns a pointer to a chunk of size n that contains the same data + as does chunk p up to the minimum of (n, p's size) bytes, or null + if no space is available. + + The returned pointer may or may not be the same as p. The algorithm + prefers extending p in most cases when possible, otherwise it + employs the equivalent of a malloc-copy-free sequence. + + If p is null, realloc is equivalent to malloc. + + If space is not available, realloc returns null, errno is set (if on + ANSI) and p is NOT freed. + + if n is for fewer bytes than already held by p, the newly unused + space is lopped off and freed if possible. realloc with a size + argument of zero (re)allocates a minimum-sized chunk. + + The old unix realloc convention of allowing the last-free'd chunk + to be used as an argument to realloc is not supported. +*/ + +void* dlrealloc(void*, size_t); + +/* + memalign(size_t alignment, size_t n); + Returns a pointer to a newly allocated chunk of n bytes, aligned + in accord with the alignment argument. + + The alignment argument should be a power of two. If the argument is + not a power of two, the nearest greater power is used. + 8-byte alignment is guaranteed by normal malloc calls, so don't + bother calling memalign with an argument of 8 or less. + + Overreliance on memalign is a sure way to fragment space. +*/ +void* dlmemalign(size_t, size_t); + +/* + valloc(size_t n); + Equivalent to memalign(pagesize, n), where pagesize is the page + size of the system. If the pagesize is unknown, 4096 is used. +*/ +void* dlvalloc(size_t); + +/* + mallopt(int parameter_number, int parameter_value) + Sets tunable parameters The format is to provide a + (parameter-number, parameter-value) pair. mallopt then sets the + corresponding parameter to the argument value if it can (i.e., so + long as the value is meaningful), and returns 1 if successful else + 0. SVID/XPG/ANSI defines four standard param numbers for mallopt, + normally defined in malloc.h. None of these are use in this malloc, + so setting them has no effect. But this malloc also supports other + options in mallopt. See below for details. Briefly, supported + parameters are as follows (listed defaults are for "typical" + configurations). + + Symbol param # default allowed param values + M_TRIM_THRESHOLD -1 2*1024*1024 any (MAX_SIZE_T disables) + M_GRANULARITY -2 page size any power of 2 >= page size + M_MMAP_THRESHOLD -3 256*1024 any (or 0 if no MMAP support) +*/ +int dlmallopt(int, int); + +/* + malloc_footprint(); + Returns the number of bytes obtained from the system. The total + number of bytes allocated by malloc, realloc etc., is less than this + value. Unlike mallinfo, this function returns only a precomputed + result, so can be called frequently to monitor memory consumption. + Even if locks are otherwise defined, this function does not use them, + so results might not be up to date. +*/ +size_t dlmalloc_footprint(void); + +/* + malloc_max_footprint(); + Returns the maximum number of bytes obtained from the system. This + value will be greater than current footprint if deallocated space + has been reclaimed by the system. The peak number of bytes allocated + by malloc, realloc etc., is less than this value. Unlike mallinfo, + this function returns only a precomputed result, so can be called + frequently to monitor memory consumption. Even if locks are + otherwise defined, this function does not use them, so results might + not be up to date. +*/ +size_t dlmalloc_max_footprint(void); + +#if !NO_MALLINFO +/* + mallinfo() + Returns (by copy) a struct containing various summary statistics: + + arena: current total non-mmapped bytes allocated from system + ordblks: the number of free chunks + smblks: always zero. + hblks: current number of mmapped regions + hblkhd: total bytes held in mmapped regions + usmblks: the maximum total allocated space. This will be greater + than current total if trimming has occurred. + fsmblks: always zero + uordblks: current total allocated space (normal or mmapped) + fordblks: total free space + keepcost: the maximum number of bytes that could ideally be released + back to system via malloc_trim. ("ideally" means that + it ignores page restrictions etc.) + + Because these fields are ints, but internal bookkeeping may + be kept as longs, the reported values may wrap around zero and + thus be inaccurate. +*/ +struct mallinfo dlmallinfo(void); +#endif /* NO_MALLINFO */ + +/* + independent_calloc(size_t n_elements, size_t element_size, void* chunks[]); + + independent_calloc is similar to calloc, but instead of returning a + single cleared space, it returns an array of pointers to n_elements + independent elements that can hold contents of size elem_size, each + of which starts out cleared, and can be independently freed, + realloc'ed etc. The elements are guaranteed to be adjacently + allocated (this is not guaranteed to occur with multiple callocs or + mallocs), which may also improve cache locality in some + applications. + + The "chunks" argument is optional (i.e., may be null, which is + probably the most typical usage). If it is null, the returned array + is itself dynamically allocated and should also be freed when it is + no longer needed. Otherwise, the chunks array must be of at least + n_elements in length. It is filled in with the pointers to the + chunks. + + In either case, independent_calloc returns this pointer array, or + null if the allocation failed. If n_elements is zero and "chunks" + is null, it returns a chunk representing an array with zero elements + (which should be freed if not wanted). + + Each element must be individually freed when it is no longer + needed. If you'd like to instead be able to free all at once, you + should instead use regular calloc and assign pointers into this + space to represent elements. (In this case though, you cannot + independently free elements.) + + independent_calloc simplifies and speeds up implementations of many + kinds of pools. It may also be useful when constructing large data + structures that initially have a fixed number of fixed-sized nodes, + but the number is not known at compile time, and some of the nodes + may later need to be freed. For example: + + struct Node { int item; struct Node* next; }; + + struct Node* build_list() { + struct Node** pool; + int n = read_number_of_nodes_needed(); + if (n <= 0) return 0; + pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0); + if (pool == 0) die(); + // organize into a linked list... + struct Node* first = pool[0]; + for (i = 0; i < n-1; ++i) + pool[i]->next = pool[i+1]; + free(pool); // Can now free the array (or not, if it is needed later) + return first; + } +*/ +void** dlindependent_calloc(size_t, size_t, void**); + +/* + independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]); + + independent_comalloc allocates, all at once, a set of n_elements + chunks with sizes indicated in the "sizes" array. It returns + an array of pointers to these elements, each of which can be + independently freed, realloc'ed etc. The elements are guaranteed to + be adjacently allocated (this is not guaranteed to occur with + multiple callocs or mallocs), which may also improve cache locality + in some applications. + + The "chunks" argument is optional (i.e., may be null). If it is null + the returned array is itself dynamically allocated and should also + be freed when it is no longer needed. Otherwise, the chunks array + must be of at least n_elements in length. It is filled in with the + pointers to the chunks. + + In either case, independent_comalloc returns this pointer array, or + null if the allocation failed. If n_elements is zero and chunks is + null, it returns a chunk representing an array with zero elements + (which should be freed if not wanted). + + Each element must be individually freed when it is no longer + needed. If you'd like to instead be able to free all at once, you + should instead use a single regular malloc, and assign pointers at + particular offsets in the aggregate space. (In this case though, you + cannot independently free elements.) + + independent_comallac differs from independent_calloc in that each + element may have a different size, and also that it does not + automatically clear elements. + + independent_comalloc can be used to speed up allocation in cases + where several structs or objects must always be allocated at the + same time. For example: + + struct Head { ... } + struct Foot { ... } + + void send_message(char* msg) { + int msglen = strlen(msg); + size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) }; + void* chunks[3]; + if (independent_comalloc(3, sizes, chunks) == 0) + die(); + struct Head* head = (struct Head*)(chunks[0]); + char* body = (char*)(chunks[1]); + struct Foot* foot = (struct Foot*)(chunks[2]); + // ... + } + + In general though, independent_comalloc is worth using only for + larger values of n_elements. For small values, you probably won't + detect enough difference from series of malloc calls to bother. + + Overuse of independent_comalloc can increase overall memory usage, + since it cannot reuse existing noncontiguous small chunks that + might be available for some of the elements. +*/ +void** dlindependent_comalloc(size_t, size_t*, void**); + + +/* + pvalloc(size_t n); + Equivalent to valloc(minimum-page-that-holds(n)), that is, + round up n to nearest pagesize. + */ +void* dlpvalloc(size_t); + +/* + malloc_trim(size_t pad); + + If possible, gives memory back to the system (via negative arguments + to sbrk) if there is unused memory at the `high' end of the malloc + pool or in unused MMAP segments. You can call this after freeing + large blocks of memory to potentially reduce the system-level memory + requirements of a program. However, it cannot guarantee to reduce + memory. Under some allocation patterns, some large free blocks of + memory will be locked between two used chunks, so they cannot be + given back to the system. + + The `pad' argument to malloc_trim represents the amount of free + trailing space to leave untrimmed. If this argument is zero, only + the minimum amount of memory to maintain internal data structures + will be left. Non-zero arguments can be supplied to maintain enough + trailing space to service future expected allocations without having + to re-obtain memory from the system. + + Malloc_trim returns 1 if it actually released any memory, else 0. +*/ +int dlmalloc_trim(size_t); + +/* + malloc_usable_size(void* p); + + Returns the number of bytes you can actually use in + an allocated chunk, which may be more than you requested (although + often not) due to alignment and minimum size constraints. + You can use this many bytes without worrying about + overwriting other allocated objects. This is not a particularly great + programming practice. malloc_usable_size can be more useful in + debugging and assertions, for example: + + p = malloc(n); + assert(malloc_usable_size(p) >= 256); +*/ +size_t dlmalloc_usable_size(void*); + +/* + malloc_stats(); + Prints on stderr the amount of space obtained from the system (both + via sbrk and mmap), the maximum amount (which may be more than + current if malloc_trim and/or munmap got called), and the current + number of bytes allocated via malloc (or realloc, etc) but not yet + freed. Note that this is the number of bytes allocated, not the + number requested. It will be larger than the number requested + because of alignment and bookkeeping overhead. Because it includes + alignment wastage as being in use, this figure may be greater than + zero even when no user-level chunks are allocated. + + The reported current and maximum system memory can be inaccurate if + a program makes other calls to system memory allocation functions + (normally sbrk) outside of malloc. + + malloc_stats prints only the most commonly interesting statistics. + More information can be obtained by calling mallinfo. +*/ +void dlmalloc_stats(void); + +#endif /* ONLY_MSPACES */ + +#if MSPACES + +/* + mspace is an opaque type representing an independent + region of space that supports mspace_malloc, etc. +*/ +typedef void* mspace; + +/* + create_mspace creates and returns a new independent space with the + given initial capacity, or, if 0, the default granularity size. It + returns null if there is no system memory available to create the + space. If argument locked is non-zero, the space uses a separate + lock to control access. The capacity of the space will grow + dynamically as needed to service mspace_malloc requests. You can + control the sizes of incremental increases of this space by + compiling with a different DEFAULT_GRANULARITY or dynamically + setting with mallopt(M_GRANULARITY, value). +*/ +mspace create_mspace(size_t capacity, int locked); + +/* + destroy_mspace destroys the given space, and attempts to return all + of its memory back to the system, returning the total number of + bytes freed. After destruction, the results of access to all memory + used by the space become undefined. +*/ +size_t destroy_mspace(mspace msp); + +/* + create_mspace_with_base uses the memory supplied as the initial base + of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this + space is used for bookkeeping, so the capacity must be at least this + large. (Otherwise 0 is returned.) When this initial space is + exhausted, additional memory will be obtained from the system. + Destroying this space will deallocate all additionally allocated + space (if possible) but not the initial base. +*/ +mspace create_mspace_with_base(void* base, size_t capacity, int locked); + +/* + mspace_malloc behaves as malloc, but operates within + the given space. +*/ +void* mspace_malloc(mspace msp, size_t bytes); + +/* + mspace_free behaves as free, but operates within + the given space. + + If compiled with FOOTERS==1, mspace_free is not actually needed. + free may be called instead of mspace_free because freed chunks from + any space are handled by their originating spaces. +*/ +void mspace_free(mspace msp, void* mem); + +/* + mspace_realloc behaves as realloc, but operates within + the given space. + + If compiled with FOOTERS==1, mspace_realloc is not actually + needed. realloc may be called instead of mspace_realloc because + realloced chunks from any space are handled by their originating + spaces. +*/ +void* mspace_realloc(mspace msp, void* mem, size_t newsize); + +/* + mspace_calloc behaves as calloc, but operates within + the given space. +*/ +void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size); + +/* + mspace_memalign behaves as memalign, but operates within + the given space. +*/ +void* mspace_memalign(mspace msp, size_t alignment, size_t bytes); + +/* + mspace_independent_calloc behaves as independent_calloc, but + operates within the given space. +*/ +void** mspace_independent_calloc(mspace msp, size_t n_elements, + size_t elem_size, void* chunks[]); + +/* + mspace_independent_comalloc behaves as independent_comalloc, but + operates within the given space. +*/ +void** mspace_independent_comalloc(mspace msp, size_t n_elements, + size_t sizes[], void* chunks[]); + +/* + mspace_footprint() returns the number of bytes obtained from the + system for this space. +*/ +size_t mspace_footprint(mspace msp); + +/* + mspace_max_footprint() returns the peak number of bytes obtained from the + system for this space. +*/ +size_t mspace_max_footprint(mspace msp); + + +#if !NO_MALLINFO +/* + mspace_mallinfo behaves as mallinfo, but reports properties of + the given space. +*/ +struct mallinfo mspace_mallinfo(mspace msp); +#endif /* NO_MALLINFO */ + +/* + mspace_malloc_stats behaves as malloc_stats, but reports + properties of the given space. +*/ +void mspace_malloc_stats(mspace msp); + +/* + mspace_trim behaves as malloc_trim, but + operates within the given space. +*/ +int mspace_trim(mspace msp, size_t pad); + +/* + An alias for mallopt. +*/ +int mspace_mallopt(int, int); + +#endif /* MSPACES */ + +#ifdef __cplusplus +}; /* end of extern "C" */ +#endif /* __cplusplus */ + +/* + ======================================================================== + To make a fully customizable malloc.h header file, cut everything + above this line, put into file malloc.h, edit to suit, and #include it + on the next line, as well as in programs that use this malloc. + ======================================================================== +*/ + +/* #include "malloc.h" */ + +/*------------------------------ internal #includes ---------------------- */ + +#ifdef WIN32 +#pragma warning( disable : 4146 ) /* no "unsigned" warnings */ +#endif /* WIN32 */ + +#include /* for printing in malloc_stats */ + +#ifndef LACKS_ERRNO_H +#include /* for MALLOC_FAILURE_ACTION */ +#endif /* LACKS_ERRNO_H */ +#if FOOTERS +#include /* for magic initialization */ +#endif /* FOOTERS */ +#ifndef LACKS_STDLIB_H +#include /* for abort() */ +#endif /* LACKS_STDLIB_H */ +#ifdef DL_DEBUG +#if ABORT_ON_ASSERT_FAILURE +#define dl_assert(x) if(!(x)) ABORT +#else /* ABORT_ON_ASSERT_FAILURE */ +#include +#endif /* ABORT_ON_ASSERT_FAILURE */ +#else /* DL_DEBUG */ +#define dl_assert(x) assert(x) +#include +#endif /* DL_DEBUG */ +#ifndef LACKS_STRING_H +#include /* for memset etc */ +#endif /* LACKS_STRING_H */ +#if USE_BUILTIN_FFS +#ifndef LACKS_STRINGS_H +#include /* for ffs */ +#endif /* LACKS_STRINGS_H */ +#endif /* USE_BUILTIN_FFS */ +#if DL_HAVE_MMAP +#ifndef LACKS_SYS_MMAN_H +#include /* for mmap */ +#endif /* LACKS_SYS_MMAN_H */ +#ifndef LACKS_FCNTL_H +#include +#endif /* LACKS_FCNTL_H */ +#endif /* DL_HAVE_MMAP */ +#if HAVE_MORECORE +#ifndef LACKS_UNISTD_H +#include /* for sbrk */ +#else /* LACKS_UNISTD_H */ +#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) +extern void* sbrk(ptrdiff_t); +#endif /* FreeBSD etc */ +#endif /* LACKS_UNISTD_H */ +#endif /* DL_HAVE_MMAP */ + +/* Declarations for locking */ +#if USE_LOCKS +#ifndef WIN32 +#include +#if defined (__SVR4) && defined (__sun) /* solaris */ +#include +#endif /* solaris */ +#else +#ifndef _M_AMD64 +/* These are already defined on AMD64 builds */ +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +LONG __cdecl _InterlockedCompareExchange(LPLONG volatile Dest, LONG Exchange, LONG Comp); +LONG __cdecl _InterlockedExchange(LPLONG volatile Target, LONG Value); +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* _M_AMD64 */ +#pragma intrinsic (_InterlockedCompareExchange) +#pragma intrinsic (_InterlockedExchange) +#define interlockedcompareexchange _InterlockedCompareExchange +#define interlockedexchange _InterlockedExchange +#endif /* Win32 */ +#endif /* USE_LOCKS */ + +/* Declarations for bit scanning on win32 */ +#if defined(_MSC_VER) && _MSC_VER>=1300 +#ifndef BitScanForward /* Try to avoid pulling in WinNT.h */ +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +unsigned char _BitScanForward(unsigned long *index, unsigned long mask); +unsigned char _BitScanReverse(unsigned long *index, unsigned long mask); +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#define BitScanForward _BitScanForward +#define BitScanReverse _BitScanReverse +#pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) +#endif /* BitScanForward */ +#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */ + +#ifndef WIN32 +#ifndef malloc_getpagesize +# ifdef _SC_PAGESIZE /* some SVR4 systems omit an underscore */ +# ifndef _SC_PAGE_SIZE +# define _SC_PAGE_SIZE _SC_PAGESIZE +# endif +# endif +# ifdef _SC_PAGE_SIZE +# define malloc_getpagesize sysconf(_SC_PAGE_SIZE) +# else +# if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE) + extern size_t getpagesize(); +# define malloc_getpagesize getpagesize() +# else +# ifdef WIN32 /* use supplied emulation of getpagesize */ +# define malloc_getpagesize getpagesize() +# else +# ifndef LACKS_SYS_PARAM_H +# include +# endif +# ifdef EXEC_PAGESIZE +# define malloc_getpagesize EXEC_PAGESIZE +# else +# ifdef NBPG +# ifndef CLSIZE +# define malloc_getpagesize NBPG +# else +# define malloc_getpagesize (NBPG * CLSIZE) +# endif +# else +# ifdef NBPC +# define malloc_getpagesize NBPC +# else +# ifdef PAGESIZE +# define malloc_getpagesize PAGESIZE +# else /* just guess */ +# define malloc_getpagesize ((size_t)4096U) +# endif +# endif +# endif +# endif +# endif +# endif +# endif +#endif +#endif + + + +/* ------------------- size_t and alignment properties -------------------- */ + +/* The byte and bit size of a size_t */ +#define SIZE_T_SIZE (sizeof(size_t)) +#define SIZE_T_BITSIZE (sizeof(size_t) << 3) + +/* Some constants coerced to size_t */ +/* Annoying but necessary to avoid errors on some platforms */ +#define SIZE_T_ZERO ((size_t)0) +#define SIZE_T_ONE ((size_t)1) +#define SIZE_T_TWO ((size_t)2) +#define SIZE_T_FOUR ((size_t)4) +#define TWO_SIZE_T_SIZES (SIZE_T_SIZE<<1) +#define FOUR_SIZE_T_SIZES (SIZE_T_SIZE<<2) +#define SIX_SIZE_T_SIZES (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES) +#define HALF_MAX_SIZE_T (MAX_SIZE_T / 2U) + +/* The bit mask value corresponding to MALLOC_ALIGNMENT */ +#define CHUNK_ALIGN_MASK (MALLOC_ALIGNMENT - SIZE_T_ONE) + +/* True if address a has acceptable alignment */ +#define is_aligned(A) (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0) + +/* the number of bytes to offset an address to align it */ +#define align_offset(A)\ + ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\ + ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK)) + +/* -------------------------- MMAP preliminaries ------------------------- */ + +/* + If HAVE_MORECORE or DL_HAVE_MMAP are false, we just define calls and + checks to fail so compiler optimizer can delete code rather than + using so many "#if"s. +*/ + + +/* MORECORE and MMAP must return MFAIL on failure */ +#define MFAIL ((void*)(MAX_SIZE_T)) +#define CMFAIL ((char*)(MFAIL)) /* defined for convenience */ + +#if !DL_HAVE_MMAP +#define IS_MMAPPED_BIT (SIZE_T_ZERO) +#define USE_MMAP_BIT (SIZE_T_ZERO) +#define CALL_MMAP(s) MFAIL +#define CALL_MUNMAP(a, s) (-1) +#define DIRECT_MMAP(s) MFAIL + +#else /* DL_HAVE_MMAP */ +#define IS_MMAPPED_BIT (SIZE_T_ONE) +#define USE_MMAP_BIT (SIZE_T_ONE) + +#ifndef WIN32 +#define CALL_MUNMAP(a, s) munmap((a), (s)) +#define MMAP_PROT (PROT_READ|PROT_WRITE) +#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) +#define MAP_ANONYMOUS MAP_ANON +#endif /* MAP_ANON */ +#ifdef MAP_ANONYMOUS +#define MMAP_FLAGS (MAP_PRIVATE|MAP_ANONYMOUS) +#define CALL_MMAP(s) mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0) +#else /* MAP_ANONYMOUS */ +/* + Nearly all versions of mmap support MAP_ANONYMOUS, so the following + is unlikely to be needed, but is supplied just in case. +*/ +#define MMAP_FLAGS (MAP_PRIVATE) +static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */ +#define CALL_MMAP(s) ((dev_zero_fd < 0) ? \ + (dev_zero_fd = open("/dev/zero", O_RDWR), \ + mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \ + mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) +#endif /* MAP_ANONYMOUS */ + +#define DIRECT_MMAP(s) CALL_MMAP(s) +#else /* WIN32 */ + +/* Win32 MMAP via VirtualAlloc */ +static FORCEINLINE void* win32mmap(size_t size) { + void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); + return (ptr != 0)? ptr: MFAIL; +} + +/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */ +static FORCEINLINE void* win32direct_mmap(size_t size) { + void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN, + PAGE_READWRITE); + return (ptr != 0)? ptr: MFAIL; +} + +/* This function supports releasing coalesed segments */ +static FORCEINLINE int win32munmap(void* ptr, size_t size) { + MEMORY_BASIC_INFORMATION minfo; + char* cptr = (char*)ptr; + while (size) { + if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0) + return -1; + if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr || + minfo.State != MEM_COMMIT || minfo.RegionSize > size) + return -1; + if (VirtualFree(cptr, 0, MEM_RELEASE) == 0) + return -1; + cptr += minfo.RegionSize; + size -= minfo.RegionSize; + } + return 0; +} + +#define CALL_MMAP(s) win32mmap(s) +#define CALL_MUNMAP(a, s) win32munmap((a), (s)) +#define DIRECT_MMAP(s) win32direct_mmap(s) +#endif /* WIN32 */ +#endif /* DL_HAVE_MMAP */ + +#if DL_HAVE_MMAP && DL_HAVE_MREMAP +#define CALL_MREMAP(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv)) +#else /* DL_HAVE_MMAP && DL_HAVE_MREMAP */ +#define CALL_MREMAP(addr, osz, nsz, mv) ((void)(addr),(void)(osz), \ + (void)(nsz), (void)(mv),MFAIL) +#endif /* DL_HAVE_MMAP && DL_HAVE_MREMAP */ + +#if HAVE_MORECORE +#define CALL_MORECORE(S) MORECORE(S) +#else /* HAVE_MORECORE */ +#define CALL_MORECORE(S) MFAIL +#endif /* HAVE_MORECORE */ + +/* mstate bit set if continguous morecore disabled or failed */ +#define USE_NONCONTIGUOUS_BIT (4U) + +/* segment bit set in create_mspace_with_base */ +#define EXTERN_BIT (8U) + + +/* --------------------------- Lock preliminaries ------------------------ */ + +/* + When locks are defined, there are up to two global locks: + + * If HAVE_MORECORE, morecore_mutex protects sequences of calls to + MORECORE. In many cases sys_alloc requires two calls, that should + not be interleaved with calls by other threads. This does not + protect against direct calls to MORECORE by other threads not + using this lock, so there is still code to cope the best we can on + interference. + + * magic_init_mutex ensures that mparams.magic and other + unique mparams values are initialized only once. + + To enable use in layered extensions, locks are reentrant. + + Because lock-protected regions generally have bounded times, we use + the supplied simple spinlocks in the custom versions for x86. + + If USE_LOCKS is > 1, the definitions of lock routines here are + bypassed, in which case you will need to define at least + INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK, and + NULL_LOCK_INITIALIZER, and possibly TRY_LOCK and IS_LOCKED + (The latter two are not used in this malloc, but are + commonly needed in extensions.) +*/ + +#if USE_LOCKS == 1 + +#if USE_SPIN_LOCKS +#ifndef WIN32 +/* Custom pthread-style spin locks on x86 and x64 for gcc */ +struct pthread_mlock_t +{ + volatile pthread_t threadid; + volatile unsigned int c; + volatile unsigned int l; +}; +#define MLOCK_T struct pthread_mlock_t +#define CURRENT_THREAD pthread_self() +#define SPINS_PER_YIELD 63 +static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) { + if(CURRENT_THREAD==sl->threadid) + ++sl->c; + else { + int spins = 0; + for (;;) { + int ret; + __asm__ __volatile__ ("lock cmpxchgl %2,(%1)" : "=a" (ret) : "r" (&sl->l), "r" (1), "a" (0)); + if(!ret) { + dl_assert(!sl->threadid); + sl->threadid=CURRENT_THREAD; + sl->c=1; + break; + } + if ((++spins & SPINS_PER_YIELD) == 0) { +#if defined (__SVR4) && defined (__sun) /* solaris */ + thr_yield(); +#else +#ifdef linux + sched_yield(); +#else /* no-op yield on unknown systems */ + ; +#endif /* linux */ +#endif /* solaris */ + } + } + } + + return 0; +} + +static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) { + int ret; + dl_assert(CURRENT_THREAD==sl->threadid); + if (!--sl->c) { + sl->threadid=0; + __asm__ __volatile__ ("xchgl %2,(%1)" : "=r" (ret) : "r" (&sl->l), "0" (0)); + } +} + +static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) { + int ret; + __asm__ __volatile__ ("lock cmpxchgl %2,(%1)" : "=a" (ret) : "r" (&sl->l), "r" (1), "a" (0)); + if(!ret){ + dl_assert(!sl->threadid); + sl->threadid=CURRENT_THREAD; + sl->c=1; + return 1; + } + return 0; +} + +#define INITIAL_LOCK(sl) (memset((sl), 0, sizeof(MLOCK_T)), 0) +#define ACQUIRE_LOCK(sl) pthread_acquire_lock(sl) +#define RELEASE_LOCK(sl) pthread_release_lock(sl) +#define TRY_LOCK(sl) pthread_try_lock(sl) +#define IS_LOCKED(sl) ((sl)->l) + +static MLOCK_T magic_init_mutex = {0, 0, 0 }; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = {0, 0, 0 }; +#endif /* HAVE_MORECORE */ + +#else /* WIN32 */ +/* Custom win32-style spin locks on x86 and x64 for MSC */ +struct win32_mlock_t +{ + volatile long threadid; + volatile unsigned int c; + long l; +}; +#define MLOCK_T struct win32_mlock_t +#define CURRENT_THREAD GetCurrentThreadId() +#define SPINS_PER_YIELD 63 +static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) { + long mythreadid=CURRENT_THREAD; + if(mythreadid==sl->threadid) + ++sl->c; + else { + int spins = 0; + for (;;) { + if (!interlockedexchange(&sl->l, 1)) { + dl_assert(!sl->threadid); + sl->threadid=mythreadid; + sl->c=1; + break; + } + if ((++spins & SPINS_PER_YIELD) == 0) + SleepEx(0, FALSE); + } + } + return 0; +} + +static FORCEINLINE void win32_release_lock (MLOCK_T *sl) { + dl_assert(CURRENT_THREAD==sl->threadid); + if (!--sl->c) { + sl->threadid=0; + interlockedexchange (&sl->l, 0); + } +} + +static FORCEINLINE int win32_try_lock (MLOCK_T *sl) { + if (!interlockedexchange(&sl->l, 1)){ + dl_assert(!sl->threadid); + sl->threadid=CURRENT_THREAD; + sl->c=1; + return 1; + } + return 0; +} + +#define INITIAL_LOCK(sl) (memset(sl, 0, sizeof(MLOCK_T)), 0) +#define ACQUIRE_LOCK(sl) win32_acquire_lock(sl) +#define RELEASE_LOCK(sl) win32_release_lock(sl) +#define TRY_LOCK(sl) win32_try_lock(sl) +#define IS_LOCKED(sl) ((sl)->l) + +static MLOCK_T magic_init_mutex = {0, 0 }; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = {0, 0 }; +#endif /* HAVE_MORECORE */ + +#endif /* WIN32 */ +#else /* USE_SPIN_LOCKS */ + +#ifndef WIN32 +/* pthreads-based locks */ +struct pthread_mlock_t +{ + volatile unsigned int c; + pthread_mutex_t l; +}; +#define MLOCK_T struct pthread_mlock_t +#define CURRENT_THREAD pthread_self() +static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) { + if(!pthread_mutex_lock(&(sl)->l)){ + sl->c++; + return 0; + } + return 1; +} + +static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) { + --sl->c; + pthread_mutex_unlock(&(sl)->l); +} + +static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) { + if(!pthread_mutex_trylock(&(sl)->l)){ + sl->c++; + return 1; + } + return 0; +} + +static FORCEINLINE int pthread_init_lock (MLOCK_T *sl) { + pthread_mutexattr_t attr; + sl->c=0; + if(pthread_mutexattr_init(&attr)) return 1; + if(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1; + if(pthread_mutex_init(&sl->l, &attr)) return 1; + pthread_mutexattr_destroy(&attr); + return 0; +} + +static FORCEINLINE int pthread_islocked (MLOCK_T *sl) { + if(!pthread_try_lock(sl)){ + int ret = (sl->c != 0); + pthread_mutex_unlock(sl); + return ret; + } + return 0; +} + +#define INITIAL_LOCK(sl) pthread_init_lock(sl) +#define ACQUIRE_LOCK(sl) pthread_acquire_lock(sl) +#define RELEASE_LOCK(sl) pthread_release_lock(sl) +#define TRY_LOCK(sl) pthread_try_lock(sl) +#define IS_LOCKED(sl) pthread_islocked(sl) + +static MLOCK_T magic_init_mutex = {0, PTHREAD_MUTEX_INITIALIZER }; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = {0, PTHREAD_MUTEX_INITIALIZER }; +#endif /* HAVE_MORECORE */ + +#else /* WIN32 */ +/* Win32 critical sections */ +#define MLOCK_T CRITICAL_SECTION +#define CURRENT_THREAD GetCurrentThreadId() +#define INITIAL_LOCK(s) (!InitializeCriticalSectionAndSpinCount((s), 4000) +#define ACQUIRE_LOCK(s) ( (!((s))->DebugInfo ? INITIAL_LOCK((s)) : 0), !EnterCriticalSection((s)), 0) +#define RELEASE_LOCK(s) ( LeaveCriticalSection((s)), 0 ) +#define TRY_LOCK(s) ( TryEnterCriticalSection((s)) ) +#define IS_LOCKED(s) ( (s)->LockCount >= 0 ) +#define NULL_LOCK_INITIALIZER +static MLOCK_T magic_init_mutex; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex; +#endif /* HAVE_MORECORE */ +#endif /* WIN32 */ +#endif /* USE_SPIN_LOCKS */ +#endif /* USE_LOCKS == 1 */ + +/* ----------------------- User-defined locks ------------------------ */ + +#if USE_LOCKS > 1 +/* Define your own lock implementation here */ +/* #define INITIAL_LOCK(sl) ... */ +/* #define ACQUIRE_LOCK(sl) ... */ +/* #define RELEASE_LOCK(sl) ... */ +/* #define TRY_LOCK(sl) ... */ +/* #define IS_LOCKED(sl) ... */ +/* #define NULL_LOCK_INITIALIZER ... */ + +static MLOCK_T magic_init_mutex = NULL_LOCK_INITIALIZER; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = NULL_LOCK_INITIALIZER; +#endif /* HAVE_MORECORE */ +#endif /* USE_LOCKS > 1 */ + +/* ----------------------- Lock-based state ------------------------ */ + + +#if USE_LOCKS +#define USE_LOCK_BIT (2U) +#else /* USE_LOCKS */ +#define USE_LOCK_BIT (0U) +#define INITIAL_LOCK(l) +#endif /* USE_LOCKS */ + +#if USE_LOCKS && HAVE_MORECORE +#define ACQUIRE_MORECORE_LOCK() ACQUIRE_LOCK(&morecore_mutex); +#define RELEASE_MORECORE_LOCK() RELEASE_LOCK(&morecore_mutex); +#else /* USE_LOCKS && HAVE_MORECORE */ +#define ACQUIRE_MORECORE_LOCK() +#define RELEASE_MORECORE_LOCK() +#endif /* USE_LOCKS && HAVE_MORECORE */ + +#if USE_LOCKS +#define ACQUIRE_MAGIC_INIT_LOCK() ACQUIRE_LOCK(&magic_init_mutex); +#define RELEASE_MAGIC_INIT_LOCK() RELEASE_LOCK(&magic_init_mutex); +#else /* USE_LOCKS */ +#define ACQUIRE_MAGIC_INIT_LOCK() +#define RELEASE_MAGIC_INIT_LOCK() +#endif /* USE_LOCKS */ + + +/* ----------------------- Chunk representations ------------------------ */ + +/* + (The following includes lightly edited explanations by Colin Plumb.) + + The malloc_chunk declaration below is misleading (but accurate and + necessary). It declares a "view" into memory allowing access to + necessary fields at known offsets from a given base. + + Chunks of memory are maintained using a `boundary tag' method as + originally described by Knuth. (See the paper by Paul Wilson + ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such + techniques.) Sizes of free chunks are stored both in the front of + each chunk and at the end. This makes consolidating fragmented + chunks into bigger chunks fast. The head fields also hold bits + representing whether chunks are free or in use. + + Here are some pictures to make it clearer. They are "exploded" to + show that the state of a chunk can be thought of as extending from + the high 31 bits of the head field of its header through the + prev_foot and PINUSE_BIT bit of the following chunk header. + + A chunk that's in use looks like: + + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of previous chunk (if P = 1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P| + | Size of this chunk 1| +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + +- -+ + | | + +- -+ + | : + +- size - sizeof(size_t) available payload bytes -+ + : | + chunk-> +- -+ + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1| + | Size of next chunk (may or may not be in use) | +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + And if it's free, it looks like this: + + chunk-> +- -+ + | User payload (must be in use, or we would have merged!) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P| + | Size of this chunk 0| +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next pointer | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prev pointer | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | : + +- size - sizeof(struct chunk) unused bytes -+ + : | + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of this chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0| + | Size of next chunk (must be in use, or we would have merged)| +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | : + +- User payload -+ + : | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |0| + +-+ + Note that since we always merge adjacent free chunks, the chunks + adjacent to a free chunk must be in use. + + Given a pointer to a chunk (which can be derived trivially from the + payload pointer) we can, in O(1) time, find out whether the adjacent + chunks are free, and if so, unlink them from the lists that they + are on and merge them with the current chunk. + + Chunks always begin on even word boundaries, so the mem portion + (which is returned to the user) is also on an even word boundary, and + thus at least double-word aligned. + + The P (PINUSE_BIT) bit, stored in the unused low-order bit of the + chunk size (which is always a multiple of two words), is an in-use + bit for the *previous* chunk. If that bit is *clear*, then the + word before the current chunk size contains the previous chunk + size, and can be used to find the front of the previous chunk. + The very first chunk allocated always has this bit set, preventing + access to non-existent (or non-owned) memory. If pinuse is set for + any given chunk, then you CANNOT determine the size of the + previous chunk, and might even get a memory addressing fault when + trying to do so. + + The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of + the chunk size redundantly records whether the current chunk is + inuse. This redundancy enables usage checks within free and realloc, + and reduces indirection when freeing and consolidating chunks. + + Each freshly allocated chunk must have both cinuse and pinuse set. + That is, each allocated chunk borders either a previously allocated + and still in-use chunk, or the base of its memory arena. This is + ensured by making all allocations from the the `lowest' part of any + found chunk. Further, no free chunk physically borders another one, + so each free chunk is known to be preceded and followed by either + inuse chunks or the ends of memory. + + Note that the `foot' of the current chunk is actually represented + as the prev_foot of the NEXT chunk. This makes it easier to + deal with alignments etc but can be very confusing when trying + to extend or adapt this code. + + The exceptions to all this are + + 1. The special chunk `top' is the top-most available chunk (i.e., + the one bordering the end of available memory). It is treated + specially. Top is never included in any bin, is used only if + no other chunk is available, and is released back to the + system if it is very large (see M_TRIM_THRESHOLD). In effect, + the top chunk is treated as larger (and thus less well + fitting) than any other available chunk. The top chunk + doesn't update its trailing size field since there is no next + contiguous chunk that would have to index off it. However, + space is still allocated for it (TOP_FOOT_SIZE) to enable + separation or merging when space is extended. + + 3. Chunks allocated via mmap, which have the lowest-order bit + (IS_MMAPPED_BIT) set in their prev_foot fields, and do not set + PINUSE_BIT in their head fields. Because they are allocated + one-by-one, each must carry its own prev_foot field, which is + also used to hold the offset this chunk has within its mmapped + region, which is needed to preserve alignment. Each mmapped + chunk is trailed by the first two fields of a fake next-chunk + for sake of usage checks. + +*/ + +struct malloc_chunk { + size_t prev_foot; /* Size of previous chunk (if free). */ + size_t head; /* Size and inuse bits. */ + struct malloc_chunk* fd; /* double links -- used only if free. */ + struct malloc_chunk* bk; +}; + +typedef struct malloc_chunk mchunk; +typedef struct malloc_chunk* mchunkptr; +typedef struct malloc_chunk* sbinptr; /* The type of bins of chunks */ +typedef unsigned int bindex_t; /* Described below */ +typedef unsigned int binmap_t; /* Described below */ +typedef unsigned int flag_t; /* The type of various bit flag sets */ + +/* ------------------- Chunks sizes and alignments ----------------------- */ + +#define MCHUNK_SIZE (sizeof(mchunk)) + +#if FOOTERS +#define CHUNK_OVERHEAD (TWO_SIZE_T_SIZES) +#else /* FOOTERS */ +#define CHUNK_OVERHEAD (SIZE_T_SIZE) +#endif /* FOOTERS */ + +/* MMapped chunks need a second word of overhead ... */ +#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES) +/* ... and additional padding for fake next-chunk at foot */ +#define MMAP_FOOT_PAD (FOUR_SIZE_T_SIZES) + +/* The smallest size we can malloc is an aligned minimal chunk */ +#define MIN_CHUNK_SIZE\ + ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) + +/* conversion from malloc headers to user pointers, and back */ +#define chunk2mem(p) ((void*)((char*)(p) + TWO_SIZE_T_SIZES)) +#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES)) +/* chunk associated with aligned address A */ +#define align_as_chunk(A) (mchunkptr)((A) + align_offset(chunk2mem(A))) + +/* Bounds on request (not chunk) sizes. */ +#define MAX_REQUEST ((-MIN_CHUNK_SIZE) << 2) +#define MIN_REQUEST (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE) + +/* pad request bytes into a usable size */ +#define pad_request(req) \ + (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) + +/* pad request, checking for minimum (but not maximum) */ +#define request2size(req) \ + (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req)) + + +/* ------------------ Operations on head and foot fields ----------------- */ + +/* + The head field of a chunk is or'ed with PINUSE_BIT when previous + adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in + use. If the chunk was obtained with mmap, the prev_foot field has + IS_MMAPPED_BIT set, otherwise holding the offset of the base of the + mmapped region to the base of the chunk. + + FLAG4_BIT is not used by this malloc, but might be useful in extensions. +*/ + +#define PINUSE_BIT (SIZE_T_ONE) +#define CINUSE_BIT (SIZE_T_TWO) +#define FLAG4_BIT (SIZE_T_FOUR) +#define INUSE_BITS (PINUSE_BIT|CINUSE_BIT) +#define FLAG_BITS (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT) + +/* Head value for fenceposts */ +#define FENCEPOST_HEAD (INUSE_BITS|SIZE_T_SIZE) + +/* extraction of fields from head words */ +#define cinuse(p) ((p)->head & CINUSE_BIT) +#define pinuse(p) ((p)->head & PINUSE_BIT) +#define chunksize(p) ((p)->head & ~(FLAG_BITS)) + +#define clear_pinuse(p) ((p)->head &= ~PINUSE_BIT) +#define clear_cinuse(p) ((p)->head &= ~CINUSE_BIT) + +/* Treat space at ptr +/- offset as a chunk */ +#define chunk_plus_offset(p, s) ((mchunkptr)(((char*)(p)) + (s))) +#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s))) + +/* Ptr to next or previous physical malloc_chunk. */ +#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS))) +#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) )) + +/* extract next chunk's pinuse bit */ +#define next_pinuse(p) ((next_chunk(p)->head) & PINUSE_BIT) + +/* Get/set size at footer */ +#define get_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_foot) +#define set_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s)) + +/* Set size, pinuse bit, and foot */ +#define set_size_and_pinuse_of_free_chunk(p, s)\ + ((p)->head = (s|PINUSE_BIT), set_foot(p, s)) + +/* Set size, pinuse bit, foot, and clear next pinuse */ +#define set_free_with_pinuse(p, s, n)\ + (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s)) + +#define is_mmapped(p)\ + (!((p)->head & PINUSE_BIT) && ((p)->prev_foot & IS_MMAPPED_BIT)) + +/* Get the internal overhead associated with chunk p */ +#define overhead_for(p)\ + (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD) + +/* Return true if malloced space is not necessarily cleared */ +#if MMAP_CLEARS +#define calloc_must_clear(p) (!is_mmapped(p)) +#else /* MMAP_CLEARS */ +#define calloc_must_clear(p) (1) +#endif /* MMAP_CLEARS */ + +/* ---------------------- Overlaid data structures ----------------------- */ + +/* + When chunks are not in use, they are treated as nodes of either + lists or trees. + + "Small" chunks are stored in circular doubly-linked lists, and look + like this: + + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of previous chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `head:' | Size of chunk, in bytes |P| + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Forward pointer to next chunk in list | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Back pointer to previous chunk in list | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Unused space (may be 0 bytes long) . + . . + . | +nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `foot:' | Size of chunk, in bytes | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Larger chunks are kept in a form of bitwise digital trees (aka + tries) keyed on chunksizes. Because malloc_tree_chunks are only for + free chunks greater than 256 bytes, their size doesn't impose any + constraints on user chunk sizes. Each node looks like: + + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of previous chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `head:' | Size of chunk, in bytes |P| + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Forward pointer to next chunk of same size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Back pointer to previous chunk of same size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Pointer to left child (child[0]) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Pointer to right child (child[1]) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Pointer to parent | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | bin index of this chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Unused space . + . | +nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `foot:' | Size of chunk, in bytes | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Each tree holding treenodes is a tree of unique chunk sizes. Chunks + of the same size are arranged in a circularly-linked list, with only + the oldest chunk (the next to be used, in our FIFO ordering) + actually in the tree. (Tree members are distinguished by a non-null + parent pointer.) If a chunk with the same size an an existing node + is inserted, it is linked off the existing node using pointers that + work in the same way as fd/bk pointers of small chunks. + + Each tree contains a power of 2 sized range of chunk sizes (the + smallest is 0x100 <= x < 0x180), which is is divided in half at each + tree level, with the chunks in the smaller half of the range (0x100 + <= x < 0x140 for the top nose) in the left subtree and the larger + half (0x140 <= x < 0x180) in the right subtree. This is, of course, + done by inspecting individual bits. + + Using these rules, each node's left subtree contains all smaller + sizes than its right subtree. However, the node at the root of each + subtree has no particular ordering relationship to either. (The + dividing line between the subtree sizes is based on trie relation.) + If we remove the last chunk of a given size from the interior of the + tree, we need to replace it with a leaf node. The tree ordering + rules permit a node to be replaced by any leaf below it. + + The smallest chunk in a tree (a common operation in a best-fit + allocator) can be found by walking a path to the leftmost leaf in + the tree. Unlike a usual binary tree, where we follow left child + pointers until we reach a null, here we follow the right child + pointer any time the left one is null, until we reach a leaf with + both child pointers null. The smallest chunk in the tree will be + somewhere along that path. + + The worst case number of steps to add, find, or remove a node is + bounded by the number of bits differentiating chunks within + bins. Under current bin calculations, this ranges from 6 up to 21 + (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case + is of course much better. +*/ + +struct malloc_tree_chunk { + /* The first four fields must be compatible with malloc_chunk */ + size_t prev_foot; + size_t head; + struct malloc_tree_chunk* fd; + struct malloc_tree_chunk* bk; + + struct malloc_tree_chunk* child[2]; + struct malloc_tree_chunk* parent; + bindex_t index; +}; + +typedef struct malloc_tree_chunk tchunk; +typedef struct malloc_tree_chunk* tchunkptr; +typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */ + +/* A little helper macro for trees */ +#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1]) + +/* ----------------------------- Segments -------------------------------- */ + +/* + Each malloc space may include non-contiguous segments, held in a + list headed by an embedded malloc_segment record representing the + top-most space. Segments also include flags holding properties of + the space. Large chunks that are directly allocated by mmap are not + included in this list. They are instead independently created and + destroyed without otherwise keeping track of them. + + Segment management mainly comes into play for spaces allocated by + MMAP. Any call to MMAP might or might not return memory that is + adjacent to an existing segment. MORECORE normally contiguously + extends the current space, so this space is almost always adjacent, + which is simpler and faster to deal with. (This is why MORECORE is + used preferentially to MMAP when both are available -- see + sys_alloc.) When allocating using MMAP, we don't use any of the + hinting mechanisms (inconsistently) supported in various + implementations of unix mmap, or distinguish reserving from + committing memory. Instead, we just ask for space, and exploit + contiguity when we get it. It is probably possible to do + better than this on some systems, but no general scheme seems + to be significantly better. + + Management entails a simpler variant of the consolidation scheme + used for chunks to reduce fragmentation -- new adjacent memory is + normally prepended or appended to an existing segment. However, + there are limitations compared to chunk consolidation that mostly + reflect the fact that segment processing is relatively infrequent + (occurring only when getting memory from system) and that we + don't expect to have huge numbers of segments: + + * Segments are not indexed, so traversal requires linear scans. (It + would be possible to index these, but is not worth the extra + overhead and complexity for most programs on most platforms.) + * New segments are only appended to old ones when holding top-most + memory; if they cannot be prepended to others, they are held in + different segments. + + Except for the top-most segment of an mstate, each segment record + is kept at the tail of its segment. Segments are added by pushing + segment records onto the list headed by &mstate.seg for the + containing mstate. + + Segment flags control allocation/merge/deallocation policies: + * If EXTERN_BIT set, then we did not allocate this segment, + and so should not try to deallocate or merge with others. + (This currently holds only for the initial segment passed + into create_mspace_with_base.) + * If IS_MMAPPED_BIT set, the segment may be merged with + other surrounding mmapped segments and trimmed/de-allocated + using munmap. + * If neither bit is set, then the segment was obtained using + MORECORE so can be merged with surrounding MORECORE'd segments + and deallocated/trimmed using MORECORE with negative arguments. +*/ + +struct malloc_segment { + char* base; /* base address */ + size_t size; /* allocated size */ + struct malloc_segment* next; /* ptr to next segment */ + flag_t sflags; /* mmap and extern flag */ +}; + +#define is_mmapped_segment(S) ((S)->sflags & IS_MMAPPED_BIT) +#define is_extern_segment(S) ((S)->sflags & EXTERN_BIT) + +typedef struct malloc_segment msegment; +typedef struct malloc_segment* msegmentptr; + +/* ---------------------------- malloc_state ----------------------------- */ + +/* + A malloc_state holds all of the bookkeeping for a space. + The main fields are: + + Top + The topmost chunk of the currently active segment. Its size is + cached in topsize. The actual size of topmost space is + topsize+TOP_FOOT_SIZE, which includes space reserved for adding + fenceposts and segment records if necessary when getting more + space from the system. The size at which to autotrim top is + cached from mparams in trim_check, except that it is disabled if + an autotrim fails. + + Designated victim (dv) + This is the preferred chunk for servicing small requests that + don't have exact fits. It is normally the chunk split off most + recently to service another small request. Its size is cached in + dvsize. The link fields of this chunk are not maintained since it + is not kept in a bin. + + SmallBins + An array of bin headers for free chunks. These bins hold chunks + with sizes less than MIN_LARGE_SIZE bytes. Each bin contains + chunks of all the same size, spaced 8 bytes apart. To simplify + use in double-linked lists, each bin header acts as a malloc_chunk + pointing to the real first node, if it exists (else pointing to + itself). This avoids special-casing for headers. But to avoid + waste, we allocate only the fd/bk pointers of bins, and then use + repositioning tricks to treat these as the fields of a chunk. + + TreeBins + Treebins are pointers to the roots of trees holding a range of + sizes. There are 2 equally spaced treebins for each power of two + from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything + larger. + + Bin maps + There is one bit map for small bins ("smallmap") and one for + treebins ("treemap). Each bin sets its bit when non-empty, and + clears the bit when empty. Bit operations are then used to avoid + bin-by-bin searching -- nearly all "search" is done without ever + looking at bins that won't be selected. The bit maps + conservatively use 32 bits per map word, even if on 64bit system. + For a good description of some of the bit-based techniques used + here, see Henry S. Warren Jr's book "Hacker's Delight" (and + supplement at http://hackersdelight.org/). Many of these are + intended to reduce the branchiness of paths through malloc etc, as + well as to reduce the number of memory locations read or written. + + Segments + A list of segments headed by an embedded malloc_segment record + representing the initial space. + + Address check support + The least_addr field is the least address ever obtained from + MORECORE or MMAP. Attempted frees and reallocs of any address less + than this are trapped (unless INSECURE is defined). + + Magic tag + A cross-check field that should always hold same value as mparams.magic. + + Flags + Bits recording whether to use MMAP, locks, or contiguous MORECORE + + Statistics + Each space keeps track of current and maximum system memory + obtained via MORECORE or MMAP. + + Trim support + Fields holding the amount of unused topmost memory that should trigger + timming, and a counter to force periodic scanning to release unused + non-topmost segments. + + Locking + If USE_LOCKS is defined, the "mutex" lock is acquired and released + around every public call using this mspace. + + Extension support + A void* pointer and a size_t field that can be used to help implement + extensions to this malloc. +*/ + +/* Bin types, widths and sizes */ +#define NSMALLBINS (32U) +#define NTREEBINS (32U) +#define SMALLBIN_SHIFT (3U) +#define SMALLBIN_WIDTH (SIZE_T_ONE << SMALLBIN_SHIFT) +#define TREEBIN_SHIFT (8U) +#define MIN_LARGE_SIZE (SIZE_T_ONE << TREEBIN_SHIFT) +#define MAX_SMALL_SIZE (MIN_LARGE_SIZE - SIZE_T_ONE) +#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD) + +struct malloc_state { + binmap_t smallmap; + binmap_t treemap; + size_t dvsize; + size_t topsize; + char* least_addr; + mchunkptr dv; + mchunkptr top; + size_t trim_check; + size_t release_checks; + size_t magic; + mchunkptr smallbins[(NSMALLBINS+1)*2]; + tbinptr treebins[NTREEBINS]; + size_t footprint; + size_t max_footprint; + flag_t mflags; +#if USE_LOCKS + MLOCK_T mutex; /* locate lock among fields that rarely change */ +#endif /* USE_LOCKS */ + msegment seg; + void* extp; /* Unused but available for extensions */ + size_t exts; +}; + +typedef struct malloc_state* mstate; + +/* ------------- Global malloc_state and malloc_params ------------------- */ + +/* + malloc_params holds global properties, including those that can be + dynamically set using mallopt. There is a single instance, mparams, + initialized in init_mparams. +*/ + +struct malloc_params { + size_t magic; + size_t page_size; + size_t granularity; + size_t mmap_threshold; + size_t trim_threshold; + flag_t default_mflags; +}; + +static struct malloc_params mparams; + +#if !ONLY_MSPACES + +/* The global malloc_state used for all non-"mspace" calls */ +static struct malloc_state _gm_; +#define gm (&_gm_) +#define is_global(M) ((M) == &_gm_) + +#endif /* !ONLY_MSPACES */ + +#define is_initialized(M) ((M)->top != 0) + +/* -------------------------- system alloc setup ------------------------- */ + +/* Operations on mflags */ + +#define use_lock(M) ((M)->mflags & USE_LOCK_BIT) +#define enable_lock(M) ((M)->mflags |= USE_LOCK_BIT) +#define disable_lock(M) ((M)->mflags &= ~USE_LOCK_BIT) + +#define use_mmap(M) ((M)->mflags & USE_MMAP_BIT) +#define enable_mmap(M) ((M)->mflags |= USE_MMAP_BIT) +#define disable_mmap(M) ((M)->mflags &= ~USE_MMAP_BIT) + +#define use_noncontiguous(M) ((M)->mflags & USE_NONCONTIGUOUS_BIT) +#define disable_contiguous(M) ((M)->mflags |= USE_NONCONTIGUOUS_BIT) + +#define set_lock(M,L)\ + ((M)->mflags = (L)?\ + ((M)->mflags | USE_LOCK_BIT) :\ + ((M)->mflags & ~USE_LOCK_BIT)) + +/* page-align a size */ +#define page_align(S)\ + (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE)) + +/* granularity-align a size */ +#define granularity_align(S)\ + (((S) + (mparams.granularity - SIZE_T_ONE))\ + & ~(mparams.granularity - SIZE_T_ONE)) + + +/* For mmap, use granularity alignment on windows, else page-align */ +#ifdef WIN32 +#define mmap_align(S) granularity_align(S) +#else +#define mmap_align(S) page_align(S) +#endif + +#define is_page_aligned(S)\ + (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0) +#define is_granularity_aligned(S)\ + (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0) + +/* True if segment S holds address A */ +#define segment_holds(S, A)\ + ((char*)(A) >= S->base && (char*)(A) < S->base + S->size) + +/* Return segment holding given address */ +static msegmentptr segment_holding(mstate m, char* addr) { + msegmentptr sp = &m->seg; + for (;;) { + if (addr >= sp->base && addr < sp->base + sp->size) + return sp; + if ((sp = sp->next) == 0) + return 0; + } +} + +/* Return true if segment contains a segment link */ +static int has_segment_link(mstate m, msegmentptr ss) { + msegmentptr sp = &m->seg; + for (;;) { + if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size) + return 1; + if ((sp = sp->next) == 0) + return 0; + } +} + +#ifndef MORECORE_CANNOT_TRIM +#define should_trim(M,s) ((s) > (M)->trim_check) +#else /* MORECORE_CANNOT_TRIM */ +#define should_trim(M,s) (0) +#endif /* MORECORE_CANNOT_TRIM */ + +/* + TOP_FOOT_SIZE is padding at the end of a segment, including space + that may be needed to place segment records and fenceposts when new + noncontiguous segments are added. +*/ +#define TOP_FOOT_SIZE\ + (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE) + + +/* ------------------------------- Hooks -------------------------------- */ + +/* + PREACTION should be defined to return 0 on success, and nonzero on + failure. If you are not using locking, you can redefine these to do + anything you like. +*/ + +#if USE_LOCKS + +/* Ensure locks are initialized */ +#define GLOBALLY_INITIALIZE() (mparams.page_size == 0 && init_mparams()) + +#define PREACTION(M) ((GLOBALLY_INITIALIZE() || use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0) +#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); } +#else /* USE_LOCKS */ + +#ifndef PREACTION +#define PREACTION(M) (0) +#endif /* PREACTION */ + +#ifndef POSTACTION +#define POSTACTION(M) +#endif /* POSTACTION */ + +#endif /* USE_LOCKS */ + +/* + CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses. + USAGE_ERROR_ACTION is triggered on detected bad frees and + reallocs. The argument p is an address that might have triggered the + fault. It is ignored by the two predefined actions, but might be + useful in custom actions that try to help diagnose errors. +*/ + +#if PROCEED_ON_ERROR + +/* A count of the number of corruption errors causing resets */ +int malloc_corruption_error_count; + +/* default corruption action */ +static void reset_on_error(mstate m); + +#define CORRUPTION_ERROR_ACTION(m) reset_on_error(m) +#define USAGE_ERROR_ACTION(m, p) + +#else /* PROCEED_ON_ERROR */ + +#ifndef CORRUPTION_ERROR_ACTION +#define CORRUPTION_ERROR_ACTION(m) ABORT +#endif /* CORRUPTION_ERROR_ACTION */ + +#ifndef USAGE_ERROR_ACTION +#define USAGE_ERROR_ACTION(m,p) ABORT +#endif /* USAGE_ERROR_ACTION */ + +#endif /* PROCEED_ON_ERROR */ + +/* -------------------------- Debugging setup ---------------------------- */ + +#if ! DL_DEBUG + +#define check_free_chunk(M,P) +#define check_inuse_chunk(M,P) +#define check_malloced_chunk(M,P,N) +#define check_mmapped_chunk(M,P) +#define check_malloc_state(M) +#define check_top_chunk(M,P) + +#else /* DL_DEBUG */ +#define check_free_chunk(M,P) do_check_free_chunk(M,P) +#define check_inuse_chunk(M,P) do_check_inuse_chunk(M,P) +#define check_top_chunk(M,P) do_check_top_chunk(M,P) +#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N) +#define check_mmapped_chunk(M,P) do_check_mmapped_chunk(M,P) +#define check_malloc_state(M) do_check_malloc_state(M) + +static void do_check_any_chunk(mstate m, mchunkptr p); +static void do_check_top_chunk(mstate m, mchunkptr p); +static void do_check_mmapped_chunk(mstate m, mchunkptr p); +static void do_check_inuse_chunk(mstate m, mchunkptr p); +static void do_check_free_chunk(mstate m, mchunkptr p); +static void do_check_malloced_chunk(mstate m, void* mem, size_t s); +static void do_check_tree(mstate m, tchunkptr t); +static void do_check_treebin(mstate m, bindex_t i); +static void do_check_smallbin(mstate m, bindex_t i); +static void do_check_malloc_state(mstate m); +static int bin_find(mstate m, mchunkptr x); +static size_t traverse_and_check(mstate m); +#endif /* DL_DEBUG */ + +/* ---------------------------- Indexing Bins ---------------------------- */ + +#define is_small(s) (((s) >> SMALLBIN_SHIFT) < NSMALLBINS) +#define small_index(s) ((s) >> SMALLBIN_SHIFT) +#define small_index2size(i) ((i) << SMALLBIN_SHIFT) +#define MIN_SMALL_INDEX (small_index(MIN_CHUNK_SIZE)) + +/* addressing by index. See above about smallbin repositioning */ +#define smallbin_at(M, i) ((sbinptr)((char*)&((M)->smallbins[(i)<<1]))) +#define treebin_at(M,i) (&((M)->treebins[i])) + +/* assign tree index for size S to variable I */ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define compute_tree_index(S, I)\ +{\ + unsigned int X = S >> TREEBIN_SHIFT;\ + if (X == 0)\ + I = 0;\ + else if (X > 0xFFFF)\ + I = NTREEBINS-1;\ + else {\ + unsigned int K;\ + __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g" (X));\ + I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ + }\ +} + +#elif defined(_MSC_VER) && _MSC_VER>=1300 +#define compute_tree_index(S, I)\ +{\ + size_t X = S >> TREEBIN_SHIFT;\ + if (X == 0)\ + I = 0;\ + else if (X > 0xFFFF)\ + I = NTREEBINS-1;\ + else {\ + unsigned int K;\ + _BitScanReverse((DWORD *) &K, X);\ + I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ + }\ +} +#else /* GNUC */ +#define compute_tree_index(S, I)\ +{\ + size_t X = S >> TREEBIN_SHIFT;\ + if (X == 0)\ + I = 0;\ + else if (X > 0xFFFF)\ + I = NTREEBINS-1;\ + else {\ + unsigned int Y = (unsigned int)X;\ + unsigned int N = ((Y - 0x100) >> 16) & 8;\ + unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\ + N += K;\ + N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\ + K = 14 - N + ((Y <<= K) >> 15);\ + I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\ + }\ +} +#endif /* GNUC */ + +/* Bit representing maximum resolved size in a treebin at i */ +#define bit_for_tree_index(i) \ + (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2) + +/* Shift placing maximum resolved bit in a treebin at i as sign bit */ +#define leftshift_for_tree_index(i) \ + ((i == NTREEBINS-1)? 0 : \ + ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2))) + +/* The size of the smallest chunk held in bin with index i */ +#define minsize_for_tree_index(i) \ + ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) | \ + (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1))) + + +/* ------------------------ Operations on bin maps ----------------------- */ + +/* bit corresponding to given index */ +#define idx2bit(i) ((binmap_t)(1) << (i)) + +/* Mark/Clear bits with given index */ +#define mark_smallmap(M,i) ((M)->smallmap |= idx2bit(i)) +#define clear_smallmap(M,i) ((M)->smallmap &= ~idx2bit(i)) +#define smallmap_is_marked(M,i) ((M)->smallmap & idx2bit(i)) + +#define mark_treemap(M,i) ((M)->treemap |= idx2bit(i)) +#define clear_treemap(M,i) ((M)->treemap &= ~idx2bit(i)) +#define treemap_is_marked(M,i) ((M)->treemap & idx2bit(i)) + +/* index corresponding to given bit */ + +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define compute_bit2idx(X, I)\ +{\ + unsigned int J;\ + __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\ + I = (bindex_t)J;\ +} +#elif defined(_MSC_VER) && _MSC_VER>=1300 +#define compute_bit2idx(X, I)\ +{\ + unsigned int J;\ + _BitScanForward((DWORD *) &J, X);\ + I = (bindex_t)J;\ +} + +#else /* GNUC */ +#if USE_BUILTIN_FFS +#define compute_bit2idx(X, I) I = ffs(X)-1 + +#else /* USE_BUILTIN_FFS */ +#define compute_bit2idx(X, I)\ +{\ + unsigned int Y = X - 1;\ + unsigned int K = Y >> (16-4) & 16;\ + unsigned int N = K; Y >>= K;\ + N += K = Y >> (8-3) & 8; Y >>= K;\ + N += K = Y >> (4-2) & 4; Y >>= K;\ + N += K = Y >> (2-1) & 2; Y >>= K;\ + N += K = Y >> (1-0) & 1; Y >>= K;\ + I = (bindex_t)(N + Y);\ +} +#endif /* USE_BUILTIN_FFS */ +#endif /* GNUC */ + +/* isolate the least set bit of a bitmap */ +#define least_bit(x) ((x) & -(x)) + +/* mask with all bits to left of least bit of x on */ +#define left_bits(x) ((x<<1) | -(x<<1)) + +/* mask with all bits to left of or equal to least bit of x on */ +#define same_or_left_bits(x) ((x) | -(x)) + + +/* ----------------------- Runtime Check Support ------------------------- */ + +/* + For security, the main invariant is that malloc/free/etc never + writes to a static address other than malloc_state, unless static + malloc_state itself has been corrupted, which cannot occur via + malloc (because of these checks). In essence this means that we + believe all pointers, sizes, maps etc held in malloc_state, but + check all of those linked or offsetted from other embedded data + structures. These checks are interspersed with main code in a way + that tends to minimize their run-time cost. + + When FOOTERS is defined, in addition to range checking, we also + verify footer fields of inuse chunks, which can be used guarantee + that the mstate controlling malloc/free is intact. This is a + streamlined version of the approach described by William Robertson + et al in "Run-time Detection of Heap-based Overflows" LISA'03 + http://www.usenix.org/events/lisa03/tech/robertson.html The footer + of an inuse chunk holds the xor of its mstate and a random seed, + that is checked upon calls to free() and realloc(). This is + (probablistically) unguessable from outside the program, but can be + computed by any code successfully malloc'ing any chunk, so does not + itself provide protection against code that has already broken + security through some other means. Unlike Robertson et al, we + always dynamically check addresses of all offset chunks (previous, + next, etc). This turns out to be cheaper than relying on hashes. +*/ + +#if !INSECURE +/* Check if address a is at least as high as any from MORECORE or MMAP */ +#define ok_address(M, a) ((char*)(a) >= (M)->least_addr) +/* Check if address of next chunk n is higher than base chunk p */ +#define ok_next(p, n) ((char*)(p) < (char*)(n)) +/* Check if p has its cinuse bit on */ +#define ok_cinuse(p) cinuse(p) +/* Check if p has its pinuse bit on */ +#define ok_pinuse(p) pinuse(p) + +#else /* !INSECURE */ +#define ok_address(M, a) (1) +#define ok_next(b, n) (1) +#define ok_cinuse(p) (1) +#define ok_pinuse(p) (1) +#endif /* !INSECURE */ + +#if (FOOTERS && !INSECURE) +/* Check if (alleged) mstate m has expected magic field */ +/* Modified by sasha: also check that address is within memheap */ +#define ok_magic(M) ((char *)(M) >= (char *)gm->least_addr && (M)->magic == mparams.magic) +#else /* (FOOTERS && !INSECURE) */ +#define ok_magic(M) (1) +#endif /* (FOOTERS && !INSECURE) */ + + +/* In gcc, use __builtin_expect to minimize impact of checks */ +#if !INSECURE +#if defined(__GNUC__) && __GNUC__ >= 3 +#define RTCHECK(e) __builtin_expect(e, 1) +#else /* GNUC */ +#define RTCHECK(e) (e) +#endif /* GNUC */ +#else /* !INSECURE */ +#define RTCHECK(e) (1) +#endif /* !INSECURE */ + +/* macros to set up inuse chunks with or without footers */ + +#if !FOOTERS + +#define mark_inuse_foot(M,p,s) + +/* Set cinuse bit and pinuse bit of next chunk */ +#define set_inuse(M,p,s)\ + ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\ + ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT) + +/* Set cinuse and pinuse of this chunk and pinuse of next chunk */ +#define set_inuse_and_pinuse(M,p,s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ + ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT) + +/* Set size, cinuse and pinuse bit of this chunk */ +#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT)) + +#else /* FOOTERS */ + +/* Set foot of inuse chunk to be xor of mstate and seed */ +#define mark_inuse_foot(M,p,s)\ + (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic)) + +#define get_mstate_for(p)\ + ((mstate)(((mchunkptr)((char*)(p) +\ + (chunksize(p))))->prev_foot ^ mparams.magic)) + +#define set_inuse(M,p,s)\ + ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\ + (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \ + mark_inuse_foot(M,p,s)) + +#define set_inuse_and_pinuse(M,p,s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ + (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\ + mark_inuse_foot(M,p,s)) + +#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ + mark_inuse_foot(M, p, s)) + +#endif /* !FOOTERS */ + +/* ---------------------------- setting mparams -------------------------- */ + +/* Initialize mparams */ +static int init_mparams(void) { + if (mparams.page_size == 0) { + size_t s; + + mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD; + mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD; +#if MORECORE_CONTIGUOUS + mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT; +#else /* MORECORE_CONTIGUOUS */ + mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT; +#endif /* MORECORE_CONTIGUOUS */ + +#if (FOOTERS && !INSECURE) + { +#if USE_DEV_RANDOM + int fd; + unsigned char buf[sizeof(size_t)]; + /* Try to use /dev/urandom, else fall back on using time */ + if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 && + read(fd, buf, sizeof(buf)) == sizeof(buf)) { + s = *((size_t *) buf); + close(fd); + } + else +#endif /* USE_DEV_RANDOM */ + s = (size_t)(time(0) ^ (size_t)0x55555555U); + + s |= (size_t)8U; /* ensure nonzero */ + s &= ~(size_t)7U; /* improve chances of fault for bad values */ + + } +#else /* (FOOTERS && !INSECURE) */ + s = (size_t)0x58585858U; +#endif /* (FOOTERS && !INSECURE) */ + ACQUIRE_MAGIC_INIT_LOCK(); + if (mparams.magic == 0) { + mparams.magic = s; +#if !ONLY_MSPACES + /* Set up lock for main malloc area */ + INITIAL_LOCK(&gm->mutex); + gm->mflags = mparams.default_mflags; +#endif + } + RELEASE_MAGIC_INIT_LOCK(); + +#ifndef WIN32 + mparams.page_size = malloc_getpagesize; + mparams.granularity = ((DEFAULT_GRANULARITY != 0)? + DEFAULT_GRANULARITY : mparams.page_size); +#else /* WIN32 */ + { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + mparams.page_size = system_info.dwPageSize; + mparams.granularity = system_info.dwAllocationGranularity; + } +#endif /* WIN32 */ + + /* Sanity-check configuration: + size_t must be unsigned and as wide as pointer type. + ints must be at least 4 bytes. + alignment must be at least 8. + Alignment, min chunk size, and page size must all be powers of 2. + */ + if ((sizeof(size_t) != sizeof(char*)) || + (MAX_SIZE_T < MIN_CHUNK_SIZE) || + (sizeof(int) < 4) || + (MALLOC_ALIGNMENT < (size_t)8U) || + ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) || + ((MCHUNK_SIZE & (MCHUNK_SIZE-SIZE_T_ONE)) != 0) || + ((mparams.granularity & (mparams.granularity-SIZE_T_ONE)) != 0) || + ((mparams.page_size & (mparams.page_size-SIZE_T_ONE)) != 0)) + ABORT; + } + return 0; +} + +/* support for mallopt */ +static int change_mparam(int param_number, int value) { + size_t val = (size_t)value; + init_mparams(); + switch(param_number) { + case M_TRIM_THRESHOLD: + mparams.trim_threshold = val; + return 1; + case M_GRANULARITY: + if (val >= mparams.page_size && ((val & (val-1)) == 0)) { + mparams.granularity = val; + return 1; + } + else + return 0; + case M_MMAP_THRESHOLD: + mparams.mmap_threshold = val; + return 1; + default: + return 0; + } +} + +#if DL_DEBUG +/* ------------------------- Debugging Support --------------------------- */ + +/* Check properties of any chunk, whether free, inuse, mmapped etc */ +static void do_check_any_chunk(mstate m, mchunkptr p) { + dl_assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); + dl_assert(ok_address(m, p)); +} + +/* Check properties of top chunk */ +static void do_check_top_chunk(mstate m, mchunkptr p) { + msegmentptr sp = segment_holding(m, (char*)p); + size_t sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */ + dl_assert(sp != 0); + dl_assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); + dl_assert(ok_address(m, p)); + dl_assert(sz == m->topsize); + dl_assert(sz > 0); + dl_assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE); + dl_assert(pinuse(p)); + dl_assert(!pinuse(chunk_plus_offset(p, sz))); +} + +/* Check properties of (inuse) mmapped chunks */ +static void do_check_mmapped_chunk(mstate m, mchunkptr p) { + size_t sz = chunksize(p); + size_t len = (sz + (p->prev_foot & ~IS_MMAPPED_BIT) + MMAP_FOOT_PAD); + dl_assert(is_mmapped(p)); + dl_assert(use_mmap(m)); + dl_assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); + dl_assert(ok_address(m, p)); + dl_assert(!is_small(sz)); + dl_assert((len & (mparams.page_size-SIZE_T_ONE)) == 0); + dl_assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD); + dl_assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0); +} + +/* Check properties of inuse chunks */ +static void do_check_inuse_chunk(mstate m, mchunkptr p) { + do_check_any_chunk(m, p); + dl_assert(cinuse(p)); + dl_assert(next_pinuse(p)); + /* If not pinuse and not mmapped, previous chunk has OK offset */ + dl_assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p); + if (is_mmapped(p)) + do_check_mmapped_chunk(m, p); +} + +/* Check properties of free chunks */ +static void do_check_free_chunk(mstate m, mchunkptr p) { + size_t sz = chunksize(p); + mchunkptr next = chunk_plus_offset(p, sz); + do_check_any_chunk(m, p); + dl_assert(!cinuse(p)); + dl_assert(!next_pinuse(p)); + assert (!is_mmapped(p)); + if (p != m->dv && p != m->top) { + if (sz >= MIN_CHUNK_SIZE) { + dl_assert((sz & CHUNK_ALIGN_MASK) == 0); + dl_assert(is_aligned(chunk2mem(p))); + dl_assert(next->prev_foot == sz); + dl_assert(pinuse(p)); + assert (next == m->top || cinuse(next)); + dl_assert(p->fd->bk == p); + dl_assert(p->bk->fd == p); + } + else /* markers are always of size SIZE_T_SIZE */ + dl_assert(sz == SIZE_T_SIZE); + } +} + +/* Check properties of malloced chunks at the point they are malloced */ +static void do_check_malloced_chunk(mstate m, void* mem, size_t s) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); + size_t sz = p->head & ~(PINUSE_BIT|CINUSE_BIT); + do_check_inuse_chunk(m, p); + dl_assert((sz & CHUNK_ALIGN_MASK) == 0); + dl_assert(sz >= MIN_CHUNK_SIZE); + dl_assert(sz >= s); + /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */ + dl_assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE)); + } +} + +/* Check a tree and its subtrees. */ +static void do_check_tree(mstate m, tchunkptr t) { + tchunkptr head = 0; + tchunkptr u = t; + bindex_t tindex = t->index; + size_t tsize = chunksize(t); + bindex_t idx; + compute_tree_index(tsize, idx); + dl_assert(tindex == idx); + dl_assert(tsize >= MIN_LARGE_SIZE); + dl_assert(tsize >= minsize_for_tree_index(idx)); + dl_assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1)))); + + do { /* traverse through chain of same-sized nodes */ + do_check_any_chunk(m, ((mchunkptr)u)); + dl_assert(u->index == tindex); + dl_assert(chunksize(u) == tsize); + dl_assert(!cinuse(u)); + dl_assert(!next_pinuse(u)); + dl_assert(u->fd->bk == u); + dl_assert(u->bk->fd == u); + if (u->parent == 0) { + dl_assert(u->child[0] == 0); + dl_assert(u->child[1] == 0); + } + else { + dl_assert(head == 0); /* only one node on chain has parent */ + head = u; + dl_assert(u->parent != u); + assert (u->parent->child[0] == u || + u->parent->child[1] == u || + *((tbinptr*)(u->parent)) == u); + if (u->child[0] != 0) { + dl_assert(u->child[0]->parent == u); + dl_assert(u->child[0] != u); + do_check_tree(m, u->child[0]); + } + if (u->child[1] != 0) { + dl_assert(u->child[1]->parent == u); + dl_assert(u->child[1] != u); + do_check_tree(m, u->child[1]); + } + if (u->child[0] != 0 && u->child[1] != 0) { + dl_assert(chunksize(u->child[0]) < chunksize(u->child[1])); + } + } + u = u->fd; + } while (u != t); + dl_assert(head != 0); +} + +/* Check all the chunks in a treebin. */ +static void do_check_treebin(mstate m, bindex_t i) { + tbinptr* tb = treebin_at(m, i); + tchunkptr t = *tb; + int empty = (m->treemap & (1U << i)) == 0; + if (t == 0) + dl_assert(empty); + if (!empty) + do_check_tree(m, t); +} + +/* Check all the chunks in a smallbin. */ +static void do_check_smallbin(mstate m, bindex_t i) { + sbinptr b = smallbin_at(m, i); + mchunkptr p = b->bk; + unsigned int empty = (m->smallmap & (1U << i)) == 0; + if (p == b) + dl_assert(empty); + if (!empty) { + for (; p != b; p = p->bk) { + size_t size = chunksize(p); + mchunkptr q; + /* each chunk claims to be free */ + do_check_free_chunk(m, p); + /* chunk belongs in bin */ + dl_assert(small_index(size) == i); + dl_assert(p->bk == b || chunksize(p->bk) == chunksize(p)); + /* chunk is followed by an inuse chunk */ + q = next_chunk(p); + if (q->head != FENCEPOST_HEAD) + do_check_inuse_chunk(m, q); + } + } +} + +/* Find x in a bin. Used in other check functions. */ +static int bin_find(mstate m, mchunkptr x) { + size_t size = chunksize(x); + if (is_small(size)) { + bindex_t sidx = small_index(size); + sbinptr b = smallbin_at(m, sidx); + if (smallmap_is_marked(m, sidx)) { + mchunkptr p = b; + do { + if (p == x) + return 1; + } while ((p = p->fd) != b); + } + } + else { + bindex_t tidx; + compute_tree_index(size, tidx); + if (treemap_is_marked(m, tidx)) { + tchunkptr t = *treebin_at(m, tidx); + size_t sizebits = size << leftshift_for_tree_index(tidx); + while (t != 0 && chunksize(t) != size) { + t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]; + sizebits <<= 1; + } + if (t != 0) { + tchunkptr u = t; + do { + if (u == (tchunkptr)x) + return 1; + } while ((u = u->fd) != t); + } + } + } + return 0; +} + +/* Traverse each chunk and check it; return total */ +static size_t traverse_and_check(mstate m) { + size_t sum = 0; + if (is_initialized(m)) { + msegmentptr s = &m->seg; + sum += m->topsize + TOP_FOOT_SIZE; + while (s != 0) { + mchunkptr q = align_as_chunk(s->base); + mchunkptr lastq = 0; + dl_assert(pinuse(q)); + while (segment_holds(s, q) && + q != m->top && q->head != FENCEPOST_HEAD) { + sum += chunksize(q); + if (cinuse(q)) { + dl_assert(!bin_find(m, q)); + do_check_inuse_chunk(m, q); + } + else { + dl_assert(q == m->dv || bin_find(m, q)); + dl_assert(lastq == 0 || cinuse(lastq)); /* Not 2 consecutive free */ + do_check_free_chunk(m, q); + } + lastq = q; + q = next_chunk(q); + } + s = s->next; + } + } + return sum; +} + +/* Check all properties of malloc_state. */ +static void do_check_malloc_state(mstate m) { + bindex_t i; + size_t total; + /* check bins */ + for (i = 0; i < NSMALLBINS; ++i) + do_check_smallbin(m, i); + for (i = 0; i < NTREEBINS; ++i) + do_check_treebin(m, i); + + if (m->dvsize != 0) { /* check dv chunk */ + do_check_any_chunk(m, m->dv); + dl_assert(m->dvsize == chunksize(m->dv)); + dl_assert(m->dvsize >= MIN_CHUNK_SIZE); + dl_assert(bin_find(m, m->dv) == 0); + } + + if (m->top != 0) { /* check top chunk */ + do_check_top_chunk(m, m->top); + /*dl_assert(m->topsize == chunksize(m->top)); redundant */ + dl_assert(m->topsize > 0); + dl_assert(bin_find(m, m->top) == 0); + } + + total = traverse_and_check(m); + dl_assert(total <= m->footprint); + dl_assert(m->footprint <= m->max_footprint); +} +#endif /* DL_DEBUG */ + +/* ----------------------------- statistics ------------------------------ */ + +#if !NO_MALLINFO +static struct mallinfo internal_mallinfo(mstate m) { + struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + if (!PREACTION(m)) { + check_malloc_state(m); + if (is_initialized(m)) { + size_t nfree = SIZE_T_ONE; /* top always free */ + size_t mfree = m->topsize + TOP_FOOT_SIZE; + size_t sum = mfree; + msegmentptr s = &m->seg; + while (s != 0) { + mchunkptr q = align_as_chunk(s->base); + while (segment_holds(s, q) && + q != m->top && q->head != FENCEPOST_HEAD) { + size_t sz = chunksize(q); + sum += sz; + if (!cinuse(q)) { + mfree += sz; + ++nfree; + } + q = next_chunk(q); + } + s = s->next; + } + + nm.arena = sum; + nm.ordblks = nfree; + nm.hblkhd = m->footprint - sum; + nm.usmblks = m->max_footprint; + nm.uordblks = m->footprint - mfree; + nm.fordblks = mfree; + nm.keepcost = m->topsize; + } + + POSTACTION(m); + } + return nm; +} +#endif /* !NO_MALLINFO */ + +static void internal_malloc_stats(mstate m) { + if (!PREACTION(m)) { + size_t maxfp = 0; + size_t fp = 0; + size_t used = 0; + check_malloc_state(m); + if (is_initialized(m)) { + msegmentptr s = &m->seg; + maxfp = m->max_footprint; + fp = m->footprint; + used = fp - (m->topsize + TOP_FOOT_SIZE); + + while (s != 0) { + mchunkptr q = align_as_chunk(s->base); + while (segment_holds(s, q) && + q != m->top && q->head != FENCEPOST_HEAD) { + if (!cinuse(q)) + used -= chunksize(q); + q = next_chunk(q); + } + s = s->next; + } + } + + fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp)); + fprintf(stderr, "system bytes = %10lu\n", (unsigned long)(fp)); + fprintf(stderr, "in use bytes = %10lu\n", (unsigned long)(used)); + + POSTACTION(m); + } +} + +/* ----------------------- Operations on smallbins ----------------------- */ + +/* + Various forms of linking and unlinking are defined as macros. Even + the ones for trees, which are very long but have very short typical + paths. This is ugly but reduces reliance on inlining support of + compilers. +*/ + +/* Link a free chunk into a smallbin */ +#define insert_small_chunk(M, P, S) {\ + bindex_t IDX = small_index(S);\ + mchunkptr B = smallbin_at(M, IDX);\ + mchunkptr F = B;\ + dl_assert(S >= MIN_CHUNK_SIZE);\ + if (!smallmap_is_marked(M, IDX))\ + mark_smallmap(M, IDX);\ + else if (RTCHECK(ok_address(M, B->fd)))\ + F = B->fd;\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + B->fd = P;\ + F->bk = P;\ + P->fd = F;\ + P->bk = B;\ +} + +/* Unlink a chunk from a smallbin */ +#define unlink_small_chunk(M, P, S) {\ + mchunkptr F = P->fd;\ + mchunkptr B = P->bk;\ + bindex_t IDX = small_index(S);\ + dl_assert(P != B);\ + dl_assert(P != F);\ + dl_assert(chunksize(P) == small_index2size(IDX));\ + if (F == B)\ + clear_smallmap(M, IDX);\ + else if (RTCHECK((F == smallbin_at(M,IDX) || ok_address(M, F)) &&\ + (B == smallbin_at(M,IDX) || ok_address(M, B)))) {\ + F->bk = B;\ + B->fd = F;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ +} + +/* Unlink the first chunk from a smallbin */ +#define unlink_first_small_chunk(M, B, P, IDX) {\ + mchunkptr F = P->fd;\ + dl_assert(P != B);\ + dl_assert(P != F);\ + dl_assert(chunksize(P) == small_index2size(IDX));\ + if (B == F)\ + clear_smallmap(M, IDX);\ + else if (RTCHECK(ok_address(M, F))) {\ + B->fd = F;\ + F->bk = B;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ +} + +/* Replace dv node, binning the old one */ +/* Used only when dvsize known to be small */ +#define replace_dv(M, P, S) {\ + size_t DVS = M->dvsize;\ + if (DVS != 0) {\ + mchunkptr DV = M->dv;\ + dl_assert(is_small(DVS));\ + insert_small_chunk(M, DV, DVS);\ + }\ + M->dvsize = S;\ + M->dv = P;\ +} + +/* ------------------------- Operations on trees ------------------------- */ + +/* Insert chunk into tree */ +#define insert_large_chunk(M, X, S) {\ + tbinptr* H;\ + bindex_t IDX;\ + compute_tree_index(S, IDX);\ + H = treebin_at(M, IDX);\ + X->index = IDX;\ + X->child[0] = X->child[1] = 0;\ + if (!treemap_is_marked(M, IDX)) {\ + mark_treemap(M, IDX);\ + *H = X;\ + X->parent = (tchunkptr)H;\ + X->fd = X->bk = X;\ + }\ + else {\ + tchunkptr T = *H;\ + size_t K = S << leftshift_for_tree_index(IDX);\ + for (;;) {\ + if (chunksize(T) != S) {\ + tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\ + K <<= 1;\ + if (*C != 0)\ + T = *C;\ + else if (RTCHECK(ok_address(M, C))) {\ + *C = X;\ + X->parent = T;\ + X->fd = X->bk = X;\ + break;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + break;\ + }\ + }\ + else {\ + tchunkptr F = T->fd;\ + if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\ + T->fd = F->bk = X;\ + X->fd = F;\ + X->bk = T;\ + X->parent = 0;\ + break;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + break;\ + }\ + }\ + }\ + }\ +} + +/* + Unlink steps: + + 1. If x is a chained node, unlink it from its same-sized fd/bk links + and choose its bk node as its replacement. + 2. If x was the last node of its size, but not a leaf node, it must + be replaced with a leaf node (not merely one with an open left or + right), to make sure that lefts and rights of descendents + correspond properly to bit masks. We use the rightmost descendent + of x. We could use any other leaf, but this is easy to locate and + tends to counteract removal of leftmosts elsewhere, and so keeps + paths shorter than minimally guaranteed. This doesn't loop much + because on average a node in a tree is near the bottom. + 3. If x is the base of a chain (i.e., has parent links) relink + x's parent and children to x's replacement (or null if none). +*/ + +#define unlink_large_chunk(M, X) {\ + tchunkptr XP = X->parent;\ + tchunkptr R;\ + if (X->bk != X) {\ + tchunkptr F = X->fd;\ + R = X->bk;\ + if (RTCHECK(ok_address(M, F))) {\ + F->bk = R;\ + R->fd = F;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ + else {\ + tchunkptr* RP;\ + if (((R = *(RP = &(X->child[1]))) != 0) ||\ + ((R = *(RP = &(X->child[0]))) != 0)) {\ + tchunkptr* CP;\ + while ((*(CP = &(R->child[1])) != 0) ||\ + (*(CP = &(R->child[0])) != 0)) {\ + R = *(RP = CP);\ + }\ + if (RTCHECK(ok_address(M, RP)))\ + *RP = 0;\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ + }\ + if (XP != 0) {\ + tbinptr* H = treebin_at(M, X->index);\ + if (X == *H) {\ + if ((*H = R) == 0) \ + clear_treemap(M, X->index);\ + }\ + else if (RTCHECK(ok_address(M, XP))) {\ + if (XP->child[0] == X) \ + XP->child[0] = R;\ + else \ + XP->child[1] = R;\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + if (R != 0) {\ + if (RTCHECK(ok_address(M, R))) {\ + tchunkptr C0, C1;\ + R->parent = XP;\ + if ((C0 = X->child[0]) != 0) {\ + if (RTCHECK(ok_address(M, C0))) {\ + R->child[0] = C0;\ + C0->parent = R;\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + if ((C1 = X->child[1]) != 0) {\ + if (RTCHECK(ok_address(M, C1))) {\ + R->child[1] = C1;\ + C1->parent = R;\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ +} + +/* Relays to large vs small bin operations */ + +#define insert_chunk(M, P, S)\ + if (is_small(S)) insert_small_chunk(M, P, S)\ + else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); } + +#define unlink_chunk(M, P, S)\ + if (is_small(S)) unlink_small_chunk(M, P, S)\ + else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); } + + +/* Relays to internal calls to malloc/free from realloc, memalign etc */ + +#if ONLY_MSPACES +#define internal_malloc(m, b) mspace_malloc(m, b) +#define internal_free(m, mem) mspace_free(m,mem); +#else /* ONLY_MSPACES */ +#if MSPACES +#define internal_malloc(m, b)\ + (m == gm)? dlmalloc(b) : mspace_malloc(m, b) +#define internal_free(m, mem)\ + if (m == gm) dlfree(mem); else mspace_free(m,mem); +#else /* MSPACES */ +#define internal_malloc(m, b) dlmalloc(b) +#define internal_free(m, mem) dlfree(mem) +#endif /* MSPACES */ +#endif /* ONLY_MSPACES */ + +/* ----------------------- Direct-mmapping chunks ----------------------- */ + +/* + Directly mmapped chunks are set up with an offset to the start of + the mmapped region stored in the prev_foot field of the chunk. This + allows reconstruction of the required argument to MUNMAP when freed, + and also allows adjustment of the returned chunk to meet alignment + requirements (especially in memalign). There is also enough space + allocated to hold a fake next chunk of size SIZE_T_SIZE to maintain + the PINUSE bit so frees can be checked. +*/ + +/* Malloc using mmap */ +static void* mmap_alloc(mstate m, size_t nb) { + size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK); + if (mmsize > nb) { /* Check for wrap around 0 */ + char* mm = (char*)(DIRECT_MMAP(mmsize)); + if (mm != CMFAIL) { + size_t offset = align_offset(chunk2mem(mm)); + size_t psize = mmsize - offset - MMAP_FOOT_PAD; + mchunkptr p = (mchunkptr)(mm + offset); + p->prev_foot = offset | IS_MMAPPED_BIT; + (p)->head = (psize|CINUSE_BIT); + mark_inuse_foot(m, p, psize); + chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD; + chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0; + + if (mm < m->least_addr) + m->least_addr = mm; + if ((m->footprint += mmsize) > m->max_footprint) + m->max_footprint = m->footprint; + dl_assert(is_aligned(chunk2mem(p))); + check_mmapped_chunk(m, p); + return chunk2mem(p); + } + } + return 0; +} + +/* Realloc using mmap */ +static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) { + size_t oldsize = chunksize(oldp); + if (is_small(nb)) /* Can't shrink mmap regions below small size */ + return 0; + /* Keep old chunk if big enough but not too big */ + if (oldsize >= nb + SIZE_T_SIZE && + (oldsize - nb) <= (mparams.granularity << 1)) + return oldp; + else { + size_t offset = oldp->prev_foot & ~IS_MMAPPED_BIT; + size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD; + size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK); + char* cp = (char*)CALL_MREMAP((char*)oldp - offset, + oldmmsize, newmmsize, 1); + if (cp != CMFAIL) { + mchunkptr newp = (mchunkptr)(cp + offset); + size_t psize = newmmsize - offset - MMAP_FOOT_PAD; + newp->head = (psize|CINUSE_BIT); + mark_inuse_foot(m, newp, psize); + chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD; + chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0; + + if (cp < m->least_addr) + m->least_addr = cp; + if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint) + m->max_footprint = m->footprint; + check_mmapped_chunk(m, newp); + return newp; + } + } + return 0; +} + +/* -------------------------- mspace management -------------------------- */ + +/* Initialize top chunk and its size */ +static void init_top(mstate m, mchunkptr p, size_t psize) { + /* Ensure alignment */ + size_t offset = align_offset(chunk2mem(p)); + p = (mchunkptr)((char*)p + offset); + psize -= offset; + + m->top = p; + m->topsize = psize; + p->head = psize | PINUSE_BIT; + /* set size of fake trailing chunk holding overhead space only once */ + chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE; + m->trim_check = mparams.trim_threshold; /* reset on each update */ +} + +/* Initialize bins for a new mstate that is otherwise zeroed out */ +static void init_bins(mstate m) { + /* Establish circular links for smallbins */ + bindex_t i; + for (i = 0; i < NSMALLBINS; ++i) { + sbinptr bin = smallbin_at(m,i); + bin->fd = bin->bk = bin; + } +} + +#if PROCEED_ON_ERROR + +/* default corruption action */ +static void reset_on_error(mstate m) { + int i; + ++malloc_corruption_error_count; + /* Reinitialize fields to forget about all memory */ + m->smallbins = m->treebins = 0; + m->dvsize = m->topsize = 0; + m->seg.base = 0; + m->seg.size = 0; + m->seg.next = 0; + m->top = m->dv = 0; + for (i = 0; i < NTREEBINS; ++i) + *treebin_at(m, i) = 0; + init_bins(m); +} +#endif /* PROCEED_ON_ERROR */ + +/* Allocate chunk and prepend remainder with chunk in successor base. */ +static void* prepend_alloc(mstate m, char* newbase, char* oldbase, + size_t nb) { + mchunkptr p = align_as_chunk(newbase); + mchunkptr oldfirst = align_as_chunk(oldbase); + size_t psize = (char*)oldfirst - (char*)p; + mchunkptr q = chunk_plus_offset(p, nb); + size_t qsize = psize - nb; + set_size_and_pinuse_of_inuse_chunk(m, p, nb); + + dl_assert((char*)oldfirst > (char*)q); + dl_assert(pinuse(oldfirst)); + dl_assert(qsize >= MIN_CHUNK_SIZE); + + /* consolidate remainder with first chunk of old base */ + if (oldfirst == m->top) { + size_t tsize = m->topsize += qsize; + m->top = q; + q->head = tsize | PINUSE_BIT; + check_top_chunk(m, q); + } + else if (oldfirst == m->dv) { + size_t dsize = m->dvsize += qsize; + m->dv = q; + set_size_and_pinuse_of_free_chunk(q, dsize); + } + else { + if (!cinuse(oldfirst)) { + size_t nsize = chunksize(oldfirst); + unlink_chunk(m, oldfirst, nsize); + oldfirst = chunk_plus_offset(oldfirst, nsize); + qsize += nsize; + } + set_free_with_pinuse(q, qsize, oldfirst); + insert_chunk(m, q, qsize); + check_free_chunk(m, q); + } + + check_malloced_chunk(m, chunk2mem(p), nb); + return chunk2mem(p); +} + +/* Add a segment to hold a new noncontiguous region */ +static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) { + /* Determine locations and sizes of segment, fenceposts, old top */ + char* old_top = (char*)m->top; + msegmentptr oldsp = segment_holding(m, old_top); + char* old_end = oldsp->base + oldsp->size; + size_t ssize = pad_request(sizeof(struct malloc_segment)); + char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK); + size_t offset = align_offset(chunk2mem(rawsp)); + char* asp = rawsp + offset; + char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp; + mchunkptr sp = (mchunkptr)csp; + msegmentptr ss = (msegmentptr)(chunk2mem(sp)); + mchunkptr tnext = chunk_plus_offset(sp, ssize); + mchunkptr p = tnext; + int nfences = 0; + + /* reset top to new space */ + init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE); + + /* Set up segment record */ + dl_assert(is_aligned(ss)); + set_size_and_pinuse_of_inuse_chunk(m, sp, ssize); + *ss = m->seg; /* Push current record */ + m->seg.base = tbase; + m->seg.size = tsize; + m->seg.sflags = mmapped; + m->seg.next = ss; + + /* Insert trailing fenceposts */ + for (;;) { + mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE); + p->head = FENCEPOST_HEAD; + ++nfences; + if ((char*)(&(nextp->head)) < old_end) + p = nextp; + else + break; + } + dl_assert(nfences >= 2); + + /* Insert the rest of old top into a bin as an ordinary free chunk */ + if (csp != old_top) { + mchunkptr q = (mchunkptr)old_top; + size_t psize = csp - old_top; + mchunkptr tn = chunk_plus_offset(q, psize); + set_free_with_pinuse(q, psize, tn); + insert_chunk(m, q, psize); + } + + check_top_chunk(m, m->top); +} + +/* -------------------------- System allocation -------------------------- */ + +/* Get memory from system using MORECORE or MMAP */ +static void* sys_alloc(mstate m, size_t nb) { + char* tbase = CMFAIL; + size_t tsize = 0; + flag_t mmap_flag = 0; + + init_mparams(); + + /* Directly map large chunks */ + if (use_mmap(m) && nb >= mparams.mmap_threshold) { + void* mem = mmap_alloc(m, nb); + if (mem != 0) + return mem; + } + + /* + Try getting memory in any of three ways (in most-preferred to + least-preferred order): + 1. A call to MORECORE that can normally contiguously extend memory. + (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or + or main space is mmapped or a previous contiguous call failed) + 2. A call to MMAP new space (disabled if not DL_HAVE_MMAP). + Note that under the default settings, if MORECORE is unable to + fulfill a request, and DL_HAVE_MMAP is true, then mmap is + used as a noncontiguous system allocator. This is a useful backup + strategy for systems with holes in address spaces -- in this case + sbrk cannot contiguously expand the heap, but mmap may be able to + find space. + 3. A call to MORECORE that cannot usually contiguously extend memory. + (disabled if not HAVE_MORECORE) + */ + + if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) { + char* br = CMFAIL; + msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top); + size_t asize = 0; + ACQUIRE_MORECORE_LOCK(); + + if (ss == 0) { /* First time through or recovery */ + char* base = (char*)CALL_MORECORE(0); + if (base != CMFAIL) { + asize = granularity_align(nb + TOP_FOOT_SIZE + SIZE_T_ONE); + /* Adjust to end on a page boundary */ + if (!is_page_aligned(base)) + asize += (page_align((size_t)base) - (size_t)base); + /* Can't call MORECORE if size is negative when treated as signed */ + if (asize < HALF_MAX_SIZE_T && + (br = (char*)(CALL_MORECORE(asize))) == base) { + tbase = base; + tsize = asize; + } + } + } + else { + /* Subtract out existing available top space from MORECORE request. */ + asize = granularity_align(nb - m->topsize + TOP_FOOT_SIZE + SIZE_T_ONE); + /* Use mem here only if it did continuously extend old space */ + if (asize < HALF_MAX_SIZE_T && + (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) { + tbase = br; + tsize = asize; + } + } + + if (tbase == CMFAIL) { /* Cope with partial failure */ + if (br != CMFAIL) { /* Try to use/extend the space we did get */ + if (asize < HALF_MAX_SIZE_T && + asize < nb + TOP_FOOT_SIZE + SIZE_T_ONE) { + size_t esize = granularity_align(nb + TOP_FOOT_SIZE + SIZE_T_ONE - asize); + if (esize < HALF_MAX_SIZE_T) { + char* end = (char*)CALL_MORECORE(esize); + if (end != CMFAIL) + asize += esize; + else { /* Can't use; try to release */ + (void) CALL_MORECORE(-asize); + br = CMFAIL; + } + } + } + } + if (br != CMFAIL) { /* Use the space we did get */ + tbase = br; + tsize = asize; + } + else + disable_contiguous(m); /* Don't try contiguous path in the future */ + } + + RELEASE_MORECORE_LOCK(); + } + + if (DL_HAVE_MMAP && tbase == CMFAIL) { /* Try MMAP */ + size_t req = nb + TOP_FOOT_SIZE + SIZE_T_ONE; + size_t rsize = granularity_align(req); + if (rsize > nb) { /* Fail if wraps around zero */ + char* mp = (char*)(CALL_MMAP(rsize)); + if (mp != CMFAIL) { + tbase = mp; + tsize = rsize; + mmap_flag = IS_MMAPPED_BIT; + } + } + } + + if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */ + size_t asize = granularity_align(nb + TOP_FOOT_SIZE + SIZE_T_ONE); + if (asize < HALF_MAX_SIZE_T) { + char* br = CMFAIL; + char* end = CMFAIL; + ACQUIRE_MORECORE_LOCK(); + br = (char*)(CALL_MORECORE(asize)); + end = (char*)(CALL_MORECORE(0)); + RELEASE_MORECORE_LOCK(); + if (br != CMFAIL && end != CMFAIL && br < end) { + size_t ssize = end - br; + if (ssize > nb + TOP_FOOT_SIZE) { + tbase = br; + tsize = ssize; + } + } + } + } + + if (tbase != CMFAIL) { + + if ((m->footprint += tsize) > m->max_footprint) + m->max_footprint = m->footprint; + + if (!is_initialized(m)) { /* first-time initialization */ + m->seg.base = m->least_addr = tbase; + m->seg.size = tsize; + m->seg.sflags = mmap_flag; + m->magic = mparams.magic; + m->release_checks = MAX_RELEASE_CHECK_RATE; + init_bins(m); +#if !ONLY_MSPACES + if (is_global(m)) + init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE); + else +#endif + { + /* Offset top by embedded malloc_state */ + mchunkptr mn = next_chunk(mem2chunk(m)); + init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE); + } + } + + else { + /* Try to merge with an existing segment */ + msegmentptr sp = &m->seg; + /* Only consider most recent segment if traversal suppressed */ + while (sp != 0 && tbase != sp->base + sp->size) + sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next; + if (sp != 0 && + !is_extern_segment(sp) && + (sp->sflags & IS_MMAPPED_BIT) == mmap_flag && + segment_holds(sp, m->top)) { /* append */ + sp->size += tsize; + init_top(m, m->top, m->topsize + tsize); + } + else { + if (tbase < m->least_addr) + m->least_addr = tbase; + sp = &m->seg; + while (sp != 0 && sp->base != tbase + tsize) + sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next; + if (sp != 0 && + !is_extern_segment(sp) && + (sp->sflags & IS_MMAPPED_BIT) == mmap_flag) { + char* oldbase = sp->base; + sp->base = tbase; + sp->size += tsize; + return prepend_alloc(m, tbase, oldbase, nb); + } + else + add_segment(m, tbase, tsize, mmap_flag); + } + } + + if (nb < m->topsize) { /* Allocate from new or extended top space */ + size_t rsize = m->topsize -= nb; + mchunkptr p = m->top; + mchunkptr r = m->top = chunk_plus_offset(p, nb); + r->head = rsize | PINUSE_BIT; + set_size_and_pinuse_of_inuse_chunk(m, p, nb); + check_top_chunk(m, m->top); + check_malloced_chunk(m, chunk2mem(p), nb); + return chunk2mem(p); + } + } + + MALLOC_FAILURE_ACTION; + return 0; +} + +/* ----------------------- system deallocation -------------------------- */ + +/* Unmap and unlink any mmapped segments that don't contain used chunks */ +static size_t release_unused_segments(mstate m) { + size_t released = 0; + int nsegs = 0; + msegmentptr pred = &m->seg; + msegmentptr sp = pred->next; + while (sp != 0) { + char* base = sp->base; + size_t size = sp->size; + msegmentptr next = sp->next; + ++nsegs; + if (is_mmapped_segment(sp) && !is_extern_segment(sp)) { + mchunkptr p = align_as_chunk(base); + size_t psize = chunksize(p); + /* Can unmap if first chunk holds entire segment and not pinned */ + if (!cinuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) { + tchunkptr tp = (tchunkptr)p; + dl_assert(segment_holds(sp, (char*)sp)); + if (p == m->dv) { + m->dv = 0; + m->dvsize = 0; + } + else { + unlink_large_chunk(m, tp); + } + if (CALL_MUNMAP(base, size) == 0) { + released += size; + m->footprint -= size; + /* unlink obsoleted record */ + sp = pred; + sp->next = next; + } + else { /* back out if cannot unmap */ + insert_large_chunk(m, tp, psize); + } + } + } + if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */ + break; + pred = sp; + sp = next; + } + /* Reset check counter */ + m->release_checks = ((nsegs > (int)MAX_RELEASE_CHECK_RATE)? + (size_t)nsegs : MAX_RELEASE_CHECK_RATE); + return released; +} + +static int sys_trim(mstate m, size_t pad) { + size_t released = 0; + if (pad < MAX_REQUEST && is_initialized(m)) { + pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */ + + if (m->topsize > pad) { + /* Shrink top space in granularity-size units, keeping at least one */ + size_t unit = mparams.granularity; + size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit - + SIZE_T_ONE) * unit; + msegmentptr sp = segment_holding(m, (char*)m->top); + + if (!is_extern_segment(sp)) { + if (is_mmapped_segment(sp)) { + if (DL_HAVE_MMAP && + sp->size >= extra && + !has_segment_link(m, sp)) { /* can't shrink if pinned */ + size_t newsize = sp->size - extra; + /* Prefer mremap, fall back to munmap */ + if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) || + (CALL_MUNMAP(sp->base + newsize, extra) == 0)) { + released = extra; + } + } + } + else if (HAVE_MORECORE) { + if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */ + extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit; + ACQUIRE_MORECORE_LOCK(); + { + /* Make sure end of memory is where we last set it. */ + char* old_br = (char*)(CALL_MORECORE(0)); + if (old_br == sp->base + sp->size) { + char* rel_br = (char*)(CALL_MORECORE(-extra)); + char* new_br = (char*)(CALL_MORECORE(0)); + if (rel_br != CMFAIL && new_br < old_br) + released = old_br - new_br; + } + } + RELEASE_MORECORE_LOCK(); + } + } + + if (released != 0) { + sp->size -= released; + m->footprint -= released; + init_top(m, m->top, m->topsize - released); + check_top_chunk(m, m->top); + } + } + + /* Unmap any unused mmapped segments */ + if (DL_HAVE_MMAP) + released += release_unused_segments(m); + + /* On failure, disable autotrim to avoid repeated failed future calls */ + if (released == 0 && m->topsize > m->trim_check) + m->trim_check = MAX_SIZE_T; + } + + return (released != 0)? 1 : 0; +} + +/* ---------------------------- malloc support --------------------------- */ + +/* allocate a large request from the best fitting chunk in a treebin */ +static void* tmalloc_large(mstate m, size_t nb) { + tchunkptr v = 0; + size_t rsize = -nb; /* Unsigned negation */ + tchunkptr t; + bindex_t idx; + compute_tree_index(nb, idx); + + if ((t = *treebin_at(m, idx)) != 0) { + /* Traverse tree for this bin looking for node with size == nb */ + size_t sizebits = nb << leftshift_for_tree_index(idx); + tchunkptr rst = 0; /* The deepest untaken right subtree */ + for (;;) { + tchunkptr rt; + size_t trem = chunksize(t) - nb; + if (trem < rsize) { + v = t; + if ((rsize = trem) == 0) + break; + } + rt = t->child[1]; + t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]; + if (rt != 0 && rt != t) + rst = rt; + if (t == 0) { + t = rst; /* set t to least subtree holding sizes > nb */ + break; + } + sizebits <<= 1; + } + } + + if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */ + binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap; + if (leftbits != 0) { + bindex_t i; + binmap_t leastbit = least_bit(leftbits); + compute_bit2idx(leastbit, i); + t = *treebin_at(m, i); + } + } + + while (t != 0) { /* find smallest of tree or subtree */ + size_t trem = chunksize(t) - nb; + if (trem < rsize) { + rsize = trem; + v = t; + } + t = leftmost_child(t); + } + + /* If dv is a better fit, return 0 so malloc will use it */ + if (v != 0 && rsize < (size_t)(m->dvsize - nb)) { + if (RTCHECK(ok_address(m, v))) { /* split */ + mchunkptr r = chunk_plus_offset(v, nb); + dl_assert(chunksize(v) == rsize + nb); + if (RTCHECK(ok_next(v, r))) { + unlink_large_chunk(m, v); + if (rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(m, v, (rsize + nb)); + else { + set_size_and_pinuse_of_inuse_chunk(m, v, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + insert_chunk(m, r, rsize); + } + return chunk2mem(v); + } + } + CORRUPTION_ERROR_ACTION(m); + } + return 0; +} + +/* allocate a small request from the best fitting chunk in a treebin */ +static void* tmalloc_small(mstate m, size_t nb) { + tchunkptr t, v; + size_t rsize; + bindex_t i; + binmap_t leastbit = least_bit(m->treemap); + compute_bit2idx(leastbit, i); + + v = t = *treebin_at(m, i); + rsize = chunksize(t) - nb; + + while ((t = leftmost_child(t)) != 0) { + size_t trem = chunksize(t) - nb; + if (trem < rsize) { + rsize = trem; + v = t; + } + } + + if (RTCHECK(ok_address(m, v))) { + mchunkptr r = chunk_plus_offset(v, nb); + dl_assert(chunksize(v) == rsize + nb); + if (RTCHECK(ok_next(v, r))) { + unlink_large_chunk(m, v); + if (rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(m, v, (rsize + nb)); + else { + set_size_and_pinuse_of_inuse_chunk(m, v, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + replace_dv(m, r, rsize); + } + return chunk2mem(v); + } + } + + CORRUPTION_ERROR_ACTION(m); + return 0; +} + +/* --------------------------- realloc support --------------------------- */ + +static void* internal_realloc(mstate m, void* oldmem, size_t bytes) { + if (bytes >= MAX_REQUEST) { + MALLOC_FAILURE_ACTION; + return 0; + } + if (!PREACTION(m)) { + mchunkptr oldp = mem2chunk(oldmem); + size_t oldsize = chunksize(oldp); + mchunkptr next = chunk_plus_offset(oldp, oldsize); + mchunkptr newp = 0; + void* extra = 0; + + /* Try to either shrink or extend into top. Else malloc-copy-free */ + + if (RTCHECK(ok_address(m, oldp) && ok_cinuse(oldp) && + ok_next(oldp, next) && ok_pinuse(next))) { + size_t nb = request2size(bytes); + if (is_mmapped(oldp)) + newp = mmap_resize(m, oldp, nb); + else if (oldsize >= nb) { /* already big enough */ + size_t rsize = oldsize - nb; + newp = oldp; + if (rsize >= MIN_CHUNK_SIZE) { + mchunkptr remainder = chunk_plus_offset(newp, nb); + set_inuse(m, newp, nb); + set_inuse(m, remainder, rsize); + extra = chunk2mem(remainder); + } + } + else if (next == m->top && oldsize + m->topsize > nb) { + /* Expand into top */ + size_t newsize = oldsize + m->topsize; + size_t newtopsize = newsize - nb; + mchunkptr newtop = chunk_plus_offset(oldp, nb); + set_inuse(m, oldp, nb); + newtop->head = newtopsize |PINUSE_BIT; + m->top = newtop; + m->topsize = newtopsize; + newp = oldp; + } + } + else { + USAGE_ERROR_ACTION(m, oldmem); + POSTACTION(m); + return 0; + } + + POSTACTION(m); + + if (newp != 0) { + if (extra != 0) { + internal_free(m, extra); + } + check_inuse_chunk(m, newp); + return chunk2mem(newp); + } + else { + void* newmem = internal_malloc(m, bytes); + if (newmem != 0) { + size_t oc = oldsize - overhead_for(oldp); + memcpy(newmem, oldmem, (oc < bytes)? oc : bytes); + internal_free(m, oldmem); + } + return newmem; + } + } + return 0; +} + +/* --------------------------- memalign support -------------------------- */ + +static void* internal_memalign(mstate m, size_t alignment, size_t bytes) { + if (alignment <= MALLOC_ALIGNMENT) /* Can just use malloc */ + return internal_malloc(m, bytes); + if (alignment < MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */ + alignment = MIN_CHUNK_SIZE; + if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */ + size_t a = MALLOC_ALIGNMENT << 1; + while (a < alignment) a <<= 1; + alignment = a; + } + + if (bytes >= MAX_REQUEST - alignment) { + if (m != 0) { /* Test isn't needed but avoids compiler warning */ + MALLOC_FAILURE_ACTION; + } + } + else { + size_t nb = request2size(bytes); + size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD; + char* mem = (char*)internal_malloc(m, req); + if (mem != 0) { + void* leader = 0; + void* trailer = 0; + mchunkptr p = mem2chunk(mem); + + if (PREACTION(m)) return 0; + if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */ + /* + Find an aligned spot inside chunk. Since we need to give + back leading space in a chunk of at least MIN_CHUNK_SIZE, if + the first calculation places us at a spot with less than + MIN_CHUNK_SIZE leader, we can move to the next aligned spot. + We've allocated enough total room so that this is always + possible. + */ + char* br = (char*)mem2chunk((size_t)(((size_t)(mem + + alignment - + SIZE_T_ONE)) & + -alignment)); + char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)? + br : br+alignment; + mchunkptr newp = (mchunkptr)pos; + size_t leadsize = pos - (char*)(p); + size_t newsize = chunksize(p) - leadsize; + + if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */ + newp->prev_foot = p->prev_foot + leadsize; + newp->head = (newsize|CINUSE_BIT); + } + else { /* Otherwise, give back leader, use the rest */ + set_inuse(m, newp, newsize); + set_inuse(m, p, leadsize); + leader = chunk2mem(p); + } + p = newp; + } + + /* Give back spare room at the end */ + if (!is_mmapped(p)) { + size_t size = chunksize(p); + if (size > nb + MIN_CHUNK_SIZE) { + size_t remainder_size = size - nb; + mchunkptr remainder = chunk_plus_offset(p, nb); + set_inuse(m, p, nb); + set_inuse(m, remainder, remainder_size); + trailer = chunk2mem(remainder); + } + } + + assert (chunksize(p) >= nb); + dl_assert((((size_t)(chunk2mem(p))) % alignment) == 0); + check_inuse_chunk(m, p); + POSTACTION(m); + if (leader != 0) { + internal_free(m, leader); + } + if (trailer != 0) { + internal_free(m, trailer); + } + return chunk2mem(p); + } + } + return 0; +} + +/* ------------------------ comalloc/coalloc support --------------------- */ + +static void** ialloc(mstate m, + size_t n_elements, + size_t* sizes, + int opts, + void* chunks[]) { + /* + This provides common support for independent_X routines, handling + all of the combinations that can result. + + The opts arg has: + bit 0 set if all elements are same size (using sizes[0]) + bit 1 set if elements should be zeroed + */ + + size_t element_size; /* chunksize of each element, if all same */ + size_t contents_size; /* total size of elements */ + size_t array_size; /* request size of pointer array */ + void* mem; /* malloced aggregate space */ + mchunkptr p; /* corresponding chunk */ + size_t remainder_size; /* remaining bytes while splitting */ + void** marray; /* either "chunks" or malloced ptr array */ + mchunkptr array_chunk; /* chunk for malloced ptr array */ + flag_t was_enabled; /* to disable mmap */ + size_t size; + size_t i; + + /* compute array length, if needed */ + if (chunks != 0) { + if (n_elements == 0) + return chunks; /* nothing to do */ + marray = chunks; + array_size = 0; + } + else { + /* if empty req, must still return chunk representing empty array */ + if (n_elements == 0) + return (void**)internal_malloc(m, 0); + marray = 0; + array_size = request2size(n_elements * (sizeof(void*))); + } + + /* compute total element size */ + if (opts & 0x1) { /* all-same-size */ + element_size = request2size(*sizes); + contents_size = n_elements * element_size; + } + else { /* add up all the sizes */ + element_size = 0; + contents_size = 0; + for (i = 0; i != n_elements; ++i) + contents_size += request2size(sizes[i]); + } + + size = contents_size + array_size; + + /* + Allocate the aggregate chunk. First disable direct-mmapping so + malloc won't use it, since we would not be able to later + free/realloc space internal to a segregated mmap region. + */ + was_enabled = use_mmap(m); + disable_mmap(m); + mem = internal_malloc(m, size - CHUNK_OVERHEAD); + if (was_enabled) + enable_mmap(m); + if (mem == 0) + return 0; + + if (PREACTION(m)) return 0; + p = mem2chunk(mem); + remainder_size = chunksize(p); + + dl_assert(!is_mmapped(p)); + + if (opts & 0x2) { /* optionally clear the elements */ + memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size); + } + + /* If not provided, allocate the pointer array as final part of chunk */ + if (marray == 0) { + size_t array_chunk_size; + array_chunk = chunk_plus_offset(p, contents_size); + array_chunk_size = remainder_size - contents_size; + marray = (void**) (chunk2mem(array_chunk)); + set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size); + remainder_size = contents_size; + } + + /* split out elements */ + for (i = 0; ; ++i) { + marray[i] = chunk2mem(p); + if (i != n_elements-1) { + if (element_size != 0) + size = element_size; + else + size = request2size(sizes[i]); + remainder_size -= size; + set_size_and_pinuse_of_inuse_chunk(m, p, size); + p = chunk_plus_offset(p, size); + } + else { /* the final element absorbs any overallocation slop */ + set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size); + break; + } + } + +#if DL_DEBUG + if (marray != chunks) { + /* final element must have exactly exhausted chunk */ + if (element_size != 0) { + dl_assert(remainder_size == element_size); + } + else { + dl_assert(remainder_size == request2size(sizes[i])); + } + check_inuse_chunk(m, mem2chunk(marray)); + } + for (i = 0; i != n_elements; ++i) + check_inuse_chunk(m, mem2chunk(marray[i])); + +#endif /* DL_DEBUG */ + + POSTACTION(m); + return marray; +} + + +/* -------------------------- public routines ---------------------------- */ + +#if !ONLY_MSPACES + +void* dlmalloc(size_t bytes) { + /* + Basic algorithm: + If a small request (< 256 bytes minus per-chunk overhead): + 1. If one exists, use a remainderless chunk in associated smallbin. + (Remainderless means that there are too few excess bytes to + represent as a chunk.) + 2. If it is big enough, use the dv chunk, which is normally the + chunk adjacent to the one used for the most recent small request. + 3. If one exists, split the smallest available chunk in a bin, + saving remainder in dv. + 4. If it is big enough, use the top chunk. + 5. If available, get memory from system and use it + Otherwise, for a large request: + 1. Find the smallest available binned chunk that fits, and use it + if it is better fitting than dv chunk, splitting if necessary. + 2. If better fitting than any binned chunk, use the dv chunk. + 3. If it is big enough, use the top chunk. + 4. If request size >= mmap threshold, try to directly mmap this chunk. + 5. If available, get memory from system and use it + + The ugly goto's here ensure that postaction occurs along all paths. + */ + + if (!PREACTION(gm)) { + void* mem; + size_t nb; + if (bytes <= MAX_SMALL_REQUEST) { + bindex_t idx; + binmap_t smallbits; + nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes); + idx = small_index(nb); + smallbits = gm->smallmap >> idx; + + if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */ + mchunkptr b, p; + idx += ~smallbits & 1; /* Uses next bin if idx empty */ + b = smallbin_at(gm, idx); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(idx)); + unlink_first_small_chunk(gm, b, p, idx); + set_inuse_and_pinuse(gm, p, small_index2size(idx)); + mem = chunk2mem(p); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + else if (nb > gm->dvsize) { + if (smallbits != 0) { /* Use chunk in next nonempty smallbin */ + mchunkptr b, p, r; + size_t rsize; + bindex_t i; + binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx)); + binmap_t leastbit = least_bit(leftbits); + compute_bit2idx(leastbit, i); + b = smallbin_at(gm, i); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(i)); + unlink_first_small_chunk(gm, b, p, i); + rsize = small_index2size(i) - nb; + /* Fit here cannot be remainderless if 4byte sizes */ + if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(gm, p, small_index2size(i)); + else { + set_size_and_pinuse_of_inuse_chunk(gm, p, nb); + r = chunk_plus_offset(p, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + replace_dv(gm, r, rsize); + } + mem = chunk2mem(p); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) { + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + } + } + else if (bytes >= MAX_REQUEST) + nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */ + else { + nb = pad_request(bytes); + if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) { + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + } + + if (nb <= gm->dvsize) { + size_t rsize = gm->dvsize - nb; + mchunkptr p = gm->dv; + if (rsize >= MIN_CHUNK_SIZE) { /* split dv */ + mchunkptr r = gm->dv = chunk_plus_offset(p, nb); + gm->dvsize = rsize; + set_size_and_pinuse_of_free_chunk(r, rsize); + set_size_and_pinuse_of_inuse_chunk(gm, p, nb); + } + else { /* exhaust dv */ + size_t dvs = gm->dvsize; + gm->dvsize = 0; + gm->dv = 0; + set_inuse_and_pinuse(gm, p, dvs); + } + mem = chunk2mem(p); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + else if (nb < gm->topsize) { /* Split top */ + size_t rsize = gm->topsize -= nb; + mchunkptr p = gm->top; + mchunkptr r = gm->top = chunk_plus_offset(p, nb); + r->head = rsize | PINUSE_BIT; + set_size_and_pinuse_of_inuse_chunk(gm, p, nb); + mem = chunk2mem(p); + check_top_chunk(gm, gm->top); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + mem = sys_alloc(gm, nb); + + postaction: + POSTACTION(gm); + return mem; + } + + return 0; +} + +void dlfree(void* mem) { + /* + Consolidate freed chunks with preceeding or succeeding bordering + free chunks, if they exist, and then place in a bin. Intermixed + with special cases for top, dv, mmapped chunks, and usage errors. + */ + + if (mem != 0) { + mchunkptr p = mem2chunk(mem); +#if FOOTERS + mstate fm = get_mstate_for(p); + if (!ok_magic(fm)) { + USAGE_ERROR_ACTION(fm, p); + return; + } +#else /* FOOTERS */ +#define fm gm +#endif /* FOOTERS */ + if (!PREACTION(fm)) { + check_inuse_chunk(fm, p); + if (RTCHECK(ok_address(fm, p) && ok_cinuse(p))) { + size_t psize = chunksize(p); + mchunkptr next = chunk_plus_offset(p, psize); + if (!pinuse(p)) { + size_t prevsize = p->prev_foot; + if ((prevsize & IS_MMAPPED_BIT) != 0) { + prevsize &= ~IS_MMAPPED_BIT; + psize += prevsize + MMAP_FOOT_PAD; + if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) + fm->footprint -= psize; + goto postaction; + } + else { + mchunkptr prev = chunk_minus_offset(p, prevsize); + psize += prevsize; + p = prev; + if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */ + if (p != fm->dv) { + unlink_chunk(fm, p, prevsize); + } + else if ((next->head & INUSE_BITS) == INUSE_BITS) { + fm->dvsize = psize; + set_free_with_pinuse(p, psize, next); + goto postaction; + } + } + else + goto erroraction; + } + } + + if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) { + if (!cinuse(next)) { /* consolidate forward */ + if (next == fm->top) { + size_t tsize = fm->topsize += psize; + fm->top = p; + p->head = tsize | PINUSE_BIT; + if (p == fm->dv) { + fm->dv = 0; + fm->dvsize = 0; + } + if (should_trim(fm, tsize)) + sys_trim(fm, 0); + goto postaction; + } + else if (next == fm->dv) { + size_t dsize = fm->dvsize += psize; + fm->dv = p; + set_size_and_pinuse_of_free_chunk(p, dsize); + goto postaction; + } + else { + size_t nsize = chunksize(next); + psize += nsize; + unlink_chunk(fm, next, nsize); + set_size_and_pinuse_of_free_chunk(p, psize); + if (p == fm->dv) { + fm->dvsize = psize; + goto postaction; + } + } + } + else + set_free_with_pinuse(p, psize, next); + + if (is_small(psize)) { + insert_small_chunk(fm, p, psize); + check_free_chunk(fm, p); + } + else { + tchunkptr tp = (tchunkptr)p; + insert_large_chunk(fm, tp, psize); + check_free_chunk(fm, p); + if (--fm->release_checks == 0) + release_unused_segments(fm); + } + goto postaction; + } + } + erroraction: + USAGE_ERROR_ACTION(fm, p); + postaction: + POSTACTION(fm); + } + } +#if !FOOTERS +#undef fm +#endif /* FOOTERS */ +} + +void* dlcalloc(size_t n_elements, size_t elem_size) { + void* mem; + size_t req = 0; + if (n_elements != 0) { + req = n_elements * elem_size; + if (((n_elements | elem_size) & ~(size_t)0xffff) && + (req / n_elements != elem_size)) + req = MAX_SIZE_T; /* force downstream failure on overflow */ + } + mem = dlmalloc(req); + if (mem != 0 && calloc_must_clear(mem2chunk(mem))) + memset(mem, 0, req); + return mem; +} + +void* dlrealloc(void* oldmem, size_t bytes) { + // printf("oldmem=%p bytes=%d\n", oldmem, (int)bytes); + if (oldmem == 0) + return dlmalloc(bytes); +#ifdef REALLOC_ZERO_BYTES_FREES + if (bytes == 0) { + dlfree(oldmem); + return 0; + } +#endif /* REALLOC_ZERO_BYTES_FREES */ + else { +#if ! FOOTERS + mstate m = gm; +#else /* FOOTERS */ +//printf("checking state\n"); + mstate m = get_mstate_for(mem2chunk(oldmem)); + // mchunkptr p = mem2chunk(oldmem); +//printf("checking state m=%p gm=%p least_addr=%p p=%p, head=%x size=%d pp=%x\n", m, gm, gm->least_addr, p, (unsigned)p->head, (int)chunksize(p), + // (unsigned)(((mchunkptr)((char*)(p) +(chunksize(p))))->prev_foot) + // ); + if (!ok_magic(m)) { +//printf("checking state - oops\n"); + USAGE_ERROR_ACTION(m, oldmem); + return 0; + } +//printf("checking state OK\n"); +#endif /* FOOTERS */ +//printf("to internal realloc m=%p gm=%p, mparams.magic=%x oldmem=%p bytes=%d\n", m, gm, (unsigned)mparams.magic, oldmem, (int)bytes); + return internal_realloc(m, oldmem, bytes); + } +} + +void* dlmemalign(size_t alignment, size_t bytes) { + return internal_memalign(gm, alignment, bytes); +} + +void** dlindependent_calloc(size_t n_elements, size_t elem_size, + void* chunks[]) { + size_t sz = elem_size; /* serves as 1-element array */ + return ialloc(gm, n_elements, &sz, 3, chunks); +} + +void** dlindependent_comalloc(size_t n_elements, size_t sizes[], + void* chunks[]) { + return ialloc(gm, n_elements, sizes, 0, chunks); +} + +void* dlvalloc(size_t bytes) { + size_t pagesz; + init_mparams(); + pagesz = mparams.page_size; + return dlmemalign(pagesz, bytes); +} + +void* dlpvalloc(size_t bytes) { + size_t pagesz; + init_mparams(); + pagesz = mparams.page_size; + return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE)); +} + +int dlmalloc_trim(size_t pad) { + int result = 0; + if (!PREACTION(gm)) { + result = sys_trim(gm, pad); + POSTACTION(gm); + } + return result; +} + +size_t dlmalloc_footprint(void) { + return gm->footprint; +} + +size_t dlmalloc_max_footprint(void) { + return gm->max_footprint; +} + +#if !NO_MALLINFO +struct mallinfo dlmallinfo(void) { + return internal_mallinfo(gm); +} +#endif /* NO_MALLINFO */ + +void dlmalloc_stats() { + internal_malloc_stats(gm); +} + +size_t dlmalloc_usable_size(void* mem) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); + if (cinuse(p)) + return chunksize(p) - overhead_for(p); + } + return 0; +} + +int dlmallopt(int param_number, int value) { + return change_mparam(param_number, value); +} + +#endif /* !ONLY_MSPACES */ + +/* ----------------------------- user mspaces ---------------------------- */ + +#if MSPACES + +static mstate init_user_mstate(char* tbase, size_t tsize) { + size_t msize = pad_request(sizeof(struct malloc_state)); + mchunkptr mn; + mchunkptr msp = align_as_chunk(tbase); + mstate m = (mstate)(chunk2mem(msp)); + memset(m, 0, msize); + INITIAL_LOCK(&m->mutex); + msp->head = (msize|PINUSE_BIT|CINUSE_BIT); + m->seg.base = m->least_addr = tbase; + m->seg.size = m->footprint = m->max_footprint = tsize; + m->magic = mparams.magic; + m->release_checks = MAX_RELEASE_CHECK_RATE; + m->mflags = mparams.default_mflags; + m->extp = 0; + m->exts = 0; + disable_contiguous(m); + init_bins(m); + mn = next_chunk(mem2chunk(m)); + init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE); + check_top_chunk(m, m->top); + return m; +} + +mspace create_mspace(size_t capacity, int locked) { + mstate m = 0; + size_t msize = pad_request(sizeof(struct malloc_state)); + init_mparams(); /* Ensure pagesize etc initialized */ + + if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { + size_t rs = ((capacity == 0)? mparams.granularity : + (capacity + TOP_FOOT_SIZE + msize)); + size_t tsize = granularity_align(rs); + char* tbase = (char*)(CALL_MMAP(tsize)); + if (tbase != CMFAIL) { + m = init_user_mstate(tbase, tsize); + m->seg.sflags = IS_MMAPPED_BIT; + set_lock(m, locked); + } + } + return (mspace)m; +} + +mspace create_mspace_with_base(void* base, size_t capacity, int locked) { + mstate m = 0; + size_t msize = pad_request(sizeof(struct malloc_state)); + init_mparams(); /* Ensure pagesize etc initialized */ + + if (capacity > msize + TOP_FOOT_SIZE && + capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { + m = init_user_mstate((char*)base, capacity); + m->seg.sflags = EXTERN_BIT; + set_lock(m, locked); + } + return (mspace)m; +} + +size_t destroy_mspace(mspace msp) { + size_t freed = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + msegmentptr sp = &ms->seg; + while (sp != 0) { + char* base = sp->base; + size_t size = sp->size; + flag_t flag = sp->sflags; + sp = sp->next; + if ((flag & IS_MMAPPED_BIT) && !(flag & EXTERN_BIT) && + CALL_MUNMAP(base, size) == 0) + freed += size; + } + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return freed; +} + +/* + mspace versions of routines are near-clones of the global + versions. This is not so nice but better than the alternatives. +*/ + + +void* mspace_malloc(mspace msp, size_t bytes) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + if (!PREACTION(ms)) { + void* mem; + size_t nb; + if (bytes <= MAX_SMALL_REQUEST) { + bindex_t idx; + binmap_t smallbits; + nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes); + idx = small_index(nb); + smallbits = ms->smallmap >> idx; + + if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */ + mchunkptr b, p; + idx += ~smallbits & 1; /* Uses next bin if idx empty */ + b = smallbin_at(ms, idx); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(idx)); + unlink_first_small_chunk(ms, b, p, idx); + set_inuse_and_pinuse(ms, p, small_index2size(idx)); + mem = chunk2mem(p); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + else if (nb > ms->dvsize) { + if (smallbits != 0) { /* Use chunk in next nonempty smallbin */ + mchunkptr b, p, r; + size_t rsize; + bindex_t i; + binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx)); + binmap_t leastbit = least_bit(leftbits); + compute_bit2idx(leastbit, i); + b = smallbin_at(ms, i); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(i)); + unlink_first_small_chunk(ms, b, p, i); + rsize = small_index2size(i) - nb; + /* Fit here cannot be remainderless if 4byte sizes */ + if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(ms, p, small_index2size(i)); + else { + set_size_and_pinuse_of_inuse_chunk(ms, p, nb); + r = chunk_plus_offset(p, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + replace_dv(ms, r, rsize); + } + mem = chunk2mem(p); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) { + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + } + } + else if (bytes >= MAX_REQUEST) + nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */ + else { + nb = pad_request(bytes); + if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) { + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + } + + if (nb <= ms->dvsize) { + size_t rsize = ms->dvsize - nb; + mchunkptr p = ms->dv; + if (rsize >= MIN_CHUNK_SIZE) { /* split dv */ + mchunkptr r = ms->dv = chunk_plus_offset(p, nb); + ms->dvsize = rsize; + set_size_and_pinuse_of_free_chunk(r, rsize); + set_size_and_pinuse_of_inuse_chunk(ms, p, nb); + } + else { /* exhaust dv */ + size_t dvs = ms->dvsize; + ms->dvsize = 0; + ms->dv = 0; + set_inuse_and_pinuse(ms, p, dvs); + } + mem = chunk2mem(p); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + else if (nb < ms->topsize) { /* Split top */ + size_t rsize = ms->topsize -= nb; + mchunkptr p = ms->top; + mchunkptr r = ms->top = chunk_plus_offset(p, nb); + r->head = rsize | PINUSE_BIT; + set_size_and_pinuse_of_inuse_chunk(ms, p, nb); + mem = chunk2mem(p); + check_top_chunk(ms, ms->top); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + mem = sys_alloc(ms, nb); + + postaction: + POSTACTION(ms); + return mem; + } + + return 0; +} + +void mspace_free(mspace msp, void* mem) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); +#if FOOTERS + mstate fm = get_mstate_for(p); +#else /* FOOTERS */ + mstate fm = (mstate)msp; +#endif /* FOOTERS */ + if (!ok_magic(fm)) { + USAGE_ERROR_ACTION(fm, p); + return; + } + if (!PREACTION(fm)) { + check_inuse_chunk(fm, p); + if (RTCHECK(ok_address(fm, p) && ok_cinuse(p))) { + size_t psize = chunksize(p); + mchunkptr next = chunk_plus_offset(p, psize); + if (!pinuse(p)) { + size_t prevsize = p->prev_foot; + if ((prevsize & IS_MMAPPED_BIT) != 0) { + prevsize &= ~IS_MMAPPED_BIT; + psize += prevsize + MMAP_FOOT_PAD; + if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) + fm->footprint -= psize; + goto postaction; + } + else { + mchunkptr prev = chunk_minus_offset(p, prevsize); + psize += prevsize; + p = prev; + if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */ + if (p != fm->dv) { + unlink_chunk(fm, p, prevsize); + } + else if ((next->head & INUSE_BITS) == INUSE_BITS) { + fm->dvsize = psize; + set_free_with_pinuse(p, psize, next); + goto postaction; + } + } + else + goto erroraction; + } + } + + if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) { + if (!cinuse(next)) { /* consolidate forward */ + if (next == fm->top) { + size_t tsize = fm->topsize += psize; + fm->top = p; + p->head = tsize | PINUSE_BIT; + if (p == fm->dv) { + fm->dv = 0; + fm->dvsize = 0; + } + if (should_trim(fm, tsize)) + sys_trim(fm, 0); + goto postaction; + } + else if (next == fm->dv) { + size_t dsize = fm->dvsize += psize; + fm->dv = p; + set_size_and_pinuse_of_free_chunk(p, dsize); + goto postaction; + } + else { + size_t nsize = chunksize(next); + psize += nsize; + unlink_chunk(fm, next, nsize); + set_size_and_pinuse_of_free_chunk(p, psize); + if (p == fm->dv) { + fm->dvsize = psize; + goto postaction; + } + } + } + else + set_free_with_pinuse(p, psize, next); + + if (is_small(psize)) { + insert_small_chunk(fm, p, psize); + check_free_chunk(fm, p); + } + else { + tchunkptr tp = (tchunkptr)p; + insert_large_chunk(fm, tp, psize); + check_free_chunk(fm, p); + if (--fm->release_checks == 0) + release_unused_segments(fm); + } + goto postaction; + } + } + erroraction: + USAGE_ERROR_ACTION(fm, p); + postaction: + POSTACTION(fm); + } + } +} + +void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) { + void* mem; + size_t req = 0; + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + if (n_elements != 0) { + req = n_elements * elem_size; + if (((n_elements | elem_size) & ~(size_t)0xffff) && + (req / n_elements != elem_size)) + req = MAX_SIZE_T; /* force downstream failure on overflow */ + } + mem = internal_malloc(ms, req); + if (mem != 0 && calloc_must_clear(mem2chunk(mem))) + memset(mem, 0, req); + return mem; +} + +void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) { + if (oldmem == 0) + return mspace_malloc(msp, bytes); +#ifdef REALLOC_ZERO_BYTES_FREES + if (bytes == 0) { + mspace_free(msp, oldmem); + return 0; + } +#endif /* REALLOC_ZERO_BYTES_FREES */ + else { +#if FOOTERS + mchunkptr p = mem2chunk(oldmem); + mstate ms = get_mstate_for(p); +#else /* FOOTERS */ + mstate ms = (mstate)msp; +#endif /* FOOTERS */ + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return internal_realloc(ms, oldmem, bytes); + } +} + +void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return internal_memalign(ms, alignment, bytes); +} + +void** mspace_independent_calloc(mspace msp, size_t n_elements, + size_t elem_size, void* chunks[]) { + size_t sz = elem_size; /* serves as 1-element array */ + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return ialloc(ms, n_elements, &sz, 3, chunks); +} + +void** mspace_independent_comalloc(mspace msp, size_t n_elements, + size_t sizes[], void* chunks[]) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return ialloc(ms, n_elements, sizes, 0, chunks); +} + +int mspace_trim(mspace msp, size_t pad) { + int result = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + if (!PREACTION(ms)) { + result = sys_trim(ms, pad); + POSTACTION(ms); + } + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return result; +} + +void mspace_malloc_stats(mspace msp) { + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + internal_malloc_stats(ms); + } + else { + USAGE_ERROR_ACTION(ms,ms); + } +} + +size_t mspace_footprint(mspace msp) { + size_t result = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + result = ms->footprint; + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return result; +} + + +size_t mspace_max_footprint(mspace msp) { + size_t result = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + result = ms->max_footprint; + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return result; +} + + +#if !NO_MALLINFO +struct mallinfo mspace_mallinfo(mspace msp) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + } + return internal_mallinfo(ms); +} +#endif /* NO_MALLINFO */ + +size_t mspace_usable_size(void* mem) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); + if (cinuse(p)) + return chunksize(p) - overhead_for(p); + } + return 0; +} + +int mspace_mallopt(int param_number, int value) { + return change_mparam(param_number, value); +} + +#endif /* MSPACES */ + +/* -------------------- Alternative MORECORE functions ------------------- */ + +/* + Guidelines for creating a custom version of MORECORE: + + * For best performance, MORECORE should allocate in multiples of pagesize. + * MORECORE may allocate more memory than requested. (Or even less, + but this will usually result in a malloc failure.) + * MORECORE must not allocate memory when given argument zero, but + instead return one past the end address of memory from previous + nonzero call. + * For best performance, consecutive calls to MORECORE with positive + arguments should return increasing addresses, indicating that + space has been contiguously extended. + * Even though consecutive calls to MORECORE need not return contiguous + addresses, it must be OK for malloc'ed chunks to span multiple + regions in those cases where they do happen to be contiguous. + * MORECORE need not handle negative arguments -- it may instead + just return MFAIL when given negative arguments. + Negative arguments are always multiples of pagesize. MORECORE + must not misinterpret negative args as large positive unsigned + args. You can suppress all such calls from even occurring by defining + MORECORE_CANNOT_TRIM, + + As an example alternative MORECORE, here is a custom allocator + kindly contributed for pre-OSX macOS. It uses virtually but not + necessarily physically contiguous non-paged memory (locked in, + present and won't get swapped out). You can use it by uncommenting + this section, adding some #includes, and setting up the appropriate + defines above: + + #define MORECORE osMoreCore + + There is also a shutdown routine that should somehow be called for + cleanup upon program exit. + + #define MAX_POOL_ENTRIES 100 + #define MINIMUM_MORECORE_SIZE (64 * 1024U) + static int next_os_pool; + void *our_os_pools[MAX_POOL_ENTRIES]; + + void *osMoreCore(int size) + { + void *ptr = 0; + static void *sbrk_top = 0; + + if (size > 0) + { + if (size < MINIMUM_MORECORE_SIZE) + size = MINIMUM_MORECORE_SIZE; + if (CurrentExecutionLevel() == kTaskLevel) + ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0); + if (ptr == 0) + { + return (void *) MFAIL; + } + // save ptrs so they can be freed during cleanup + our_os_pools[next_os_pool] = ptr; + next_os_pool++; + ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK); + sbrk_top = (char *) ptr + size; + return ptr; + } + else if (size < 0) + { + // we don't currently support shrink behavior + return (void *) MFAIL; + } + else + { + return sbrk_top; + } + } + + // cleanup any allocated memory pools + // called as last thing before shutting down driver + + void osCleanupMem(void) + { + void **ptr; + + for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++) + if (*ptr) + { + PoolDeallocate(*ptr); + *ptr = 0; + } + } + +*/ + + +/* ----------------------------------------------------------------------- +History: + V2.8.4 (not yet released) + * Fix bad error check in mspace_footprint + * Adaptations for ptmalloc, courtesy of Wolfram Gloger. + * Reentrant spin locks, courtesy of Earl Chew and others + * Win32 improvements, courtesy of Niall Douglas and Earl Chew + * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options + * Various small adjustments to reduce warnings on some compilers + * Extension hook in malloc_state + + V2.8.3 Thu Sep 22 11:16:32 2005 Doug Lea (dl at gee) + * Add max_footprint functions + * Ensure all appropriate literals are size_t + * Fix conditional compilation problem for some #define settings + * Avoid concatenating segments with the one provided + in create_mspace_with_base + * Rename some variables to avoid compiler shadowing warnings + * Use explicit lock initialization. + * Better handling of sbrk interference. + * Simplify and fix segment insertion, trimming and mspace_destroy + * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x + * Thanks especially to Dennis Flanagan for help on these. + + V2.8.2 Sun Jun 12 16:01:10 2005 Doug Lea (dl at gee) + * Fix memalign brace error. + + V2.8.1 Wed Jun 8 16:11:46 2005 Doug Lea (dl at gee) + * Fix improper #endif nesting in C++ + * Add explicit casts needed for C++ + + V2.8.0 Mon May 30 14:09:02 2005 Doug Lea (dl at gee) + * Use trees for large bins + * Support mspaces + * Use segments to unify sbrk-based and mmap-based system allocation, + removing need for emulation on most platforms without sbrk. + * Default safety checks + * Optional footer checks. Thanks to William Robertson for the idea. + * Internal code refactoring + * Incorporate suggestions and platform-specific changes. + Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas, + Aaron Bachmann, Emery Berger, and others. + * Speed up non-fastbin processing enough to remove fastbins. + * Remove useless cfree() to avoid conflicts with other apps. + * Remove internal memcpy, memset. Compilers handle builtins better. + * Remove some options that no one ever used and rename others. + + V2.7.2 Sat Aug 17 09:07:30 2002 Doug Lea (dl at gee) + * Fix malloc_state bitmap array misdeclaration + + V2.7.1 Thu Jul 25 10:58:03 2002 Doug Lea (dl at gee) + * Allow tuning of FIRST_SORTED_BIN_SIZE + * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte. + * Better detection and support for non-contiguousness of MORECORE. + Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger + * Bypass most of malloc if no frees. Thanks To Emery Berger. + * Fix freeing of old top non-contiguous chunk im sysmalloc. + * Raised default trim and map thresholds to 256K. + * Fix mmap-related #defines. Thanks to Lubos Lunak. + * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield. + * Branch-free bin calculation + * Default trim and mmap thresholds now 256K. + + V2.7.0 Sun Mar 11 14:14:06 2001 Doug Lea (dl at gee) + * Introduce independent_comalloc and independent_calloc. + Thanks to Michael Pachos for motivation and help. + * Make optional .h file available + * Allow > 2GB requests on 32bit systems. + * new WIN32 sbrk, mmap, munmap, lock code from . + Thanks also to Andreas Mueller , + and Anonymous. + * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for + helping test this.) + * memalign: check alignment arg + * realloc: don't try to shift chunks backwards, since this + leads to more fragmentation in some programs and doesn't + seem to help in any others. + * Collect all cases in malloc requiring system memory into sysmalloc + * Use mmap as backup to sbrk + * Place all internal state in malloc_state + * Introduce fastbins (although similar to 2.5.1) + * Many minor tunings and cosmetic improvements + * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK + * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS + Thanks to Tony E. Bennett and others. + * Include errno.h to support default failure action. + + V2.6.6 Sun Dec 5 07:42:19 1999 Doug Lea (dl at gee) + * return null for negative arguments + * Added Several WIN32 cleanups from Martin C. Fong + * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h' + (e.g. WIN32 platforms) + * Cleanup header file inclusion for WIN32 platforms + * Cleanup code to avoid Microsoft Visual C++ compiler complaints + * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing + memory allocation routines + * Set 'malloc_getpagesize' for WIN32 platforms (needs more work) + * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to + usage of 'assert' in non-WIN32 code + * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to + avoid infinite loop + * Always call 'fREe()' rather than 'free()' + + V2.6.5 Wed Jun 17 15:57:31 1998 Doug Lea (dl at gee) + * Fixed ordering problem with boundary-stamping + + V2.6.3 Sun May 19 08:17:58 1996 Doug Lea (dl at gee) + * Added pvalloc, as recommended by H.J. Liu + * Added 64bit pointer support mainly from Wolfram Gloger + * Added anonymously donated WIN32 sbrk emulation + * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen + * malloc_extend_top: fix mask error that caused wastage after + foreign sbrks + * Add linux mremap support code from HJ Liu + + V2.6.2 Tue Dec 5 06:52:55 1995 Doug Lea (dl at gee) + * Integrated most documentation with the code. + * Add support for mmap, with help from + Wolfram Gloger (Gloger@lrz.uni-muenchen.de). + * Use last_remainder in more cases. + * Pack bins using idea from colin@nyx10.cs.du.edu + * Use ordered bins instead of best-fit threshhold + * Eliminate block-local decls to simplify tracing and debugging. + * Support another case of realloc via move into top + * Fix error occuring when initial sbrk_base not word-aligned. + * Rely on page size for units instead of SBRK_UNIT to + avoid surprises about sbrk alignment conventions. + * Add mallinfo, mallopt. Thanks to Raymond Nijssen + (raymond@es.ele.tue.nl) for the suggestion. + * Add `pad' argument to malloc_trim and top_pad mallopt parameter. + * More precautions for cases where other routines call sbrk, + courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de). + * Added macros etc., allowing use in linux libc from + H.J. Lu (hjl@gnu.ai.mit.edu) + * Inverted this history list + + V2.6.1 Sat Dec 2 14:10:57 1995 Doug Lea (dl at gee) + * Re-tuned and fixed to behave more nicely with V2.6.0 changes. + * Removed all preallocation code since under current scheme + the work required to undo bad preallocations exceeds + the work saved in good cases for most test programs. + * No longer use return list or unconsolidated bins since + no scheme using them consistently outperforms those that don't + given above changes. + * Use best fit for very large chunks to prevent some worst-cases. + * Added some support for debugging + + V2.6.0 Sat Nov 4 07:05:23 1995 Doug Lea (dl at gee) + * Removed footers when chunks are in use. Thanks to + Paul Wilson (wilson@cs.texas.edu) for the suggestion. + + V2.5.4 Wed Nov 1 07:54:51 1995 Doug Lea (dl at gee) + * Added malloc_trim, with help from Wolfram Gloger + (wmglo@Dent.MED.Uni-Muenchen.DE). + + V2.5.3 Tue Apr 26 10:16:01 1994 Doug Lea (dl at g) + + V2.5.2 Tue Apr 5 16:20:40 1994 Doug Lea (dl at g) + * realloc: try to expand in both directions + * malloc: swap order of clean-bin strategy; + * realloc: only conditionally expand backwards + * Try not to scavenge used bins + * Use bin counts as a guide to preallocation + * Occasionally bin return list chunks in first scan + * Add a few optimizations from colin@nyx10.cs.du.edu + + V2.5.1 Sat Aug 14 15:40:43 1993 Doug Lea (dl at g) + * faster bin computation & slightly different binning + * merged all consolidations to one part of malloc proper + (eliminating old malloc_find_space & malloc_clean_bin) + * Scan 2 returns chunks (not just 1) + * Propagate failure in realloc if malloc returns 0 + * Add stuff to allow compilation on non-ANSI compilers + from kpv@research.att.com + + V2.5 Sat Aug 7 07:41:59 1993 Doug Lea (dl at g.oswego.edu) + * removed potential for odd address access in prev_chunk + * removed dependency on getpagesize.h + * misc cosmetics and a bit more internal documentation + * anticosmetics: mangled names in macros to evade debugger strangeness + * tested on sparc, hp-700, dec-mips, rs6000 + with gcc & native cc (hp, dec only) allowing + Detlefs & Zorn comparison study (in SIGPLAN Notices.) + + Trial version Fri Aug 28 13:14:29 1992 Doug Lea (dl at g.oswego.edu) + * Based loosely on libg++-1.2X malloc. (It retains some of the overall + structure of old version, but most details differ.) + +*/ + + diff --git a/oshmem/mca/memheap/ptmalloc/malloc_defs.h b/oshmem/mca/memheap/ptmalloc/malloc_defs.h new file mode 100644 index 0000000000..a58b72f33b --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/malloc_defs.h @@ -0,0 +1,32 @@ +#ifndef _MALLOC_DEFS_H +#define _MALLOC_DEFS_H + +#include "oshmem/runtime/runtime.h" + +/* See malloc.c for detailed parameter description */ +#define USE_SPIN_LOCKS 0 +#define USE_DL_PREFIX +#define ABORT oshmem_shmem_abort(-2) +//#define ABORT abort() +#define MORECORE mca_memheap_ptmalloc_sbrk +#define MORECORE_CANNOT_TRIM +#define DL_HAVE_MMAP 0 +#define DL_HAVE_MREMAP 0 +#define malloc_getpagesize mca_memheap_ptmalloc_getpagesize() +#define REALLOC_ZERO_BYTES_FREES +#define ABORT_ON_ASSERT_FAILURE 1 +/* next two are useful for debugging */ +#define DL_DEBUG 0 +#define FOOTERS 0 +/* print error if *alloc() is called with incorrect params */ +#define USAGE_ERROR_ACTION(m, p) do { printf("PTMALLOC: USAGE ERROR DETECTED: m=%p ptr=%p\n", m, p); } while (0) + +int mca_memheap_ptmalloc_getpagesize(void); +void *mca_memheap_ptmalloc_sbrk(size_t size); + +void* dlmalloc(size_t); +void dlfree(void*); +void* dlrealloc(void*, size_t); +void* dlmemalign(size_t, size_t); + +#endif diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c new file mode 100644 index 0000000000..26dd203ca1 --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c @@ -0,0 +1,179 @@ +/* Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h" +#include "oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h" +#include "oshmem/mca/memheap/base/base.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "opal/class/opal_hash_table.h" +#include "opal/class/opal_object.h" +#include "orte/util/name_fns.h" + +mca_memheap_ptmalloc_module_t memheap_ptmalloc = { + { + &mca_memheap_ptmalloc_component, + mca_memheap_ptmalloc_finalize, + mca_memheap_ptmalloc_alloc, + mca_memheap_ptmalloc_align, + mca_memheap_ptmalloc_realloc, + mca_memheap_ptmalloc_free, + + mca_memheap_ptmalloc_alloc, + mca_memheap_ptmalloc_free, + + mca_memheap_base_get_cached_mkey, + mca_memheap_base_get_mkey, + mca_memheap_base_find_offset, + mca_memheap_base_is_symmetric_addr, + mca_memheap_modex_recv_all, + + 0 + }, + 100 /* priority */ +}; + +/* Memory Heap Buddy Implementation */ +/** + * Initialize the Memory Heap + */ +int mca_memheap_ptmalloc_module_init(memheap_context_t *context) +{ + if (!context || !context->user_size || !context->private_size) { + return OSHMEM_ERR_BAD_PARAM; + } + + /* Construct a mutex object */ + OBJ_CONSTRUCT(&memheap_ptmalloc.lock, opal_mutex_t); + memheap_ptmalloc.base = context->user_base_addr; + memheap_ptmalloc.cur_size = 0; + memheap_ptmalloc.max_size = context->user_size + context->private_size; + memheap_ptmalloc.max_alloc_size = context->user_size; + + MEMHEAP_VERBOSE(1, + "symmetric heap memory (user+private): %llu bytes", + (unsigned long long)(context->user_size + context->private_size)); + + /* disable till we figure out double modex&grpcomm.bad problem */ + // memheap_modex_mkey_exchange(); + return OSHMEM_SUCCESS; + +} + +/** + * Allocate size bytes on the symmetric heap. + * The allocated variable is aligned to its size. + */ +int mca_memheap_ptmalloc_alloc(size_t size, void** p_buff) +{ + if (size > memheap_ptmalloc.max_alloc_size) { + *p_buff = 0; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + *p_buff = dlmalloc(size); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + + if (NULL == *p_buff) + return OSHMEM_ERROR; + + return OSHMEM_SUCCESS; +} + +int mca_memheap_ptmalloc_align(size_t align, size_t size, void **p_buff) +{ + if (size > memheap_ptmalloc.max_alloc_size) { + *p_buff = 0; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + if (align == 0) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + /* check that align is power of 2 */ + if (align & (align - 1)) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + *p_buff = dlmemalign(align, size); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + + if (NULL == *p_buff) + return OSHMEM_ERROR; + + return OSHMEM_SUCCESS; +} + +int mca_memheap_ptmalloc_realloc(size_t new_size, + void *p_buff, + void **p_new_buff) +{ + if (new_size > memheap_ptmalloc.max_alloc_size) { + *p_new_buff = 0; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + *p_new_buff = dlrealloc(p_buff, new_size); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + + if (!*p_new_buff) + return OSHMEM_ERR_OUT_OF_RESOURCE; + + return OSHMEM_SUCCESS; +} + +/* + * Free a variable allocated on the + * symmetric heap. + */ +int mca_memheap_ptmalloc_free(void* ptr) +{ + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + dlfree(ptr); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + return OSHMEM_SUCCESS; +} + +int mca_memheap_ptmalloc_finalize() +{ + MEMHEAP_VERBOSE(5, "deregistering symmetric heap"); + return OSHMEM_SUCCESS; +} + +int mca_memheap_ptmalloc_getpagesize(void) +{ + return 2 * 1024 * 1024; +} + +/* must be same as in malloc.c */ +#define PTMALLOC_MAX_SIZE_T (~(size_t)0) +#define PTMALLOC_MFAIL ((void*)(PTMALLOC_MAX_SIZE_T)) +void *mca_memheap_ptmalloc_sbrk(size_t size) +{ + char *ret; + + if (memheap_ptmalloc.cur_size + size > memheap_ptmalloc.max_size) { + return PTMALLOC_MFAIL ; + } + + ret = (char *) memheap_ptmalloc.base + memheap_ptmalloc.cur_size; + memheap_ptmalloc.cur_size += size; + + return ret; +} + diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h new file mode 100644 index 0000000000..d18ec0a52f --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h @@ -0,0 +1,71 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * Description of the Registration Cache framework + */ +#ifndef MCA_MEMHEAP_PTMALLOC_H +#define MCA_MEMHEAP_PTMALLOC_H + +#include "oshmem_config.h" +#include "opal/mca/mca.h" +#include "opal/class/opal_list.h" +#include "opal/threads/mutex.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/spml/spml.h" +#include "opal/class/opal_hash_table.h" +#include "ompi/mca/btl/btl.h" +#include +#include +#include + +BEGIN_C_DECLS + +#include "malloc_defs.h" +/* + * At the moment we use only dlmalloc part of the ptmalloc3. Thread safety is implemented by using locks on + * alloc operations. Since all shmem alloc ops are collectives, malloc performance is not a problem. So it makes + * sense to use simpler algorithm. + * + * Heap is allocate in one chunk, and we implement our on sbrk like function that serves portions of the memory + * to malloc. + * + * At the moment we do not support growing/returning heap based memory to OS. + */ + +/* Structure for managing shmem symmetric heap */ +struct mca_memheap_ptmalloc_module_t { + mca_memheap_base_module_t super; + int priority; /** Module's Priority */ + void *base; + size_t cur_size; + size_t max_size; + size_t max_alloc_size; + opal_mutex_t lock; /** Part of the allocator */ +}; + +typedef struct mca_memheap_ptmalloc_module_t mca_memheap_ptmalloc_module_t; +OSHMEM_DECLSPEC extern mca_memheap_ptmalloc_module_t memheap_ptmalloc; + +/* + * Buddy interface. + * Please pay attention to the new differences in the interface. + */ +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_module_init(memheap_context_t *); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_alloc(size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_realloc(size_t, void*, void **); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_align(size_t, size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_free(void*); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_finalize(void); + +END_C_DECLS + +#endif /* MCA_MEMHEAP_BUDDY_H */ diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c new file mode 100644 index 0000000000..cd5af8a904 --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include "opal/util/output.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h" +#include "memheap_ptmalloc_component.h" + +static int mca_memheap_ptmalloc_component_close(void); +static mca_memheap_base_module_t* mca_memheap_ptmalloc_component_init(memheap_context_t *, + int *); + +static int __basic_open(void); + +mca_memheap_base_component_t mca_memheap_ptmalloc_component = { + { + MCA_MEMHEAP_BASE_VERSION_2_0_0, + + "ptmalloc", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + + __basic_open, + mca_memheap_ptmalloc_component_close, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + mca_memheap_ptmalloc_component_init +}; + +/* Open component */ +static int __basic_open(void) +{ + return OSHMEM_SUCCESS; +} + +/* Initialize component */ +mca_memheap_base_module_t* mca_memheap_ptmalloc_component_init(memheap_context_t *context, + int *priority) +{ + int rc; + + *priority = memheap_ptmalloc.priority; + rc = mca_memheap_ptmalloc_module_init(context); + if (OSHMEM_SUCCESS != rc) { + return NULL ; + } + + return &(memheap_ptmalloc.super); +} + +/* + * This function is automaticaly called from mca_base_components_close. + * It releases the component's allocated memory. + */ +int mca_memheap_ptmalloc_component_close() +{ + mca_memheap_ptmalloc_finalize(); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h new file mode 100644 index 0000000000..0f7dc39b2d --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_MEMHEAP_PTMALLOC_COMPONENT_H +#define MCA_MEMHEAP_PTMALLOC_COMPONENT_H + +BEGIN_C_DECLS + +/* + * MEMHEAP module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_memheap_base_component_2_0_0_t mca_memheap_ptmalloc_component; + +END_C_DECLS + +#endif diff --git a/oshmem/mca/scoll/Makefile.am b/oshmem/mca/scoll/Makefile.am new file mode 100644 index 0000000000..4463de944e --- /dev/null +++ b/oshmem/mca/scoll/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_scoll.la +libmca_scoll_la_SOURCES = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +# local files +headers = scoll.h +libmca_scoll_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/scoll +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/scoll/base/Makefile.am b/oshmem/mca/scoll/base/Makefile.am new file mode 100644 index 0000000000..4a6f608b16 --- /dev/null +++ b/oshmem/mca/scoll/base/Makefile.am @@ -0,0 +1,19 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +headers += \ + base/base.h + +libmca_scoll_la_SOURCES += \ + base/scoll_base_frame.c \ + base/scoll_base_available.c \ + base/scoll_base_select.c diff --git a/oshmem/mca/scoll/base/base.h b/oshmem/mca/scoll/base/base.h new file mode 100644 index 0000000000..db61ec1ee5 --- /dev/null +++ b/oshmem/mca/scoll/base/base.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SCOLL_BASE_H +#define MCA_SCOLL_BASE_H + +#include "oshmem_config.h" + +#include "oshmem/mca/memheap/memheap.h" +#include "opal/class/opal_list.h" + +/* + * Global functions for MCA overall collective open and close + */ + +BEGIN_C_DECLS + +/* + * Globals + */ +/** + * Special synchronization array to do barrier all. + */ +OSHMEM_DECLSPEC extern long* mca_scoll_sync_array; + +OSHMEM_DECLSPEC int mca_scoll_base_find_available(bool enable_progress_threads, + bool enable_threads); + +OSHMEM_DECLSPEC int mca_scoll_base_select(struct oshmem_group_t *group); + +int mca_scoll_base_group_unselect(struct oshmem_group_t *group); + +OSHMEM_DECLSPEC int mca_scoll_enable(void); + +/* + * MCA framework + */ +OSHMEM_DECLSPEC extern mca_base_framework_t oshmem_scoll_base_framework; + +/* ******************************************************************** */ +#ifdef __BASE_FILE__ +#define __SCOLL_FILE__ __BASE_FILE__ +#else +#define __SCOLL_FILE__ __FILE__ +#endif + +#define SCOLL_VERBOSE(level, format, ...) \ + opal_output_verbose(level, oshmem_scoll_base_framework.framework_output, "%s:%d - %s() " format, \ + __SCOLL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define SCOLL_ERROR(format, ... ) \ + opal_output_verbose(0, oshmem_scoll_base_framework.framework_output, "Error: %s:%d - %s() " format, \ + __SCOLL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +END_C_DECLS + +#endif /* MCA_SCOLL_BASE_H */ diff --git a/oshmem/mca/scoll/base/scoll_base_available.c b/oshmem/mca/scoll/base/scoll_base_available.c new file mode 100644 index 0000000000..f1a36fec0b --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_available.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "oshmem_config.h" + +#include "orte/util/show_help.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" + +/* + * Private functions + */ +static int init_query(const mca_base_component_t * ls, + bool enable_progress_threads, + bool enable_threads); + +/* + * Scan down the list of successfully opened components and query each of + * them (the opened list will be one or more components. If the user + * requested a specific component, it will be the only component in the + * opened list). Create and populate the available list of all + * components who indicate that they want to be considered for selection. + * Close all components who do not want to be considered for selection, + * and destroy the opened list. + * + * Also find the basic component while we're doing all of this, and save + * it in a global variable so that we can find it easily later (e.g., + * during scope selection). + */ +int mca_scoll_base_find_available(bool enable_progress_threads, + bool enable_threads) +{ + mca_base_component_list_item_t *cli, *next; + const mca_base_component_t *component; + + OPAL_LIST_FOREACH_SAFE(cli, next, &oshmem_scoll_base_framework.framework_components, mca_base_component_list_item_t) { + component = cli->cli_component; + + /* Call a subroutine to do the work, because the component may + represent different versions of the coll MCA. */ + + if (OSHMEM_SUCCESS != init_query(component, enable_progress_threads, + enable_threads)) { + /* If the component doesn't want to run, then close it. + Now close it out and release it from the DSO repository (if it's there). */ + opal_list_remove_item(&oshmem_scoll_base_framework.framework_components, &cli->super); + mca_base_component_close(component, oshmem_scoll_base_framework.framework_output); + OBJ_RELEASE(cli); + } + } + + /* If we have no collective components available, it's an error. + Thanks for playing! */ + + if (opal_list_get_size(&oshmem_scoll_base_framework.framework_components) == 0) { + SCOLL_VERBOSE(10, + "scoll:find_available: no components available!"); + return OSHMEM_ERROR; + } + + /* All done */ + + return OSHMEM_SUCCESS; +} + +/* + * Query a component, see if it wants to run at all. If it does, save + * some information. If it doesn't, close it. + */ +static int init_query(const mca_base_component_t * component, + bool enable_progress_threads, + bool enable_threads) +{ + int ret; + + SCOLL_VERBOSE(10, + "scoll:find_available: querying scoll component %s", + component->mca_component_name); + + /* This component has already been successfully opened. So now + query it. */ + + if (1 == component->mca_type_major_version + && 0 == component->mca_type_minor_version + && 0 == component->mca_type_release_version) { + + mca_scoll_base_component_t *scoll = + (mca_scoll_base_component_t *) component; + + ret = scoll->scoll_init(enable_progress_threads, enable_threads); + } else { + /* Unrecognized coll API version */ + + SCOLL_VERBOSE(10, + "scoll:find_available: unrecognized scoll API version (%d.%d.%d, ignored)", + component->mca_type_major_version, + component->mca_type_minor_version, + component->mca_type_release_version); + return OSHMEM_ERROR; + } + + /* Query done -- look at the return value to see what happened */ + + if (OSHMEM_SUCCESS != ret) { + SCOLL_VERBOSE(10, + "scoll:find_available: scoll component %s is not available", + component->mca_component_name); + if (NULL != component->mca_close_component) { + component->mca_close_component(); + } + } else { + SCOLL_VERBOSE(10, + "scoll:find_available: scoll component %s is available", + component->mca_component_name); + } + + /* All done */ + + return ret; +} diff --git a/oshmem/mca/scoll/base/scoll_base_frame.c b/oshmem/mca/scoll/base/scoll_base_frame.c new file mode 100644 index 0000000000..12857c62b8 --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_frame.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "oshmem_config.h" + +#include "oshmem/constants.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "oshmem/mca/scoll/base/static-components.h" + +/* + * Global variables; most of which are loaded by back-ends of MCA + * variables + */ +long* mca_scoll_sync_array = NULL; + +/* + * Ensure all function pointers are NULL'ed out to start with + */ +static void scoll_base_module_construct(mca_scoll_base_module_t *m) +{ + /* Collective function pointers */ + m->scoll_barrier = NULL; + m->scoll_broadcast = NULL; + m->scoll_collect = NULL; + m->scoll_reduce = NULL; + m->scoll_module_enable = NULL; +} + +OBJ_CLASS_INSTANCE(mca_scoll_base_module_t, opal_object_t, + scoll_base_module_construct, NULL); + +int mca_scoll_enable(void) +{ + int ret = OSHMEM_SUCCESS; + + if (!mca_scoll_sync_array) { + void* ptr = (void*) mca_scoll_sync_array; + int i = 0; + + MCA_MEMHEAP_CALL(private_alloc((_SHMEM_BARRIER_SYNC_SIZE * sizeof(*mca_scoll_sync_array)), &ptr)); + mca_scoll_sync_array = ptr; + + for (i = 0; i < _SHMEM_BARRIER_SYNC_SIZE; i++) { + mca_scoll_sync_array[i] = _SHMEM_SYNC_VALUE; + } + } + + /* Note: it is done to support FCA only and we need to consider possibility to + * find a way w/o this ugly hack + */ + if (OSHMEM_SUCCESS != (ret = mca_scoll_base_select(oshmem_group_all))) { + return ret; + } + if (OSHMEM_SUCCESS != (ret = mca_scoll_base_select(oshmem_group_self))) { + return ret; + } + + return OSHMEM_SUCCESS; +} + +static int mca_scoll_base_register(mca_base_register_flag_t flags) +{ + return OSHMEM_SUCCESS; +} + +static int mca_scoll_base_close(void) +{ + /* This call should be done before memheap close */ + if (mca_scoll_sync_array) { + void* ptr = (void*) mca_scoll_sync_array; + + MCA_MEMHEAP_CALL(private_free(ptr)); + mca_scoll_sync_array = NULL; + } + + return OSHMEM_SUCCESS; +} + +static int mca_scoll_base_open(mca_base_open_flag_t flags) +{ + return OSHMEM_SUCCESS; +} + +MCA_BASE_FRAMEWORK_DECLARE(oshmem, scoll, + "OSHMEM SCOLL", + mca_scoll_base_register, + mca_scoll_base_open, + mca_scoll_base_close, + mca_scoll_base_static_components, + 0); diff --git a/oshmem/mca/scoll/base/scoll_base_select.c b/oshmem/mca/scoll/base/scoll_base_select.c new file mode 100644 index 0000000000..78d8020a84 --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_select.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include +#include +#include + +#include "oshmem/constants.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/runtime/runtime.h" + +/* + * Local types + */ +struct avail_com_t { + opal_list_item_t super; + + int ac_priority; + mca_scoll_base_module_t *ac_module; +}; +typedef struct avail_com_t avail_com_t; + +/* + * Local functions + */ +static opal_list_t *check_components(opal_list_t * components, + oshmem_group_t * group); +static int check_one_component(oshmem_group_t * group, + const mca_base_component_t * component, + mca_scoll_base_module_1_0_0_t ** module); + +static int query(const mca_base_component_t * component, + oshmem_group_t * group, + int *priority, + mca_scoll_base_module_1_0_0_t ** module); + +static int query_1_0_0(const mca_scoll_base_component_1_0_0_t * scoll_component, + oshmem_group_t * group, + int *priority, + mca_scoll_base_module_1_0_0_t ** module); + +static int scoll_null_barrier(struct oshmem_group_t *group, + long *pSync, + int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int scoll_null_broadcast(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync, + int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int scoll_null_collect(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync, + bool nlong_type, + int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int scoll_null_reduce(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk, + int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +/* + * Stuff for the OBJ interface + */ +static OBJ_CLASS_INSTANCE(avail_com_t, opal_list_item_t, NULL, NULL); + +#define COPY(module, group, func) \ + do { \ + if (NULL != module->scoll_ ## func) { \ + if (NULL != group->g_scoll.scoll_ ## func ## _module) { \ + OBJ_RELEASE(group->g_scoll.scoll_ ## func ## _module); \ + } \ + group->g_scoll.scoll_ ## func = module->scoll_ ## func; \ + group->g_scoll.scoll_ ## func ## _module = module; \ + OBJ_RETAIN(module); \ + } \ + } while (0) + +#define CLOSE(group, func) \ + do { \ + if (NULL != group->g_scoll.scoll_ ## func ## _module) { \ + OBJ_RELEASE(group->g_scoll.scoll_ ## func ## _module); \ + group->g_scoll.scoll_## func = NULL; \ + group->g_scoll.scoll_## func ## _module = NULL; \ + } \ + } while (0) + +int mca_scoll_base_group_unselect(struct oshmem_group_t * group) +{ + /* + * scoll close() is called before group destructors, so + * do close group collectives if scoll modules are no longer + * valid + * + * there is a memory leak here, because not doing close means + * that we leaving object with dangling ref counts + */ + SCOLL_VERBOSE(10, "scoll:base:group_unselect: group: %d", group->id); + + CLOSE(group, barrier); + CLOSE(group, broadcast); + CLOSE(group, collect); + CLOSE(group, reduce); + + /* All done */ + return OSHMEM_SUCCESS; +} +/* + * This function is called at the initialization time of every + * group. It is used to select which coll component will be + * active for a given group. + */ +int mca_scoll_base_select(struct oshmem_group_t *group) +{ + opal_list_t *selectable; + opal_list_item_t *item; + int ret; + + /* Announce */ + SCOLL_VERBOSE(10, "scoll:base:group_select: new group: %d", group->id); + mca_scoll_base_group_unselect(group); + memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t)); + if (!oshmem_proc_group_is_member(group)) { + group->g_scoll.scoll_barrier = scoll_null_barrier; + group->g_scoll.scoll_broadcast = scoll_null_broadcast; + group->g_scoll.scoll_collect = scoll_null_collect; + group->g_scoll.scoll_reduce = scoll_null_reduce; + return OSHMEM_SUCCESS; + } + SCOLL_VERBOSE(10, + "scoll:base:group_select: Checking all available modules"); + selectable = check_components(&oshmem_scoll_base_framework.framework_components, group); + + /* Upon return from the above, the modules list will contain the + list of modules that returned (priority >= 0). If we have no + collective modules available, then print error and return. */ + if (NULL == selectable) { + /* There's no modules available */ + return OSHMEM_ERROR; + } + + /* do the selection loop */ + for (item = opal_list_remove_first(selectable); NULL != item; item = + opal_list_remove_first(selectable)) { + avail_com_t *avail = (avail_com_t *) item; + ret = avail->ac_module->scoll_module_enable(avail->ac_module, group); + if (OSHMEM_SUCCESS != ret) { + mca_scoll_base_group_unselect(group); + } else { + COPY(avail->ac_module, group, broadcast); + COPY(avail->ac_module, group, collect); + COPY(avail->ac_module, group, reduce); + COPY(avail->ac_module, group, barrier); + } + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); + } + + /* Done with the list from the check_components() call so release it. */ + OBJ_RELEASE(selectable); + if ((NULL == group->g_scoll.scoll_barrier) + || (NULL == group->g_scoll.scoll_broadcast) + || (NULL == group->g_scoll.scoll_collect) + || (NULL == group->g_scoll.scoll_reduce)) { + mca_scoll_base_group_unselect(group); + return OSHMEM_ERR_NOT_FOUND; + } + + return OSHMEM_SUCCESS; +} + +static int avail_coll_compare (opal_list_item_t **a, + opal_list_item_t **b) +{ + avail_com_t *acom = (avail_com_t *) *a; + avail_com_t *bcom = (avail_com_t *) *b; + + if (acom->ac_priority > bcom->ac_priority) { + return 1; + } else if (acom->ac_priority < bcom->ac_priority) { + return -1; + } + + return 0; +} + +/* + * For each module in the list, check and see if it wants to run, and + * do the resulting priority comparison. Make a list of modules to be + * only those who returned that they want to run, and put them in + * priority order. + */ +static opal_list_t *check_components(opal_list_t *components, + oshmem_group_t *group) +{ + int priority; + const mca_base_component_t *component; + mca_base_component_list_item_t *cli; + mca_scoll_base_module_1_0_0_t *module; + opal_list_t *selectable; + avail_com_t *avail; + + /* Make a list of the components that query successfully */ + selectable = OBJ_NEW(opal_list_t); + + /* Scan through the list of components */ + OPAL_LIST_FOREACH(cli, &oshmem_scoll_base_framework.framework_components, mca_base_component_list_item_t) { + component = cli->cli_component; + + priority = check_one_component(group, component, &module); + if (priority >= 0) { + /* We have a component that indicated that it wants to run + by giving us a module */ + avail = OBJ_NEW(avail_com_t); + avail->ac_priority = priority; + avail->ac_module = module; + + opal_list_append(selectable, &avail->super); + } + } + + /* If we didn't find any available components, return an error */ + if (0 == opal_list_get_size(selectable)) { + OBJ_RELEASE(selectable); + return NULL; + } + + /* Put this list in priority order */ + opal_list_sort(selectable, avail_coll_compare); + + /* All done */ + return selectable; +} + +/* + * Check a single component + */ +static int check_one_component(oshmem_group_t *group, + const mca_base_component_t *component, + mca_scoll_base_module_1_0_0_t **module) +{ + int err; + int priority = -1; + + err = query(component, group, &priority, module); + + if (OSHMEM_SUCCESS == err) { + priority = (priority < 100) ? priority : 100; + SCOLL_VERBOSE(10, + "scoll:base:group_select: component available: %s, priority: %d", + component->mca_component_name, priority); + + } else { + priority = -1; + SCOLL_VERBOSE(10, + "scoll:base:group_select: component not available: %s", + component->mca_component_name); + } + + return priority; +} + +/************************************************************************** + * Query functions + **************************************************************************/ + +/* + * Take any version of a coll module, query it, and return the right + * module struct + */ +static int query(const mca_base_component_t * component, + oshmem_group_t *group, + int *priority, + mca_scoll_base_module_1_0_0_t **module) +{ + *module = NULL; + if (1 == component->mca_type_major_version + && 0 == component->mca_type_minor_version + && 0 == component->mca_type_release_version) { + const mca_scoll_base_component_1_0_0_t *coll100 = + (mca_scoll_base_component_1_0_0_t *) component; + + return query_1_0_0(coll100, group, priority, module); + } + + /* Unknown coll API version -- return error */ + + return OSHMEM_ERROR; +} + +static int query_1_0_0(const mca_scoll_base_component_1_0_0_t *component, + oshmem_group_t *group, + int *priority, + mca_scoll_base_module_1_0_0_t **module) +{ + mca_scoll_base_module_1_0_0_t *ret; + + /* There's currently no need for conversion */ + + ret = component->scoll_query(group, priority); + if (NULL != ret) { + *module = ret; + return OSHMEM_SUCCESS; + } + + return OSHMEM_ERROR; +} diff --git a/oshmem/mca/scoll/basic/Makefile.am b/oshmem/mca/scoll/basic/Makefile.am new file mode 100644 index 0000000000..150e6bb150 --- /dev/null +++ b/oshmem/mca/scoll/basic/Makefile.am @@ -0,0 +1,42 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +sources = \ + scoll_basic.h \ + scoll_basic_module.c \ + scoll_basic_component.c \ + scoll_basic_barrier.c \ + scoll_basic_broadcast.c \ + scoll_basic_collect.c \ + scoll_basic_reduce.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_oshmem_scoll_basic_DSO +component_noinst = +component_install = mca_scoll_basic.la +else +component_noinst = libmca_scoll_basic.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_scoll_basic_la_SOURCES = $(sources) +mca_scoll_basic_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_scoll_basic_la_SOURCES =$(sources) +libmca_scoll_basic_la_LDFLAGS = -module -avoid-version diff --git a/oshmem/mca/scoll/basic/configure.params b/oshmem/mca/scoll/basic/configure.params new file mode 100644 index 0000000000..1b6b5ba51c --- /dev/null +++ b/oshmem/mca/scoll/basic/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/scoll/basic/scoll_basic.h b/oshmem/mca/scoll/basic/scoll_basic.h new file mode 100644 index 0000000000..315a66f3c1 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SCOLL_BASIC_H +#define MCA_SCOLL_BASIC_H + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "oshmem/mca/scoll/scoll.h" + +BEGIN_C_DECLS + +/* Globally exported variables */ + +OSHMEM_MODULE_DECLSPEC extern mca_scoll_base_component_1_0_0_t +mca_scoll_basic_component; + +extern int mca_scoll_basic_priority_param; +OSHMEM_DECLSPEC extern int mca_scoll_basic_param_barrier_algorithm; +extern int mca_scoll_basic_param_broadcast_algorithm; +extern int mca_scoll_basic_param_collect_algorithm; +extern int mca_scoll_basic_param_reduce_algorithm; + +/* API functions */ + +int mca_scoll_basic_init(bool enable_progress_threads, bool enable_threads); +mca_scoll_base_module_t* +mca_scoll_basic_query(struct oshmem_group_t *group, int *priority); + +enum { + SHMEM_SYNC_INIT = _SHMEM_SYNC_VALUE, + SHMEM_SYNC_WAIT = -2, + SHMEM_SYNC_RUN = -3, + SHMEM_SYNC_READY = -4, +}; + +int mca_scoll_basic_barrier(struct oshmem_group_t *group, long *pSync, int alg); +int mca_scoll_basic_broadcast(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync, + int alg); +int mca_scoll_basic_collect(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync, + bool nlong_type, + int alg); +int mca_scoll_basic_reduce(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk, + int alg); + +static inline unsigned int scoll_log2(unsigned long val) +{ + unsigned int count = 0; + + while (val > 0) { + val = val >> 1; + count++; + } + + return count > 0 ? count - 1 : 0; +} + +struct mca_scoll_basic_module_t { + mca_scoll_base_module_t super; +}; +typedef struct mca_scoll_basic_module_t mca_scoll_basic_module_t; +OBJ_CLASS_DECLARATION(mca_scoll_basic_module_t); + +END_C_DECLS + +#endif /* MCA_SCOLL_BASIC_H */ diff --git a/oshmem/mca/scoll/basic/scoll_basic_barrier.c b/oshmem/mca/scoll/basic/scoll_basic_barrier.c new file mode 100644 index 0000000000..5876095d49 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_barrier.c @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "orte/mca/grpcomm/grpcomm.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/proc/proc.h" +#include "scoll_basic.h" + +static int __algorithm_central_counter(struct oshmem_group_t *group, + long *pSync); +static int __algorithm_tournament(struct oshmem_group_t *group, long *pSync); +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, + long *pSync); +static int __algorithm_dissemination(struct oshmem_group_t *group, long *pSync); +static int __algorithm_basic(struct oshmem_group_t *group, long *pSync); +static int __algorithm_adaptive(struct oshmem_group_t *group, long *pSync); + +int mca_scoll_basic_barrier(struct oshmem_group_t *group, long *pSync, int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) { + if (pSync) { + alg = (alg == SCOLL_DEFAULT_ALG ? + mca_scoll_basic_param_barrier_algorithm : alg); + switch (alg) { + case SCOLL_ALG_BARRIER_CENTRAL_COUNTER: + { + rc = __algorithm_central_counter(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_TOURNAMENT: + { + rc = __algorithm_tournament(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_RECURSIVE_DOUBLING: + { + rc = __algorithm_recursive_doubling(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_DISSEMINATION: + { + rc = __algorithm_dissemination(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_BASIC: + { + rc = __algorithm_basic(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_ADAPTIVE: + { + rc = __algorithm_adaptive(group, pSync); + break; + } + default: + { + rc = __algorithm_recursive_doubling(group, pSync); + } + } + } else { + SCOLL_ERROR("Incorrect argument pSync"); + rc = OSHMEM_ERR_BAD_PARAM; + } + } + + return rc; +} + +/* + This algorithm is quite simple and straightforward. But because of it’s obvious simplicity and + the naive prove for correctness it is implemented quite often. One node asks peers if they are + achieve barrier state. When all processors are ready it signals to go ahead. + Outlay: + NP-1 competing network transfers are needed to implement the counter + The memory usage is constant (1 byte) per node. + */ +static int __algorithm_central_counter(struct oshmem_group_t *group, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + long value = SHMEM_SYNC_INIT; + int root_id = 0; + int PE_root = oshmem_proc_pe(group->proc_array[root_id]); + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Central Counter", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + /* Root processes synchronization */ + if (PE_root == group->my_pe) { + int pe_cur = 0; + long wait_pe_count = 0; + int* wait_pe_array = NULL; + + wait_pe_array = malloc(sizeof(*wait_pe_array) * group->proc_count); + if (wait_pe_array) { + SCOLL_VERBOSE(14, "[#%d] PE is the root", group->my_pe); + + wait_pe_count = group->proc_count; + for (i = 0; i < group->proc_count; i++) { + wait_pe_array[i] = oshmem_proc_pe(group->proc_array[i]); + } + wait_pe_array[root_id] = OSHMEM_PE_INVALID; + wait_pe_count--; + + while (wait_pe_count) { + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); + i++) { + pe_cur = wait_pe_array[i]; + if (pe_cur != OSHMEM_PE_INVALID) { + rc = MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, pe_cur)); + if ((rc == OSHMEM_SUCCESS) + && (value == SHMEM_SYNC_WAIT)) { + wait_pe_array[i] = OSHMEM_PE_INVALID; + wait_pe_count--; + SCOLL_VERBOSE(14, + "[#%d] PE#%d is ready (wait list counter: %d)", + group->my_pe, pe_cur, (int)wait_pe_count); + } + } + } + } + + SCOLL_VERBOSE(14, "[#%d] PE signals to all", group->my_pe); + value = SHMEM_SYNC_RUN; + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); + i++) { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) { + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, pe_cur)); + } + } + + free(wait_pe_array); + } else { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + + /* Possibly this is unnecessary... + But imagine the scenario when you have 2 sequential barriers and the root PE is the fastest one. + The root could leave the first barrier and in the second barrier it could get SHMEM_SYNC_WAIT value on + remote node before the remote node receives its SHMEM_SYNC_RUN value in the first barrier + */ + /* TODO: actually it must be quiet */ + MCA_SPML_CALL(fence()); + } + /* Wait for RUN signal */ + else { + SCOLL_VERBOSE(14, + "[#%d] PE waits for a signal from root", + group->my_pe); + + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +/* + The Tournament Barrier, proposed by Hengsen, Finkel and Manser is mostly suitable for shared memory + multiprocessors because it benefits from several caching mechanisms. + The algorithm is similar to a tournament game. In each round two + nodes play against each other. The winner is known in advance and waits until the looser arrives. The + winners play against each other in the next round. The overall winner (the champion) notifies all others + about the end of the barrier. + Outlay: + The game scales with log2(NP) and uses 1 byte of memory. + */ +static int __algorithm_tournament(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int exit_flag = group->proc_count - 1; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Tournament", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + while (exit_flag && (rc == OSHMEM_SUCCESS)) { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + /* Do not have peer for tournament */ + if (peer_id >= group->proc_count) + continue; + + if (my_id < peer_id) { + pSync[0] = peer_id; + value = my_id; + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } else { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != my_id); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = peer_id; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#else + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + do + { + rc = MCA_ATOMIC_CALL(cswap((void*)pSync, (void*)&value, (const void*)&my_id, (const void*)&peer_id, sizeof(value), peer_pe)); + }while (value != my_id); +#endif + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + break; + } + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + /* Send result to all PE in group */ + if ((my_id == 0) && (rc == OSHMEM_SUCCESS)) { + SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe); + + value = SHMEM_SYNC_RUN; + for (peer_id = 1; + (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS); + peer_id++) { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +/* + Pairwise Exchange With Recursive Doubling. + Rinka Gupta, Vinod Tipparaju, Jare Nieplocha, and Dhabaleswar Panda. Efficient Barrier + using Remote Memory Operations on VIA-Based Clusters. In 2002 IEEE International + Conference on Cluster Computing (CLUSTER 2002), page 83. IEEE Computer Society, 2002. + Outlay: + The algorithm uses a maximum of log2(NP) + 2 network writes and P bytes memory per node. + */ +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int floor2_proc = 0; + int exit_flag = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + floor2_proc = 1; + i = group->proc_count; + i >>= 1; + while (i) { + i >>= 1; + floor2_proc <<= 1; + } + + SCOLL_VERBOSE(12, + "[#%d] Barrier algorithm: Recursive Doubling", + group->my_pe); + SCOLL_VERBOSE(15, + "[#%d] pSync[0] = %ld floor2_proc = %d", + group->my_pe, pSync[0], floor2_proc); + + if (my_id >= floor2_proc) { + /* I am in extra group, my partner is node (my_id-y) in basic group */ + peer_id = my_id - floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, + "[#%d] is extra and signal to #%d", + group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } else { + /* Wait for a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, + "[#%d] wait a signal from #%d", + group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + /* Pairwise exchange */ + exit_flag = floor2_proc - 1; + pSync[0] = round; + while (exit_flag && (rc == OSHMEM_SUCCESS)) { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != (round - 1)); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = round; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#else + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + { + long cond = round - 1; + do + { + rc = MCA_ATOMIC_CALL(cswap((void*)pSync, (void*)&value, (const void*)&cond, (const void*)&round, sizeof(value), peer_pe)); + }while (value != (round-1)); + } +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = round; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG)); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + /* Notify a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +/* + The Dissemination Barrier, introduced by Hengsen, Finkel and Manser in 1998. + The algorithm is mostly an improvement of the Butterfly Barrier for non power of two processor counts. + It uses the same pairwise synchronization but with other partners. + Outlay: + The game scales with log2(NP) and uses 1 byte of memory. + */ +static int __algorithm_dissemination(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int log2_proc = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + log2_proc = scoll_log2((unsigned long) group->proc_count); + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Dissemination", group->my_pe); + SCOLL_VERBOSE(15, + "[#%d] pSync[0] = %ld floor2_proc = %d", + group->my_pe, pSync[0], log2_proc); + + pSync[0] = round; + for (round = 0; (round <= log2_proc) && (rc == OSHMEM_SUCCESS); round++) { + /* Define a peer to send signal */ + peer_id = (my_id + (1 << round)) % group->proc_count; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != round); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = round + 1; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = round + 1; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG)); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +static int __algorithm_basic(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int root_id = 0; + int PE_root = oshmem_proc_pe(group->proc_array[root_id]); + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Basic", group->my_pe); + + if (PE_root != group->my_pe) { + rc = MCA_SPML_CALL(send(NULL, 0, PE_root, MCA_SPML_BASE_PUT_STANDARD)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + + rc = MCA_SPML_CALL(recv(NULL, 0, PE_root)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + /* The root collects and broadcasts the messages. */ + + else { + int pe_cur = 0; + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) { + rc = MCA_SPML_CALL(recv(NULL, 0, SHMEM_ANY_SOURCE)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) { + rc = MCA_SPML_CALL(send(NULL, 0, pe_cur, MCA_SPML_BASE_PUT_STANDARD)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + } + + return rc; +} + +static int __algorithm_adaptive(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + bool local_peers_only = true; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Adaptive", group->my_pe); + + /* check if we have only local peers */ + { + int i = 0; + + for (i = 0; i < group->proc_count; i++) { + if (i == group->id) + continue; + + if (!OPAL_PROC_ON_LOCAL_NODE(group->proc_array[i]->proc_flags)) { + local_peers_only = false; + break; + } + } + } + + /* Select algorithm we use: + * use send/recv way for group in the same node and for np < 32 + * otherwise use put/get way + */ + if (local_peers_only || (group->proc_count < 32)) { + rc = __algorithm_basic(group, pSync); + } else { + rc = __algorithm_recursive_doubling(group, pSync); + } + + return rc; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_broadcast.c b/oshmem/mca/scoll/basic/scoll_basic_broadcast.c new file mode 100644 index 0000000000..b93d63eabe --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_broadcast.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "orte/mca/grpcomm/grpcomm.h" + +#include "opal/util/bit_ops.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + +static int __algorithm_central_counter(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync); +static int __algorithm_binomial_tree(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync); + +int mca_scoll_basic_broadcast(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync, + int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Check if this PE is part of the group */ + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) { + int i = 0; + + if (pSync) { + alg = (alg == SCOLL_DEFAULT_ALG ? + mca_scoll_basic_param_broadcast_algorithm : alg); + switch (alg) { + case SCOLL_ALG_BROADCAST_CENTRAL_COUNTER: + { + rc = __algorithm_central_counter(group, + PE_root, + target, + source, + nlong, + pSync); + break; + } + case SCOLL_ALG_BROADCAST_BINOMIAL: + { + rc = __algorithm_binomial_tree(group, + PE_root, + target, + source, + nlong, + pSync); + break; + } + default: + { + rc = __algorithm_binomial_tree(group, + PE_root, + target, + source, + nlong, + pSync); + } + } + } else { + SCOLL_ERROR("Incorrect argument pSync"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BCAST_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + + return rc; +} + +/* + This algorithm is quite simple and straightforward. But because of it’s obvious simplicity and + the naive prove for correctness it is implemented quite often. The root send data to all. + Outlay: + NP-1 competing network transfers are needed to implement the counter + The memory usage is constant (1 byte) per node. + */ +static int __algorithm_central_counter(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + + SCOLL_VERBOSE(12, + "[#%d] Broadcast algorithm: Central Counter", + group->my_pe); + SCOLL_VERBOSE(15, + "[#%d] pSync[0] = %ld root = #%d", + group->my_pe, pSync[0], PE_root); + + /* Check if this PE is the root */ + if (PE_root == group->my_pe) { + int pe_cur = 0; + + SCOLL_VERBOSE(14, + "[#%d] send data to all PE in the group", + group->my_pe); + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) { + SCOLL_VERBOSE(15, + "[#%d] send data to #%d", + group->my_pe, pe_cur); + rc = MCA_SPML_CALL(put(target, nlong, (void *)source, pe_cur)); + } + } + } + + /* Wait for operation completion to set needed size */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, "[#%d] Wait for operation completion", group->my_pe); + rc = group->g_scoll.scoll_barrier(group, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + return rc; +} + +/* + The Binomial Spanning Tree algorithm. + Outlay: + The game scales with log2(NP) and uses 1 byte of memory. + */ +static int __algorithm_binomial_tree(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + long value = SHMEM_SYNC_INIT; + int root_id = oshmem_proc_group_find_id(group, PE_root); + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int vrank; + int dim = opal_cube_dim(group->proc_count); + int hibit; + int mask; + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe); + SCOLL_VERBOSE(15, + "[#%d] pSync[0] = %ld root = #%d", + group->my_pe, pSync[0], PE_root); + + vrank = (my_id + group->proc_count - root_id) % group->proc_count; + hibit = opal_hibit(vrank, dim); + + SCOLL_VERBOSE(15, + "[#%d] dim = %d vrank = %d hibit = %d", + group->my_pe, dim, vrank, hibit); + + dim--; + + pSync[0] = SHMEM_SYNC_READY; + /* Receive data from parent in the tree. */ + if (vrank > 0) { + value = SHMEM_SYNC_READY; + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); + while ((value = pSync[0]) < 0) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast size is a negative value (%li)\n", + group->my_pe, pSync[0]); + MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + nlong = (size_t) pSync[0]; + } + + /* Send data to the children. */ + for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { + peer_id = vrank | mask; + + if (peer_id < group->proc_count) { + /* Wait for the child to be ready to receive (pSync must have the initial value) */ + peer_id = (peer_id + root_id) % group->proc_count; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, + "[#%d] check remote pe is ready to receive #%d", + group->my_pe, peer_pe); + do { + rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe)); + } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY)); + + SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = nlong; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + if (OSHMEM_SUCCESS != rc) { + break; + } + } + } + + return rc; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_collect.c b/oshmem/mca/scoll/basic/scoll_basic_collect.c new file mode 100644 index 0000000000..148546d297 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_collect.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "orte/mca/grpcomm/grpcomm.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + +static int __algorithm_central_collector(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync); +static int __algorithm_f_central_counter(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync); +static int __algorithm_f_tournament(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync); +static int __algorithm_f_recursive_doubling(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync); +static int __algorithm_f_ring(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync); + +int mca_scoll_basic_collect(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync, + bool nlong_type, + int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Check if this PE is part of the group */ + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) { + int i = 0; + + if (nlong_type) { + alg = (alg == SCOLL_DEFAULT_ALG ? + mca_scoll_basic_param_collect_algorithm : alg); + switch (alg) { + case SCOLL_ALG_COLLECT_CENTRAL_COUNTER: + { + rc = __algorithm_f_central_counter(group, + target, + source, + nlong, + pSync); + break; + } + case SCOLL_ALG_COLLECT_TOURNAMENT: + { + rc = __algorithm_f_tournament(group, + target, + source, + nlong, + pSync); + break; + } + case SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING: + { + rc = __algorithm_f_recursive_doubling(group, + target, + source, + nlong, + pSync); + break; + } + case SCOLL_ALG_COLLECT_RING: + { + rc = __algorithm_f_ring(group, + target, + source, + nlong, + pSync); + break; + } + default: + { + rc = __algorithm_f_central_counter(group, + target, + source, + nlong, + pSync); + } + } + } else { + rc = __algorithm_central_collector(group, + target, + source, + nlong, + pSync); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "[#%d] Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_COLLECT_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + + return rc; +} + +/* + This algorithm is quite simple and straightforward for PEs with identical data size. + One node gathers data from peers and send final result to them. + Outlay: + NP-1 competing network transfers are needed. + */ +static int __algorithm_f_central_counter(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, + "[#%d] Collect algorithm: Central Counter (identical size)", + group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + if (PE_root == group->my_pe) { + int pe_cur = 0; + + memcpy((void*) ((unsigned char*) target + 0 * nlong), + (void *) source, + nlong); + + SCOLL_VERBOSE(14, + "[#%d] Gather data from all PEs in the group", + group->my_pe); + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) { + /* Get PE ID of a peer from the group */ + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + if (pe_cur == group->my_pe) + continue; + + SCOLL_VERBOSE(14, + "[#%d] Gather data (%d bytes) from #%d", + group->my_pe, (int)nlong, pe_cur); + + /* Get data from the current peer */ + rc = MCA_SPML_CALL(get((void *)source, nlong, (void*)((unsigned char*)target + i * nlong), pe_cur)); + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, + PE_root, + target, + target, + group->proc_count * nlong, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +static int __algorithm_f_tournament(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int exit_flag = group->proc_count - 1; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, + "[#%d] Collect algorithm: Tournament (identical size)", + group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + /* Copy data to itself */ + memcpy((void*) ((unsigned char*) target + my_id * nlong), + (void *) source, + nlong); + + while (exit_flag && (rc == OSHMEM_SUCCESS)) { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + /* Do not have peer for tournament */ + if (peer_id >= group->proc_count) + continue; + + if (my_id < peer_id) { + pSync[0] = peer_id; + value = my_id; + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } else { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != my_id); + + SCOLL_VERBOSE(14, + "[#%d] round = %d send data to #%d", + group->my_pe, round, peer_pe); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + my_id * nlong), (1 << (round - 1)) * nlong, (void*)((unsigned char*)target + my_id * nlong), peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = peer_id; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + break; + } + } + + /* Send result to all PE in group */ + if ((my_id == 0) && (rc == OSHMEM_SUCCESS)) { + SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe); + + value = SHMEM_SYNC_RUN; + for (peer_id = 1; + (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS); + peer_id++) { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, + PE_root, + target, + target, + group->proc_count * nlong, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +static int __algorithm_f_ring(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int data_index = 0; + int peer_id = 0; + int peer_pe = 0; + + SCOLL_VERBOSE(12, + "[#%d] Collect algorithm: Ring (identical size)", + group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + peer_id = (my_id + 1) % group->proc_count; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + memcpy((void*) ((unsigned char*) target + my_id * nlong), + (void *) source, + nlong); + data_index = my_id; + + for (i = 0; (i < (group->proc_count - 1)) && (rc == OSHMEM_SUCCESS); i++) { + SCOLL_VERBOSE(14, + "[#%d] round = %d send data to #%d by index = %d", + group->my_pe, i, peer_pe, data_index); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), nlong, (void*)((unsigned char*)target + data_index * nlong), peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, i, peer_pe); + value = i; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + data_index = (data_index ? (data_index - 1) : (group->proc_count - 1)); + + SCOLL_VERBOSE(14, + "[#%d] round = %d wait for data by index = %d", + group->my_pe, i, data_index); + if (i == 0) { + value = _SHMEM_SYNC_VALUE; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); + } else { + value = i; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +static int __algorithm_f_recursive_doubling(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int floor2_proc = 0; + int exit_flag = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int data_index = 0; + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + floor2_proc = 1; + i = group->proc_count; + i >>= 1; + while (i) { + i >>= 1; + floor2_proc <<= 1; + } + + SCOLL_VERBOSE(12, + "[#%d] Collect algorithm: Recursive Doubling (identical size)", + group->my_pe); + SCOLL_VERBOSE(15, + "[#%d] pSync[0] = %ld floor2_proc = %d", + group->my_pe, pSync[0], floor2_proc); + + memcpy((void*) ((unsigned char*) target + my_id * nlong), + (void *) source, + nlong); + data_index = my_id; + + if (my_id >= floor2_proc) { + int pe_cur = 0; + + /* I am in extra group, my partner is node (my_id-y) in basic group */ + peer_id = my_id - floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) { + if (i == my_id) + continue; + + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + SCOLL_VERBOSE(14, + "[#%d] is extra send data to #%d", + group->my_pe, pe_cur); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), nlong, (void *)source, pe_cur)); + } + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, + "[#%d] is extra and signal to #%d", + group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } else { + /* Wait for a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, + "[#%d] wait a signal from #%d", + group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + /* Pairwise exchange */ + exit_flag = floor2_proc - 1; + pSync[0] = round; + while (exit_flag && (rc == OSHMEM_SUCCESS)) { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != (round - 1)); + + SCOLL_VERBOSE(14, + "[#%d] round = %d send data to #%d by index = %d", + group->my_pe, round, peer_pe, data_index); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), (1 << (round - 1)) * nlong, (void*)((unsigned char*)target + data_index * nlong), peer_pe)); + + MCA_SPML_CALL(fence()); + + data_index = (my_id / (1 << round)) * (1 << round); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + pSync[0] = round; + } + + /* Notify a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, + "[#%d] is extra send data to #%d", + group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, group->proc_count * nlong, target, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +/* + This algorithm is quite simple and straightforward. It allows to have different data size on PEs. + One node gathers data from peers and send final result to them. + Outlay: + NP-1 competing network transfers are needed. + */ +static int __algorithm_central_collector(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync) +{ + int rc = OSHMEM_SUCCESS; + size_t offset = 0; + int i = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, + "[#%d] Collect algorithm: Central Counter (vary size)", + group->my_pe); + + /* Set own data size */ + pSync[0] = nlong; + + if (PE_root == group->my_pe) { + long value = 0; + int pe_cur = 0; + long wait_pe_count = 0; + size_t* wait_pe_array = NULL; + + wait_pe_count = group->proc_count; + wait_pe_array = malloc(sizeof(*wait_pe_array) * wait_pe_count); + if (wait_pe_array) { + memset((void*) wait_pe_array, + 0, + sizeof(*wait_pe_array) * wait_pe_count); + wait_pe_array[0] = nlong; + wait_pe_count--; + + while (wait_pe_count) { + SCOLL_VERBOSE(14, + "[#%d] Gather data size info from all PEs in the group", + group->my_pe); + for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); + i++) { + if (wait_pe_array[i] == 0) { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + value = 0; + rc = MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, pe_cur)); + if ((rc == OSHMEM_SUCCESS) + && (value != _SHMEM_SYNC_VALUE) + && (value > 0)) { + wait_pe_array[i] = (size_t) value; + wait_pe_count--; + SCOLL_VERBOSE(14, + "Got source data size as %d from #%d (wait list counter: %d)", + (int)value, pe_cur, (int)wait_pe_count); + } + } + } + } + + memcpy((void*) ((unsigned char*) target + 0 * nlong), + (void *) source, + nlong); + offset += nlong; + + for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); + i++) { + /* Get PE ID of a peer from the group */ + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + /* Get data from the current peer */ + rc = MCA_SPML_CALL(get((void *)source, wait_pe_array[i], (void*)((unsigned char*)target + offset), pe_cur)); + + SCOLL_VERBOSE(14, + "Got %d bytes of data from #%d (offset: %d)", + (int)wait_pe_array[i], pe_cur, (int)offset); + + offset += wait_pe_array[i]; + } + + free(wait_pe_array); + } else { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, + PE_root, + target, + target, + offset, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + return rc; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_component.c b/oshmem/mca/scoll/basic/scoll_basic_component.c new file mode 100644 index 0000000000..78bf909109 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_component.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + +/* + * Public string showing the scoll basic component version number + */ +const char *mca_scoll_basic_component_version_string = +"Open SHMEM basic collective MCA component version " OSHMEM_VERSION; + +/* + * Global variable + */ +int mca_scoll_basic_priority_param = -1; +int mca_scoll_basic_param_barrier_algorithm = SCOLL_ALG_BARRIER_ADAPTIVE; +int mca_scoll_basic_param_broadcast_algorithm = SCOLL_ALG_BROADCAST_BINOMIAL; +int mca_scoll_basic_param_collect_algorithm = + SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING; +int mca_scoll_basic_param_reduce_algorithm = SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING; + +/* + * Local function + */ +static int basic_register(void); +static int basic_open(void); +static int basic_close(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_scoll_base_component_t mca_scoll_basic_component = { + + /* First, the mca_component_t struct containing meta information + about the component itself */ + + { + MCA_SCOLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + "basic", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + basic_open, + basic_close, + NULL, + basic_register + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Initialization / querying functions */ + + mca_scoll_basic_init, + mca_scoll_basic_query +}; + +static int basic_register(void) +{ + char help_msg[200]; + mca_base_component_t *comp = &mca_scoll_basic_component.scoll_version; + + mca_scoll_basic_priority_param = 75; + (void) mca_base_component_var_register(comp, + "priority", + "Priority of the basic scoll:basic component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_basic_priority_param); + + sprintf(help_msg, + "Algoritm selection for Barrier (%d - Central Counter, %d - Tournament, %d - Recursive Doubling, %d - Dissemination, %d - Basic, %d - Adaptive)", + SCOLL_ALG_BARRIER_CENTRAL_COUNTER, + SCOLL_ALG_BARRIER_TOURNAMENT, + SCOLL_ALG_BARRIER_RECURSIVE_DOUBLING, + SCOLL_ALG_BARRIER_DISSEMINATION, + SCOLL_ALG_BARRIER_BASIC, + SCOLL_ALG_BARRIER_ADAPTIVE); + (void) mca_base_component_var_register(comp, + "barrier_alg", + help_msg, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_basic_param_barrier_algorithm); + + sprintf(help_msg, + "Algoritm selection for Broadcast (%d - Central Counter, %d - Binomial)", + SCOLL_ALG_BROADCAST_CENTRAL_COUNTER, + SCOLL_ALG_BROADCAST_BINOMIAL); + (void) mca_base_component_var_register(comp, + "broadcast_alg", + help_msg, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_basic_param_broadcast_algorithm); + + sprintf(help_msg, + "Algoritm selection for Collect (%d - Central Counter, %d - Tournament, %d - Recursive Doubling, %d - Ring)", + SCOLL_ALG_COLLECT_CENTRAL_COUNTER, + SCOLL_ALG_COLLECT_TOURNAMENT, + SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING, + SCOLL_ALG_COLLECT_RING); + (void) mca_base_component_var_register(comp, + "collect_alg", + help_msg, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_basic_param_collect_algorithm); + + sprintf(help_msg, + "Algoritm selection for Reduce (%d - Central Counter, %d - Tournament, %d - Recursive Doubling %d - Linear %d - Log)", + SCOLL_ALG_REDUCE_CENTRAL_COUNTER, + SCOLL_ALG_REDUCE_TOURNAMENT, + SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING, + SCOLL_ALG_REDUCE_LEGACY_LINEAR, + SCOLL_ALG_REDUCE_LEGACY_LOG); + (void) mca_base_component_var_register(comp, + "reduce_alg", + help_msg, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_basic_param_reduce_algorithm); + + return OSHMEM_SUCCESS; +} + +static int basic_open(void) +{ + return OSHMEM_SUCCESS; +} + +static int basic_close(void) +{ + return OSHMEM_SUCCESS; +} + +OBJ_CLASS_INSTANCE(mca_scoll_basic_module_t, + mca_scoll_base_module_t, + NULL, + NULL); diff --git a/oshmem/mca/scoll/basic/scoll_basic_module.c b/oshmem/mca/scoll/basic/scoll_basic_module.c new file mode 100644 index 0000000000..4306026b01 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_module.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + +/* + * Initial query function that is invoked during initialization, allowing + * this module to indicate what level of thread support it provides. + */ +int mca_scoll_basic_init(bool enable_progress_threads, bool enable_threads) +{ + /* Nothing to do */ + return OSHMEM_SUCCESS; +} + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +static int mca_scoll_basic_enable(mca_scoll_base_module_t *module, + struct oshmem_group_t *comm) +{ + /*nothing to do here*/ + return OSHMEM_SUCCESS; +} + +mca_scoll_base_module_t * +mca_scoll_basic_query(struct oshmem_group_t *group, int *priority) +{ + mca_scoll_basic_module_t *module; + + *priority = mca_scoll_basic_priority_param; + + module = OBJ_NEW(mca_scoll_basic_module_t); + if (module) { + module->super.scoll_barrier = mca_scoll_basic_barrier; + module->super.scoll_broadcast = mca_scoll_basic_broadcast; + module->super.scoll_collect = mca_scoll_basic_collect; + module->super.scoll_reduce = mca_scoll_basic_reduce; + module->super.scoll_module_enable = mca_scoll_basic_enable; + return &(module->super); + } + + return NULL ; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_reduce.c b/oshmem/mca/scoll/basic/scoll_basic_reduce.c new file mode 100644 index 0000000000..114ae873da --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_reduce.c @@ -0,0 +1,810 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "opal/util/bit_ops.h" + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + +static int __algorithm_central_counter(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk); +static int __algorithm_tournament(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk); +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk); +static int __algorithm_linear(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk); +static int __algorithm_log(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk); + +int mca_scoll_basic_reduce(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk, + int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Check if this PE is part of the group */ + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) { + int i = 0; + + if (pSync) { + alg = (alg == SCOLL_DEFAULT_ALG ? + mca_scoll_basic_param_reduce_algorithm : alg); + switch (alg) { + case SCOLL_ALG_REDUCE_CENTRAL_COUNTER: + { + rc = __algorithm_central_counter(group, + op, + target, + source, + nlong, + pSync, + pWrk); + break; + } + case SCOLL_ALG_REDUCE_TOURNAMENT: + { + rc = __algorithm_tournament(group, + op, + target, + source, + nlong, + pSync, + pWrk); + break; + } + case SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING: + { + rc = __algorithm_recursive_doubling(group, + op, + target, + source, + nlong, + pSync, + pWrk); + break; + } + case SCOLL_ALG_REDUCE_LEGACY_LINEAR: + { + rc = __algorithm_linear(group, + op, + target, + source, + nlong, + pSync, + pWrk); + break; + } + case SCOLL_ALG_REDUCE_LEGACY_LOG: + { + rc = __algorithm_log(group, + op, + target, + source, + nlong, + pSync, + pWrk); + break; + } + default: + { + rc = __algorithm_central_counter(group, + op, + target, + source, + nlong, + pSync, + pWrk); + } + } + } else { + SCOLL_ERROR("Incorrect argument pSync"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, + "PE#%d Restore special synchronization array", + group->my_pe); + for (i = 0; pSync && (i < _SHMEM_REDUCE_SYNC_SIZE); i++) { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + + return rc; +} + +/* + This algorithm is quite simple and straightforward for PEs with identical data size. + One node gathers data from peers and send final result to them. + Outlay: + NP-1 competing network transfers are needed. + */ +static int __algorithm_central_counter(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Central Counter", group->my_pe); + + if (PE_root == group->my_pe) { + int pe_cur = 0; + void *target_cur = NULL; + + target_cur = malloc(nlong); + if (target_cur) { + memcpy(target, (void *) source, nlong); + + SCOLL_VERBOSE(14, + "[#%d] Gather data from all PEs in the group", + group->my_pe); + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); + i++) { + /* Get PE ID of a peer from the group */ + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + if (pe_cur == group->my_pe) + continue; + + SCOLL_VERBOSE(14, + "[#%d] Gather data (%d bytes) from #%d", + group->my_pe, (int)nlong, pe_cur); + + /* Clean up temporary buffer */ + memset(target_cur, 0, nlong); + + /* Get data from the current peer */ + rc = MCA_SPML_CALL(get((void *)source, nlong, target_cur, pe_cur)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) { + op->o_func.c_fn(target_cur, target, nlong / op->dt_size); + } + } + + free(target_cur); + } else { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, + PE_root, + target, + target, + nlong, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + return rc; +} + +static int __algorithm_tournament(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int exit_flag = group->proc_count - 1; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + void *target_cur = NULL; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Tournament", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + target_cur = malloc(nlong); + if (target_cur) { + memcpy(target_cur, (void *) source, nlong); + } else { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + while (exit_flag && (rc == OSHMEM_SUCCESS)) { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + /* Do not have peer for tournament */ + if (peer_id >= group->proc_count) + continue; + + if (my_id < peer_id) { + pSync[0] = peer_id; + value = my_id; + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) { + op->o_func.c_fn(target, target_cur, nlong / op->dt_size); + } + } else { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != my_id); + + SCOLL_VERBOSE(14, + "[#%d] round = %d send data to #%d", + group->my_pe, round, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = peer_id; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + break; + } + } + + /* Send result to all PE in group */ + if ((my_id == 0) && (rc == OSHMEM_SUCCESS)) { + SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe); + + memcpy(target, target_cur, nlong); + + value = SHMEM_SYNC_RUN; + for (peer_id = 1; + (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS); + peer_id++) { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, + PE_root, + target, + target, + nlong, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + free(target_cur); + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int floor2_proc = 0; + int exit_flag = 0; + long value = SHMEM_SYNC_INIT; + void *target_cur = NULL; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + floor2_proc = 1; + i = group->proc_count; + i >>= 1; + while (i) { + i >>= 1; + floor2_proc <<= 1; + } + + target_cur = malloc(nlong); + if (target_cur) { + memcpy(target_cur, (void *) source, nlong); + } else { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + SCOLL_VERBOSE(12, + "[#%d] Reduce algorithm: Recursive Doubling", + group->my_pe); + SCOLL_VERBOSE(15, + "[#%d] pSync[0] = %ld floor2_proc = %d", + group->my_pe, pSync[0], floor2_proc); + + if (my_id >= floor2_proc) { + /* I am in extra group, my partner is node (my_id-y) in basic group */ + peer_id = my_id - floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + /* Special procedure is needed in case target and source are the same */ + if (source == target) { + SCOLL_VERBOSE(14, + "[#%d] wait for peer #%d is ready", + group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + SCOLL_VERBOSE(14, + "[#%d] is extra send data to #%d", + group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, + "[#%d] is extra and signal to #%d", + group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } else { + /* Wait for a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + /* Special procedure is needed in case target and source are the same */ + if (source == target) { + SCOLL_VERBOSE(14, + "[#%d] signal to #%d that I am ready", + group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + + SCOLL_VERBOSE(14, + "[#%d] wait a signal from #%d", + group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) { + op->o_func.c_fn(target, target_cur, nlong / op->dt_size); + } + } + + /* Pairwise exchange */ + exit_flag = floor2_proc - 1; + pSync[0] = round; + while (exit_flag && (rc == OSHMEM_SUCCESS)) { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != (round - 1)); + + SCOLL_VERBOSE(14, + "[#%d] round = %d send data to #%d", + group->my_pe, round, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, + "[#%d] round = %d signals to #%d", + group->my_pe, round, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) { + op->o_func.c_fn(target, target_cur, nlong / op->dt_size); + } + + pSync[0] = round; + } + + memcpy(target, target_cur, nlong); + + /* Notify a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, + "[#%d] is extra send data to #%d", + group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + free(target_cur); + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + +static int __algorithm_linear(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int i, rank, size; + char *free_buffer = NULL; + char *pml_buffer = NULL; + char *inbuf; + int peer_id = 0; + int peer_pe = 0; + + /* Initialize */ + rank = group->my_pe; + size = group->proc_count; + int root_id = size - 1; + int root_pe = oshmem_proc_pe(group->proc_array[root_id]); + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Basic", group->my_pe); + + /* If not root, send data to the root. */ + + if (rank != root_pe) { + rc = MCA_SPML_CALL(send((void*)source, nlong, root_pe, MCA_SPML_BASE_PUT_STANDARD)); + } else { + + /* for reducing buffer allocation lengths.... */ + + if (size > 1) { + free_buffer = (char*) malloc(nlong); + if (NULL == free_buffer) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + pml_buffer = free_buffer; + } + + /* Initialize the receive buffer. */ + + if (root_id == (size - 1)) { + memcpy(target, (void *) source, nlong); + } else { + peer_id = size - 1; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(recv(target, nlong, peer_pe)); + } + if (OSHMEM_SUCCESS != rc) { + if (NULL != free_buffer) { + free(free_buffer); + } + return rc; + } + + /* Loop receiving and calling reduction function (C or Fortran). */ + + for (i = size - 2; i >= 0; --i) { + if (root_id == i) { + inbuf = (char*) source; + } else { + peer_id = i; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(recv(pml_buffer, nlong, peer_pe)); + if (OSHMEM_SUCCESS != rc) { + if (NULL != free_buffer) { + free(free_buffer); + } + return rc; + } + + inbuf = pml_buffer; + } + + /* Perform the reduction */ + op->o_func.c_fn(inbuf, target, nlong / op->dt_size); + } + + if (NULL != free_buffer) { + free(free_buffer); + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + group->my_pe, root_pe); + rc = group->g_scoll.scoll_broadcast(group, + root_pe, + target, + target, + nlong, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + /* All done */ + return rc; +} + +static int __algorithm_log(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int i, size, rank, vrank; + int mask; + void *sbuf = (void*) source; + void *rbuf = target; + char *free_buffer = NULL; + char *free_rbuf = NULL; + char *pml_buffer = NULL; + char *snd_buffer = NULL; + char *rcv_buffer = (char*) rbuf; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int root_id = 0; + int root_pe = oshmem_proc_pe(group->proc_array[root_id]); + int dim = 0; + + /* Initialize */ + rank = group->my_pe; + size = group->proc_count; + dim = opal_cube_dim(group->proc_count); + vrank = (my_id + size - root_id) % size; + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Log", rank); + + /* Allocate the incoming and resulting message buffers. See lengthy + * rationale above. */ + + free_buffer = (char*) malloc(nlong); + if (NULL == free_buffer) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + pml_buffer = free_buffer; + rcv_buffer = pml_buffer; + + /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy + * rationale above. */ + + snd_buffer = (char*) sbuf; + + if (my_id != root_id && 0 == (vrank & 1)) { + /* root is the only one required to provide a valid rbuf. + * Assume rbuf is invalid for all other ranks, so fix it up + * here to be valid on all non-leaf ranks */ + free_rbuf = (char*) malloc(nlong); + if (NULL == free_rbuf) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto cleanup_and_return; + } + rbuf = free_rbuf; + } + + /* Loop over cube dimensions. High processes send to low ones in the + * dimension. */ + + for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) { + + /* A high-proc sends to low-proc and stops. */ + if (vrank & mask) { + peer_id = vrank & ~mask; + peer_id = (peer_id + root_id) % size; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + rc = MCA_SPML_CALL(send((void*)snd_buffer, nlong, peer_pe, MCA_SPML_BASE_PUT_STANDARD)); + if (OSHMEM_SUCCESS != rc) { + goto cleanup_and_return; + } + snd_buffer = (char*) rbuf; + break; + } + + /* A low-proc receives, reduces, and moves to a higher + * dimension. */ + + else { + peer_id = vrank | mask; + if (peer_id >= size) { + continue; + } + peer_id = (peer_id + root_id) % size; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + /* Most of the time (all except the first one for commutative + * operations) we receive in the user provided buffer + * (rbuf). But the exception is here to allow us to dont have + * to copy from the sbuf to a temporary location. If the + * operation is commutative we dont care in which order we + * apply the operation, so for the first time we can receive + * the data in the pml_buffer and then apply to operation + * between this buffer and the user provided data. */ + + rc = MCA_SPML_CALL(recv(rcv_buffer, nlong, peer_pe)); + if (OSHMEM_SUCCESS != rc) { + goto cleanup_and_return; + } + /* Perform the operation. The target is always the user + * provided buffer We do the operation only if we receive it + * not in the user buffer */ + if (snd_buffer != sbuf) { + /* the target buffer is the locally allocated one */ + op->o_func.c_fn(rcv_buffer, pml_buffer, nlong / op->dt_size); + } else { + /* If we're commutative, we don't care about the order of + * operations and we can just reduce the operations now. + * If we are not commutative, we have to copy the send + * buffer into a temp buffer (pml_buffer) and then reduce + * what we just received against it. */ + { + op->o_func.c_fn(sbuf, pml_buffer, nlong / op->dt_size); + } + /* now we have to send the buffer containing the computed data */ + snd_buffer = pml_buffer; + /* starting from now we always receive in the user + * provided buffer */ + rcv_buffer = (char*) rbuf; + } + } + } + + /* Get the result to the root if needed. */ + rc = OSHMEM_SUCCESS; + if (0 == vrank) { + if (root_id == my_id) { + memcpy(rbuf, snd_buffer, nlong); + } else { + rc = MCA_SPML_CALL(send((void*)snd_buffer, nlong, root_pe, MCA_SPML_BASE_PUT_STANDARD)); + } + } else if (my_id == root_id) { + rc = MCA_SPML_CALL(recv(rcv_buffer, nlong, root_pe)); + if (rcv_buffer != rbuf) { + op->o_func.c_fn(rcv_buffer, rbuf, nlong / op->dt_size); + } + } + + cleanup_and_return: if (NULL != free_buffer) { + free(free_buffer); + } + if (NULL != free_rbuf) { + free(free_rbuf); + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) { + SCOLL_VERBOSE(14, + "[#%d] Broadcast from the root #%d", + rank, root_pe); + rc = group->g_scoll.scoll_broadcast(group, + root_pe, + target, + target, + nlong, + (pSync + 1), + SCOLL_DEFAULT_ALG); + } + + /* All done */ + return rc; +} diff --git a/oshmem/mca/scoll/fca/Makefile.am b/oshmem/mca/scoll/fca/Makefile.am new file mode 100644 index 0000000000..0d0b1c2aa1 --- /dev/null +++ b/oshmem/mca/scoll/fca/Makefile.am @@ -0,0 +1,38 @@ +# -*- shell-script -*- +# +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +AM_CPPFLAGS = $(OSHMEM_CFLAGS) $(coll_fca_CPPFLAGS) -DCOLL_FCA_HOME=\"$(coll_fca_HOME)\" -I$(coll_fca_HOME)/include/fca -I$(coll_fca_HOME)/include/fca_core +scoll_fca_sources = \ + scoll_fca.h \ + scoll_fca_debug.h \ + scoll_fca_api.h \ + scoll_fca_module.c \ + scoll_fca_component.c \ + scoll_fca_ops.c +if MCA_BUILD_oshmem_scoll_fca_DSO +component_noinst = +component_install = mca_scoll_fca.la +else +component_noinst = libmca_scoll_fca.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_scoll_fca_la_SOURCES = $(scoll_fca_sources) +mca_scoll_fca_la_LIBADD = $(scoll_fca_LIBS) +mca_scoll_fca_la_LDFLAGS = -module -avoid-version $(scoll_fca_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_scoll_fca_la_SOURCES =$(scoll_fca_sources) +libmca_scoll_fca_la_LIBADD = $(scoll_fca_LIBS) +libmca_scoll_fca_la_LDFLAGS = -module -avoid-version $(scoll_fca_LDFLAGS) diff --git a/oshmem/mca/scoll/fca/configure.m4 b/oshmem/mca/scoll/fca/configure.m4 new file mode 100644 index 0000000000..d585547ab6 --- /dev/null +++ b/oshmem/mca/scoll/fca/configure.m4 @@ -0,0 +1,39 @@ +# -*- shell-script -*- +# +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +# MCA_oshmem_scoll_fca_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_oshmem_scoll_fca_CONFIG],[ + AC_CONFIG_FILES([oshmem/mca/scoll/fca/Makefile]) + + OMPI_CHECK_FCA([scoll_fca], + [scoll_fca_happy="yes"], + [scoll_fca_happy="no"]) + + AS_IF([test "$scoll_fca_happy" = "yes"], + [scoll_fca_WRAPPER_EXTRA_LDFLAGS="$scoll_fca_LDFLAGS" + scoll_fca_CPPFLAGS="$scoll_fca_CPPFLAGS" + scoll_fca_WRAPPER_EXTRA_CPPFLAGS="$scoll_fca_CPPFLAGS" + scoll_fca_WRAPPER_EXTRA_LIBS="$scoll_fca_LIBS" + $1], + [$2]) + + # substitute in the things needed to build fca + AC_SUBST([scoll_fca_CFLAGS]) + AC_SUBST([scoll_fca_CPPFLAGS]) + AC_SUBST([scoll_fca_LDFLAGS]) + AC_SUBST([scoll_fca_LIBS]) + AC_SUBST(scoll_fca_HOME, "$ompi_check_fca_dir") +])dnl + diff --git a/oshmem/mca/scoll/fca/configure.params b/oshmem/mca/scoll/fca/configure.params new file mode 100644 index 0000000000..1b6b5ba51c --- /dev/null +++ b/oshmem/mca/scoll/fca/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/scoll/fca/scoll_fca.h b/oshmem/mca/scoll/fca/scoll_fca.h new file mode 100644 index 0000000000..f688c14b90 --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca.h @@ -0,0 +1,137 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ + +#ifndef MCA_SCOLL_FCA_H +#define MCA_SCOLL_FCA_H +#include "oshmem_config.h" +#include "oshmem/constants.h" +#include "shmem.h" +#include "opal/mca/mca.h" +#include "oshmem/mca/scoll/scoll.h" +#include "scoll_fca_api.h" +#include "scoll_fca_debug.h" + +#ifdef OMPI_PROC_FLAG_LOCAL +#define FCA_IS_LOCAL_PROCESS(n) ((n) & OMPI_PROC_FLAG_LOCAL) +#else +#define FCA_IS_LOCAL_PROCESS(n) OPAL_PROC_ON_LOCAL_NODE(n) +#endif + +BEGIN_C_DECLS +struct mca_scoll_fca_component_t { + /** Base coll component */ + mca_scoll_base_component_1_0_0_t super; + + /** MCA parameter: Priority of this component */ + int fca_priority; + + /** MCA parameter: Verbose level of this component */ + int fca_verbose; + + /** MCA parameter: Path to fca spec file */ + char* fca_spec_file; + + /** MCA parameter: FCA device */ + char* fca_dev; + + /** MCA parameter: Enable FCA */ + int fca_enable; + + /** MCA parameter: Enable FCA Barrier */ + int fca_enable_barrier; + + /** MCA parameter: Enable FCA Bcast */ + int fca_enable_bcast; + + /** MCA parameter: Enable FCA Allreduce */ + int fca_enable_allreduce; + + /** MCA parameter: Enable FCA Allgather */ + int fca_enable_allgather; + + /** MCA parameter: Enable FCA Allgatherv */ + int fca_enable_allgatherv; + + /** MCA parameter: FCA NP */ + int fca_np; + + /* FCA global stuff */ + fca_t *fca_context; /* FCA context handle */ + + /*These vars are used as symmetric objects during __fca_comm_new. The proper amount of memory + is allocated only once during fca_comm_query*/ + int *ret; + int *rcounts; + void *my_info_exchangeable; + void *fca_comm_desc_exchangeable; +}; +typedef struct mca_scoll_fca_component_t mca_scoll_fca_component_t; + +OSHMEM_MODULE_DECLSPEC extern mca_scoll_fca_component_t mca_scoll_fca_component; + +struct mca_scoll_fca_module_t { + mca_scoll_base_module_t super; + struct oshmem_group_t *comm; + int rank; + int local_proc_idx; + int num_local_procs; + int *local_ranks; + fca_comm_t *fca_comm; + fca_comm_desc_t fca_comm_desc; + fca_comm_caps_t fca_comm_caps; + + /* Saved handlers - for fallback */ + mca_scoll_base_module_barrier_fn_t previous_barrier; + mca_scoll_base_module_t *previous_barrier_module; + mca_scoll_base_module_broadcast_fn_t previous_broadcast; + mca_scoll_base_module_t *previous_broadcast_module; + mca_scoll_base_module_collect_fn_t previous_collect; + mca_scoll_base_module_t *previous_collect_module; + mca_scoll_base_module_reduce_fn_t previous_reduce; + mca_scoll_base_module_t *previous_reduce_module; +}; +typedef struct mca_scoll_fca_module_t mca_scoll_fca_module_t; +OBJ_CLASS_DECLARATION(mca_scoll_fca_module_t); + +/* API functions */ +int mca_scoll_fca_init_query(bool enable_progress_threads, + bool enable_mpi_threads); +mca_scoll_base_module_t *mca_scoll_fca_comm_query(struct oshmem_group_t *comm, + int *priority); +int mca_scoll_fca_get_fca_lib(struct oshmem_group_t *comm); + +int mca_scoll_fca_barrier(struct oshmem_group_t *group, + long *pSync, + int algorithm_type); +int mca_scoll_fca_broadcast(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync, + int algorithm_type); +int mca_scoll_fca_collect(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync, + bool nlong_type, + int algorithm_type); +int mca_scoll_fca_reduce(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk, + int algorithm_type); +OBJ_CLASS_DECLARATION(mca_coll_fca_module_t); +END_C_DECLS +#endif diff --git a/oshmem/mca/scoll/fca/scoll_fca_api.h b/oshmem/mca/scoll/fca/scoll_fca_api.h new file mode 100644 index 0000000000..1d9f820a69 --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_api.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" + +#include +#include +#include + +#ifndef FCA_API +#define OSHMEM_FCA_VERSION 12 +#else +#define OSHMEM_FCA_VERSION FCA_API +#endif + +/* + * * FCA API compatibility layer. + * * MPI build must define an FCA version macro. + * */ + +#define OSHMEM_FCA_BARRIER 1 +#define OSHMEM_FCA_BCAST 1 +#define OSHMEM_FCA_ALLREDUCE 1 + +#if OSHMEM_FCA_VERSION == 12 + +#define OSHMEM_FCA_ALLGATHER 0 +#define FCA_API_ABI_MAJOR 1 +#define FCA_API_ABI_MINOR 2 +#define FCA_MAJOR_BIT 24ul +#define FCA_MINOR_BIT 16ul +#define EUSESHMEM 287 + +static inline int mca_scoll_fca_comm_init(fca_t *fca_context, + int rank, + int comm_size, + int local_proc_idx, + int num_local_procs, + fca_comm_desc_t *comm_desc, + fca_comm_t **fca_comm) +{ + return fca_comm_init(fca_context, + local_proc_idx, + num_local_procs, + comm_size, + comm_desc, + fca_comm); +} +#elif OSHMEM_FCA_VERSION >= 20 + +#define OSHMEM_FCA_ALLGATHER 1 +#define OSHMEM_FCA_ALLGATHERV 1 + +#define OSHMEM_FCA_PROGRESS 1 +#define EUSESHMEM 287 + +static inline int mca_scoll_fca_comm_init(fca_t *fca_context, int rank, int comm_size, + int local_proc_idx, int num_local_procs, + fca_comm_desc_t *comm_desc, + fca_comm_t **fca_comm) +{ + fca_comm_init_spec_t spec; + + spec.rank = rank; + spec.size = comm_size; + spec.desc = *comm_desc; + spec.proc_idx = local_proc_idx; + spec.num_procs = num_local_procs; + return fca_comm_init(fca_context, &spec, fca_comm); +} +#else + +#error "FCA API version is unsupported" + +#endif diff --git a/oshmem/mca/scoll/fca/scoll_fca_component.c b/oshmem/mca/scoll/fca/scoll_fca_component.c new file mode 100644 index 0000000000..ab9e240fac --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_component.c @@ -0,0 +1,301 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ +#define _GNU_SOURCE +#include + +#include +#include + +#include "scoll_fca.h" + +#include "opal/runtime/opal_progress.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +/* + * * Public string showing the coll ompi_fca component version number + * */ +const char *mca_scoll_fca_component_version_string = +"Open SHMEM FCA collective MCA component version " OSHMEM_VERSION; + +/* + * * Global variable + * */ +int mca_scoll_fca_output = -1; + +/* + * * Instantiate the public struct with all of our public information + * * and pointers to our public functions in it + * */ +static int fca_open(void); +static int fca_close(void); +static int fca_register(void); + +mca_scoll_fca_component_t mca_scoll_fca_component = { + + /* First, the mca_component_t struct containing meta information + * about the component itfca */ + { + { + MCA_SCOLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + "fca", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + fca_open, + fca_close, + NULL, + fca_register + }, + { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + + /* Initialization / querying functions */ + + mca_scoll_fca_init_query, + mca_scoll_fca_comm_query, + } +}; + +#define FCA_API_CLEAR_MICRO(__x) ((__x>>FCA_MINOR_BIT)<> FCA_MAJOR_BIT); + minor = (fca_ver >> FCA_MINOR_BIT) & 0xf; + sprintf(x, "%ld%ld", major, minor); + detected_ver = atol(x); + + if (detected_ver != OSHMEM_FCA_VERSION) { + FCA_ERROR("Unsupported FCA version: %s, please update FCA to v%d, now v%ld", + fca_get_version_string(), OSHMEM_FCA_VERSION, fca_ver); + return OSHMEM_ERROR; + } + + spec = fca_parse_spec_file(mca_scoll_fca_component.fca_spec_file); + if (!spec) { + FCA_ERROR("Failed to parse FCA spec file `%s'", + mca_scoll_fca_component.fca_spec_file); + return OSHMEM_ERROR; + } + spec->job_id = oshmem_proc_local()->proc_name.jobid; + spec->rank_id = oshmem_proc_pe(oshmem_proc_local()); + spec->progress.func = mca_scoll_fca_progress_cb; + spec->progress.arg = NULL; + + ret = fca_init(spec, &mca_scoll_fca_component.fca_context); + if (ret < 0) { + FCA_ERROR("Failed to initialize FCA: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + fca_free_init_spec(spec); + + opal_progress_register(mca_scoll_fca_mpi_progress_cb); + return OSHMEM_SUCCESS; +} + +static void mca_scoll_fca_close_fca_lib(void) +{ + opal_progress_unregister(mca_scoll_fca_mpi_progress_cb); + fca_cleanup(mca_scoll_fca_component.fca_context); + mca_scoll_fca_component.fca_context = NULL; +} + +static int fca_register(void) +{ + mca_base_component_t *c; + + FCA_VERBOSE(2, "==>"); + + c = &mca_scoll_fca_component.super.scoll_version; + + mca_scoll_fca_component.fca_priority = 80; + (void) mca_base_component_var_register(c, + "priority", + "Priority of the scoll:fca component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_priority); + + mca_scoll_fca_component.fca_verbose = 0; + (void) mca_base_component_var_register(c, + "verbose", + "Verbose level of the fca coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_verbose); + + mca_scoll_fca_component.fca_enable = 1; + (void) mca_base_component_var_register(c, + "enable", + "[1|0|] Enable/Disable Fabric Collective Accelerator", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_verbose); + + mca_scoll_fca_component.fca_spec_file = ""COLL_FCA_HOME"/etc/fca_mpi_spec.ini"; + (void) mca_base_component_var_register(c, + "spec_file", + "Path to the FCA configuration file fca_mpi_spec.ini", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_spec_file); + + mca_scoll_fca_component.fca_np = 64; + (void) mca_base_component_var_register(c, + "np", + "[integer] Minimal allowed job's NP to activate FCA", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_np); + + mca_scoll_fca_component.fca_enable_barrier = OSHMEM_FCA_BARRIER; + (void) mca_base_component_var_register(c, + "enable_barrier", + "[1|0|] Enable/Disable FCA Barrier support", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_enable_barrier); + + mca_scoll_fca_component.fca_enable_bcast = OSHMEM_FCA_BCAST; + (void) mca_base_component_var_register(c, + "enable_bcast", + "[1|0|] Enable/Disable FCA Bcast support", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_enable_bcast); + + mca_scoll_fca_component.fca_enable_allreduce = OSHMEM_FCA_ALLREDUCE; + (void) mca_base_component_var_register(c, + "enable_allreduce", + "[1|0|] Enable/Disable FCA Allreduce support", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_enable_allreduce); + + mca_scoll_fca_component.fca_enable_allgather = OSHMEM_FCA_ALLGATHER; + (void) mca_base_component_var_register(c, + "enable_allgather", + "[1|0|] Enable/Disable FCA Allgather support", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_enable_allgather); + + mca_scoll_fca_component.fca_enable_allgatherv = OSHMEM_FCA_ALLGATHERV; + (void) mca_base_component_var_register(c, + "enable_allgatherv", + "[1|0|] Enable/Disable FCA Allgatherv support", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_scoll_fca_component.fca_enable_allgatherv); + + return OSHMEM_SUCCESS; +} + +static int fca_open(void) +{ + FCA_VERBOSE(2, "==>"); + + mca_scoll_fca_output = opal_output_open(NULL ); + opal_output_set_verbosity(mca_scoll_fca_output, + mca_scoll_fca_component.fca_verbose); + mca_scoll_fca_component.fca_context = NULL; + mca_scoll_fca_component.ret = NULL; + mca_scoll_fca_component.rcounts = NULL; + mca_scoll_fca_component.fca_comm_desc_exchangeable = NULL; + mca_scoll_fca_component.my_info_exchangeable = NULL; + return OSHMEM_SUCCESS; +} + +static int fca_close(void) +{ + FCA_VERBOSE(2, "==>"); + + if (!mca_scoll_fca_component.fca_context) + return OSHMEM_SUCCESS; + + mca_scoll_fca_close_fca_lib(); + + if (NULL != mca_scoll_fca_component.ret) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.ret)); + + if (NULL != mca_scoll_fca_component.rcounts) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.rcounts)); + + if (NULL != mca_scoll_fca_component.fca_comm_desc_exchangeable) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.fca_comm_desc_exchangeable)); + + if (NULL != mca_scoll_fca_component.my_info_exchangeable) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.my_info_exchangeable)); + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/scoll/fca/scoll_fca_debug.h b/oshmem/mca/scoll/fca/scoll_fca_debug.h new file mode 100644 index 0000000000..a0ab3ddc81 --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_debug.h @@ -0,0 +1,35 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ + +#ifndef MCA_SCOLL_FCA_DEBUG_H +#define MCA_SCOLL_FCA_DEBUG_H +#pragma GCC system_header + +#ifdef __BASE_FILE__ +#define __FCA_FILE__ __BASE_FILE__ +#else +#define __FCA_FILE__ __FILE__ +#endif + +#define FCA_VERBOSE(level, format, ...) \ + opal_output_verbose(level, mca_scoll_fca_output, "%s:%d - %s() " format, \ + __FCA_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define FCA_ERROR(format, ... ) \ + opal_output_verbose(0, mca_scoll_fca_output, "Error: %s:%d - %s() " format, \ + __FCA_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define FCA_MODULE_VERBOSE(fca_module, level, format, ...) \ + FCA_VERBOSE(level, "[%p:%d] " format, (void*)(fca_module)->comm, (fca_module)->rank, ## __VA_ARGS__) + +extern int mca_scoll_fca_output; + +#endif + diff --git a/oshmem/mca/scoll/fca/scoll_fca_module.c b/oshmem/mca/scoll/fca/scoll_fca_module.c new file mode 100644 index 0000000000..564867de8f --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_module.c @@ -0,0 +1,557 @@ +/** + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ +#include "oshmem_config.h" +#include "scoll_fca.h" +#include +#include +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/runtime/runtime.h" + +/* + * * Initial query function that is invoked during MPI_INIT, allowing + * * this module to indicate what level of thread support it provides. + * */ + +static const int root_id = 0; + +#define __INTERNAL_BARRIER_FROM_SCOLL_BASIC 1 +static int __internal_barrier(mca_scoll_fca_module_t *fca_module) +{ +#if !__INTERNAL_BARRIER_FROM_SCOLL_BASIC + struct oshmem_group_t *group = fca_module->comm; + int rc = OSHMEM_SUCCESS; + int root_id = 0; + int PE_root = oshmem_proc_pe(group->proc_array[root_id]); + int i = 0; + + if (PE_root != group->my_pe) + { + rc = MCA_SPML_CALL(send(NULL, 0, PE_root, MCA_SPML_BASE_PUT_STANDARD)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + + rc = MCA_SPML_CALL(recv(NULL, 0, PE_root)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + /* The root collects and broadcasts the messages. */ + + else + { + int pe_cur = 0; + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(recv(NULL, 0, SHMEM_ANY_SOURCE)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(send(NULL, 0, pe_cur, MCA_SPML_BASE_PUT_STANDARD)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + } + + return rc; +#else + long pSync = _SHMEM_SYNC_VALUE; + /*we use 4th algorithm for barrier from scoll/basic. It does not use pSync, + * so we pass to that function just regular long value in order to meet function defenition requirements*/ + return fca_module->previous_barrier(fca_module->comm, + &pSync, + SCOLL_ALG_BARRIER_BASIC); +#endif +} +int mca_scoll_fca_init_query(bool enable_progress_threads, + bool enable_mpi_threads) +{ + return OSHMEM_SUCCESS; +} + +static int have_remote_peers(struct oshmem_group_t *group, + size_t size, + int *local_peers) +{ + struct oshmem_proc_t *proc; + size_t i; + int ret; + + *local_peers = 0; + ret = 0; + for (i = 0; i < size; ++i) { + proc = group->proc_array[i]; + if (FCA_IS_LOCAL_PROCESS(proc->proc_flags)) { + ++*local_peers; + } else { + ret = 1; + } + } + return ret; +} + +/** + * * Fills local rank information in fca_module. + * */ + +static int __get_local_ranks(mca_scoll_fca_module_t *fca_module) +{ + struct oshmem_group_t *comm = fca_module->comm; + oshmem_proc_t* proc; + int i, rank; + + /* Count the local ranks */ + fca_module->num_local_procs = 0; + for (rank = 0; rank < comm->proc_count; ++rank) { + proc = comm->proc_array[rank]; + if (FCA_IS_LOCAL_PROCESS(proc->proc_flags)) { + if (proc->proc_name.vpid == (uint32_t) fca_module->rank) { + fca_module->local_proc_idx = fca_module->num_local_procs; + } + ++fca_module->num_local_procs; + } + } + /* Make a list of local ranks */ + fca_module->local_ranks = calloc(fca_module->num_local_procs, + sizeof *fca_module->local_ranks); + if (!fca_module->local_ranks) { + FCA_ERROR("Failed to allocate memory for %d local ranks", + fca_module->num_local_procs); + return OSHMEM_ERROR; + } + + i = 0; + for (rank = 0; rank < comm->proc_count; ++rank) { + proc = comm->proc_array[rank]; + if (FCA_IS_LOCAL_PROCESS(proc->proc_flags)) { + fca_module->local_ranks[i++] = rank; + } + } + + FCA_MODULE_VERBOSE(fca_module, + 3, + "i am %d/%d", + fca_module->local_proc_idx, fca_module->num_local_procs); + + return OSHMEM_SUCCESS; +} + +static int __fca_comm_new(mca_scoll_fca_module_t *fca_module) +{ + struct oshmem_group_t *comm = fca_module->comm; + fca_comm_new_spec_t spec; + int info_size = 0, all_info_size = 0; + void *all_info = NULL, *my_info = NULL; + int *disps = NULL; + int i; + const int root_pe = oshmem_proc_pe(comm->proc_array[root_id]); + const int my_id = oshmem_proc_group_find_id(comm, comm->my_pe); + /* call fca_get_rank_info() on node managers only*/ + + if (fca_module->local_proc_idx == 0) { + my_info = fca_get_rank_info(mca_scoll_fca_component.fca_context, + &info_size); + if (!my_info) { + FCA_ERROR("fca_get_rank_info returned NULL"); + return OSHMEM_ERROR; + } + + } else { + info_size = 0; + } + + FCA_MODULE_VERBOSE(fca_module, 1, "Info size: %d", info_size); + for (i = 0; i < comm->proc_count; i++) { + mca_scoll_fca_component.rcounts[i] = -1; + } + __internal_barrier(fca_module); + MCA_SPML_CALL(put((void *)&mca_scoll_fca_component.rcounts[my_id], (size_t)sizeof(info_size), (void *)&info_size, root_pe)); + + if (root_pe == comm->my_pe) { + int value = -1; + for (i = 0; i < comm->proc_count; i++) { + MCA_SPML_CALL(wait((void *)&mca_scoll_fca_component.rcounts[i], SHMEM_CMP_NE, &value, SHMEM_INT)); + } + } + + /* Allocate buffer for gathering rank information on rank0 */ + if (root_pe == comm->my_pe) { + all_info_size = 0; + disps = calloc(comm->proc_count, sizeof *disps); + for (i = 0; i < comm->proc_count; ++i) { + disps[i] = all_info_size; + all_info_size += mca_scoll_fca_component.rcounts[i]; + } + all_info = NULL; + FCA_MODULE_VERBOSE(fca_module, + 1, + "Total rank_info size: %d", + all_info_size); + all_info = malloc(all_info_size); + memset(all_info, 0, all_info_size); + } + + if (my_info) { + memcpy(mca_scoll_fca_component.my_info_exchangeable, + my_info, + info_size); + } + __internal_barrier(fca_module); + if (root_pe == comm->my_pe) { + for (i = 0; i < comm->proc_count; i++) { + if (mca_scoll_fca_component.rcounts[i] > 0) { + MCA_SPML_CALL(get((void *)mca_scoll_fca_component.my_info_exchangeable, mca_scoll_fca_component.rcounts[i], (void*)(((char*)all_info)+disps[i]),comm->proc_array[i]->proc_name.vpid)); + } + } + } + + /* Rank0 calls fca_comm_new() and fills fca_comm_spec filed */ + if (root_pe == comm->my_pe) { + spec.rank_info = all_info; + spec.is_comm_world = comm == oshmem_group_all; + spec.rank_count = 0; + for (i = 0; i < comm->proc_count; ++i) { + FCA_MODULE_VERBOSE(fca_module, + 1, + "rcounts[%d]=%d disps[%d]=%d", + i, mca_scoll_fca_component.rcounts[i], i, disps[i]); + if (mca_scoll_fca_component.rcounts[i] > 0) + ++spec.rank_count; + } + + FCA_MODULE_VERBOSE(fca_module, + 1, + "starting fca_comm_new(), rank_count: %d", + spec.rank_count); + + *mca_scoll_fca_component.ret = + fca_comm_new(mca_scoll_fca_component.fca_context, + &spec, + &fca_module->fca_comm_desc); + + free(disps); + free(all_info); + } + + __internal_barrier(fca_module); + + if (root_pe != comm->my_pe) { + MCA_SPML_CALL(get((void *)mca_scoll_fca_component.ret,sizeof(int), (void *)mca_scoll_fca_component.ret, root_pe)); + } + + /* Examine comm_new return value */ + __internal_barrier(fca_module); + if (*mca_scoll_fca_component.ret < 0) { + FCA_ERROR("rank %i: COMM_NEW failed: %s", + fca_module->rank, fca_strerror(*mca_scoll_fca_component.ret)); + return OSHMEM_ERROR; + } + + /* Release allocate rank_info on node managers */ + if (fca_module->local_proc_idx == 0) { + fca_free_rank_info(my_info); + } + + { + if (root_pe == comm->my_pe) { + memcpy(mca_scoll_fca_component.fca_comm_desc_exchangeable, + &fca_module->fca_comm_desc, + sizeof(fca_module->fca_comm_desc)); + } + + __internal_barrier(fca_module); + if (root_pe != comm->my_pe) { + MCA_SPML_CALL(get((void *)mca_scoll_fca_component.fca_comm_desc_exchangeable, sizeof(fca_module->fca_comm_desc), (void *)&fca_module->fca_comm_desc, root_pe)); + } + + __internal_barrier(fca_module); + + } + FCA_MODULE_VERBOSE(fca_module, + 1, + "Received FCA communicator spec, comm_id %d", + fca_module->fca_comm_desc.comm_id); + return OSHMEM_SUCCESS; +} + +static int __create_fca_comm(mca_scoll_fca_module_t *fca_module) +{ + int comm_size; + int rc, ret; + + rc = __fca_comm_new(fca_module); + if (rc != OSHMEM_SUCCESS) + return rc; + + /* allocate comm_init_spec */ + FCA_MODULE_VERBOSE(fca_module, + 1, + "Starting COMM_INIT comm_id %d proc_idx %d num_procs %d", + fca_module->fca_comm_desc.comm_id, fca_module->local_proc_idx, fca_module->num_local_procs); + + comm_size = fca_module->comm->proc_count; + ret = mca_scoll_fca_comm_init(mca_scoll_fca_component.fca_context, + oshmem_proc_group_find_id(fca_module->comm, + fca_module->rank), + comm_size, + fca_module->local_proc_idx, + fca_module->num_local_procs, + &fca_module->fca_comm_desc, + &fca_module->fca_comm); + if (ret < 0) { + FCA_ERROR("COMM_INIT failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + + /* get communicator capabilities */ + ret = fca_comm_get_caps(fca_module->fca_comm, &fca_module->fca_comm_caps); + if (ret < 0) { + FCA_ERROR("GET_COMM_CAPS failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + + /* by this point every rank in the communicator is set up */ + FCA_MODULE_VERBOSE(fca_module, + 1, + "Initialized FCA communicator, comm_id %d", + fca_module->fca_comm_desc.comm_id); + + return OSHMEM_SUCCESS; +} + +static void __destroy_fca_comm(mca_scoll_fca_module_t *fca_module) +{ + int ret; + struct oshmem_group_t *comm = fca_module->comm; + const int root_pe = oshmem_proc_pe(comm->proc_array[root_id]); + + fca_comm_destroy(fca_module->fca_comm); + if (comm->my_pe == root_pe && mca_scoll_fca_component.fca_context) { + ret = fca_comm_end(mca_scoll_fca_component.fca_context, + fca_module->fca_comm_desc.comm_id); + if (ret < 0) { + FCA_ERROR("COMM_END failed: %s", fca_strerror(ret)); + } + } + + FCA_MODULE_VERBOSE(fca_module, + 1, + "Destroyed FCA communicator, comm_id %d", + fca_module->fca_comm_desc.comm_id); +} + +#define FCA_SAVE_PREV_SCOLL_API(__api) do {\ + fca_module->previous_ ## __api = comm->g_scoll.scoll_ ## __api;\ + fca_module->previous_ ## __api ## _module = comm->g_scoll.scoll_ ## __api ## _module;\ + if (!comm->g_scoll.scoll_ ## __api || !comm->g_scoll.scoll_ ## __api ## _module) {\ + FCA_VERBOSE(1, "no underlying " # __api"; disqualifying myself");\ + return OSHMEM_ERROR;\ + }\ + OBJ_RETAIN(fca_module->previous_ ## __api ## _module);\ +} while(0) + +static int __save_coll_handlers(mca_scoll_fca_module_t *fca_module) +{ + struct oshmem_group_t *comm = fca_module->comm; + + FCA_SAVE_PREV_SCOLL_API(barrier); + FCA_SAVE_PREV_SCOLL_API(broadcast); + FCA_SAVE_PREV_SCOLL_API(collect); + FCA_SAVE_PREV_SCOLL_API(reduce); + + return OSHMEM_SUCCESS; +} + +/* + * * Initialize module on the communicator + * */ +static int mca_scoll_fca_module_enable(mca_scoll_base_module_t *module, + struct oshmem_group_t *comm) +{ + + mca_scoll_fca_module_t *fca_module = (mca_scoll_fca_module_t*) module; + int rc; + + fca_module->comm = comm; + fca_module->rank = comm->my_pe; + + rc = mca_scoll_fca_get_fca_lib(comm); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + rc = __save_coll_handlers(fca_module); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + rc = __get_local_ranks(fca_module); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + rc = __create_fca_comm(fca_module); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + FCA_MODULE_VERBOSE(fca_module, 1, "FCA Module initialized"); + return OMPI_SUCCESS; + + exit_fatal: + /* it is possible that other pe(s) succesfully enabled fca. + * So differnt frameworks will be used for collective ops + */ + FCA_ERROR("FCA module enable failed - aborting to prevent inconsistent application state"); + oshmem_shmem_abort(-1); + return OMPI_ERROR; +} + +static void mca_scoll_fca_module_clear(mca_scoll_fca_module_t *fca_module) +{ + fca_module->num_local_procs = 0; + fca_module->local_ranks = NULL; + fca_module->fca_comm = NULL; + + fca_module->previous_barrier = NULL; + fca_module->previous_broadcast = NULL; + fca_module->previous_collect = NULL; + fca_module->previous_reduce = NULL; +} + +static void mca_scoll_fca_module_construct(mca_scoll_fca_module_t *fca_module) +{ + FCA_VERBOSE(5, "==>"); + mca_scoll_fca_module_clear(fca_module); +} + +static void mca_scoll_fca_module_destruct(mca_scoll_fca_module_t *fca_module) +{ + FCA_VERBOSE(5, "==>"); + OBJ_RELEASE(fca_module->previous_barrier_module); + OBJ_RELEASE(fca_module->previous_broadcast_module); + OBJ_RELEASE(fca_module->previous_collect_module); + OBJ_RELEASE(fca_module->previous_reduce_module); + if (fca_module->fca_comm) + __destroy_fca_comm(fca_module); + free(fca_module->local_ranks); + mca_scoll_fca_module_clear(fca_module); +} + +/* + * * Invoked when there's a new communicator that has been created. + * * Look at the communicator and decide which set of functions and + * * priority we want to return. + * */ +mca_scoll_base_module_t * +mca_scoll_fca_comm_query(struct oshmem_group_t *comm, int *priority) +{ + mca_scoll_base_module_t *module; + int size = comm->proc_count; + int local_peers = 0; + + mca_scoll_fca_module_t *fca_module; + + *priority = 0; + module = NULL; + + if (!mca_scoll_fca_component.fca_enable) { + FCA_VERBOSE(20, "FCA is disable on user request => exiting"); + goto exit; + } + + if (mca_memheap.memheap_component == NULL ) { + FCA_VERBOSE(20, "No memheap => exiting"); + goto exit; + } + + if (NULL == mca_scoll_fca_component.ret) { + MCA_MEMHEAP_CALL(private_alloc(sizeof(int),(void **)&mca_scoll_fca_component.ret)); + MCA_MEMHEAP_CALL(private_alloc(oshmem_group_all->proc_count*sizeof(*mca_scoll_fca_component.rcounts), (void **)&mca_scoll_fca_component.rcounts )); + MCA_MEMHEAP_CALL(private_alloc(/*info_size*/20,&mca_scoll_fca_component.my_info_exchangeable)); + MCA_MEMHEAP_CALL(private_alloc(sizeof(fca_comm_desc_t), &mca_scoll_fca_component.fca_comm_desc_exchangeable)); + } + if (size < mca_scoll_fca_component.fca_np) { + FCA_VERBOSE(20, + "size(%d) < fca_np(%d)", + size, mca_scoll_fca_component.fca_np); + goto exit; + } + + if (size < 2) { + FCA_VERBOSE(20, "size(%d) < 2", size); + goto exit; + } + + if (!have_remote_peers(comm, + size, + &local_peers) /* || OMPI_COMM_IS_INTER(comm)*/) { + FCA_VERBOSE(1, + "all peers in group are on the same node, fca disabled\n"); + goto exit; + } + + fca_module = OBJ_NEW(mca_scoll_fca_module_t); + if (!fca_module) { + goto exit_fatal; + } + fca_module->super.scoll_module_enable = mca_scoll_fca_module_enable; + fca_module->super.scoll_collect = + mca_scoll_fca_component.fca_enable_allgather ? + mca_scoll_fca_collect : NULL; + fca_module->super.scoll_reduce = + mca_scoll_fca_component.fca_enable_allreduce ? + mca_scoll_fca_reduce : NULL; + fca_module->super.scoll_barrier = + mca_scoll_fca_component.fca_enable_barrier ? mca_scoll_fca_barrier : + NULL; + fca_module->super.scoll_broadcast = + mca_scoll_fca_component.fca_enable_bcast ? mca_scoll_fca_broadcast : + NULL; + + *priority = mca_scoll_fca_component.fca_priority; + module = &fca_module->super; + + exit: + FCA_VERBOSE(4, + "Query FCA module for comm %p size %d rank %d local_peers=%d: priority=%d %s", + (void *)comm, size, comm->my_pe, local_peers, *priority, module ? "enabled" : "disabled"); + return module; + + exit_fatal: + /* it is possible that other pe(s) succesfully initialized fca. + * So differnt frameworks will be used for collective ops + */ + FCA_ERROR("FCA module query failed - aborting"); + oshmem_shmem_abort(-1); + return NULL ; +} + +OBJ_CLASS_INSTANCE(mca_scoll_fca_module_t, + mca_scoll_base_module_t, + mca_scoll_fca_module_construct, + mca_scoll_fca_module_destruct); + diff --git a/oshmem/mca/scoll/fca/scoll_fca_ops.c b/oshmem/mca/scoll/fca/scoll_fca_ops.c new file mode 100644 index 0000000000..41834ad1eb --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_ops.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include "oshmem/constants.h" +#include "scoll_fca.h" +#include +#include "oshmem/proc/proc.h" +#include "oshmem/op/op.h" +int mca_scoll_fca_barrier(struct oshmem_group_t *group, long *pSync, int alg) +{ + mca_scoll_fca_module_t *fca_module = + (mca_scoll_fca_module_t *) group->g_scoll.scoll_barrier_module; + int ret; + + FCA_VERBOSE(5, "Using FCA Barrier"); + ret = fca_do_barrier(fca_module->fca_comm); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5, "FCA Barrier failed, using original barrier"); + goto orig_barrier; + } + FCA_ERROR("Barrier failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; + orig_barrier: return fca_module->previous_barrier(group, + pSync, + SCOLL_DEFAULT_ALG); +} + +int mca_scoll_fca_broadcast(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync, + int alg) +{ + mca_scoll_fca_module_t *fca_module = + (mca_scoll_fca_module_t *) group->g_scoll.scoll_broadcast_module; + fca_bcast_spec_t spec; + int ret; + + FCA_VERBOSE(5, "rank %i, DOING FCA BCAST\n", group->my_pe); + spec.root = oshmem_proc_group_find_id(group, PE_root); + if (group->my_pe == PE_root) + spec.buf = (void *) source; + else + spec.buf = target; + spec.size = nlong; + if (spec.size > fca_module->fca_comm_caps.max_payload) { + FCA_VERBOSE(5, + "Unsupported bcast operation size %d, using fallback", + spec.size); + goto orig_bcast; + } + ret = fca_do_bcast(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5, "FCA Broadcast failed, using original Broadcast"); + goto orig_bcast; + } + FCA_ERROR("Bcast failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; + orig_bcast: return fca_module->previous_broadcast(group, + PE_root, + target, + source, + nlong, + pSync, + SCOLL_DEFAULT_ALG); +} + +int mca_scoll_fca_collect(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync, + bool nlong_type, + int alg) +{ + mca_scoll_fca_module_t *fca_module = + (mca_scoll_fca_module_t *) group->g_scoll.scoll_collect_module; + + FCA_VERBOSE(5, + "rank %i, DOING FCA_COLLECT, nlong_type = %i\n", + group->my_pe, (int)nlong_type); +#if OSHMEM_FCA_ALLGATHER + if (nlong_type == true) { + fca_gather_spec_t spec = {0,}; + int ret; + spec.size = (int)nlong; + spec.sbuf = (void *)source; + spec.rbuf = target; + ret = fca_do_allgather(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Fcollect(allgather) failed, using original Fcollect"); + goto orig_collect; + } + FCA_ERROR("Fcollect(allgather) failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; + } + else + { + int i, ret; + size_t *sendcounts = (size_t *)malloc(group->proc_count*sizeof(size_t)); + mca_scoll_fca_collect(group,sendcounts,(void *)&nlong,sizeof(size_t),pSync,true,SCOLL_DEFAULT_ALG); + fca_gatherv_spec_t spec; + spec.sendsize = (int)nlong; + spec.sbuf = (void *)source; + spec.rbuf = target; + spec.recvsizes = alloca(sizeof(*spec.recvsizes) * group->proc_count); + spec.displs = alloca(sizeof(*spec.displs) * group->proc_count); + for (i=0; iproc_count; i++) { + spec.recvsizes[i] = (int)sendcounts[i]; + } + spec.displs[0] = 0; + for (i=1; iproc_count; i++) { + spec.displs[i] = spec.displs[i-1]+spec.recvsizes[i-1]; + } + ret = fca_do_allgatherv(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Collect(allgatherv) failed, using original Collect"); + goto orig_collect; + } + FCA_ERROR("Collect(allgatherv) failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + free(sendcounts); + return OSHMEM_SUCCESS; + } + orig_collect: +#endif + return fca_module->previous_collect(group, + target, + source, + nlong, + pSync, + nlong_type, + SCOLL_DEFAULT_ALG); +} + +#define FCA_DTYPE_8_SIGNED 1 +#define FCA_DTYPE_16_SIGNED 2 +#define FCA_DTYPE_32_SIGNED 3 +#define FCA_DTYPE_64_SIGNED 4 +#define FCA_DTYPE_32_FLOAT 9 +#define FCA_DTYPE_64_FLOAT 10 +#define UNSUPPORTED_OP -1 + +static bool if_floating_type(oshmem_op_t *op) +{ + if ((op->dt == OSHMEM_OP_TYPE_FLOAT) || (op->dt == OSHMEM_OP_TYPE_DOUBLE) + || (op->dt == OSHMEM_OP_TYPE_LDOUBLE)) + return true; + else + return false; +} +static int shmem_dtype_to_fca_dtype(oshmem_op_t *op) +{ + if ((op->dt == OSHMEM_OP_TYPE_FCOMPLEX) + || (op->dt == OSHMEM_OP_TYPE_DCOMPLEX)) { + return UNSUPPORTED_OP; + } + switch (op->dt_size * 8) { + case 64: + if (if_floating_type(op)) + return FCA_DTYPE_64_FLOAT; + else + return FCA_DTYPE_64_SIGNED; + break; + case 32: + if (if_floating_type(op)) + return FCA_DTYPE_32_FLOAT; + else + return FCA_DTYPE_32_SIGNED; + break; + case 16: + if (OPAL_UNLIKELY(if_floating_type(op))) + return UNSUPPORTED_OP; + else + return FCA_DTYPE_16_SIGNED; + break; + case 8: + if (OPAL_UNLIKELY(if_floating_type(op))) + return UNSUPPORTED_OP; + else + return FCA_DTYPE_8_SIGNED; + break; + default: + return UNSUPPORTED_OP; + } +} + +static int shmem_op_to_fca_op(oshmem_op_t *op) +{ + switch (op->op) { + case OSHMEM_OP_AND: + return FCA_OP_BAND; + break; + case OSHMEM_OP_OR: + return FCA_OP_BOR; + break; + case OSHMEM_OP_XOR: + return FCA_OP_BXOR; + case OSHMEM_OP_MAX: + return FCA_OP_MAX; + break; + case OSHMEM_OP_MIN: + return FCA_OP_MIN; + break; + case OSHMEM_OP_SUM: + return FCA_OP_SUM; + break; + case OSHMEM_OP_PROD: + return FCA_OP_PROD; + break; + default: + return UNSUPPORTED_OP; + } +} +int mca_scoll_fca_reduce(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk, + int alg) +{ + mca_scoll_fca_module_t *fca_module = + (mca_scoll_fca_module_t *) group->g_scoll.scoll_reduce_module; + int fca_dtype; + int fca_op; + int ret; + fca_reduce_spec_t spec; + + FCA_VERBOSE(5, "rank %i, DOING FCA_REDUCE\n", group->my_pe); + if ((fca_dtype = shmem_dtype_to_fca_dtype(op)) < 0) { + FCA_VERBOSE(5, + "SHMEM_DATA_TYPE = %i is unsupported in the current version of FCA library; using original reduce", + op->dt); + goto orig_reduce; + } + if ((fca_op = shmem_op_to_fca_op(op)) < 0) { + FCA_VERBOSE(5, + "SHMEM_OPERATION_TYPE = %i is unsupported; using original reduce", + op->op); + goto orig_reduce; + } + spec.sbuf = (void *) source; + spec.rbuf = target; + spec.dtype = (enum fca_reduce_dtype_t) fca_dtype; + spec.op = (enum fca_reduce_op_t) fca_op; + spec.length = (int) (nlong / op->dt_size); + ret = fca_do_all_reduce(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5, + "FCA Reduce(allreduce) failed, using original Reduce"); + goto orig_reduce; + } + FCA_ERROR("Reduce (allreduce) failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; + orig_reduce: return fca_module->previous_reduce(group, + op, + target, + source, + nlong, + pSync, + pWrk, + SCOLL_DEFAULT_ALG); +} diff --git a/oshmem/mca/scoll/scoll.h b/oshmem/mca/scoll/scoll.h new file mode 100644 index 0000000000..b8dbd159e6 --- /dev/null +++ b/oshmem/mca/scoll/scoll.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Collective Communication Interface + * + */ + +#ifndef OSHMEM_MCA_SCOLL_H +#define OSHMEM_MCA_SCOLL_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "opal/util/output.h" +#include "mpi.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" + +BEGIN_C_DECLS + +/* ******************************************************************** */ + +struct oshmem_group_t; +struct oshmem_op_t; + +/* ******************************************************************** */ + +typedef int (*mca_scoll_base_component_init_fn_t)(bool enable_progress_threads, + bool enable_threads); + +typedef struct mca_scoll_base_module_1_0_0_t* (*mca_scoll_base_component_query_fn_t)(struct oshmem_group_t *group, + int *priority); + +/* ******************************************************************** */ + +/** + * Collective component interface + * + * Component interface for the collective framework. A public + * instance of this structure, called + * mca_scoll_[component_name]_component, must exist in any collective + * component. + */ +struct mca_scoll_base_component_1_0_0_t { + /** Base component description */ + mca_base_component_t scoll_version; + /** Base component data block */ + mca_base_component_data_t scoll_data; + + /** Component initialization function */ + mca_scoll_base_component_init_fn_t scoll_init; + mca_scoll_base_component_query_fn_t scoll_query; +}; +typedef struct mca_scoll_base_component_1_0_0_t mca_scoll_base_component_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_scoll_base_component_1_0_0_t mca_scoll_base_component_t; + +/** + * Collective module interface + * + * Module interface to the Collective framework. Modules are + * reference counted based on the number of functions from the module + * used on the commuicator. There is at most one module per component + * on a given communicator, and there can be many component modules on + * a given communicator. + * + * @note The collective framework and the + * communicator functionality only stores a pointer to the module + * function, so the component is free to create a structure that + * inherits from this one for use as the module structure. + */ +typedef int +(*mca_scoll_base_module_enable_1_0_0_fn_t)(struct mca_scoll_base_module_1_0_0_t* module, + struct oshmem_group_t *comm); +typedef int (*mca_scoll_base_module_ft_event_fn_t)(int state); + +#define SCOLL_DEFAULT_ALG (-1) + +#define SCOLL_ALG_BARRIER_CENTRAL_COUNTER 0 +#define SCOLL_ALG_BARRIER_TOURNAMENT 1 +#define SCOLL_ALG_BARRIER_RECURSIVE_DOUBLING 2 +#define SCOLL_ALG_BARRIER_DISSEMINATION 3 +#define SCOLL_ALG_BARRIER_BASIC 4 +#define SCOLL_ALG_BARRIER_ADAPTIVE 5 + +#define SCOLL_ALG_BROADCAST_CENTRAL_COUNTER 0 +#define SCOLL_ALG_BROADCAST_BINOMIAL 1 + +#define SCOLL_ALG_COLLECT_CENTRAL_COUNTER 0 +#define SCOLL_ALG_COLLECT_TOURNAMENT 1 +#define SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING 2 +#define SCOLL_ALG_COLLECT_RING 3 + +#define SCOLL_ALG_REDUCE_CENTRAL_COUNTER 0 +#define SCOLL_ALG_REDUCE_TOURNAMENT 1 +#define SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING 2 +#define SCOLL_ALG_REDUCE_LEGACY_LINEAR 3 /* Based linear algorithm from OMPI coll:basic */ +#define SCOLL_ALG_REDUCE_LEGACY_LOG 4 /* Based log algorithm from OMPI coll:basic */ + +typedef int (*mca_scoll_base_module_barrier_fn_t)(struct oshmem_group_t *group, + long *pSync, + int alg); +typedef int (*mca_scoll_base_module_broadcast_fn_t)(struct oshmem_group_t *group, + int PE_root, + void *target, + const void *source, + size_t nlong, + long *pSync, + int alg); +typedef int (*mca_scoll_base_module_collect_fn_t)(struct oshmem_group_t *group, + void *target, + const void *source, + size_t nlong, + long *pSync, + bool nlong_type, + int alg); +typedef int (*mca_scoll_base_module_reduce_fn_t)(struct oshmem_group_t *group, + struct oshmem_op_t *op, + void *target, + const void *source, + size_t nlong, + long *pSync, + void *pWrk, + int alg); + +struct mca_scoll_base_module_1_0_0_t { + /** Collective modules all inherit from opal_object */ + opal_object_t super; + + /* Collective function pointers */ + mca_scoll_base_module_barrier_fn_t scoll_barrier; + mca_scoll_base_module_broadcast_fn_t scoll_broadcast; + mca_scoll_base_module_collect_fn_t scoll_collect; + mca_scoll_base_module_reduce_fn_t scoll_reduce; + mca_scoll_base_module_enable_1_0_0_fn_t scoll_module_enable; +}; +typedef struct mca_scoll_base_module_1_0_0_t mca_scoll_base_module_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_scoll_base_module_1_0_0_t mca_scoll_base_module_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_scoll_base_module_t); + +/* ******************************************************************** */ + +/* + * Macro for use in components that are of type coll + */ +#define MCA_SCOLL_BASE_VERSION_2_0_0 \ + MCA_BASE_VERSION_2_0_0, \ + "scoll", 1, 0, 0 + +/* ******************************************************************** */ +/* + * Collectives group cache structure + * + * Collectives gorup cache structure, used to find functions to + * implement collective algorithms and their associated modules. + */ +struct mca_scoll_base_group_scoll_t { + mca_scoll_base_module_barrier_fn_t scoll_barrier; + mca_scoll_base_module_1_0_0_t *scoll_barrier_module; + mca_scoll_base_module_broadcast_fn_t scoll_broadcast; + mca_scoll_base_module_1_0_0_t *scoll_broadcast_module; + mca_scoll_base_module_collect_fn_t scoll_collect; + mca_scoll_base_module_1_0_0_t *scoll_collect_module; + mca_scoll_base_module_reduce_fn_t scoll_reduce; + mca_scoll_base_module_1_0_0_t *scoll_reduce_module; +}; +typedef struct mca_scoll_base_group_scoll_t mca_scoll_base_group_scoll_t; +END_C_DECLS + +#endif /* OSHMEM_MCA_SCOLL_H */ diff --git a/oshmem/mca/spml/Makefile.am b/oshmem/mca/spml/Makefile.am new file mode 100644 index 0000000000..1e500f07fd --- /dev/null +++ b/oshmem/mca/spml/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_spml.la +libmca_spml_la_SOURCES = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +# local files +headers = spml.h +libmca_spml_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/spml +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/spml/base/Makefile.am b/oshmem/mca/spml/base/Makefile.am new file mode 100644 index 0000000000..f05bbdb017 --- /dev/null +++ b/oshmem/mca/spml/base/Makefile.am @@ -0,0 +1,28 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) $(btl_sm_CPPFLAGS) + +headers += \ + base/base.h \ + base/spml_base_request.h \ + base/spml_base_request_dbg.h \ + base/spml_base_getreq.h \ + base/spml_base_atomicreq.h \ + base/spml_base_putreq.h + +libmca_spml_la_SOURCES += \ + base/spml_base_frame.c \ + base/spml_base_select.c \ + base/spml_base_request.c \ + base/spml_base_atomicreq.c \ + base/spml_base_getreq.c \ + base/spml_base_putreq.c \ + base/spml_base.c diff --git a/oshmem/mca/spml/base/base.h b/oshmem/mca/spml/base/base.h new file mode 100644 index 0000000000..e48aef8ed3 --- /dev/null +++ b/oshmem/mca/spml/base/base.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SPML_BASE_H +#define MCA_SPML_BASE_H + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "opal/mca/base/mca_base_framework.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_pointer_array.h" + +#include "oshmem/mca/spml/spml.h" + +/* + * Global functions for the PML + */ + +BEGIN_C_DECLS + +/* + * This is the base priority for a SPML wrapper component + * If there exists more than one then it is undefined + * which one is picked. + */ +#define SPML_SELECT_WRAPPER_PRIORITY -128 + +/* + * Globals + */ +OSHMEM_DECLSPEC extern mca_spml_base_component_t mca_spml_base_selected_component; +OSHMEM_DECLSPEC extern opal_pointer_array_t mca_spml_base_spml; + +OSHMEM_DECLSPEC int mca_spml_base_finalize(void); + +/* + * Select an available component. + */ +OSHMEM_DECLSPEC int mca_spml_base_select(bool enable_progress_threads, + bool enable_threads); + +/* + * Share in modex the name of the selected component + */ +OSHMEM_DECLSPEC int mca_spml_base_spml_selected(const char *name); + +/* + * Verify that all new procs are using the currently selected component + */ +OSHMEM_DECLSPEC int mca_spml_base_spml_check_selected(const char *my_spml, + oshmem_proc_t **procs, + size_t nprocs); + +OSHMEM_DECLSPEC int mca_spml_base_wait(void* addr, + int cmp, + void* value, + int datatype); +OSHMEM_DECLSPEC int mca_spml_base_wait_nb(void* handle); +OSHMEM_DECLSPEC int mca_spml_base_oob_get_mkeys(int pe, + uint32_t seg, + mca_spml_mkey_t *mkeys); + +/* + * MCA framework + */ +OSHMEM_DECLSPEC extern mca_base_framework_t oshmem_spml_base_framework; + +/* ******************************************************************** */ +#ifdef __BASE_FILE__ +#define __SPML_FILE__ __BASE_FILE__ +#else +#define __SPML_FILE__ __FILE__ +#endif + +#define SPML_VERBOSE(level, format, ...) \ + opal_output_verbose(level, oshmem_spml_base_framework.framework_output, "%s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define SPML_ERROR(format, ... ) \ + opal_output_verbose(0, oshmem_spml_base_framework.framework_output, "Error: %s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +END_C_DECLS + +#endif /* MCA_SPML_BASE_H */ diff --git a/oshmem/mca/spml/base/spml_base.c b/oshmem/mca/spml/base/spml_base.c new file mode 100644 index 0000000000..b2b6d7d3cd --- /dev/null +++ b/oshmem/mca/spml/base/spml_base.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "ompi/mca/bml/base/base.h" +#include "opal/datatype/opal_convertor.h" +#include "orte/include/orte/types.h" +#include "orte/runtime/orte_globals.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_getreq.h" +#include "ompi/mca/btl/btl.h" + +#define SPML_BASE_DO_CMP(res, addr, op, val) \ + switch((op)) { \ + case SHMEM_CMP_EQ: \ + res = *(addr) == (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_NE: \ + res = *(addr) != (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_GT: \ + res = *(addr) > (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_LE: \ + res = *(addr) <= (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_LT: \ + res = *(addr) < (val) ? 1: 0; \ + break; \ + case SHMEM_CMP_GE: \ + res = *(addr) >= (val) ? 1 : 0; \ + break; \ + } + +#define SPML_BASE_DO_WAIT(cond, val, addr, op) \ + do { \ + SPML_BASE_DO_CMP(cond, val,addr,op); \ + opal_progress(); \ + } while (cond == 0) ; + +/** + * Wait for data delivery. + * Pool on a variable given in addr until it is not equal to value. + */ +int mca_spml_base_wait(void* addr, int cmp, void* value, int datatype) +{ + int *int_addr, int_value; + long *long_addr, long_value; + short *short_addr, short_value; + long long *longlong_addr, longlong_value; + ompi_fortran_integer_t *fint_addr, fint_value; + ompi_fortran_integer4_t *fint4_addr, fint4_value; + ompi_fortran_integer8_t *fint8_addr, fint8_value; + int res = 0; + + switch (datatype) { + + /* Int */ + case SHMEM_INT: + int_value = *(int*) value; + int_addr = (int*) addr; + SPML_BASE_DO_WAIT(res, int_addr, cmp, int_value); + break; + + /* Short */ + case SHMEM_SHORT: + short_value = *(short*) value; + short_addr = (short*) addr; + SPML_BASE_DO_WAIT(res, short_addr, cmp, short_value); + break; + + /* Long */ + case SHMEM_LONG: + long_value = *(long*) value; + long_addr = (long*) addr; + SPML_BASE_DO_WAIT(res, long_addr, cmp, long_value); + break; + + /* Long-Long */ + case SHMEM_LLONG: + longlong_value = *(long long*) value; + longlong_addr = (long long*) addr; + SPML_BASE_DO_WAIT(res, longlong_addr, cmp, longlong_value); + break; + + /*C equivalent of Fortran integer type */ + case SHMEM_FINT: + fint_value = *(ompi_fortran_integer_t *) value; + fint_addr = (ompi_fortran_integer_t *) addr; + SPML_BASE_DO_WAIT(res, fint_addr, cmp, fint_value); + break; + + /*C equivalent of Fortran int4 type*/ + case SHMEM_FINT4: + fint4_value = *(ompi_fortran_integer4_t *) value; + fint4_addr = (ompi_fortran_integer4_t *) addr; + SPML_BASE_DO_WAIT(res, fint4_addr, cmp, fint4_value); + break; + + /*C equivalent of Fortran int8 type*/ + case SHMEM_FINT8: + fint8_value = *(ompi_fortran_integer8_t *) value; + fint8_addr = (ompi_fortran_integer8_t *) addr; + SPML_BASE_DO_WAIT(res, fint8_addr, cmp, fint8_value); + break; + } + + return OSHMEM_SUCCESS; +} + +/** + * Waits for completion of a non-blocking put or get issued by the calling PE. + * This function waits for completion of a single non-blocking transfer issued by + * shmem_put_nb() or shmem_get_nb() (or related functions) when called with the + * address of a completion handle. + * Completion of the call to shmem_wait_nb() ensures that a non-blocking transfer has + * completed. The source buffer may then be reused. + */ +int mca_spml_base_wait_nb(void* handle) +{ + /* TODO fence is a gag for more accurate code + * Use shmem_quiet() (or a function calling shmem_quiet()) or + * shmem_wait_nb() to force completion of transfers for non-blocking operations. + */ + MCA_SPML_CALL(fence()); + + return OSHMEM_SUCCESS; +} + +int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) +{ + return OSHMEM_ERROR; +} + diff --git a/oshmem/mca/spml/base/spml_base_atomicreq.c b/oshmem/mca/spml/base/spml_base_atomicreq.c new file mode 100644 index 0000000000..9652022fd5 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_atomicreq.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_atomicreq.h" + +static void mca_spml_base_atomic_request_construct(mca_spml_base_atomic_request_t*); +static void mca_spml_base_atomic_request_destruct(mca_spml_base_atomic_request_t*); + +OBJ_CLASS_INSTANCE( mca_spml_base_atomic_request_t, + mca_spml_base_request_t, + mca_spml_base_atomic_request_construct, + mca_spml_base_atomic_request_destruct); + +static void mca_spml_base_atomic_request_construct(mca_spml_base_atomic_request_t* request) +{ + /* no need to reinit for every atomic -- never changes */ + request->req_base.req_type = MCA_SPML_REQUEST_ATOMIC_CAS; + OBJ_CONSTRUCT(&request->req_base.req_convertor, opal_convertor_t); +} + +static void mca_spml_base_atomic_request_destruct(mca_spml_base_atomic_request_t* request) +{ + /* For each request the convertor get cleaned after each message + * (in the base _FINI macro). Therefore, as the convertor is a static object + * we don't have to call OBJ_DESTRUCT here. + */ +} + diff --git a/oshmem/mca/spml/base/spml_base_atomicreq.h b/oshmem/mca/spml/base/spml_base_atomicreq.h new file mode 100644 index 0000000000..99600a0827 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_atomicreq.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_ATOMIC_REQUEST_H +#define MCA_SPML_BASE_ATOMIC_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "ompi/peruse/peruse-internal.h" + +BEGIN_C_DECLS + +/** + * Base type for atomic requests. + */ +struct mca_spml_base_atomic_request_t { + mca_spml_base_request_t req_base; /**< base request */ + size_t req_bytes_packed; /**< size of virtual heap memory variable operated on */ +}; +typedef struct mca_spml_base_atomic_request_t mca_spml_base_atomic_request_t; + +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_spml_base_atomic_request_t); + +/** + * Initialize an atomic request with call parameters. + * + * @param request (IN) Atomic request. + * @param addr (IN) User buffer. + * @param count (IN) Number of bytes. + * @param src (IN) Source rank w/in the communicator. + * @param comm (IN) Communicator. + * @param persistent (IN) Is this a persistent request. + */ + +#define MCA_SPML_BASE_ATOMIC_REQUEST_INIT( \ + request, \ + addr, \ + count, \ + src, \ + comm, \ + persistent) \ +{ \ + /* increment reference count on communicator */ \ + OBJ_RETAIN(comm); \ + \ + OSHMEM_REQUEST_INIT(&(request)->req_base.req_oshmem, persistent); \ + (request)->req_base.req_oshmem.req_shmem_object.comm = comm; \ + (request)->req_bytes_packed = 0; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_peer = src; \ + (request)->req_base.req_comm = comm; \ + (request)->req_base.req_proc = NULL; \ + (request)->req_base.req_sequence = 0; \ + /* What about req_type ? */ \ + (request)->req_base.req_spml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ +} +/** + * + * + */ +#define MCA_SPML_BASE_ATOMIC_START( request ) \ + do { \ + (request)->req_spml_complete = false; \ + \ + (request)->req_oshmem.req_status.SHMEM_SOURCE = SHMEM_ANY_SOURCE; \ + (request)->req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; \ + (request)->req_oshmem.req_status._count = 0; \ + (request)->req_oshmem.req_status._cancelled = 0; \ + \ + (request)->req_oshmem.req_complete = false; \ + (request)->req_oshmem.req_state = OSHMEM_REQUEST_ACTIVE; \ + } while (0) + +/** + * Return a atomic request. Handle the release of the communicator and the + * attached datatype. + * + * @param request (IN) Get request. + */ +#define MCA_SPML_BASE_ATOMIC_REQUEST_FINI( request ) \ + do { \ + OSHMEM_REQUEST_FINI(&(request)->req_base.req_oshmem); \ + OBJ_RELEASE( (request)->req_base.req_comm); \ + opal_convertor_cleanup( &((request)->req_base.req_convertor) ); \ + } while (0) + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_frame.c b/oshmem/mca/spml/base/spml_base_frame.c new file mode 100644 index 0000000000..2edcd1414e --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_frame.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNIST_H */ +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_request.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "oshmem/mca/spml/base/static-components.h" + +#define xstringify(spml) #spml +#define stringify(spml) xstringify(spml) + +/* + * Global variables + */ +mca_spml_base_module_t mca_spml; + +mca_spml_base_component_t mca_spml_base_selected_component; +opal_pointer_array_t mca_spml_base_spml; + + +static int mca_spml_base_register(mca_base_register_flag_t flags) +{ + return OMPI_SUCCESS; +} + +int mca_spml_base_finalize(void) +{ + if (NULL != mca_spml_base_selected_component.spmlm_finalize) { + return mca_spml_base_selected_component.spmlm_finalize(); + } + return OSHMEM_SUCCESS; +} + +static int mca_spml_base_close(void) +{ + int i, j; + + /** + * Destruct the send and receive queues. The ompi_free_list_t destructor + * will return the memory to the mpool, so this has to be done before the + * mpool get released by the SPML close function. + */ + OBJ_DESTRUCT(&mca_spml_base_put_requests); + OBJ_DESTRUCT(&mca_spml_base_get_requests); + + /* Free all the strings in the array */ + j = opal_pointer_array_get_size(&mca_spml_base_spml); + for (i = 0; i < j; i++) { + char * tmp_val; + tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, i); + if (NULL == tmp_val) { + continue; + } + free(tmp_val); + } + OBJ_DESTRUCT(&mca_spml_base_spml); + + /* Close all remaining available components */ + return mca_base_framework_components_close(&oshmem_spml_base_framework, NULL); +} + +/** + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +static int mca_spml_base_open(mca_base_open_flag_t flags) +{ + /** + * Construct the send and receive request queues. There are 2 reasons to do it + * here. First, as they are globals it's better to construct them in one common + * place. Second, in order to be able to allow the external debuggers to show + * their content, they should get constructed as soon as possible once the MPI + * process is started. + */ + OBJ_CONSTRUCT(&mca_spml_base_put_requests, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_spml_base_get_requests, ompi_free_list_t); + + OBJ_CONSTRUCT(&mca_spml_base_spml, opal_pointer_array_t); + + /* Open up all available components */ + if (OPAL_SUCCESS != + mca_base_framework_components_open(&oshmem_spml_base_framework, flags)) { + return OSHMEM_ERROR; + } + + /* Set a sentinel in case we don't select any components (e.g., + ompi_info) */ + + mca_spml_base_selected_component.spmlm_finalize = NULL; + + /** + * Right now our selection of BTLs is completely broken. If we have + * multiple SPMLs that use BTLs than we will open all BTLs several times, leading to + * undefined behaviors. The simplest solution, at least until we + * figure out the correct way to do it, is to force a default SPML that + * uses BTLs and any other SPMLs that do not in the mca_spml_base_spml array. + */ + +#if MCA_ompi_pml_DIRECT_CALL + opal_pointer_array_add(&mca_spml_base_spml, + strdup(stringify(MCA_oshmem_spml_DIRECT_CALL_COMPONENT))); +#else + { + const char **default_spml = NULL; + int var_id; + + var_id = mca_base_var_find("oshmem", "spml", NULL, NULL); + mca_base_var_get_value(var_id, &default_spml, NULL, NULL); + + if( (NULL == default_spml || NULL == default_spml[0] || + 0 == strlen(default_spml[0])) || (default_spml[0][0] == '^') ) { +#ifdef OSHMEM_HAS_IKRIT + opal_pointer_array_add(&mca_spml_base_spml, strdup("ikrit")); +#endif + opal_pointer_array_add(&mca_spml_base_spml, strdup("yoda")); + } else { + opal_pointer_array_add(&mca_spml_base_spml, strdup(default_spml[0])); + } + } +#endif + + return OSHMEM_SUCCESS; +} + +MCA_BASE_FRAMEWORK_DECLARE(oshmem, spml, + "OSHMEM SPML", + mca_spml_base_register, + mca_spml_base_open, + mca_spml_base_close, + mca_spml_base_static_components, + 0); diff --git a/oshmem/mca/spml/base/spml_base_getreq.c b/oshmem/mca/spml/base/spml_base_getreq.c new file mode 100644 index 0000000000..8df201a600 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_getreq.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +static void mca_spml_base_get_request_construct(mca_spml_base_get_request_t*); +static void mca_spml_base_get_request_destruct(mca_spml_base_get_request_t*); + +OBJ_CLASS_INSTANCE( mca_spml_base_get_request_t, + mca_spml_base_request_t, + mca_spml_base_get_request_construct, + mca_spml_base_get_request_destruct); + +static void mca_spml_base_get_request_construct(mca_spml_base_get_request_t* request) +{ + /* no need to reinit for every get -- never changes */ + request->req_base.req_type = MCA_SPML_REQUEST_GET; + OBJ_CONSTRUCT(&request->req_base.req_convertor, opal_convertor_t); +} + +static void mca_spml_base_get_request_destruct(mca_spml_base_get_request_t* request) +{ + /* For each request the convertor get cleaned after each message + * (in the base _FINI macro). Therefore, as the convertor is a static object + * we don't have to call OBJ_DESTRUCT here. + */ +} + diff --git a/oshmem/mca/spml/base/spml_base_getreq.h b/oshmem/mca/spml/base/spml_base_getreq.h new file mode 100644 index 0000000000..39e3948303 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_getreq.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_GET_REQUEST_H +#define MCA_SPML_BASE_GET_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "ompi/peruse/peruse-internal.h" + +BEGIN_C_DECLS + +/** + * Base type for get requests. + */ +struct mca_spml_base_get_request_t { + mca_spml_base_request_t req_base; /**< base request */ + void *req_addr; /**< pointer to recv buffer on the local PE - not necessarily an application buffer */ + size_t req_bytes_packed; /**< size of message being read */ +}; +typedef struct mca_spml_base_get_request_t mca_spml_base_get_request_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_spml_base_get_request_t); + +/** + * Initialize a get request. + * + * @param request (IN) Pointer to the Get request. + * @param addr (IN) User buffer. + * @param count (IN) Number of bytes. + * @param peer (IN) rank w/in the communicator where the data is read from. + * @param mode (IN) Get Mode. + * @param persistent (IN) Is this a persistent request. + * @param convertor_flags(IN) + */ +#define MCA_SPML_BASE_GET_REQUEST_INIT( request, \ + addr, \ + count, \ + peer, \ + persistent) \ + { \ + OSHMEM_REQUEST_INIT(&(request)->req_base.req_oshmem, persistent); \ + (request)->req_addr = addr; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_peer = (int32_t)peer; \ + (request)->req_base.req_spml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ + (request)->req_base.req_oshmem.req_status._cancelled = 0; \ + (request)->req_bytes_packed = 0; \ +} + +/** + * + * + */ +#define MCA_SPML_BASE_GET_START( request ) \ + do { \ + (request)->req_spml_complete = false; \ + \ + (request)->req_oshmem.req_status.SHMEM_SOURCE = SHMEM_ANY_SOURCE; \ + (request)->req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; \ + (request)->req_oshmem.req_status._count = 0; \ + (request)->req_oshmem.req_status._cancelled = 0; \ + \ + (request)->req_oshmem.req_complete = false; \ + (request)->req_oshmem.req_state = OSHMEM_REQUEST_ACTIVE; \ + } while (0) + +/** + * Return a Get request. Handle the release of the communicator and the + * attached datatype. + * + * @param request (IN) Get request. + */ +#define MCA_SPML_BASE_GET_REQUEST_FINI( request ) \ + do { \ + OSHMEM_REQUEST_FINI(&(request)->req_base.req_oshmem); \ + opal_convertor_cleanup( &((request)->req_base.req_convertor) ); \ + } while (0) + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_putreq.c b/oshmem/mca/spml/base/spml_base_putreq.c new file mode 100644 index 0000000000..2353699493 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_putreq.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" + +static void mca_spml_base_put_request_construct(mca_spml_base_put_request_t* req); +static void mca_spml_base_put_request_destruct(mca_spml_base_put_request_t* req); + +OBJ_CLASS_INSTANCE( mca_spml_base_put_request_t, + mca_spml_base_request_t, + mca_spml_base_put_request_construct, + mca_spml_base_put_request_destruct); + +static void mca_spml_base_put_request_construct(mca_spml_base_put_request_t* request) +{ + /* no need to reinit for every send -- never changes */ + request->req_base.req_type = MCA_SPML_REQUEST_PUT; +} + +static void mca_spml_base_put_request_destruct(mca_spml_base_put_request_t* req) +{ + /* For each request the convertor get cleaned after each message + * (in the base _FINI macro). Therefore, as the convertor is a static object + * we don't have to call OBJ_DESTRUCT here. + */ +} + diff --git a/oshmem/mca/spml/base/spml_base_putreq.h b/oshmem/mca/spml/base/spml_base_putreq.h new file mode 100644 index 0000000000..adf7481453 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_putreq.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_PUT_REQUEST_H +#define MCA_SPML_BASE_PUT_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "ompi/peruse/peruse-internal.h" + +BEGIN_C_DECLS + +/** + * Base type for send requests + */ +struct mca_spml_base_put_request_t { + mca_spml_base_request_t req_base; /**< base request type - common data structure for use by wait/test */ + void *req_addr; /**< pointer to send buffer - may not be application buffer */ + size_t req_bytes_packed; /**< packed size of a message given the datatype and count */ +}; +typedef struct mca_spml_base_put_request_t mca_spml_base_put_request_t; + +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION( mca_spml_base_put_request_t); + +/** + * Initialize a send request with call parameters. + * + * @param request (IN) Send request + * @param addr (IN) User buffer + * @param count (IN) Number of bytes. + * @param peer (IN) Destination rank + * @param comm (IN) Communicator + * @param mode (IN) Send mode (STANDARD,BUFFERED,SYNCHRONOUS,READY) + * @param persistent (IN) Is request persistent. + * @param convertor_flags (IN) Flags to pass to convertor + * + * Perform a any one-time initialization. Note that per-use initialization + * is done in the send request start routine. + */ + +#define MCA_SPML_BASE_PUT_REQUEST_INIT( request, \ + addr, \ + count, \ + peer, \ + persistent) \ + { \ + OSHMEM_REQUEST_INIT(&(request)->req_base.req_oshmem, persistent); \ + (request)->req_addr = addr; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_peer = (int32_t)peer; \ + (request)->req_base.req_spml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ + (request)->req_base.req_oshmem.req_status._cancelled = 0; \ + (request)->req_bytes_packed = 0; \ + \ + } + +/** + * Mark the request as started from the SPML base point of view. + * + * @param request (IN) The put request. + */ + +#define MCA_SPML_BASE_PUT_START( request ) \ + do { \ + (request)->req_spml_complete = false; \ + (request)->req_oshmem.req_complete = false; \ + (request)->req_oshmem.req_state = OSHMEM_REQUEST_ACTIVE; \ + (request)->req_oshmem.req_status._cancelled = 0; \ + } while (0) + +/** + * Release the ref counts on the communicator and datatype. + * + * @param request (IN) The put request. + */ + +#define MCA_SPML_BASE_PUT_REQUEST_FINI( request ) \ + do { \ + OSHMEM_REQUEST_FINI(&(request)->req_base.req_oshmem); \ + opal_convertor_cleanup( &((request)->req_base.req_convertor) ); \ + } while (0) + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_request.c b/oshmem/mca/spml/base/spml_base_request.c new file mode 100644 index 0000000000..80f7b2f0ed --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_request.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_request.h" + +/** + * If you wonder why these 2 freelists are declared here read the comment + * in the spml_base_request.h file. + */ +ompi_free_list_t mca_spml_base_put_requests/* = {{{0}}}*/; +ompi_free_list_t mca_spml_base_get_requests /*= {{{0}}}*/; +ompi_free_list_t mca_spml_base_atomic_requests = { { { 0 } } }; + +static void mca_spml_base_request_construct(mca_spml_base_request_t* req) +{ + req->req_oshmem.req_type = OSHMEM_REQUEST_SPML; +} + +static void mca_spml_base_request_destruct(mca_spml_base_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE(mca_spml_base_request_t, + oshmem_request_t, + mca_spml_base_request_construct, + mca_spml_base_request_destruct); + diff --git a/oshmem/mca/spml/base/spml_base_request.h b/oshmem/mca/spml/base/spml_base_request.h new file mode 100644 index 0000000000..63df3b8f56 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_request.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_REQUEST_H +#define MCA_SPML_BASE_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/request/request.h" /* TODO: define */ + +#include "opal/datatype/opal_convertor.h" + +#include "ompi/class/ompi_free_list.h" +#include "ompi/mca/pml/ob1/pml_ob1_comm.h" + +BEGIN_C_DECLS + +/** + * External list for the requests. They are declared as lists of + * the basic request type, which will allow all SPML to overload + * the list. Beware these free lists have to be initialized + * directly by the SPML who win the SPML election. + */ +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_put_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_get_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_send_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_recv_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_atomic_requests; + +/* TODO: Consider to add requests lists + * 1. List of Non blocking requests with NULL handle. + * 2. List of Non blocking request with Non-NULL handle. + * 3. List of non completed puts (for small msgs). + */ + +/** + * Types of one sided requests. + */ +typedef enum { + MCA_SPML_REQUEST_NULL, + MCA_SPML_REQUEST_PUT, /* Put request */ + MCA_SPML_REQUEST_GET, /* Get Request */ + MCA_SPML_REQUEST_SEND, /* Send Request */ + MCA_SPML_REQUEST_RECV, /* Receive Request */ + MCA_SPML_REQUEST_ATOMIC_CAS, /* Atomic Compare-And-Swap request */ + MCA_SPML_REQUEST_ATOMIC_FAAD /* Atomic Fatch-And-Add request */ +} mca_spml_base_request_type_t; + +/** + * Base type for SPML one sided requests + */ +struct mca_spml_base_request_t { + + oshmem_request_t req_oshmem; /**< base request */ + volatile bool req_spml_complete; /**< flag indicating if the one sided layer is done with this request */ + mca_spml_base_request_type_t req_type; /**< SHMEM request type */ + volatile bool req_free_called; /**< flag indicating if the user has freed this request */ + opal_convertor_t req_convertor; /**< always need the convertor */ + + void *req_addr; /**< pointer to application buffer */ + size_t req_count; /**< count of user datatype elements *//* TODO: Need to remove since we are going to remove datatype*/ + int32_t req_peer; /**< peer process - rank of process executing the parallel program */ + oshmem_proc_t* req_proc; /**< peer process */ + uint64_t req_sequence; /**< sequence number for shmem one sided ordering */ +}; +typedef struct mca_spml_base_request_t mca_spml_base_request_t; + +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_spml_base_request_t); + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_request_dbg.h b/oshmem/mca/spml/base/spml_base_request_dbg.h new file mode 100644 index 0000000000..cc0ea3c9fd --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_request_dbg.h @@ -0,0 +1,25 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef MCA_SPML_BASE_REQUEST_DBG_H +#define MCA_SPML_BASE_REQUEST_DBG_H + +/** + * Type of request. + */ +typedef enum { + MCA_SPML_REQUEST_NULL, + MCA_SPML_REQUEST_PUT, /* Added */ + MCA_SPML_REQUEST_GET, /* Added */ + MCA_SPML_REQUEST_ATOMIC_CAS, /* Added */ + MCA_SPML_REQUEST_ATOMIC_FAAD /* Added */ +} mca_spml_base_request_type_t; + +#endif /* MCA_SPML_BASE_REQUEST_DBG_H */ diff --git a/oshmem/mca/spml/base/spml_base_select.c b/oshmem/mca/spml/base/spml_base_select.c new file mode 100644 index 0000000000..270c6e5d5a --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_select.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/mca/base/base.h" +#include "opal/runtime/opal.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/show_help.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" + + +typedef struct opened_component_t { + opal_list_item_t super; + mca_spml_base_component_t *om_component; +} opened_component_t; + + +/** + * Function for selecting one component from all those that are + * available. + * + * Call the init function on all available components and get their + * priorities. Select the component with the highest priority. All + * other components will be closed and unloaded. The selected component + * will have all of its function pointers saved and returned to the + * caller. + */ +int mca_spml_base_select(bool enable_progress_threads, bool enable_mpi_threads) +{ + int i, priority = 0, best_priority = 0, num_spml = 0; + opal_list_item_t *item = NULL; + mca_base_component_list_item_t *cli = NULL; + mca_spml_base_component_t *component = NULL, *best_component = NULL; + mca_spml_base_module_t *module = NULL, *best_module = NULL; + opal_list_t opened; + opened_component_t *om = NULL; + bool found_spml; + + /* Traverse the list of available components; call their init + functions. */ + + best_priority = -1; + best_component = NULL; + module = NULL; + OBJ_CONSTRUCT(&opened, opal_list_t); + OPAL_LIST_FOREACH(cli, &oshmem_spml_base_framework.framework_components, mca_base_component_list_item_t) { + component = (mca_spml_base_component_t *) cli->cli_component; + + /* if there is an include list - item must be in the list to be included */ + found_spml = false; + for( i = 0; i < opal_pointer_array_get_size(&mca_spml_base_spml); i++) { + char * tmp_val = NULL; + tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, i); + if( NULL == tmp_val) { + continue; + } + + if(0 == strncmp(component->spmlm_version.mca_component_name, + tmp_val, strlen(component->spmlm_version.mca_component_name)) ) { + found_spml = true; + break; + } + } + + if (!found_spml + && opal_pointer_array_get_size(&mca_spml_base_spml)) { + SPML_VERBOSE( 10, + "select: component %s not in the include list", + component->spmlm_version.mca_component_name); + + continue; + } + + /* if there is no init function - ignore it */ + if (NULL == component->spmlm_init) { + SPML_VERBOSE( 10, + "select: no init function; ignoring component %s", + component->spmlm_version.mca_component_name); + continue; + } + + /* this is a spml that could be considered */ + num_spml++; + + /* Init component to get its priority */ + SPML_VERBOSE( 10, + "select: initializing %s component %s", + component->spmlm_version.mca_type_name, component->spmlm_version.mca_component_name); + priority = best_priority; + module = component->spmlm_init(&priority, + enable_progress_threads, + enable_mpi_threads); + if (NULL == module) { + SPML_VERBOSE( 10, + "select: init returned failure for component %s", + component->spmlm_version.mca_component_name); + continue; + } + + SPML_VERBOSE( 10, "select: init returned priority %d", priority); + + if (priority > best_priority) { + best_priority = priority; + best_component = component; + best_module = module; + } + + om = (opened_component_t*) malloc(sizeof(opened_component_t)); + if (NULL == om) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + OBJ_CONSTRUCT(om, opal_list_item_t); + om->om_component = component; + opal_list_append(&opened, (opal_list_item_t*) om); + } + + /* Finished querying all components. Check for the bozo case. */ + + if (NULL == best_component) { + orte_show_help("help-shmem-mca.txt", + "find-available:none-found", + true, + "spml"); + for (i = 0; i < opal_pointer_array_get_size(&mca_spml_base_spml); i++) { + char * tmp_val = NULL; + tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, + i); + if (NULL == tmp_val) { + continue; + } + orte_errmgr.abort(1, "SPML %s cannot be selected", tmp_val); + } + if (0 == i) { + orte_errmgr.abort(2, + "No spml component available. This shouldn't happen."); + } + } + + SPML_VERBOSE( 10, + "selected %s best priority %d\n", + best_component->spmlm_version.mca_component_name, best_priority); + + /* Finalize all non-selected components */ + for (item = opal_list_remove_first(&opened); + NULL != item; + item = opal_list_remove_first(&opened)) { + om = (opened_component_t *) item; + + if (om->om_component != best_component) { + /* Finalize */ + + if (NULL != om->om_component->spmlm_finalize) { + + /* Blatently ignore the return code (what would we do to + recover, anyway? This component is going away, so errors + don't matter anymore) */ + + om->om_component->spmlm_finalize(); + SPML_VERBOSE(10, + "select: component %s not selected / finalized", + om->om_component->spmlm_version.mca_component_name); + } + } + OBJ_DESTRUCT( om); + free(om); + } + OBJ_DESTRUCT( &opened); + + /* Save the winner */ + + mca_spml_base_selected_component = *best_component; + mca_spml = *best_module; + SPML_VERBOSE( 10, + "select: component %s selected", + mca_spml_base_selected_component.spmlm_version.mca_component_name); + + /* This base function closes, unloads, and removes from the + available list all unselected components. The available list will + contain only the selected component. */ + + mca_base_components_close(oshmem_spml_base_framework.framework_output, + &oshmem_spml_base_framework.framework_components, + (mca_base_component_t *) best_component); + + /* All done */ + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/spml/configure.m4 b/oshmem/mca/spml/configure.m4 new file mode 100644 index 0000000000..fb2227e1b3 --- /dev/null +++ b/oshmem/mca/spml/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_oshmem_spml_CONFIG],[ + # configure all the components + MCA_CONFIGURE_FRAMEWORK($1, $2, 1) + + # this is a direct callable component, so set that up. + MCA_SETUP_DIRECT_CALL($1, $2) +]) diff --git a/oshmem/mca/spml/ikrit/Makefile.am b/oshmem/mca/spml/ikrit/Makefile.am new file mode 100644 index 0000000000..9f811a999f --- /dev/null +++ b/oshmem/mca/spml/ikrit/Makefile.am @@ -0,0 +1,42 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = \ + help-shmem-spml-ikrit.txt + +AM_CFLAGS = $(OSHMEM_CFLAGS) +AM_CPPFLAGS = $(spml_ikrit_CPPFLAGS) + +ikrit_sources = \ + spml_ikrit.c \ + spml_ikrit.h \ + spml_ikrit_component.c \ + spml_ikrit_component.h + +if MCA_BUILD_oshmem_spml_ikrit_DSO +component_noinst = +component_install = mca_spml_ikrit.la +else +component_noinst = libmca_spml_ikrit.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_spml_ikrit_la_SOURCES = $(ikrit_sources) +mca_spml_ikrit_la_LIBADD = $(spml_ikrit_LIBS) +mca_spml_ikrit_la_LDFLAGS = -module -avoid-version $(spml_ikrit_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_spml_ikrit_la_SOURCES = $(ikrit_sources) +libmca_spml_ikrit_la_LIBADD = $(spml_ikrit_LIBS) +libmca_spml_ikrit_la_LDFLAGS = -module -avoid-version $(spml_ikrit_LDFLAGS) + diff --git a/oshmem/mca/spml/ikrit/configure.m4 b/oshmem/mca/spml/ikrit/configure.m4 new file mode 100644 index 0000000000..2d877e6f07 --- /dev/null +++ b/oshmem/mca/spml/ikrit/configure.m4 @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +# MCA_oshmem_mtl_mxm_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_oshmem_spml_ikrit_CONFIG],[ + AC_CONFIG_FILES([oshmem/mca/spml/ikrit/Makefile]) + + OMPI_CHECK_MXM([spml_ikrit], + [AC_DEFINE([OSHMEM_HAS_IKRIT], [1], [mxm support is available]) + spml_ikrit_happy="yes"], + [spml_ikrit_happy="no"]) + + AS_IF([test "$spml_ikrit_happy" = "yes"], + [spml_ikrit_WRAPPER_EXTRA_LDFLAGS="$spml_ikrit_LDFLAGS" + spml_ikrit_WRAPPER_EXTRA_LIBS="$spml_ikrit_LIBS" + $1], + [$2]) + + + # substitute in the things needed to build mxm + AC_SUBST([spml_ikrit_CFLAGS]) + AC_SUBST([spml_ikrit_CPPFLAGS]) + AC_SUBST([spml_ikrit_LDFLAGS]) + AC_SUBST([spml_ikrit_LIBS]) +])dnl + diff --git a/oshmem/mca/spml/ikrit/configure.params b/oshmem/mca/spml/ikrit/configure.params new file mode 100644 index 0000000000..9da11ca9e3 --- /dev/null +++ b/oshmem/mca/spml/ikrit/configure.params @@ -0,0 +1,14 @@ +# -*- shell-script -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt b/oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt new file mode 100644 index 0000000000..edffaa846c --- /dev/null +++ b/oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt @@ -0,0 +1,73 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +[no uuid present] +Error obtaining unique transport key from ORTE (orte_precondition_transports %s +the environment). + + Local host: %s + +[unable to create endpoint] +MXM was unable to create an endpoint. Please make sure that the network link is +active on the node and the hardware is functioning. + + Error: %s + +[unable to get endpoint address] +MXM was unable to get endpoint address + + Error: %s + +[unable to extract endpoint ib address] +MXM was unable to read IB settings for endpoint + + Error: %s + +[unable to extract endpoint local address] +MXM was unable to read shmem settings for endpoint + + Error: %s + +[mxm mq create] +Failed to create MQ for endpoint + + Error: %s + +[errors during mxm_progress] + +Error %s occurred in attempting to make network progress (mxm_progress). + + +[mxm init] +Initialization of MXM library failed. + + Error: %s + +[error polling network] +Error %s occurred in attempting to make network progress (mxm_mq_ipeek). + +[error posting receive] +Unable to post application receive buffer + + Error: %s + Buffer: %p + Length: %d + +[error posting send] +Unable to post application send buffer + + Error: %s + +[error while waiting in send] +Unable while waiting in send + + Error: %s + \ No newline at end of file diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c new file mode 100644 index 0000000000..a2ec191c4f --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -0,0 +1,1492 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define _GNU_SOURCE +#include + +#include +#include +#include + +#include "oshmem_config.h" +#include "opal/datatype/opal_convertor.h" +#include "orte/include/orte/types.h" +#include "orte/runtime/orte_globals.h" +#include "oshmem/mca/spml/ikrit/spml_ikrit.h" +#include "oshmem/include/shmem.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "oshmem/runtime/runtime.h" +#include "orte/util/show_help.h" + +#include "oshmem/mca/spml/ikrit/spml_ikrit_component.h" + +/* Turn ON/OFF debug output from build (default 0) */ +#ifndef SPML_IKRIT_PUT_DEBUG +#define SPML_IKRIT_PUT_DEBUG 0 +#endif + +typedef struct spml_ikrit_am_hdr { + uint64_t va; +} spml_ikrit_am_hdr_t; + +struct mca_spml_ikrit_put_request { + mca_spml_base_put_request_t req_put; + mxm_send_req_t mxm_req; + int pe; + mxm_req_buffer_t iov[2]; + spml_ikrit_am_hdr_t am_pkt; +}; + +typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t; +OBJ_CLASS_DECLARATION(mca_spml_ikrit_put_request_t); + +#if MXM_API < MXM_VERSION(2,0) +static int spml_ikrit_get_ep_address(spml_ikrit_mxm_ep_conn_info_t *ep_info, + mxm_ptl_id_t ptlid) +{ + size_t addrlen; + mxm_error_t err; + + addrlen = sizeof(ep_info->ptl_addr[ptlid]); + err = mxm_ep_address(mca_spml_ikrit.mxm_ep, + ptlid, + (struct sockaddr *) &ep_info->ptl_addr[ptlid], + &addrlen); + if (MXM_OK != err) { + orte_show_help("help-spml-ikrit.txt", + "unable to extract endpoint address", + true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} +#endif + +static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req) +{ + while (!mxm_req_test(req)) + opal_progress(); +} + +static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request) +{ + mca_spml_ikrit_put_request_t *put_req = + *(mca_spml_ikrit_put_request_t **) request; + + assert(false == put_req->req_put.req_base.req_free_called); + OPAL_THREAD_LOCK(&oshmem_request_lock); + put_req->req_put.req_base.req_free_called = true; + OMPI_FREE_LIST_RETURN_MT( &mca_spml_base_put_requests, + (ompi_free_list_item_t*)put_req); + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ + + return OSHMEM_SUCCESS; +} + +static int mca_spml_ikrit_put_request_cancel(struct oshmem_request_t * request, + int complete) +{ + return OSHMEM_SUCCESS; +} + +static void mca_spml_ikrit_put_request_construct(mca_spml_ikrit_put_request_t* req) +{ + req->req_put.req_base.req_type = MCA_SPML_REQUEST_PUT; + req->req_put.req_base.req_oshmem.req_free = mca_spml_ikrit_put_request_free; + req->req_put.req_base.req_oshmem.req_cancel = + mca_spml_ikrit_put_request_cancel; +} + +static void mca_spml_ikrit_put_request_destruct(mca_spml_ikrit_put_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_ikrit_put_request_t, + mca_spml_base_put_request_t, + mca_spml_ikrit_put_request_construct, + mca_spml_ikrit_put_request_destruct); + +struct mca_spml_ikrit_get_request { + mca_spml_base_get_request_t req_get; + mxm_send_req_t mxm_req; +}; + +typedef struct mca_spml_ikrit_get_request mca_spml_ikrit_get_request_t; +OBJ_CLASS_DECLARATION(mca_spml_ikrit_get_request_t); + +static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request) +{ + mca_spml_ikrit_get_request_t *get_req = + *(mca_spml_ikrit_get_request_t **) request; + + assert(false == get_req->req_get.req_base.req_free_called); + OPAL_THREAD_LOCK(&oshmem_request_lock); + get_req->req_get.req_base.req_free_called = true; + OMPI_FREE_LIST_RETURN_MT( &mca_spml_base_get_requests, + (ompi_free_list_item_t*)get_req); + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ + + return OSHMEM_SUCCESS; +} + +static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request, + int complete) +{ + return OSHMEM_SUCCESS; +} + +static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req) +{ + req->req_get.req_base.req_type = MCA_SPML_REQUEST_PUT; + req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free; + req->req_get.req_base.req_oshmem.req_cancel = + mca_spml_ikrit_get_request_cancel; +} + +static void mca_spml_ikrit_get_request_destruct(mca_spml_ikrit_get_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_ikrit_get_request_t, + mca_spml_base_get_request_t, + mca_spml_ikrit_get_request_construct, + mca_spml_ikrit_get_request_destruct); + +int mca_spml_ikrit_put_simple(void* dst_addr, + size_t size, + void* src_addr, + int dst); + +static void mxm_setup_relays(oshmem_proc_t **procs, size_t nprocs); + +mca_spml_ikrit_t mca_spml_ikrit = { + { + /* Init mca_spml_base_module_t */ + mca_spml_ikrit_add_procs, + mca_spml_ikrit_del_procs, + mca_spml_ikrit_enable, + mca_spml_ikrit_register, + mca_spml_ikrit_deregister, + mca_spml_ikrit_oob_get_mkeys, + mca_spml_ikrit_put, + mca_spml_ikrit_put_nb, + mca_spml_ikrit_get, + mca_spml_ikrit_recv, + mca_spml_ikrit_send, + mca_spml_base_wait, + mca_spml_base_wait_nb, + mca_spml_ikrit_fence, + NULL, + NULL, + NULL, + NULL, + + (void*)&mca_spml_ikrit + } +}; + +#if MXM_API < MXM_VERSION(2,0) +void mca_spml_ikrit_dump_stats(void); +void mca_spml_ikrit_dump_stats() +{ + int num_procs; + int i; + char sbuf[1024]; + FILE *fp; + + fp = fmemopen(sbuf, sizeof(sbuf), "rw"); + num_procs = oshmem_num_procs(); + for (i = 0; i < num_procs; i++) { + mxm_print_conn_state(mca_spml_ikrit.mxm_peers[i]->mxm_conn, + MXM_STATE_DETAIL_LEVEL_DATA, + "", + fp); + printf("=========== pe:%d conn:%p stats:\n %s==================\n", + i, + mca_spml_ikrit.mxm_peers[i]->mxm_conn, + sbuf); + rewind(fp); + } + fclose(fp); +} +#endif + +static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) +{ + mca_spml_ikrit_put_request_t *req; + ompi_free_list_item_t* item; + + OMPI_FREE_LIST_WAIT_MT(&mca_spml_base_put_requests, item); + + req = (mca_spml_ikrit_put_request_t *) item; + req->req_put.req_base.req_free_called = false; + req->req_put.req_base.req_oshmem.req_complete = false; + + return req; +} + +static inline mca_spml_ikrit_get_request_t *alloc_get_req(void) +{ + mca_spml_ikrit_get_request_t *req; + ompi_free_list_item_t* item; + + OMPI_FREE_LIST_WAIT_MT(&mca_spml_base_get_requests, item); + + req = (mca_spml_ikrit_get_request_t *) item; + req->req_get.req_base.req_free_called = false; + req->req_get.req_base.req_oshmem.req_complete = false; + + return req; +} + +int mca_spml_ikrit_enable(bool enable) +{ + SPML_VERBOSE(50, "*** ikrit ENABLED ****"); + if (false == enable) { + return OSHMEM_SUCCESS; + } + + ompi_free_list_init_new(&mca_spml_base_put_requests, + sizeof(mca_spml_ikrit_put_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_ikrit_put_request_t), + 0, + opal_cache_line_size, + mca_spml_ikrit.free_list_num, + mca_spml_ikrit.free_list_max, + mca_spml_ikrit.free_list_inc, + NULL ); + + ompi_free_list_init_new(&mca_spml_base_get_requests, + sizeof(mca_spml_ikrit_get_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_ikrit_get_request_t), + 0, + opal_cache_line_size, + mca_spml_ikrit.free_list_num, + mca_spml_ikrit.free_list_max, + mca_spml_ikrit.free_list_inc, + NULL ); + + mca_spml_ikrit.enabled = true; + + return OSHMEM_SUCCESS; +} + +static int create_ptl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + + proc->transport_ids = (char *) malloc(MXM_PTL_LAST * sizeof(char)); + if (!proc->transport_ids) + return OSHMEM_ERROR; + + proc->num_transports = 1; + if (oshmem_my_proc_id() == dst_pe) + proc->transport_ids[0] = MXM_PTL_SELF; + else + proc->transport_ids[0] = MXM_PTL_RDMA; + return OSHMEM_SUCCESS; +} + +static void destroy_ptl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + if (proc->transport_ids) + free(proc->transport_ids); +} + +static void mxm_peer_construct(mxm_peer_t *p) +{ + p->pe = -1; + p->n_active_puts = 0; + p->need_fence = 0; + p->pe_relay = -1; + p->n_slaves = 0; +} + +static void mxm_peer_destruct(mxm_peer_t *p) +{ + /* may be we need to remov item from list */ +} + +OBJ_CLASS_INSTANCE( mxm_peer_t, + opal_list_item_t, + mxm_peer_construct, + mxm_peer_destruct); + +int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs) +{ + size_t i; + opal_list_item_t *item; + + if (mca_spml_ikrit.mxm_ep) { + mxm_ep_destroy(mca_spml_ikrit.mxm_ep); + mca_spml_ikrit.mxm_ep = 0; + } + + while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) { + }; + OBJ_DESTRUCT(&mca_spml_ikrit.active_peers); + + for (i = 0; i < nprocs; i++) { + destroy_ptl_idx(i); + if (mca_spml_ikrit.mxm_peers[i]) { + OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]); + } + } + if (mca_spml_ikrit.mxm_peers) + free(mca_spml_ikrit.mxm_peers); + + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) +{ + spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; +#if MXM_API < MXM_VERSION(2,0) + mxm_conn_req_t *conn_reqs; + int timeout; +#else + size_t mxm_addr_len = MXM_MAX_ADDR_LEN; +#endif + mxm_error_t err; + size_t i; + int rc = OSHMEM_ERROR; + oshmem_proc_t *proc_self; + int my_rank = oshmem_my_proc_id(); + + OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); + /* Allocate connection requests */ +#if MXM_API < MXM_VERSION(2,0) + conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t)); + if (NULL == conn_reqs) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t)); +#endif + ep_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t)); + if (NULL == ep_info) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + memset(ep_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t)); + + mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs + * sizeof(*(mca_spml_ikrit.mxm_peers))); + if (NULL == mca_spml_ikrit.mxm_peers) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + +#if MXM_API < MXM_VERSION(2,0) + if (OSHMEM_SUCCESS + != spml_ikrit_get_ep_address(&ep_info[my_rank], MXM_PTL_SELF)) { + return OSHMEM_ERROR; + } + if (OSHMEM_SUCCESS + != spml_ikrit_get_ep_address(&ep_info[my_rank], MXM_PTL_RDMA)) { + return OSHMEM_ERROR; + } +#else + err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, ep_info[my_rank].ep_addr, &mxm_addr_len); + if (MXM_OK != err) { + orte_show_help("help-shmem-spml-ikrit.txt", "unable to get endpoint address", true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } +#endif + + opal_progress_register(spml_ikrit_progress); + + oshmem_shmem_exchange_allgather(ep_info, + sizeof(spml_ikrit_mxm_ep_conn_info_t)); + + /* Get the EP connection requests for all the processes from modex */ + for (i = 0; i < nprocs; ++i) { + + mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t); + if (NULL == mca_spml_ikrit.mxm_peers[i]) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + mca_spml_ikrit.mxm_peers[i]->pe = i; + +#if MXM_API < MXM_VERSION(2,0) + conn_reqs[i].ptl_addr[MXM_PTL_SELF] = + (struct sockaddr *) &ep_info[i].ptl_addr[MXM_PTL_SELF]; + conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL; + conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = + (struct sockaddr *) &ep_info[i].ptl_addr[MXM_PTL_RDMA]; +#else + err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn); + if (MXM_OK != err) { + SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); + goto bail; + } + if (OSHMEM_SUCCESS != create_ptl_idx(i)) + goto bail; + mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]); +#endif + } + +#if MXM_API < MXM_VERSION(2,0) + /* Connect to remote peers */ + if (mxm_get_version() < MXM_VERSION(1,5)) { + timeout = 1000; + } else { + timeout = -1; + } + err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout); + if (MXM_OK != err) { + SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); + for (i = 0; i < nprocs; ++i) { + if (MXM_OK != conn_reqs[i].error) { + SPML_ERROR("MXM EP connect to %s error: %s\n", + procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); + } + } + rc = OSHMEM_ERR_CONNECTION_FAILED; + goto bail; + } + + /* Save returned connections */ + for (i = 0; i < nprocs; ++i) { + mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn; + if (OSHMEM_SUCCESS != create_ptl_idx(i)) + goto bail; + + mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]); + } + + if (ep_info) + free(ep_info); + if (conn_reqs) + free(conn_reqs); +#endif + + proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); + /* identify local processes and change transport to SHM */ + for (i = 0; i < nprocs; i++) { + if (procs[i]->proc_name.jobid != proc_self->proc_name.jobid|| + !OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { + continue; + } + if (procs[i] == proc_self) + continue; + + /* use zcopy for put/get via sysv shared memory */ + procs[i]->transport_ids[0] = MXM_PTL_SHM; + procs[i]->transport_ids[1] = MXM_PTL_RDMA; + procs[i]->num_transports = 2; + } + + mxm_setup_relays(procs, nprocs); + + SPML_VERBOSE(50, "*** ADDED PROCS ***"); + return OSHMEM_SUCCESS; + + bail: + #if MXM_API < MXM_VERSION(2,0) + if (conn_reqs) + free(conn_reqs); + if (ep_info) + free(ep_info); + #endif + SPML_ERROR("add procs FAILED rc=%d", rc); + + return rc; + +} + +mca_spml_mkey_t *mca_spml_ikrit_register(void* addr, + size_t size, + uint64_t shmid, + int *count) +{ + int i; + mca_spml_mkey_t *mkeys; + + *count = 0; + mkeys = (mca_spml_mkey_t *) calloc(1, MXM_PTL_LAST * sizeof(*mkeys)); + if (!mkeys) { + return NULL ; + } + + for (i = 0; i < MXM_PTL_LAST; i++) { + switch (i) { + case MXM_PTL_SHM: + if ((int) MEMHEAP_SHM_GET_ID(shmid) != MEMHEAP_SHM_INVALID) { + mkeys[i].key = shmid; + mkeys[i].va_base = 0; + } else { + mkeys[i].key = 0; + mkeys[i].va_base = (unsigned long) addr; + } + mkeys[i].spml_context = 0; + break; + case MXM_PTL_SELF: + mkeys[i].key = 0; + mkeys[i].spml_context = 0; + mkeys[i].va_base = (unsigned long) addr; + break; + case MXM_PTL_RDMA: +#if MXM_API < MXM_VERSION(1,5) + mkeys[i].ib.lkey = mkeys[i].ib.rkey = MXM_MKEY_NONE; +#else + mkeys[i].ib.lkey = mkeys[i].ib.rkey = 0; +#endif + mkeys[i].spml_context = 0; + mkeys[i].va_base = (unsigned long) addr; + break; + + default: + SPML_ERROR("unsupported PTL: %d", i); + goto err; + } + SPML_VERBOSE(5, + "rank %d ptl %d rkey %x lkey %x key %llx address 0x%llX len %llu shmid 0x%X|0x%X", + oshmem_proc_local_proc->proc_name.vpid, i, mkeys[i].ib.rkey, mkeys[i].ib.lkey, (unsigned long long)mkeys[i].key, (unsigned long long)mkeys[i].va_base, (unsigned long long)size, MEMHEAP_SHM_GET_TYPE(shmid), MEMHEAP_SHM_GET_ID(shmid)); + + } + *count = MXM_PTL_LAST; + + return mkeys; + + err: mca_spml_ikrit_deregister(mkeys); + return NULL ; +} + +int mca_spml_ikrit_deregister(mca_spml_mkey_t *mkeys) +{ + int i; + + if (!mkeys) + return OSHMEM_SUCCESS; + + for (i = 0; i < MXM_PTL_LAST; i++) { + switch (i) { + case MXM_PTL_SELF: + case MXM_PTL_SHM: + break; + case MXM_PTL_RDMA: + /* dereg memory */ + if (!mkeys[i].spml_context) + break; +#if MXM_API < MXM_VERSION(1,5) + mxm_dereg_mr(mca_spml_ikrit.mxm_ep, + MXM_PTL_RDMA, + (void *) mkeys[i].va_base, + (unsigned long) mkeys[i].spml_context); +#endif + break; + } + } + return OSHMEM_SUCCESS; + +} + +static inline int get_ptl_id(int dst) +{ + oshmem_proc_t *proc; + + /* get endpoint and btl */ + proc = oshmem_proc_group_all(dst); + if (!proc) { + SPML_ERROR("Can not find destination proc for pe=%d", dst); + oshmem_shmem_abort(-1); + return -1; + } + return proc->transport_ids[0]; +} + +int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) +{ + int ptl; + + ptl = get_ptl_id(pe); + if (ptl < 0) + return OSHMEM_ERROR; + + if (ptl != MXM_PTL_RDMA) + return OSHMEM_ERROR; + + if (seg > 1) + return OSHMEM_ERROR; + +#if MXM_API < MXM_VERSION(1,5) + mkeys[ptl].ib.rkey = MXM_MKEY_NONE; +#endif + + return OSHMEM_SUCCESS; +} + +static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, + void *src_addr, + size_t size, + void *dst_addr, + int src) +{ + /* shmem spec states that get() operations are blocking. So it is enough + to have single mxm request. Also we count on mxm doing copy */ + uint64_t rva; + mca_spml_mkey_t *r_mkey; + int ptl_id; + + ptl_id = get_ptl_id(src); + /* already tried to send via shm and failed. go via rdma */ + if (ptl_id == MXM_PTL_SHM) + ptl_id = MXM_PTL_RDMA; + + /** + * Get the address to the remote rkey. + **/ + r_mkey = mca_memheap.memheap_get_cached_mkey(src, + (unsigned long) src_addr, + ptl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + src, src_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + SPML_VERBOSE(100, + "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, src_rkey=0x%lx", + src, ptl_id, src_addr, dst_addr, (int)size, (void *)rva, r_mkey->key); + + /* mxm does not really cares for get lkey */ + sreq->base.mq = mca_spml_ikrit.mxm_mq; + sreq->base.conn = mca_spml_ikrit.mxm_peers[src]->mxm_conn; + sreq->base.data_type = MXM_REQ_DATA_BUFFER; + sreq->base.data.buffer.ptr = dst_addr; + sreq->base.data.buffer.length = size; +#if MXM_API < MXM_VERSION(1,5) + sreq->base.data.buffer.mkey = MXM_MKEY_NONE; + sreq->op.mem.remote_mkey = r_mkey->ib.rkey; +#elif MXM_API < MXM_VERSION(2,0) + sreq->base.data.buffer.memh = NULL; + sreq->op.mem.remote_memh = NULL; +#else + sreq->op.mem.remote_mkey = NULL; +#endif + sreq->opcode = MXM_REQ_OP_GET; + sreq->op.mem.remote_vaddr = (intptr_t) rva; + sreq->base.state = MXM_REQ_NEW; + + return OSHMEM_SUCCESS; +} + +static inline int mca_spml_ikrit_get_shm(void *src_addr, + size_t size, + void *dst_addr, + int src) +{ + int ptl_id; + uint64_t rva; + mca_spml_mkey_t *r_mkey; + + ptl_id = get_ptl_id(src); + /** + * Get the address to the remote rkey. + **/ + if (ptl_id != MXM_PTL_SHM) + return OSHMEM_ERROR; + + r_mkey = mca_memheap.memheap_get_cached_mkey(src, + (unsigned long) src_addr, + ptl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + src, src_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + if (OPAL_UNLIKELY(!mca_memheap.memheap_is_symmetric_addr((unsigned long)src_addr) || (unsigned long)src_addr == rva)) + return OSHMEM_ERROR; + + SPML_VERBOSE(100, + "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, src_rkey=0x%lx", + src, src_addr, dst_addr, (int)size, (void *)rva, r_mkey->key); + memcpy(dst_addr, (void *) (unsigned long) rva, size); + opal_progress(); + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src) +{ + mxm_send_req_t sreq; + + if (0 >= size) { + return OSHMEM_SUCCESS; + } + + if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src)) + return OSHMEM_SUCCESS; + + if (OSHMEM_SUCCESS + != mca_spml_ikrit_get_helper(&sreq, + src_addr, + size, + dst_addr, + src)) { + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#if MXM_API < MXM_VERSION(2,0) + sreq.base.flags = MXM_REQ_FLAG_BLOCKING; +#else + sreq.flags = MXM_REQ_SEND_FLAG_BLOCKING; +#endif + sreq.base.completed_cb = NULL; + + mxm_req_send(&sreq); + opal_progress(); + mca_spml_irkit_req_wait(&sreq.base); + + if (MXM_OK != sreq.base.error) { + SPML_ERROR("get request failed: %s - aborting", + mxm_error_string(sreq.base.error)); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static inline void get_completion_cb(void *ctx) +{ + mca_spml_ikrit_get_request_t *get_req = (mca_spml_ikrit_get_request_t *) ctx; + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, -1); + get_req->req_get.req_base.req_spml_complete = true; + get_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = + OSHMEM_SUCCESS; + oshmem_request_complete(&get_req->req_get.req_base.req_oshmem, 1); + oshmem_request_free((oshmem_request_t**) &get_req); +} + +/* extension. used 4 fence implementation b4 fence was added to mxm */ +int mca_spml_ikrit_get_async(void *src_addr, + size_t size, + void *dst_addr, + int src) +{ + mca_spml_ikrit_get_request_t *get_req; + + if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src)) + return OSHMEM_SUCCESS; + + get_req = alloc_get_req(); + if (NULL == get_req) { + SPML_ERROR("out of get requests - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + if (OSHMEM_SUCCESS + != mca_spml_ikrit_get_helper(&get_req->mxm_req, + src_addr, + size, + dst_addr, + src)) { + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#if MXM_API < MXM_VERSION(2,0) + get_req->mxm_req.base.flags = 0; +#else + get_req->mxm_req.flags = 0; +#endif + get_req->mxm_req.base.completed_cb = get_completion_cb; + get_req->mxm_req.base.context = get_req; + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1); + + mxm_req_send(&get_req->mxm_req); + + if (MXM_OK != get_req->mxm_req.base.error) { + SPML_ERROR("get request failed: %s - aborting", + mxm_error_string(get_req->mxm_req.base.error)); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static inline void fence_completion_cb(void *ctx) +{ + mca_spml_ikrit_get_request_t *fence_req = + (mca_spml_ikrit_get_request_t *) ctx; + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, -1); + fence_req->req_get.req_base.req_spml_complete = true; + fence_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = + OSHMEM_SUCCESS; + oshmem_request_complete(&fence_req->req_get.req_base.req_oshmem, 1); + oshmem_request_free((oshmem_request_t**) &fence_req); +} + +static int mca_spml_ikrit_mxm_fence(int dst) +{ + mca_spml_ikrit_get_request_t *fence_req; + + fence_req = alloc_get_req(); + if (NULL == fence_req) { + SPML_ERROR("out of get requests - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; + fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; +#if MXM_API < MXM_VERSION(2,0) + fence_req->mxm_req.opcode = MXM_REQ_OP_FENCE; + fence_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; +#else + fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; + fence_req->mxm_req.flags = MXM_REQ_SEND_FLAG_FENCE; + fence_req->mxm_req.op.mem.remote_vaddr = 0; + fence_req->mxm_req.op.mem.remote_mkey = &mxm_empty_mem_key; +#endif + fence_req->mxm_req.base.state = MXM_REQ_NEW; + fence_req->mxm_req.base.completed_cb = fence_completion_cb; + fence_req->mxm_req.base.context = fence_req; + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, 1); + + mxm_req_send(&fence_req->mxm_req); + return OSHMEM_SUCCESS; +} + +static inline void put_completion_cb(void *ctx) +{ + mca_spml_ikrit_put_request_t *put_req = (mca_spml_ikrit_put_request_t *) ctx; + mxm_peer_t *peer; + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); + peer = mca_spml_ikrit.mxm_peers[put_req->pe]; + + /* this was last put in progress. Remove peer from the list so that we do not need explicit fence */ +#if SPML_IKRIT_PUT_DEBUG == 1 + if (peer) { + if (peer->n_active_puts <= 0) { + /* actually this can happen because fence forces ref count to 0 while puts still may be in flight */ + SPML_VERBOSE(1, "pe %d n_active_puts %d", put_req->pe, peer->n_active_puts); + } + } + + if (put_req->mxm_req.base.state != MXM_REQ_COMPLETED) + SPML_ERROR("oops: pe %d uncompleted request state %d", put_req->pe, put_req->mxm_req.base.state); +#endif + + if (0 < peer->n_active_puts) { + peer->n_active_puts--; +#if MXM_API < MXM_VERSION(2,0) + if (0 == peer->n_active_puts && + (put_req->mxm_req.base.flags & MXM_REQ_FLAG_SEND_SYNC)) { + //SPML_VERBOSE(20, "removed pe %d from active list", put_req->pe); + opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); + peer->need_fence = 0; + } +#else + if (0 == peer->n_active_puts && + (put_req->mxm_req.opcode == MXM_REQ_OP_PUT_SYNC)) { + //SPML_VERBOSE(20, "removed pe %d from active list", put_req->pe); + opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); + peer->need_fence = 0; + } +#endif + } + + put_req->req_put.req_base.req_spml_complete = true; + put_req->req_put.req_base.req_oshmem.req_status.SHMEM_ERROR = + OSHMEM_SUCCESS; + oshmem_request_complete(&put_req->req_put.req_base.req_oshmem, 1); + oshmem_request_free((oshmem_request_t**) &put_req); +} + +/** + * TODO: using put request as handle is not good. + */ +static inline int mca_spml_ikrit_put_internal(void* dst_addr, + size_t size, + void* src_addr, + int dst, + void **handle, + int zcopy) +{ + uint64_t rva; + mca_spml_ikrit_put_request_t *put_req; + int ptl_id; + mca_spml_mkey_t *r_mkey; + static int count; + int need_progress = 0; + + if (0 >= size) { + return OSHMEM_SUCCESS; + } + + ptl_id = get_ptl_id(dst); + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, + (unsigned long) dst_addr, + ptl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#if SPML_IKRIT_PUT_DEBUG == 1 + + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + if (ptl_id == MXM_PTL_SHM) { + + if (OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)dst_addr) && (unsigned long)dst_addr != rva)) { + memcpy((void *) (unsigned long) rva, src_addr, size); + /* call progress as often as we would have with regular put */ + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; + } + /* segment not mapped - fallback to rmda */ + ptl_id = MXM_PTL_RDMA; + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, + (unsigned long) dst_addr, + ptl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + } + +#if SPML_IKRIT_PUT_DEBUG == 1 + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + put_req = alloc_put_req(); + if (NULL == put_req) { + SPML_ERROR("out of put requests - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + if (handle) + *handle = put_req; + + /* fill out request */ + put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; + /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. + * Also request explicit ack once in a while */ +#if MXM_API < MXM_VERSION(2, 0) + put_req->mxm_req.opcode = MXM_REQ_OP_PUT; + if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || + (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { + put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; + need_progress = 1; + } else { + put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC; + } +#else + put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; + if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || + (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { + put_req->mxm_req.flags = 0; + need_progress = 1; + } else { + put_req->mxm_req.flags = MXM_REQ_SEND_FLAG_LAZY; + } +#endif + + put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; + put_req->mxm_req.base.data.buffer.ptr = src_addr; + put_req->mxm_req.base.data.buffer.length = size; + put_req->mxm_req.base.completed_cb = put_completion_cb; + put_req->mxm_req.base.context = put_req; + put_req->mxm_req.opcode = MXM_REQ_OP_PUT; + put_req->mxm_req.op.mem.remote_vaddr = (intptr_t) rva; + put_req->mxm_req.base.state = MXM_REQ_NEW; + put_req->pe = dst; + +#if MXM_API < MXM_VERSION(1,5) + put_req->mxm_req.base.data.buffer.mkey = MXM_MKEY_NONE; + put_req->mxm_req.op.mem.remote_mkey = r_mkey->ib.rkey; +#elif MXM_API < MXM_VERSION(2, 0) + put_req->mxm_req.base.data.buffer.memh = NULL; + put_req->mxm_req.op.mem.remote_memh = NULL; +#else + put_req->mxm_req.op.mem.remote_mkey = NULL; +#endif + + if (mca_spml_ikrit.mxm_peers[dst]->pe_relay >= 0 + && mca_memheap_base_detect_addr_type((unsigned long) dst_addr) + == ADDR_USER) { + put_req->mxm_req.op.am.hid = 0; + put_req->mxm_req.op.am.imm_data = dst; + put_req->pe = mca_spml_ikrit.mxm_peers[dst]->pe_relay; + put_req->mxm_req.base.conn = + mca_spml_ikrit.mxm_peers[put_req->pe]->mxm_conn; + put_req->mxm_req.opcode = MXM_REQ_OP_AM; + + /* set up iov */ + put_req->mxm_req.base.data_type = MXM_REQ_DATA_IOV; + put_req->mxm_req.base.data.iov.count = 2; + put_req->mxm_req.base.data.iov.vector = put_req->iov; + + put_req->iov[0].ptr = &put_req->am_pkt.va; + put_req->iov[0].length = sizeof(uint64_t); + put_req->am_pkt.va = (uint64_t) rva; + + put_req->iov[1].ptr = src_addr; + put_req->iov[1].length = size; + +#if MXM_API < MXM_VERSION(1,5) + put_req->iov[0].mkey = MXM_MKEY_NONE; + put_req->iov[1].mkey = MXM_MKEY_NONE; +#else + put_req->iov[0].memh = NULL; + put_req->iov[1].memh = NULL; +#endif + } + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); + if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + opal_list_append(&mca_spml_ikrit.active_peers, + &mca_spml_ikrit.mxm_peers[dst]->super); + mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + } + + mca_spml_ikrit.mxm_peers[dst]->n_active_puts++; + + mxm_req_send(&put_req->mxm_req); + + if (MXM_OK != put_req->mxm_req.base.error) { + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); + SPML_ERROR("put request %p failed: %s - aborting", + put_req, mxm_error_string(put_req->mxm_req.base.error)); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + if (need_progress) + mxm_progress(mca_spml_ikrit.mxm_context); + + return OSHMEM_SUCCESS; +} + +/* simple buffered put implementation. NOT IN USE + * Problems: + * - slighly worse performance than impl based on non buffered put + * - fence complexity is O(n_active_connections) instead of O(n_connections_with_outstanding_puts). + * Later is bounded by the network RTT & mxm ack timer. + */ +int mca_spml_ikrit_put_simple(void* dst_addr, + size_t size, + void* src_addr, + int dst) +{ + uint64_t rva; + mxm_send_req_t mxm_req; + mxm_wait_t wait; + int ptl_id; + mca_spml_mkey_t *r_mkey; + static int count; + + ptl_id = get_ptl_id(dst); + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, + (unsigned long) dst_addr, + ptl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#if SPML_IKRIT_PUT_DEBUG == 1 + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + if (ptl_id == MXM_PTL_SHM) { + + if (OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)dst_addr) && (unsigned long)dst_addr != rva)) { + memcpy((void *) (unsigned long) rva, src_addr, size); + /* call progress as often as we would have with regular put */ + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; + } + /* segment not mapped - fallback to rmda */ + ptl_id = MXM_PTL_RDMA; + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, + (unsigned long) dst_addr, + ptl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + } + +#if SPML_IKRIT_PUT_DEBUG == 1 + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + /* fill out request */ + mxm_req.base.mq = mca_spml_ikrit.mxm_mq; +#if MXM_API < MXM_VERSION(2, 0) + mxm_req.base.flags = MXM_REQ_FLAG_BLOCKING; +#else + mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING; +#endif + mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; + mxm_req.base.data.buffer.ptr = src_addr; + mxm_req.base.data.buffer.length = size; + mxm_req.base.completed_cb = 0; + mxm_req.base.context = 0; + mxm_req.opcode = MXM_REQ_OP_PUT; + mxm_req.op.mem.remote_vaddr = (intptr_t) rva; + mxm_req.base.state = MXM_REQ_NEW; + mxm_req.base.error = MXM_OK; + +#if MXM_API < MXM_VERSION(1,5) + mxm_req.base.data.buffer.mkey = MXM_MKEY_NONE; + mxm_req.op.mem.remote_mkey = MXM_MKEY_NONE; +#elif MXM_API < MXM_VERSION(2, 0) + mxm_req.base.data.buffer.memh = NULL; + mxm_req.op.mem.remote_memh = NULL; +#else + mxm_req.op.mem.remote_mkey = NULL; +#endif + + if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + opal_list_append(&mca_spml_ikrit.active_peers, + &mca_spml_ikrit.mxm_peers[dst]->super); + mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + } + + mxm_req_send(&mxm_req); + if (MXM_OK != mxm_req.base.error) { + SPML_ERROR("put request failed: %s(%d) - aborting", + mxm_error_string(mxm_req.base.error), mxm_req.base.error); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + wait.req = &mxm_req.base; + wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED); + wait.progress_cb = NULL; + wait.progress_arg = NULL; + mxm_wait(&wait); + + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_put_nb(void* dst_addr, + size_t size, + void* src_addr, + int dst, + void **handle) +{ + int err; + err = mca_spml_ikrit_put_internal(dst_addr, size, src_addr, dst, handle, 1); + if (OSHMEM_SUCCESS != err) { + SPML_ERROR("put failed - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_put(void* dst_addr, size_t size, void* src_addr, int dst) +{ + int err; + mca_spml_ikrit_put_request_t *put_req; + mxm_wait_t wait; + + put_req = 0; + err = mca_spml_ikrit_put_internal(dst_addr, + size, + src_addr, + dst, + (void **) &put_req, + 0); + if (OSHMEM_SUCCESS != err) { + SPML_ERROR("put failed - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + if (!put_req) + return OSHMEM_SUCCESS; + + wait.req = &put_req->mxm_req.base; + wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED); + wait.progress_cb = NULL; + wait.progress_arg = NULL; + mxm_wait(&wait); + + return OSHMEM_SUCCESS; +} + +static void mxm_relay_handler(mxm_conn_h conn, + mxm_imm_t imm, + void *data, + size_t len, + size_t offset, + int is_lf) +{ + uint64_t va, rva; + char *pkt_data; + mca_spml_mkey_t *r_mkey; + int ptl_id; + mxm_peer_t *peer; + + if (offset == 0) { + va = *(uint64_t *) data; + pkt_data = (char *) data + sizeof(va); + len -= sizeof(va); + if (!is_lf) { + /* we expect more fragments: save destination virtual address */ + peer = mxm_conn_ctx_get(conn); + peer->dst_va = va; + } + } else { + /* next fragment: use saved va and offset to compute va */ + pkt_data = data; + peer = mxm_conn_ctx_get(conn); + va = peer->dst_va + offset - sizeof(va); + } + + ptl_id = get_ptl_id(imm); + if (ptl_id != MXM_PTL_SHM) { + SPML_ERROR("relay req to non local PE recvd dst=%d va=%llx len=%d - aborting", + (int)imm, (unsigned long long)va, (int)len); + oshmem_shmem_abort(-1); + return; + } + + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(imm, va, ptl_id, &rva); + if (!r_mkey) { + SPML_ERROR("relay to PE=%d: %p is not address of shared variable", + imm, (void *)va); + oshmem_shmem_abort(-1); + return; + } + + memcpy((void *) (unsigned long) rva, pkt_data, len); +} + +static void mxm_setup_relays(oshmem_proc_t **procs, size_t nprocs) +{ + size_t i; + opal_hash_table_t h; + int pe_relay; + int ret; + int r_i, r; + + if (mca_spml_ikrit.n_relays <= 0) + return; + + OBJ_CONSTRUCT(&h, opal_hash_table_t); + opal_hash_table_init(&h, 128); + + /* lowest rank on node will be used to relay to everyone on that node */ + for (i = 0; i < nprocs; i++) { + if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) + continue; + + ret = opal_hash_table_get_value_ptr(&h, + procs[i]->proc_hostname, + strlen(procs[i]->proc_hostname), + (void **) &pe_relay); + if (ret != OPAL_SUCCESS) { + opal_hash_table_set_value_ptr(&h, + procs[i]->proc_hostname, + strlen(procs[i]->proc_hostname), + (void *) i); + mca_spml_ikrit.mxm_peers[i]->n_relays = 1; + mca_spml_ikrit.mxm_peers[i]->pe_relays[0] = i; + continue; + } + + /* first allocate relays */ + if (mca_spml_ikrit.mxm_peers[pe_relay]->n_relays + < mca_spml_ikrit.n_relays) { + mca_spml_ikrit.mxm_peers[pe_relay]->pe_relays[mca_spml_ikrit.mxm_peers[pe_relay]->n_relays] = + i; + mca_spml_ikrit.mxm_peers[pe_relay]->n_relays++; + continue; + } + + /* now assign slave to relay */ + r_i = mca_spml_ikrit.mxm_peers[pe_relay]->n_relays - 1; + while (r_i >= 0) { + r = mca_spml_ikrit.mxm_peers[pe_relay]->pe_relays[r_i]; + if (mca_spml_ikrit.mxm_peers[r]->n_slaves >= 1) { + r_i--; + continue; + } + mca_spml_ikrit.mxm_peers[r]->n_slaves++; + mca_spml_ikrit.mxm_peers[i]->pe_relay = r; + break; + + } + } + + OBJ_DESTRUCT(&h); + mxm_set_am_handler(mca_spml_ikrit.mxm_context, + 0, + mxm_relay_handler, + MXM_AM_FLAG_THREAD_SAFE); +} + +int mca_spml_ikrit_fence(void) +{ + mxm_peer_t *peer; + opal_list_item_t *item; + + SPML_VERBOSE(20, + "Into fence with %d active puts on %d pes", + mca_spml_ikrit.n_active_puts, (int)opal_list_get_size(&mca_spml_ikrit.active_peers)); + + /* puts(unless are send sync) are completed by remote side lazily. That is either when remote decides to + * ack window which can take hundreds of ms. So speed things up by doing fence */ + while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) { + peer = (mxm_peer_t *) item; + peer->n_active_puts = 0; + peer->need_fence = 0; + mca_spml_ikrit_mxm_fence(peer->pe); + } + + while (0 < mca_spml_ikrit.n_mxm_fences) { + oshmem_request_wait_any_completion(); + } + + SPML_VERBOSE(20, "fence completed"); + return OSHMEM_SUCCESS; +} + +/* blocking receive */ +int mca_spml_ikrit_recv(void* buf, size_t size, int src) +{ + mxm_recv_req_t req; + char dummy_buf[1]; + + /* tag mask 0 matches any tag */ + SPML_VERBOSE(100, + "want to recv from src %d, size %d buf %p", + src, (int)size, buf); + req.tag = src == SHMEM_ANY_SOURCE ? 0 : src; + req.tag_mask = src == SHMEM_ANY_SOURCE ? 0 : 0xFFFFFFFF; + + req.base.state = MXM_REQ_NEW; + req.base.mq = mca_spml_ikrit.mxm_mq; + req.base.conn = NULL; +#if MXM_API < MXM_VERSION(2, 0) + req.base.flags = MXM_REQ_FLAG_BLOCKING; +#endif + req.base.completed_cb = NULL; + + req.base.data_type = MXM_REQ_DATA_BUFFER; + req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf; + req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size; +#if MXM_API < MXM_VERSION(1,5) + req.base.data.buffer.mkey = MXM_MKEY_NONE; +#else + req.base.data.buffer.memh = NULL; +#endif + + mxm_req_recv(&req); + mca_spml_irkit_req_wait(&req.base); + if (req.base.error != MXM_OK) { + return OSHMEM_ERROR; + } + SPML_VERBOSE(100, + "recvd from tag %d len %d", + req.completion.sender_tag, (int)req.completion.actual_len); + + return OSHMEM_SUCCESS; +} + +/* for now only do blocking copy send */ +int mca_spml_ikrit_send(void* buf, + size_t size, + int dst, + mca_spml_base_put_mode_t mode) +{ + mxm_send_req_t req; + char dummy_buf[1]; + + SPML_VERBOSE(100, + "sending %p size %d to %d, mode %d", + buf, (int)size, dst, (int)mode); + req.opcode = MXM_REQ_OP_SEND; + + req.op.send.tag = oshmem_my_proc_id(); + + req.base.state = MXM_REQ_NEW; + req.base.mq = mca_spml_ikrit.mxm_mq; + req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; +#if MXM_API < MXM_VERSION(2, 0) + req.base.flags = MXM_REQ_FLAG_BLOCKING; +#else + req.flags = MXM_REQ_SEND_FLAG_BLOCKING; +#endif + req.base.completed_cb = NULL; + + req.base.data_type = MXM_REQ_DATA_BUFFER; + req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf; + req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size; +#if MXM_API < MXM_VERSION(1,5) + req.base.data.buffer.mkey = MXM_MKEY_NONE; +#else + req.base.data.buffer.memh = NULL; +#endif + + mxm_req_send(&req); + mca_spml_irkit_req_wait(&req.base); + if (req.base.error != MXM_OK) { + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h new file mode 100644 index 0000000000..af6f7e41c9 --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_UD_MXM_H +#define MCA_SPML_UD_MXM_H + +#include "oshmem_config.h" +#include "oshmem/request/request.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +#include "ompi/mca/bml/base/base.h" +#include "ompi/class/ompi_free_list.h" +#include "opal/class/opal_list.h" + +#include "orte/runtime/orte_globals.h" + +#include + +#ifndef MXM_VERSION +#define MXM_VERSION(major, minor) (((major)< +#include +#endif + +#define MXM_SHMEM_MQ_ID 0x7119 +#define MXM_SHMEM_TAG 0x7 + +/* start request explicit ack once our buffer pool is less than watermark */ +#define SPML_IKRIT_PUT_LOW_WATER 16 +/* request explicit ack (SYNC) per every X put requests per connection */ +#define SPML_IKRIT_PACKETS_PER_SYNC 64 + +BEGIN_C_DECLS + +/** + * UD MXM SPML module + */ +struct mxm_peer { + opal_list_item_t super; + mxm_conn_h mxm_conn; + int pe; + uint32_t n_active_puts; + int need_fence; + /* if >= 0, data will be send to pe_relay which will forward it to destination pe */ + int pe_relay; + uint64_t dst_va; /* virtual address on the final destination */ + int n_slaves; + int pe_relays[16]; + int n_relays; +}; + +typedef struct mxm_peer mxm_peer_t; +OBJ_CLASS_DECLARATION(mxm_peer_t); + +struct mca_spml_ikrit_t { + mca_spml_base_module_t super; + + mxm_h mxm_context; + mxm_ep_h mxm_ep; + mxm_mq_h mxm_mq; + mxm_peer_t **mxm_peers; + + uint32_t n_active_puts; + uint32_t n_active_gets; + uint32_t n_mxm_fences; + + int priority; /* component priority */ + int free_list_num; /* initial size of free list */ + int free_list_max; /* maximum size of free list */ + int free_list_inc; /* number of elements to grow free list */ + + bool enabled; + opal_list_t active_peers; + int n_relays; /* number of procs/node serving as relays */ +}; + +typedef struct mca_spml_ikrit_t mca_spml_ikrit_t; + +#define MXM_MAX_ADDR_LEN 512 + +#if MXM_API >= MXM_VERSION(2,0) +#define MXM_PTL_SHM 0 +#define MXM_PTL_SELF 1 +#define MXM_PTL_RDMA 2 +#define MXM_PTL_LAST 3 +#endif + +typedef struct spml_ikrit_mxm_ep_conn_info_t { + union { + struct sockaddr_storage ptl_addr[MXM_PTL_LAST]; + char ep_addr[MXM_MAX_ADDR_LEN]; + }; +} spml_ikrit_mxm_ep_conn_info_t; + +extern mca_spml_ikrit_t mca_spml_ikrit; + +extern int mca_spml_ikrit_enable(bool enable); +extern int mca_spml_ikrit_get(void* dst_addr, + size_t size, + void* src_addr, + int src); +/* extension. used 4 fence implementation b4 fence was added to mxm */ +extern int mca_spml_ikrit_get_async(void *src_addr, + size_t size, + void *dst_addr, + int src); + +extern int mca_spml_ikrit_put(void* dst_addr, + size_t size, + void* src_addr, + int dst); +extern int mca_spml_ikrit_put_nb(void* dst_addr, + size_t size, + void* src_addr, + int dst, + void **handle); + +extern int mca_spml_ikrit_recv(void* buf, size_t size, int src); +extern int mca_spml_ikrit_send(void* buf, + size_t size, + int dst, + mca_spml_base_put_mode_t mode); + +extern mca_spml_mkey_t *mca_spml_ikrit_register(void* addr, + size_t size, + uint64_t shmid, + int *count); +extern int mca_spml_ikrit_deregister(mca_spml_mkey_t *mkeys); +extern int mca_spml_ikrit_oob_get_mkeys(int pe, + uint32_t seg, + mca_spml_mkey_t *mkeys); + +extern int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_ikrit_fence(void); +extern int spml_ikrit_progress(void); + +static inline oshmem_proc_t *mca_spml_ikrit_proc_find(int dst) +{ + orte_process_name_t name; + + name.jobid = ORTE_PROC_MY_NAME->jobid; + name.vpid = dst; + return oshmem_proc_find(&name); +} + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.c b/oshmem/mca/spml/ikrit/spml_ikrit_component.c new file mode 100644 index 0000000000..dc14f15f88 --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#define _GNU_SOURCE +#include + +#include +#include + +#include "oshmem_config.h" +#include "orte/util/show_help.h" +#include "shmem.h" +#include "oshmem/runtime/params.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "spml_ikrit_component.h" +#include "oshmem/mca/spml/ikrit/spml_ikrit.h" + +#include "orte/util/show_help.h" + +static int mca_spml_ikrit_component_register(void); +static int mca_spml_ikrit_component_open(void); +static int mca_spml_ikrit_component_close(void); +static mca_spml_base_module_t* +mca_spml_ikrit_component_init(int* priority, + bool enable_progress_threads, + bool enable_mpi_threads); +static int mca_spml_ikrit_component_fini(void); +mca_spml_base_component_2_0_0_t mca_spml_ikrit_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_SPML_BASE_VERSION_2_0_0, + + "ikrit", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + mca_spml_ikrit_component_open, /* component open */ + mca_spml_ikrit_component_close, /* component close */ + NULL, + mca_spml_ikrit_component_register + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_spml_ikrit_component_init, /* component init */ + mca_spml_ikrit_component_fini /* component finalize */ + +}; + +static inline int mca_spml_ikrit_param_register_int(const char* param_name, + int default_value, + const char *help_msg) +{ + int param_value; + + param_value = default_value; + (void) mca_base_component_var_register(&mca_spml_ikrit_component.spmlm_version, + param_name, + help_msg, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + ¶m_value); + + return param_value; +} + +static int mca_spml_ikrit_component_register(void) +{ + int np; + + mca_spml_ikrit.free_list_num = + mca_spml_ikrit_param_register_int("free_list_num", 1024, 0); + mca_spml_ikrit.free_list_max = + mca_spml_ikrit_param_register_int("free_list_max", 1024, 0); + mca_spml_ikrit.free_list_inc = + mca_spml_ikrit_param_register_int("free_list_inc", 16, 0); + mca_spml_ikrit.priority = + mca_spml_ikrit_param_register_int("priority", + 20, + "[integer] ikrit priority"); + + mca_spml_ikrit.n_relays = + mca_spml_ikrit_param_register_int("use_relays", + -1, + "[integer] First N ranks on host will receive and forward put messages to other ranks running on it. Can be used to as work around Sandy Bridge far socket problem"); + + np = mca_spml_ikrit_param_register_int("np", + 128, + "[integer] Minimal allowed job's NP to activate ikrit"); + if (oshmem_num_procs() < np) { + SPML_VERBOSE(1, + "Not enough ranks (%d<%d), disqualifying spml/ikrit", + oshmem_num_procs(), np); + return OSHMEM_ERR_NOT_AVAILABLE; + } + + return OSHMEM_SUCCESS; +} + +int spml_ikrit_progress(void) +{ + mxm_error_t err; + + err = mxm_progress(mca_spml_ikrit.mxm_context); + if ((MXM_OK != err) && (MXM_ERR_NO_PROGRESS != err)) { + orte_show_help("help-shmem-spml-ikrit.txt", + "errors during mxm_progress", + true, + mxm_error_string(err)); + } + return 1; +} + +static int mca_spml_ikrit_component_open(void) +{ + mxm_error_t err; + unsigned long cur_ver; + + cur_ver = mxm_get_version(); + if (cur_ver != MXM_API) { + char *str; + if (asprintf(&str, + "SHMEM was compiled with MXM version %d.%d but " + "version %ld.%ld detected.", + MXM_VERNO_MAJOR, + MXM_VERNO_MINOR, + (cur_ver >> MXM_MAJOR_BIT) & 0xff, + (cur_ver >> MXM_MINOR_BIT) & 0xff) > 0) { + orte_show_help("help-shmem-spml-ikrit.txt", "mxm init", true, str); + + free(str); + } + return OSHMEM_ERROR; + } + +#if MXM_API < MXM_VERSION(1,5) + mxm_context_opts_t mxm_opts; + + mxm_fill_context_opts(&mxm_opts); + /* only enable rmda and self ptls */ + mxm_opts.ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA)); + +#else + mxm_context_opts_t *mxm_opts; + + err = mxm_config_read_context_opts(&mxm_opts); + if (MXM_OK != err) { + SPML_ERROR("Failed to parse MXM configuration"); + return OSHMEM_ERROR; + } +#if MXM_API < MXM_VERSION(2, 0) + mxm_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA)); +#endif +#endif + +#if MXM_API < MXM_VERSION(1,5) + err = mxm_init(&mxm_opts, &mca_spml_ikrit.mxm_context); +#else + err = mxm_init(mxm_opts, &mca_spml_ikrit.mxm_context); +#if MXM_API < MXM_VERSION(2, 0) + mxm_config_free(mxm_opts); +#else + mxm_config_free_context_opts(mxm_opts); +#endif +#endif + + if (MXM_OK != err) { + if (MXM_ERR_NO_DEVICE == err) { + SPML_VERBOSE(1, + "No supported device found, disqualifying spml/ikrit"); + } else { + orte_show_help("help-shmem-spml-ikrit.txt", + "mxm init", + true, + mxm_error_string(err)); + } + return OSHMEM_ERR_NOT_AVAILABLE; + } + + err = mxm_mq_create(mca_spml_ikrit.mxm_context, + MXM_SHMEM_MQ_ID, + &mca_spml_ikrit.mxm_mq); + if (MXM_OK != err) { + orte_show_help("help-shmem-spml-ikrit.txt", + "mxm mq create", + true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} + +static int mca_spml_ikrit_component_close(void) +{ + if (mca_spml_ikrit.mxm_context) + mxm_cleanup(mca_spml_ikrit.mxm_context); + mca_spml_ikrit.mxm_context = NULL; + return OSHMEM_SUCCESS; +} + +static int spml_ikrit_mxm_init(void) +{ + mxm_error_t err; + mxm_ep_opts_t *p_ep_opts; + +#if MXM_API < MXM_VERSION(1,5) + mxm_ep_opts_t ep_opt; + struct sockaddr_mxm_local_proc sa_bind_self; + struct sockaddr_mxm_ib_local sa_bind_rdma; + + p_ep_opts = &ep_opt; + /* Setup the endpoint options and local addresses to bind to. */ + mxm_fill_ep_opts(&ep_opt); + + sa_bind_self.sa_family = AF_MXM_LOCAL_PROC; + sa_bind_self.context_id = 0; + sa_bind_self.process_id = oshmem_proc_local()->proc_name.vpid; + + sa_bind_rdma.sa_family = AF_MXM_IB_LOCAL; + sa_bind_rdma.lid = 0; + sa_bind_rdma.pkey = 0; + sa_bind_rdma.qp_num = 0; + sa_bind_rdma.sl = 0; + + ep_opt.ptl_bind_addr[MXM_PTL_SELF] = (struct sockaddr*) &sa_bind_self; + ep_opt.ptl_bind_addr[MXM_PTL_RDMA] = (struct sockaddr*) &sa_bind_rdma; + +#else + err = mxm_config_read_ep_opts(&p_ep_opts); + if (err != MXM_OK) { + SPML_ERROR("Failed to parse MXM configuration"); + return OSHMEM_ERROR; + } + +#if MXM_API < MXM_VERSION(2,0) + /* Only relevant for SHM PTL - ignore */ + p_ep_opts->job_id = 0; + p_ep_opts->local_rank = 0; + p_ep_opts->num_local_procs = 0; + p_ep_opts->rdma.drain_cq = 1; +#endif +#endif + + /* Open MXM endpoint */ + err = mxm_ep_create(mca_spml_ikrit.mxm_context, + p_ep_opts, + &mca_spml_ikrit.mxm_ep); + if (MXM_OK != err) { + orte_show_help("help-shmem-spml-ikrit.txt", + "unable to create endpoint", + true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } + +#if MXM_API >= MXM_VERSION(1,5) +#if MXM_API < MXM_VERSION(2,0) + mxm_config_free(p_ep_opts); +#else + mxm_config_free_ep_opts(p_ep_opts); +#endif +#endif + + return OSHMEM_SUCCESS; +} + +static mca_spml_base_module_t* +mca_spml_ikrit_component_init(int* priority, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + SPML_VERBOSE( 10, "in ikrit, my priority is %d\n", mca_spml_ikrit.priority); + + if ((*priority) > mca_spml_ikrit.priority) { + *priority = mca_spml_ikrit.priority; + return NULL ; + } + *priority = mca_spml_ikrit.priority; + + if (OSHMEM_SUCCESS != spml_ikrit_mxm_init()) + return NULL ; + + mca_spml_ikrit.n_active_puts = 0; + mca_spml_ikrit.n_active_gets = 0; + mca_spml_ikrit.n_mxm_fences = 0; + SPML_VERBOSE(50, "*** ikrit initialized ****"); + return &mca_spml_ikrit.super; +} + +static int mca_spml_ikrit_component_fini(void) +{ + opal_progress_unregister(spml_ikrit_progress); + if (NULL != mca_spml_ikrit.mxm_ep) { + mxm_ep_destroy(mca_spml_ikrit.mxm_ep); + } + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.h b/oshmem/mca/spml/ikrit/spml_ikrit_component.h new file mode 100644 index 0000000000..3c1a4b874b --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_COMPONENT_H +#define MCA_SPML_YODA_COMPONENT_H + +BEGIN_C_DECLS + +/* + * SPML module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_spml_base_component_2_0_0_t mca_spml_ikrit_component; +END_C_DECLS + +#endif diff --git a/oshmem/mca/spml/spml.h b/oshmem/mca/spml/spml.h new file mode 100644 index 0000000000..6bdeaf7732 --- /dev/null +++ b/oshmem/mca/spml/spml.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SPML_H +#define MCA_SPML_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "opal/mca/mca.h" +#include "oshmem/proc/proc.h" +#include "ompi/mca/btl/btl.h" + +BEGIN_C_DECLS + +/* + * SPML component types + */ + +/** + * MCA->PML Called by MCA framework to initialize the component. + * + * @param priority (OUT) Relative priority or ranking used by MCA to + * selected a component. + * + * @param enable_progress_threads (IN) Whether this component is + * allowed to run a hidden/progress thread or not. + * + * @param enable_mpi_threads (IN) Whether support for multiple MPI + * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which + * indicates whether multiple threads may invoke this component + * simultaneously or not. + */ +typedef enum { + MCA_SPML_BASE_PUT_SYNCHRONOUS, + MCA_SPML_BASE_PUT_COMPLETE, + MCA_SPML_BASE_PUT_BUFFERED, + MCA_SPML_BASE_PUT_READY, + MCA_SPML_BASE_PUT_STANDARD, + MCA_SPML_BASE_PUT_SIZE +} mca_spml_base_put_mode_t; + +typedef struct mca_spml_base_module_1_0_0_t * (*mca_spml_base_component_init_fn_t)(int *priority, + bool enable_progress_threads, + bool enable_mpi_threads); + +typedef int (*mca_spml_base_component_finalize_fn_t)(void); + +/** + * SPML component version and interface functions. + */ +struct mca_spml_base_component_2_0_0_t { + mca_base_component_t spmlm_version; + mca_base_component_data_t spmlm_data; + mca_spml_base_component_init_fn_t spmlm_init; + mca_spml_base_component_finalize_fn_t spmlm_finalize; +}; +typedef struct mca_spml_base_component_2_0_0_t mca_spml_base_component_2_0_0_t; +typedef mca_spml_base_component_2_0_0_t mca_spml_base_component_t; + +/** + * MCA management functions. + */ +/** + * memory key + */ +typedef struct mca_spml_mkey { + union { + struct { + uint32_t rkey; + uint32_t lkey; + } ib; + uint64_t key; + }; + uint64_t va_base; + void *spml_context; /* spml module can attach internal structures here */ +} mca_spml_mkey_t; + +/** + * Downcall from MCA layer to enable the PML/BTLs. + * + * @param enable Enable/Disable SPML forwarding + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_enable_fn_t)(bool enable); + +/** + * Waits for an int variable to change on the local PE. + * Blocked until the variable is not equal to value. + * + * @param addr Address of the variable to pool on. + * @param value The value to pool on. Pool until the value held in addr is different than value. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_wait_fn_t)(void* addr, + int cmp, + void* value, + int datatype); + +/** + * Register (Pinn) a buffer of 'size' bits starting in address addr + * + * @param addr base address of the registered buffer. + * @param size the size of the buffer to be registered. + * @param seg_id sysv segment id + * @param count number of internal transports (btls) that registered memory + * @return array of mkeys (one mkey per "btl") or NULL on failure + * + */ +typedef mca_spml_mkey_t * (*mca_spml_base_module_register_fn_t)(void *addr, + size_t size, + uint64_t shmid, + int *count); + +/** + * deregister memory pinned by register() + */ +typedef int (*mca_spml_base_module_deregister_fn_t)(mca_spml_mkey_t *mkeys); + +/** + * try to fill up mkeys that can be used to reach remote pe. + * @param pe remote pe + * @param seg 0 - symmetric heap, 1 - static data, everything else are static data in .so + * @param mkeys mkeys array + * + * @return OSHMEM_SUCCSESS if keys are found + */ +typedef int (*mca_spml_base_module_oob_get_mkeys_fn_t)(int pe, + uint32_t seg, + mca_spml_mkey_t *mkeys); + +/** + * For each proc setup a datastructure that indicates the BTLs + * that can be used to reach the destination. + * + * @param procs A list of all procs participating in the parallel application. + * @param nprocs The number of procs in the parallel application. + * @return OSHMEM_SUCCESS or failure status. + * + */ +typedef int (*mca_spml_base_module_add_procs_fn_t)(oshmem_proc_t** procs, + size_t nprocs); +typedef int (*mca_spml_base_module_del_procs_fn_t)(oshmem_proc_t** procs, + size_t nprocs); + +/** + * Transfer data to a remote pe. + * + * @param dst_addr The address in the remote PE of the object being written. + * @param size The number of bytes to be written. + * @param src_addr An address on the local PE holdng the value to be written. + * @param dst The remote PE to be written to. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_put_fn_t)(void *dst_addr, + size_t size, + void *src_addr, + int dst); + +/** + * These routines provide the means for copying contiguous data to another PE without + * blocking the caller. These routines return before the data has been delivered to the + * remote PE. + * + * @param dst_addr The address in the remote PE of the object being written. + * @param size The number of bytes to be written. + * @param src_addr An address on the local PE holdng the value to be written. + * @param dst The remote PE to be written to. + * @param handle The address of a handle to be passed to shmem_wait_nb() or + * shmem_test_nb() to wait or poll for the completion of the transfer. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_put_nb_fn_t)(void *dst_addr, + size_t size, + void *src_addr, + int dst, + void **handle); + +/** + * Blocking data transfer from remote PE. + * Read data from remote PE. + * + * @param dst_addr - The address on the local PE, to write the result of the get operation to. + * @param size - The number of bytes to be read. + * @param src_addr - The address on the remote PE, to read from. + * @param src - The ID of the remote PE. + * @return - OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_get_fn_t)(void *dst_addr, + size_t size, + void *src_addr, + int src); + +/** + * Post a receive and wait for completion. + * + * @param buf (IN) User buffer. + * @param count (IN) The number of bytes to be sent. + * @param src (IN) The ID of the remote PE. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_recv_fn_t)(void *buf, size_t count, int src); + +/** + * Post a send request and wait for completion. + * + * @param buf (IN) User buffer. + * @param count (IN) The number of bytes to be sent. + * @param dst (IN) The ID of the remote PE. + * @param mode (IN) Send mode (STANDARD,BUFFERED,SYNCHRONOUS,READY) + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_send_fn_t)(void *buf, + size_t count, + int dst, + mca_spml_base_put_mode_t mode); + +/** + * Wait for completion of all outstanding put() requests + * + * @return - OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_fence_fn_t)(void); + +/** + * Waits for completion of a non-blocking put or get issued by the calling PE. + * + * @return - OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_wait_nb_fn_t)(void*); + +typedef void* (*mca_spml_base_module_get_remote_context_fn_t)(void*); + +typedef void (*mca_spml_base_module_set_remote_context_fn_t)(void**, void*); + +typedef int (*mca_spml_base_module_get_remote_context_size_fn_t)(void*); + +typedef void (*mca_spml_base_module_set_remote_context_size_fn_t)(void**, int); + +/** + * SPML instance. + */ +struct mca_spml_base_module_1_0_0_t { + + mca_spml_base_module_add_procs_fn_t spml_add_procs; + mca_spml_base_module_del_procs_fn_t spml_del_procs; + + mca_spml_base_module_enable_fn_t spml_enable; + mca_spml_base_module_register_fn_t spml_register; + mca_spml_base_module_deregister_fn_t spml_deregister; + mca_spml_base_module_oob_get_mkeys_fn_t spml_oob_get_mkeys; + + mca_spml_base_module_put_fn_t spml_put; + mca_spml_base_module_put_nb_fn_t spml_put_nb; + mca_spml_base_module_get_fn_t spml_get; + + mca_spml_base_module_recv_fn_t spml_recv; + mca_spml_base_module_send_fn_t spml_send; + + mca_spml_base_module_wait_fn_t spml_wait; + mca_spml_base_module_wait_nb_fn_t spml_wait_nb; + mca_spml_base_module_fence_fn_t spml_fence; + mca_spml_base_module_get_remote_context_fn_t spml_get_remote_context; + mca_spml_base_module_set_remote_context_fn_t spml_set_remote_context; + mca_spml_base_module_get_remote_context_size_fn_t spml_get_remote_context_size; + mca_spml_base_module_set_remote_context_size_fn_t spml_set_remote_context_size; + + void *self; +}; + +typedef struct mca_spml_base_module_1_0_0_t mca_spml_base_module_1_0_0_t; +typedef mca_spml_base_module_1_0_0_t mca_spml_base_module_t; + +/* + * Macro for use in components that are of type spml + */ +#define MCA_SPML_BASE_VERSION_2_0_0 \ + MCA_BASE_VERSION_2_0_0, \ +"spml", 2, 0, 0 + +/* + * macro for doing direct call / call through struct + */ +#if MCA_oshmem_spml_DIRECT_CALL + +#include MCA_oshmem_spml_DIRECT_CALL_HEADER + +#define MCA_SPML_CALL_STAMP(a, b) mca_spml_ ## a ## _ ## b +#define MCA_SPML_CALL_EXPANDER(a, b) MCA_SPML_CALL_STAMP(a,b) +#define MCA_SPML_CALL(a) MCA_SPML_CALL_EXPANDER(MCA_oshmem_spml_DIRECT_CALL_COMPONENT, a) + +#else +#define MCA_SPML_CALL(a) mca_spml.spml_ ## a +#endif + +OSHMEM_DECLSPEC extern mca_spml_base_module_t mca_spml; + +END_C_DECLS +#endif /* MCA_SPML_H */ diff --git a/oshmem/mca/spml/yoda/Makefile.am b/oshmem/mca/spml/yoda/Makefile.am new file mode 100644 index 0000000000..c2a4aa5dc1 --- /dev/null +++ b/oshmem/mca/spml/yoda/Makefile.am @@ -0,0 +1,45 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = \ + help-shmem-spml-yoda.txt + +EXTRA_DIST = post_configure.sh + +AM_CFLAGS = $(OSHMEM_CFLAGS) $(btl_sm_CPPFLAGS) + +yoda_sources = \ + spml_yoda.c \ + spml_yoda.h \ + spml_yoda_component.c \ + spml_yoda_component.h \ + spml_yoda_rdmafrag.h \ + spml_yoda_putreq.c \ + spml_yoda_putreq.h \ + spml_yoda_getreq.c \ + spml_yoda_getreq.h + +if MCA_BUILD_ompi_pml_ob1_DSO +component_noinst = +component_install = mca_spml_yoda.la +else +component_noinst = libmca_spml_yoda.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_spml_yoda_la_SOURCES = $(yoda_sources) +mca_spml_yoda_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_spml_yoda_la_SOURCES = $(yoda_sources) +libmca_spml_yoda_la_LDFLAGS = -module -avoid-version diff --git a/oshmem/mca/spml/yoda/configure.params b/oshmem/mca/spml/yoda/configure.params new file mode 100644 index 0000000000..9da11ca9e3 --- /dev/null +++ b/oshmem/mca/spml/yoda/configure.params @@ -0,0 +1,14 @@ +# -*- shell-script -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt b/oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt new file mode 100644 index 0000000000..e57f6efdf7 --- /dev/null +++ b/oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt @@ -0,0 +1,21 @@ +# -*- text -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[eager_limit_too_small] +The "eager limit" MCA parameter in the %s BTL was set to a value which +is too low for Open SHMEM to function properly. Please re-run your job +with a higher eager limit value for this BTL; the exact MCA parameter +name and its corresponding minimum value is shown below. + + Local host: %s + BTL name: %s + BTL eager limit value: %d (set via btl_%s_eager_limit) + BTL eager limit minimum: %d + MCA parameter name: btl_%s_eager_limit diff --git a/oshmem/mca/spml/yoda/post_configure.sh b/oshmem/mca/spml/yoda/post_configure.sh new file mode 100644 index 0000000000..d7d3db8278 --- /dev/null +++ b/oshmem/mca/spml/yoda/post_configure.sh @@ -0,0 +1,4 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved +# $COPYRIGHT$ +DIRECT_CALL_HEADER="oshmem/mca/spml/yoda/spml_yoda.h" diff --git a/oshmem/mca/spml/yoda/spml_yoda.c b/oshmem/mca/spml/yoda/spml_yoda.c new file mode 100644 index 0000000000..c2b34bc97a --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda.c @@ -0,0 +1,1249 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "orte/include/orte/types.h" +#include "orte/runtime/orte_globals.h" + +#include "opal/datatype/opal_convertor.h" + +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/btl/sm/btl_sm_frag.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/spml/spml.h" + +#include "spml_yoda.h" +#include "spml_yoda_putreq.h" +#include "spml_yoda_getreq.h" +#ifdef HAVE_UNISTD_H +#include +#endif +#define ILLEGAL_ORDER -1 +#include "oshmem/runtime/runtime.h" + +/* Turn ON/OFF debug output from build (default 0) */ +#ifndef SPML_YODA_DEBUG +#define SPML_YODA_DEBUG 0 +#endif + +mca_spml_yoda_module_t mca_spml_yoda = { + { + /* Init mca_spml_base_module_t */ + mca_spml_yoda_add_procs, + mca_spml_yoda_del_procs, + mca_spml_yoda_enable, + mca_spml_yoda_register, + mca_spml_yoda_deregister, + mca_spml_base_oob_get_mkeys, + mca_spml_yoda_put, + mca_spml_yoda_put_nb, + mca_spml_yoda_get, + mca_spml_yoda_recv, + mca_spml_yoda_send, + mca_spml_base_wait, + mca_spml_base_wait_nb, + mca_spml_yoda_fence, + mca_spml_yoda_get_remote_context, + mca_spml_yoda_set_remote_context, + mca_spml_yoda_get_remote_context_size, + mca_spml_yoda_set_remote_context_size, + + (void *)&mca_spml_yoda + } +}; + +static inline mca_bml_base_btl_t *get_next_btl(int dst, int *btl_id); + +static inline void spml_yoda_prepare_for_get(void* buffer, size_t size, void* p_src, int dst, void* p_dst, void* p_getreq); +static int btl_name_to_id(char *btl_name) +{ + if (0 == strcmp(btl_name, "sm")) { + return YODA_BTL_SM; + } else if (0 == strcmp(btl_name, "openib")) { + return YODA_BTL_OPENIB; + } else if (0 == strcmp(btl_name, "self")) { + return YODA_BTL_SELF; + } + return YODA_BTL_UNKNOWN; +} + +static char *btl_type2str(int btl_type) +{ + switch (btl_type) { + case YODA_BTL_UNKNOWN: + return "unknown btl"; + case YODA_BTL_SELF: + return "self"; + case YODA_BTL_OPENIB: + return "openib"; + case YODA_BTL_SM: + return "sm"; + } + return "bad_btl_type"; +} + +static inline void calc_nfrags(mca_bml_base_btl_t* bml_btl, + size_t size, + int *frag_size, + int *nfrags, + int use_send) +{ + if (use_send){ + *frag_size = bml_btl->btl->btl_max_send_size - SPML_YODA_SEND_CONTEXT_SIZE; + } + else{ + *frag_size = bml_btl->btl->btl_max_send_size; + } + *nfrags = 1 + (size - 1) / (*frag_size); +} + +static inline void mca_spml_yoda_bml_alloc( mca_bml_base_btl_t* bml_btl, + mca_btl_base_descriptor_t** des, + uint8_t order, size_t size, uint32_t flags, + int use_send) +{ + if (use_send){ + size = (0 == size ? size : size + SPML_YODA_SEND_CONTEXT_SIZE); + } + mca_bml_base_alloc(bml_btl, + des, + MCA_BTL_NO_ORDER, + size, + flags); +} + +static inline void spml_yoda_prepare_for_put(void* buffer, size_t size, void* p_src, void* p_dst, int use_send) +{ + if (use_send){ + memcpy((void*) buffer, &size, sizeof(size)); + memcpy((void*) ( ((char*) buffer) + sizeof(size)), &p_dst, sizeof(p_dst)); + memcpy((void*) ( ((char*) buffer) + sizeof(size) + sizeof(p_dst)), p_src, size); + } + else{ + memcpy((void*) ( (unsigned char*) buffer), p_src, size); + } +} + +static inline void spml_yoda_prepare_for_get_response(void* buffer, size_t size, void* p_src, void* p_dst, void* p_getreq, int use_send) +{ + if (use_send){ + memcpy((void*) buffer, &size, sizeof(size)); + memcpy((void*) ( ((char*) buffer) + sizeof(size)), &p_dst, sizeof(p_dst)); + memcpy((void*) ( ((char*) buffer) + sizeof(size) + sizeof(p_dst)), p_src, size); + memcpy((void*) ( ((char*) buffer) + sizeof(size) + sizeof(p_dst) + size), &p_getreq, sizeof(p_getreq)); + } + else{ + memcpy((void*) ( (unsigned char*) buffer), p_src, size); + } +} + +static inline void spml_yoda_prepare_for_get(void* buffer, size_t size, void* p_src, int dst, void* p_dst, void* p_getreq) +{ + memcpy((void*) buffer, &p_src, sizeof(p_src)); + memcpy((void*) ( ((unsigned char*) buffer) + sizeof(p_src) ), &size, sizeof(size)); + memcpy((void*) ( ((unsigned char*) buffer) + sizeof(p_src) + sizeof(size) ), &dst, sizeof(dst)); + memcpy((void*) ( ((unsigned char*) buffer) + sizeof(p_src) + sizeof(size) + sizeof(dst)), &p_dst, sizeof(p_dst)); + memcpy((void*) ( ((unsigned char*) buffer) + sizeof(p_src) + sizeof(size) + sizeof(dst) + sizeof(p_dst)), &p_getreq, sizeof(p_getreq)); +} + +static void mca_yoda_put_callback(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + size_t* size; + void** l_addr; + + size = (size_t *) des->des_dst->seg_addr.pval; + l_addr = (void**) ( ((char*)size) + sizeof(*size)); + memcpy(*l_addr, ((char*)l_addr) + sizeof(*l_addr), *size); +} + +static void mca_yoda_get_callback(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + void** p, ** p_src, **p_dst; + size_t* size; + int* dst; + void** p_getreq; + mca_btl_base_descriptor_t* des_loc; + int rc; + mca_bml_base_btl_t* bml_btl; + mca_spml_yoda_rdma_frag_t* frag; + int btl_id; + mca_spml_yoda_put_request_t *putreq; + + rc = OSHMEM_SUCCESS; + btl_id = 0; + putreq = NULL; + + /* Unpack data */ + p = (void **)des->des_dst->seg_addr.pval; + p_src = (void*) p; + + size = (size_t*)((char*)p_src + sizeof(*p_src) ); + dst = (int*)( (char*)size + sizeof(*size)); + p_dst = (void*) ((char*)dst + sizeof(*dst)); + p_getreq =(void**) ( (char*)p_dst + sizeof(*p_dst)); + + /* Prepare put via send*/ + bml_btl = get_next_btl(*dst, &btl_id); + + putreq = mca_spml_yoda_putreq_alloc(*dst); + frag = &putreq->put_frag; + + mca_spml_yoda_bml_alloc(bml_btl, + &des_loc, + MCA_BTL_NO_ORDER, + *size, + MCA_BTL_DES_SEND_ALWAYS_CALLBACK, + 1); + + if (OPAL_UNLIKELY(!des_loc || !des_loc->des_src)) { + SPML_ERROR("shmem OOM error need %d bytes", (int)*size); + oshmem_shmem_abort(-1); + } + spml_yoda_prepare_for_get_response((void*)des_loc->des_src->seg_addr.pval, *size, (void*)*p_src, (void*) *p_dst,(void*)*p_getreq,1); + + frag->rdma_req = putreq; + + /* Initialize callback data for put*/ + des_loc->des_cbdata = frag; + des_loc->des_cbfunc = mca_spml_yoda_put_completion; + des_loc->des_src_cnt = 1; + + OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); + + /* Put via send*/ + rc = mca_bml_base_send(bml_btl, des_loc, MCA_SPML_YODA_GET_RESPONSE); + if (1 == rc) + rc = OSHMEM_SUCCESS; + + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != rc)) { + if (OSHMEM_ERR_OUT_OF_RESOURCE == rc) { + /* No free resources, Block on completion here */ + SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); + oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); + } else { + SPML_ERROR("shmem error"); + } + /* exit with errro */ + SPML_ERROR("shmem error: ret = %i, send_pe = %i, dest_pe = %i", + rc, oshmem_my_proc_id(), *dst); + oshmem_shmem_abort(-1); + rc = OSHMEM_ERROR; + } +} + +static void mca_yoda_get_response_callback(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + size_t* size; + void** l_addr; + mca_spml_yoda_get_request_t* getreq; + + /* unpacking data*/ + size = (size_t *) ( ((char*)des->des_dst->seg_addr.pval) ); + l_addr = (void**)( ((char*)size) + sizeof(*size)); + getreq = (mca_spml_yoda_get_request_t*)*(void**)((void*)l_addr + sizeof(*l_addr) + *size); + + /* Complete get request*/ + OPAL_THREAD_ADD32(&getreq->parent->active_count, -1); + getreq->req_get.req_base.req_spml_complete = true; + oshmem_request_complete(&getreq->req_get.req_base.req_oshmem, 1); + oshmem_request_free((oshmem_request_t**) &getreq); + + memcpy(*l_addr, (void*)l_addr + sizeof(*l_addr), *size); +} + +/** + * note: we have to reg memory directly with btl because no proc will have a full btl list in proc_bml + */ +int mca_spml_yoda_deregister(mca_spml_mkey_t *mkeys) +{ + int i; + struct yoda_btl *ybtl; + mca_spml_yoda_context_t* yoda_context; + + if (!mkeys) { + return OSHMEM_SUCCESS; + } + + for (i = 0; i < mca_spml_yoda.n_btls; i++) { + ybtl = &mca_spml_yoda.btl_type_map[i]; + yoda_context = (mca_spml_yoda_context_t*) mkeys[i].spml_context; + if (NULL == yoda_context) { + continue; + } + if (yoda_context->btl_src_descriptor) { + ybtl->btl->btl_free(ybtl->btl, yoda_context->btl_src_descriptor); + yoda_context->btl_src_descriptor = NULL; + } + if (yoda_context->btl_src_segment) { + free(yoda_context->btl_src_segment); + yoda_context->btl_src_segment = NULL; + } + yoda_context->btl_src_segment_size = 0; + + if (yoda_context->registration) + { + ybtl->btl->btl_mpool->mpool_deregister(ybtl->btl->btl_mpool, + yoda_context->registration); + } + + } + free(mkeys); + + return OSHMEM_SUCCESS; +} + +mca_spml_mkey_t *mca_spml_yoda_register(void* addr, + size_t size, + uint64_t shmid, + int *count) +{ + int i; + mca_btl_base_descriptor_t* des = NULL; + const opal_datatype_t *datatype = &opal_datatype_wchar; + opal_convertor_t convertor; + mca_spml_mkey_t *mkeys; + struct yoda_btl *ybtl; + oshmem_proc_t *proc_self; + mca_spml_yoda_context_t* yoda_context; + struct iovec iov; + uint32_t iov_count = 1; + + + SPML_VERBOSE(10, "address %p len %llu", addr, (unsigned long long)size); + *count = 0; + /* make sure everything is initialized to 0 */ + mkeys = (mca_spml_mkey_t *) calloc(1, + mca_spml_yoda.n_btls * sizeof(*mkeys)); + if (!mkeys) { + return NULL ; + } + + proc_self = oshmem_proc_group_find(oshmem_group_all, oshmem_my_proc_id()); + /* create convertor */ + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + + mca_bml.bml_register( MCA_SPML_YODA_PUT, + mca_yoda_put_callback, + NULL ); + mca_bml.bml_register( MCA_SPML_YODA_GET, + mca_yoda_get_callback, + NULL ); + mca_bml.bml_register( MCA_SPML_YODA_GET_RESPONSE, + mca_yoda_get_response_callback, + NULL ); + /* Register proc memory in every rdma BTL. */ + for (i = 0; i < mca_spml_yoda.n_btls; i++) { + + ybtl = &mca_spml_yoda.btl_type_map[i]; + + if (!ybtl->use_cnt) { + SPML_VERBOSE(10, + "%s: present but not in use. SKIP registration", + btl_type2str(ybtl->btl_type)); + continue; + } + + /* If we have shared memory just save its id*/ + if ((ybtl->btl_type == YODA_BTL_SM) + && ((int) MEMHEAP_SHM_GET_ID(shmid) != MEMHEAP_SHM_INVALID)) { + mkeys[i].key = shmid; + continue; + } + + yoda_context = calloc(1, sizeof(*yoda_context)); + mkeys[i].spml_context = (void*) yoda_context; + + yoda_context->registration = NULL; + if (NULL != ybtl->btl->btl_prepare_src) { + /* initialize convertor for source descriptor*/ + opal_convertor_copy_and_prepare_for_recv(proc_self->proc_convertor, + datatype, + size, + addr, + 0, + &convertor); + + if (NULL != ybtl->btl->btl_mpool && NULL != ybtl->btl->btl_mpool->mpool_register) + { + iov.iov_len = size; + iov.iov_base = NULL; + + opal_convertor_pack(&convertor, &iov, &iov_count, &size); + ybtl->btl->btl_mpool->mpool_register(ybtl->btl->btl_mpool, + iov.iov_base, size, 0, &yoda_context->registration); + } + /* initialize convertor for source descriptor*/ + opal_convertor_copy_and_prepare_for_recv(proc_self->proc_convertor, + datatype, + size, + addr, + 0, + &convertor); + + /* register source memory */ + des = ybtl->btl->btl_prepare_src(ybtl->btl, + 0, + yoda_context->registration, + &convertor, + MCA_BTL_NO_ORDER, + 0, + &size, + 0); + if (NULL == des) { + SPML_ERROR("%s: failed to register source memory. ", + btl_type2str(ybtl->btl_type)); + } + /* copy source descriptor to local structures*/ + yoda_context->btl_src_descriptor = des; + yoda_context->btl_src_segment_size = ybtl->btl->btl_seg_size; + if (0 != yoda_context->btl_src_segment_size) { + yoda_context->btl_src_segment = + malloc(yoda_context->btl_src_segment_size); + memcpy(yoda_context->btl_src_segment, + des->des_src, + yoda_context->btl_src_segment_size); + } + } + + mkeys[i].va_base = (unsigned long) addr; + + SPML_VERBOSE(5, + "rank %d btl %s rkey %x lkey %x key %llx address 0x%llX len %llu shmid 0x%X|0x%X", + oshmem_proc_local_proc->proc_name.vpid, btl_type2str(ybtl->btl_type), mkeys[i].ib.rkey, mkeys[i].ib.lkey, (unsigned long long)mkeys[i].key, (unsigned long long)mkeys[i].va_base, (unsigned long long)size, MEMHEAP_SHM_GET_TYPE(shmid), MEMHEAP_SHM_GET_ID(shmid)); + } + OBJ_DESTRUCT(&convertor); + *count = mca_spml_yoda.n_btls; + return mkeys; +} + +/* + * For each proc setup a datastructure that indicates the BTLs + * that can be used to reach the destination. + */ +static void mca_spml_yoda_error_handler(struct mca_btl_base_module_t* btl, + int32_t flags, + ompi_proc_t* errproc, + char* btlinfo) +{ + oshmem_shmem_abort(-1); +} + +/* make global btl list&map */ +static int create_btl_list(void) +{ + int btl_id; + char *btl_name; + int size; + opal_list_item_t *item; + mca_btl_base_selected_module_t *btl_sm; + int i; + + size = opal_list_get_size(&mca_btl_base_modules_initialized); + if (0 >= size) { + SPML_ERROR("no btl(s) available"); + return OSHMEM_ERROR; + } + SPML_VERBOSE(50, "found %d capable btls", size); + + mca_spml_yoda.btl_type_map = + (struct yoda_btl *) calloc(size, sizeof(struct yoda_btl)); + if (!mca_spml_yoda.btl_type_map) + return OSHMEM_ERROR; + + mca_spml_yoda.n_btls = 0; + for (i = 0, item = opal_list_get_first(&mca_btl_base_modules_initialized); + item != opal_list_get_end(&mca_btl_base_modules_initialized); + item = opal_list_get_next(item), i++) { + + btl_sm = (mca_btl_base_selected_module_t *) item; + btl_name = btl_sm->btl_component->btl_version.mca_component_name; + btl_id = btl_name_to_id(btl_name); + + SPML_VERBOSE(50, "found btl (%s) btl_id=%d", btl_name, btl_id); + mca_spml_yoda.btl_type_map[mca_spml_yoda.n_btls].btl = + btl_sm->btl_module; + mca_spml_yoda.btl_type_map[mca_spml_yoda.n_btls].btl_type = btl_id; + mca_spml_yoda.n_btls++; + } + + if (0 == mca_spml_yoda.n_btls) { + SPML_ERROR("can not find any suitable btl"); + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} + +static int _find_btl_id(mca_bml_base_btl_t *bml_btl) +{ + int i; + + for (i = 0; i < mca_spml_yoda.n_btls; i++) { + if (mca_spml_yoda.btl_type_map[i].btl == bml_btl->btl) + return i; + } + return -1; +} + +/* for each proc create transport ids which are indexes into global + * btl list&map + */ +static int create_btl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + int btl_id; + mca_bml_base_endpoint_t* endpoint; + mca_bml_base_btl_t* bml_btl = 0; + int i, size; + mca_bml_base_btl_array_t *btl_array; + int shmem_index = -1; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + assert(endpoint); + size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma); + + if (0 >= size) { + /* Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put */ + /* + This hack is necessary for the case when KNEM is not available. + In this case we still want to use send/recv of SM BTL for put and get + but SM BTL is not in the rdma list anymore + */ + size = mca_bml_base_btl_array_get_size(btl_array = + &endpoint->btl_eager); + if (size > 0) { + /*Chose SHMEM capable btl from eager array. Not filter now: take the first + (but could appear on demand).*/ + for (shmem_index = 0; shmem_index < size; shmem_index++) { + bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index); + _find_btl_id(bml_btl); + size = 1; + break; + } + } + else + { + SPML_ERROR("no SHMEM capable transport for dest pe=%d", dst_pe); + return OSHMEM_ERROR; + } + } + + proc->transport_ids = (char *) malloc(size * sizeof(char)); + if (!proc->transport_ids) + return OSHMEM_ERROR; + + proc->num_transports = size; + + for (i = 0; i < size; i++) { + bml_btl = mca_bml_base_btl_array_get_index(btl_array, + (shmem_index >= 0) ? + (shmem_index) : (i)); + btl_id = _find_btl_id(bml_btl); + SPML_VERBOSE(50, + "dst_pe(%d) use btl (%s) btl_id=%d", + dst_pe, bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id); + if (btl_id < 0) { + SPML_ERROR("unknown btl: dst_pe(%d) use btl (%s) btl_id=%d", + dst_pe, bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id); + return OSHMEM_ERROR; + } + proc->transport_ids[i] = btl_id; + mca_spml_yoda.btl_type_map[btl_id].use_cnt++; + } + return OSHMEM_SUCCESS; +} + +static int destroy_btl_list(void) +{ + if (mca_spml_yoda.btl_type_map) + free(mca_spml_yoda.btl_type_map); + + return OSHMEM_SUCCESS; +} + +static int destroy_btl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + if (proc->transport_ids) + free(proc->transport_ids); + + return OSHMEM_SUCCESS; +} + +int mca_spml_yoda_add_procs(oshmem_proc_t** procs, size_t nprocs) +{ + opal_bitmap_t reachable; + int rc; + size_t i; + + if (nprocs == 0) + return OSHMEM_SUCCESS; + + OBJ_CONSTRUCT(&reachable, opal_bitmap_t); + rc = opal_bitmap_init(&reachable, (int) nprocs); + if (OSHMEM_SUCCESS != rc) + return rc; + + rc = mca_bml.bml_add_procs(nprocs, (ompi_proc_t**) procs, &reachable); + + if (OSHMEM_SUCCESS != rc) { + SPML_ERROR("SPML YODA: shmem error\n"); + goto cleanup_and_return; + } + + rc = mca_bml.bml_register_error(mca_spml_yoda_error_handler); + if (OMPI_SUCCESS != rc) + goto cleanup_and_return; + + /* create btl index and map */ + rc = create_btl_list(); + if (OSHMEM_SUCCESS != rc) + goto cleanup_and_return; + + for (i = 0; i < nprocs; i++) { + rc = create_btl_idx(i); + if (OSHMEM_SUCCESS != rc) + goto cleanup_and_return; + } + +cleanup_and_return: + OBJ_DESTRUCT(&reachable); + + return rc; +} + +int mca_spml_yoda_del_procs(oshmem_proc_t** procs, size_t nprocs) +{ + size_t i; + + mca_bml.bml_del_procs(nprocs, (ompi_proc_t**) procs); + for (i = 0; i < nprocs; i++) { + destroy_btl_idx(i); + } + destroy_btl_list(); + + return OSHMEM_SUCCESS; +} + +static inline mca_bml_base_btl_t *get_next_btl(int dst, int *btl_id) +{ + mca_bml_base_endpoint_t* endpoint; + mca_bml_base_btl_t* bml_btl; + oshmem_proc_t *proc; + mca_bml_base_btl_array_t *btl_array = 0; + int size = 0; + int shmem_index = 0; + + /* get endpoint and btl */ + proc = oshmem_proc_group_all(dst); + if (!proc) { + SPML_ERROR("Can not find destination proc for pe=%d", dst); + return NULL ; + } + + endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + if (!endpoint) { + SPML_ERROR("pe=%d proc has no endpoint", dst); + return NULL ; + } + + /* At the moment always return first transport */ + size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma); + + if (0 >= size) { + /* Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put */ + /* + This hack is necessary for the case when KNEM is not available. + In this case we still want to use send/recv of SM BTL for put and get + but SM BTL is not in the rdma list anymore + */ + size = mca_bml_base_btl_array_get_size(btl_array = + &endpoint->btl_eager); + if (size > 0) { + /*Chose SHMEM capable btl from eager array. Not filter now: take the first + (but could appear on demand).*/ + for (shmem_index = 0; shmem_index < size; shmem_index++) { + bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index); + _find_btl_id(bml_btl); + size = 1; + break; + } + } + } + + bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index); + *btl_id = proc->transport_ids[0]; + +#if SPML_YODA_DEBUG == 1 + assert(*btl_id >= 0 && *btl_id < YODA_BTL_MAX); + SPML_VERBOSE(100, "pe=%d reachable via btl %s %d", dst, + bml_btl->btl->btl_component->btl_version.mca_component_name, *btl_id); +#endif + return bml_btl; +} + + +static inline int mca_spml_yoda_put_internal(void *dst_addr, + size_t size, + void *src_addr, + int dst, + int is_nb) +{ + int rc = OSHMEM_SUCCESS; + mca_spml_yoda_put_request_t *putreq = NULL; + mca_bml_base_btl_t* bml_btl; + mca_btl_base_descriptor_t* des = NULL; + mca_btl_base_segment_t* segment; + mca_spml_yoda_rdma_frag_t* frag; + int nfrags; + int i; + unsigned ncopied = 0; + int frag_size = 0; + char *p_src, *p_dst; + mca_spml_yoda_context_t* yoda_context; + uint64_t rva; + mca_spml_mkey_t *r_mkey; + int btl_id = 0; + struct yoda_btl *ybtl; + int put_via_send; + + /* If nothing to put its OK.*/ + if (0 >= size) { + return OSHMEM_SUCCESS; + } + + /* Find bml_btl and its global btl_id */ + bml_btl = get_next_btl(dst, &btl_id); + if (!bml_btl) { + SPML_ERROR("cannot reach %d pe: no appropriate btl found", oshmem_my_proc_id()); + oshmem_shmem_abort(-1); + } + /* Check if btl has PUT method. If it doesn't - use SEND*/ + put_via_send = !(bml_btl->btl->btl_flags & MCA_BTL_FLAGS_PUT); + + /* Get rkey of remote PE (dst proc) which must be on memheap*/ + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, + (unsigned long) dst_addr, + btl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + dst, dst_addr); + oshmem_shmem_abort(-1); + } + +#if SPML_YODA_DEBUG == 1 + SPML_VERBOSE(100, "put: pe:%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + ybtl = &mca_spml_yoda.btl_type_map[btl_id]; + + /* check if we doing put into shm attached segment and if so + * just do memcpy + */ + if ((ybtl->btl_type == YODA_BTL_SM) + && OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)dst_addr) && (unsigned long)dst_addr != rva)) { + memcpy((void *) (unsigned long) rva, src_addr, size); + return OSHMEM_SUCCESS; + } + + /* We support only blocking PUT now => we always need copy for src buffer*/ + calc_nfrags(bml_btl, size, &frag_size, &nfrags, put_via_send); + + p_src = (char*) src_addr; + p_dst = (char*) (unsigned long) rva; + for (i = 0; i < nfrags; i++) { + /* Allocating send request from free list */ + putreq = mca_spml_yoda_putreq_alloc(dst); + frag = &putreq->put_frag; + ncopied = i < nfrags - 1 ? (unsigned)frag_size : (char *) src_addr + size - p_src; + + /* Preparing source buffer */ + + /* allocate buffer */ + mca_spml_yoda_bml_alloc(bml_btl, + &des, + MCA_BTL_NO_ORDER, + ncopied, + MCA_BTL_DES_SEND_ALWAYS_CALLBACK, + put_via_send); + + if (OPAL_UNLIKELY(!des || !des->des_src )) { + SPML_ERROR("shmem OOM error need %d bytes", ncopied); + SPML_ERROR("src=%p nfrags = %d frag_size=%d", + src_addr, nfrags, frag_size); + oshmem_shmem_abort(-1); + } + + /* copy data to allocated buffer*/ + segment = des->des_src; + spml_yoda_prepare_for_put((void*)segment->seg_addr.pval, ncopied, + (void*)p_src, (void*)p_dst, put_via_send); + + /* Preparing destination buffer */ + + yoda_context = (mca_spml_yoda_context_t*) r_mkey->spml_context; + assert( (NULL != yoda_context) && (0 != yoda_context->btl_src_segment_size)); + + memcpy(&frag->rdma_segs[0].base_seg, + yoda_context->btl_src_segment, + yoda_context->btl_src_segment_size); + + frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t) p_dst; + frag->rdma_segs[0].base_seg.seg_len = (put_via_send ? + ncopied + SPML_YODA_SEND_CONTEXT_SIZE : + ncopied); + des->des_dst = &frag->rdma_segs[0].base_seg; + + frag->rdma_req = putreq; + + /* initialize callback data for put*/ + des->des_cbdata = frag; + des->des_cbfunc = mca_spml_yoda_put_completion; + des->des_dst_cnt = 1; + + OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); + /* put the data to remote side */ + if (!put_via_send) { + rc = mca_bml_base_put(bml_btl, des); + } else { + rc = mca_bml_base_send(bml_btl, des, MCA_SPML_YODA_PUT); + if (1 == rc) + rc = OSHMEM_SUCCESS; + } + + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != rc)) { + if (OSHMEM_ERR_OUT_OF_RESOURCE == rc) { + /* No free resources, Block on completion here */ + SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); + oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); + } else { + SPML_ERROR("shmem error"); + } + /* exit with errro */ + SPML_ERROR("shmem error: ret = %i, send_pe = %i, dest_pe = %i", + rc, oshmem_my_proc_id(), dst); + oshmem_shmem_abort(-1); + rc = OSHMEM_ERROR; + } + p_src += ncopied; + p_dst += ncopied; + } + + return rc; +} + +int mca_spml_yoda_put(void *dst_addr, size_t size, void *src_addr, int dst) +{ + return mca_spml_yoda_put_internal(dst_addr, size, src_addr, dst, 0); +} + +int mca_spml_yoda_put_nb(void* dst_addr, + size_t size, + void* src_addr, + int dst, + void **handle) +{ + UNREFERENCED_PARAMETER(handle); + + return mca_spml_yoda_put_internal(dst_addr, size, src_addr, dst, 1); +} + +int mca_spml_yoda_fence(void) +{ + + while (0 < mca_spml_yoda.n_active_puts) { + oshmem_request_wait_any_completion(); + } + return OSHMEM_SUCCESS; +} + +void* mca_spml_yoda_get_remote_context(void* spml_context) +{ + return ((mca_spml_yoda_context_t*) spml_context)->btl_src_segment; +} + +void mca_spml_yoda_set_remote_context(void** spml_context, + void* spml_remote_context) +{ + mca_spml_yoda_context_t * yoda_context; + yoda_context = *(spml_context); + + if (NULL == yoda_context) { + yoda_context = (mca_spml_yoda_context_t*) malloc(sizeof(*yoda_context)); + } + yoda_context->btl_src_segment = + (mca_btl_base_segment_t*) spml_remote_context; + *(spml_context) = yoda_context; +} + +int mca_spml_yoda_get_remote_context_size(void* spml_context) +{ + return ((mca_spml_yoda_context_t*) spml_context)->btl_src_segment_size; +} + +void mca_spml_yoda_set_remote_context_size(void** spml_context, + int spml_remote_context_size) +{ + mca_spml_yoda_context_t *yoda_context; + yoda_context = *(spml_context); + + if (NULL == yoda_context) { + yoda_context = calloc(1, sizeof(*yoda_context)); + } + yoda_context->btl_src_segment_size = spml_remote_context_size; + *(spml_context) = yoda_context; +} + +int mca_spml_yoda_enable(bool enable) +{ + SPML_VERBOSE(50, "*** yoda ENABLED ****"); + if (false == enable) { + return OSHMEM_SUCCESS; + } + + OBJ_CONSTRUCT(&mca_spml_yoda.lock, opal_mutex_t); + + /** + *If we get here this is the SPML who get selected for the run. We + * should get ownership for the put and get requests list, and + * initialize them with the size of our own requests. + */ + + ompi_free_list_init_new(&mca_spml_base_put_requests, + sizeof(mca_spml_yoda_put_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_yoda_put_request_t), + 0, + opal_cache_line_size, + mca_spml_yoda.free_list_num, + mca_spml_yoda.free_list_max, + mca_spml_yoda.free_list_inc, + NULL ); + + ompi_free_list_init_new(&mca_spml_base_get_requests, + sizeof(mca_spml_yoda_get_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_yoda_get_request_t), + 0, + opal_cache_line_size, + mca_spml_yoda.free_list_num, + mca_spml_yoda.free_list_max, + mca_spml_yoda.free_list_inc, + NULL ); + + mca_spml_yoda.enabled = true; + +#if OSHMEM_WAIT_COMPLETION_DEBUG == 1 + condition_dbg_init(); +#endif + + return OSHMEM_SUCCESS; +} + +/** + * shmem_get reads data from a remote address + * in the symmetric heap via RDMA READ. + * Get operation: + * 1. Get the rkey to the remote address. + * 2. Allocate a get request. + * 3. Allocated a temporary pre-registered buffer + * to copy the data to. + * 4. Init the request descriptor with remote side + * data and local side data. + * 5. Read the remote buffer to a pre-registered + * buffer on the local PE using RDMA READ. + * 6. Copy the received data to dst_addr if an + * intermediate pre-register buffer was used. + * 7. Clear the request and return. + * + * src_addr - address on remote pe. + * size - the amount on bytes to be read. + * dst_addr - address on the local pe. + * src - the pe of remote process. + */ +int mca_spml_yoda_get(void* src_addr, size_t size, void* dst_addr, int src) +{ + int rc = OSHMEM_SUCCESS; + mca_spml_mkey_t *r_mkey, *l_mkey; + uint64_t rva; + unsigned ncopied = 0; + int frag_size = 0; + char *p_src, *p_dst; + int i; + int nfrags; + mca_bml_base_btl_t* bml_btl = NULL; + mca_btl_base_segment_t* segment; + mca_btl_base_descriptor_t* des = NULL; + mca_spml_yoda_rdma_frag_t* frag = NULL; + struct mca_spml_yoda_getreq_parent get_holder; + struct yoda_btl *ybtl; + int btl_id = 0; + mca_spml_yoda_context_t* yoda_context; + int get_via_send; + const opal_datatype_t *datatype = &opal_datatype_wchar; + opal_convertor_t convertor; + oshmem_proc_t *proc_self; + size_t prepare_size; + mca_mpool_base_registration_t* registration; + mca_spml_yoda_get_request_t* getreq = NULL; + + /*If nothing to get its OK.*/ + if (0 >= size) { + return rc; + } + + /* Find bml_btl and its global btl_id */ + bml_btl = get_next_btl(src, &btl_id); + if (!bml_btl) { + SPML_ERROR("cannot reach %d pe: no appropriate btl found", oshmem_my_proc_id()); + oshmem_shmem_abort(-1); + } + /* Check if btl has GET method. If it doesn't - use SEND*/ + get_via_send = ! ( (bml_btl->btl->btl_flags & (MCA_BTL_FLAGS_GET)) && + (bml_btl->btl->btl_flags & (MCA_BTL_FLAGS_PUT)) ); + + /* Get rkey of remote PE (src proc) which must be on memheap*/ + r_mkey = mca_memheap.memheap_get_cached_mkey(src, + (unsigned long) src_addr, + btl_id, + &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", + src, src_addr); + oshmem_shmem_abort(-1); + } +#if SPML_YODA_DEBUG == 1 + SPML_VERBOSE(100, "get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, src_rkey=0x%lx", + src, src_addr, dst_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + ybtl = &mca_spml_yoda.btl_type_map[btl_id]; + + nfrags = 1; + + /* check if we doing get into shm attached segment and if so + * just do memcpy + */ + if ((ybtl->btl_type == YODA_BTL_SM) + && OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)src_addr) && (unsigned long)src_addr != rva)) { + memcpy(dst_addr, (void *) (unsigned long) rva, size); + /* must call progress here to avoid deadlock. Scenarion: + * pe1 pols pe2 via shm get. pe2 tries to get static variable from node one, which goes to sm btl + * In this case pe2 is stuck forever because pe1 never calls opal_progress. + * May be we do not need to call progress on every get() here but rather once in a while. + */ + opal_progress(); + return OSHMEM_SUCCESS; + } + + l_mkey = mca_memheap.memheap_get_local_mkey((unsigned long) dst_addr, + btl_id); + /* + * Need a copy if local memory has not been registered or + * we make GET via SEND + */ + frag_size = (int)ncopied; + if ((NULL == l_mkey) || get_via_send) + { + calc_nfrags(bml_btl, size, &frag_size, &nfrags, get_via_send); + } + + p_src = (char*) (unsigned long) rva; + p_dst = (char*) dst_addr; + get_holder.active_count = 0; + + for (i = 0; i < nfrags; i++) { + /** + * Allocating a get request from a pre-allocated + * and pre-registered free list. + */ + getreq = mca_spml_yoda_getreq_alloc(src); + assert(getreq); + getreq->p_dst = NULL; + frag = &getreq->get_frag; + getreq->parent = &get_holder; + + ncopied = i < nfrags - 1 ? (unsigned)frag_size : (char *) dst_addr + size - p_dst; + frag->allocated = 0; + /* Prepare destination descriptor*/ + yoda_context = r_mkey->spml_context; + assert(0 != yoda_context->btl_src_segment_size); + memcpy(&frag->rdma_segs[0].base_seg, + yoda_context->btl_src_segment, + yoda_context->btl_src_segment_size); + + frag->rdma_segs[0].base_seg.seg_len = (get_via_send ? ncopied + SPML_YODA_SEND_CONTEXT_SIZE : ncopied); + if (get_via_send) { + frag->use_send = 1; + frag->allocated = 1; + /** + * Allocate a temporary buffer on the local PE. + * The local buffer will store the data read + * from the remote address. + */ + mca_spml_yoda_bml_alloc(bml_btl, + &des, + MCA_BTL_NO_ORDER, + (int)frag_size, + MCA_BTL_DES_SEND_ALWAYS_CALLBACK, + get_via_send); + if (OPAL_UNLIKELY(!des || !des->des_src)) { + SPML_ERROR("shmem OOM error need %d bytes", ncopied); + SPML_ERROR("src=%p nfrags = %d frag_size=%d", + src_addr, nfrags, frag_size); + oshmem_shmem_abort(-1); + } + + segment = des->des_src; + spml_yoda_prepare_for_get((void*)segment->seg_addr.pval, ncopied, (void*)p_src, oshmem_my_proc_id(), (void*)p_dst, (void*) getreq); + des->des_cbfunc = mca_spml_yoda_get_response_completion; + } + else{ + /* + * Register src memory if do GET via GET + */ + proc_self = oshmem_proc_group_find(oshmem_group_all, oshmem_my_proc_id()); + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + + prepare_size = ncopied; + opal_convertor_copy_and_prepare_for_recv(proc_self->proc_convertor, + datatype, + prepare_size, + p_dst, + 0, + &convertor); + + registration = (NULL == l_mkey ? NULL : ((mca_spml_yoda_context_t*)l_mkey->spml_context)->registration); + des = ybtl->btl->btl_prepare_dst(ybtl->btl, + bml_btl->btl_endpoint, + registration, + &convertor, + MCA_BTL_NO_ORDER, + 0, + &prepare_size, + 0); + if (NULL == des) { + SPML_ERROR("%s: failed to register destination memory %p.", + btl_type2str(ybtl->btl_type), p_dst); + } + OBJ_DESTRUCT(&convertor); + frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t) p_src; + getreq->p_dst = (uint64_t*) p_dst; + frag->size = ncopied; + des->des_cbfunc = mca_spml_yoda_get_completion; + des->des_src = &frag->rdma_segs[0].base_seg; + } + + /** + * Initialize the remote data fragment + * with remote address data required for + * executing RDMA READ from a remote buffer. + */ + + frag->rdma_req = getreq; + + /** + * Init remote side descriptor. + */ + des->des_src_cnt = 1; + des->des_cbdata = frag; + + /** + * Do GET operation + */ + if (get_via_send){ + rc = mca_bml_base_send(bml_btl, des, MCA_SPML_YODA_GET); + if (1 == rc) + rc = OSHMEM_SUCCESS; + } else { + rc = mca_bml_base_get(bml_btl, des); + } + + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != rc)) { + if (OSHMEM_ERR_OUT_OF_RESOURCE == rc) { + /* No free resources, Block on completion here */ + oshmem_request_wait_completion(&getreq->req_get.req_base.req_oshmem); + return OSHMEM_SUCCESS; + } else { + SPML_ERROR("oshmem_get: error %d", rc); + oshmem_shmem_abort(-1); + return rc; + } + } + p_dst += ncopied; + p_src += ncopied; + OPAL_THREAD_ADD32(&get_holder.active_count, 1); + } + + /* revisit if we really need this for self and sm */ + /* if (YODA_BTL_SELF == ybtl->btl_type) */ + opal_progress(); + + /* Wait for completion on request */ + while (get_holder.active_count > 0) + oshmem_request_wait_completion(&getreq->req_get.req_base.req_oshmem); + + return rc; +} + +int mca_spml_yoda_send(void* buf, + size_t size, + int dst, + mca_spml_base_put_mode_t sendmode) +{ + int rc = OSHMEM_SUCCESS; + + rc = MCA_PML_CALL(send(buf, + size, + &(ompi_mpi_unsigned_char.dt), + dst, + 0, + (mca_pml_base_send_mode_t)sendmode, + &(ompi_mpi_comm_world.comm))); + + return rc; +} + +int mca_spml_yoda_recv(void* buf, size_t size, int src) +{ + int rc = OSHMEM_SUCCESS; + + rc = MCA_PML_CALL(recv(buf, + size, + &(ompi_mpi_unsigned_char.dt), + src, + 0, + &(ompi_mpi_comm_world.comm), + NULL)); + + return rc; +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda.h b/oshmem/mca/spml/yoda/spml_yoda.h new file mode 100644 index 0000000000..b249e5ac93 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_H +#define MCA_SPML_YODA_H + +#include "oshmem_config.h" +#include "oshmem/request/request.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/bml/base/base.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/class/ompi_free_list.h" + +/* Turn ON/OFF debug output from build (default 0) */ +#ifndef OSHMEM_WAIT_COMPLETION_DEBUG +#define OSHMEM_WAIT_COMPLETION_DEBUG 0 +#endif + +#define MCA_SPML_YODA_PUT (MCA_BTL_TAG_USR + 0x0A) +#define MCA_SPML_YODA_GET (MCA_BTL_TAG_USR + 0x0B) +#define MCA_SPML_YODA_GET_RESPONSE (MCA_BTL_TAG_USR + 0x0C) + +#define SPML_YODA_SEND_CONTEXT_SIZE (sizeof(size_t) + 3*sizeof(void*) + sizeof(int)) +BEGIN_C_DECLS + +/** + * YODA SPML module + */ + +enum { + YODA_BTL_UNKNOWN = -1, + YODA_BTL_SELF = 0, + YODA_BTL_SM, + YODA_BTL_OPENIB, + YODA_BTL_MAX +}; + +struct yoda_btl { + mca_btl_base_module_t *btl; + int btl_type; + int use_cnt; +}; + +struct mca_spml_yoda_t { + mca_spml_base_module_t super; + + int priority; + int free_list_num; /* initial size of free list */ + int free_list_max; /* maximum size of free list */ + int free_list_inc; /* number of elements to grow free list */ + + /* lock queue access */ + opal_mutex_t lock; + + /* free lists */ + ompi_free_list_t rdma_frags; + /* number of outstanding put requests */ + uint32_t n_active_puts; + bool enabled; + struct yoda_btl *btl_type_map; + int n_btls; +}; +typedef struct mca_spml_yoda_t mca_spml_yoda_module_t; + +struct mca_spml_yoda_context_t { + mca_btl_base_descriptor_t* btl_src_descriptor; + int btl_src_segment_size; + mca_btl_base_segment_t* btl_src_segment; + mca_mpool_base_registration_t* registration; +}; +typedef struct mca_spml_yoda_context_t mca_spml_yoda_context_t; + +extern mca_spml_yoda_module_t mca_spml_yoda; + +extern int mca_spml_yoda_enable(bool enable); +extern int mca_spml_yoda_get(void* dst_addr, + size_t size, + void* src_addr, + int src); +extern int mca_spml_yoda_put(void* dst_addr, + size_t size, + void* src_addr, + int dst); +extern int mca_spml_yoda_put_nb(void* dst_addr, + size_t size, + void* src_addr, + int dst, + void **handle); +extern int mca_spml_yoda_recv(void* buf, size_t size, int src); +extern int mca_spml_yoda_send(void* buf, + size_t size, + int dst, + mca_spml_base_put_mode_t mode); +extern mca_spml_mkey_t *mca_spml_yoda_register(void* addr, + size_t size, + uint64_t shmid, + int *count); +extern int mca_spml_yoda_deregister(mca_spml_mkey_t *mkeys); +extern int mca_spml_yoda_add_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_yoda_del_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_yoda_fence(void); +extern void* mca_spml_yoda_get_remote_context(void*); +extern void mca_spml_yoda_set_remote_context(void**, void*); +extern int mca_spml_yoda_get_remote_context_size(void*); +extern void mca_spml_yoda_set_remote_context_size(void**, int); + +#if OSHMEM_WAIT_COMPLETION_DEBUG == 1 +extern void condition_dbg_init(void); +extern void condition_dbg_finalize(void); +#endif + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/yoda/spml_yoda_component.c b/oshmem/mca/spml/yoda/spml_yoda_component.c new file mode 100644 index 0000000000..fd9824371f --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_component.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/runtime/params.h" +#include "oshmem/mca/spml/spml.h" +#include "ompi/mca/bml/base/base.h" +#include "spml_yoda_component.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" + +static int mca_spml_yoda_component_register(void); +static int mca_spml_yoda_component_open(void); +static int mca_spml_yoda_component_close(void); +static mca_spml_base_module_t* +mca_spml_yoda_component_init(int* priority, + bool enable_progress_threads, + bool enable_mpi_threads); +static int mca_spml_yoda_component_fini(void); +mca_spml_base_component_2_0_0_t mca_spml_yoda_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_SPML_BASE_VERSION_2_0_0, + + "yoda", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + mca_spml_yoda_component_open, /* component open */ + mca_spml_yoda_component_close, /* component close */ + NULL, + mca_spml_yoda_component_register + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_spml_yoda_component_init, /* component init */ + mca_spml_yoda_component_fini /* component finalize */ + +}; + +static inline int mca_spml_yoda_param_register_int(const char *param_name, + int default_value, + const char *help_msg) +{ + int param_value; + + param_value = default_value; + (void) mca_base_component_var_register(&mca_spml_yoda_component.spmlm_version, + param_name, + help_msg, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + ¶m_value); + + return param_value; +} + +static int mca_spml_yoda_component_register(void) +{ + mca_spml_yoda.free_list_num = + mca_spml_yoda_param_register_int("free_list_num", 1024, 0); + mca_spml_yoda.free_list_max = + mca_spml_yoda_param_register_int("free_list_max", 1024, 0); + mca_spml_yoda.free_list_inc = + mca_spml_yoda_param_register_int("free_list_inc", 16, 0); + mca_spml_yoda.priority = + mca_spml_yoda_param_register_int("priority", + 20, + "[integer] yoda priority"); + return OSHMEM_SUCCESS; +} + +static int mca_spml_yoda_component_open(void) +{ + return mca_base_framework_open(&ompi_bml_base_framework, 0); +} + +static int mca_spml_yoda_component_close(void) +{ + int rc; + if (OMPI_SUCCESS != (rc = mca_base_framework_close(&ompi_bml_base_framework))) { + return rc; + } + return OSHMEM_SUCCESS; +} + +static mca_spml_base_module_t* +mca_spml_yoda_component_init(int* priority, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + SPML_VERBOSE( 10, "in yoda, my priority is %d\n", mca_spml_yoda.priority); + + *priority = mca_spml_yoda.priority; + if ((*priority) > mca_spml_yoda.priority) { + return NULL ; + } + + /* We use BML/BTL and need to start it */ + if (!mca_bml_base_inited()) { + SPML_VERBOSE(10, "starting bml\n"); + if (OSHMEM_SUCCESS + != mca_bml_base_init(enable_progress_threads, + enable_mpi_threads)) { + return NULL ; + } + } + + mca_spml_yoda.n_active_puts = 0; + + return &mca_spml_yoda.super; +} + +int mca_spml_yoda_component_fini(void) +{ + int rc; + + /* Shutdown BML */ + if (OMPI_SUCCESS != (rc = mca_bml.bml_finalize())) + return rc; + + OBJ_DESTRUCT(&mca_spml_yoda.lock); +#if OSHMEM_WAIT_COMPLETION_DEBUG == 1 + condition_dbg_finalize(); +#endif + + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda_component.h b/oshmem/mca/spml/yoda/spml_yoda_component.h new file mode 100644 index 0000000000..b6b6e11c37 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_component.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_COMPONENT_H +#define MCA_SPML_YODA_COMPONENT_H + +BEGIN_C_DECLS + +/* + * SPML module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_spml_base_component_2_0_0_t mca_spml_yoda_component; +END_C_DECLS + +#endif diff --git a/oshmem/mca/spml/yoda/spml_yoda_getreq.c b/oshmem/mca/spml/yoda/spml_yoda_getreq.c new file mode 100644 index 0000000000..87143d5c28 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_getreq.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "opal/prefetch.h" +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "ompi/mca/btl/btl.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/bml/base/base.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_getreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" + +/* + * The free call mark the final stage in a request life-cycle. Starting from this + * point the request is completed at both SPML and user level, and can be used + * for others one sided communications. Therefore, in the case of the YODA SPML it should + * be added to the free request list. + */ +static int mca_spml_yoda_get_request_free(struct oshmem_request_t** request) +{ + mca_spml_yoda_get_request_t* getreq = + *(mca_spml_yoda_get_request_t**) request; + + assert( false == getreq->req_get.req_base.req_free_called); + + OPAL_THREAD_LOCK(&oshmem_request_lock); + getreq->req_get.req_base.req_free_called = true; + + OMPI_FREE_LIST_RETURN_MT( &mca_spml_base_get_requests, + (ompi_free_list_item_t*)getreq); + + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ + return OSHMEM_SUCCESS; +} + +static int mca_spml_yoda_get_request_cancel(struct oshmem_request_t* request, + int complete) +{ + /* we dont cancel get requests by now */ + return OSHMEM_SUCCESS; +} + +static void mca_spml_yoda_get_request_construct(mca_spml_yoda_get_request_t* req) +{ + req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET; + req->req_get.req_base.req_oshmem.req_free = mca_spml_yoda_get_request_free; + req->req_get.req_base.req_oshmem.req_cancel = + mca_spml_yoda_get_request_cancel; +} + +static void mca_spml_yoda_get_request_destruct(mca_spml_yoda_get_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_yoda_get_request_t, + mca_spml_base_get_request_t, + mca_spml_yoda_get_request_construct, + mca_spml_yoda_get_request_destruct); + +void mca_spml_yoda_get_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_spml_yoda_rdma_frag_t* frag = + (mca_spml_yoda_rdma_frag_t*) des->des_cbdata; + mca_spml_yoda_get_request_t* getreq = + (mca_spml_yoda_get_request_t*) frag->rdma_req; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check completion status */ + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != status)) { + /* shmem has no way to propagate errors. cry&die */ + SPML_ERROR("FATAL get completion error"); + abort(); + } + /* decide if we need to copy buffer */ + if (getreq->p_dst) { + memcpy(getreq->p_dst, + des->des_dst->seg_addr.pval, + frag->size); + } + + if (getreq->parent) { + OPAL_THREAD_ADD32(&getreq->parent->active_count, -1); + } + getreq->req_get.req_base.req_spml_complete = true; + oshmem_request_complete(&getreq->req_get.req_base.req_oshmem, 1); + oshmem_request_free((oshmem_request_t**) &getreq); + + mca_bml_base_free(bml_btl, des); +} + +void mca_spml_yoda_get_response_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check completion status */ + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != status)) { + /* shmem has no way to propagate errors. cry&die */ + SPML_ERROR("FATAL get completion error"); + abort(); + } + + mca_bml_base_free(bml_btl, des); +} diff --git a/oshmem/mca/spml/yoda/spml_yoda_getreq.h b/oshmem/mca/spml/yoda/spml_yoda_getreq.h new file mode 100644 index 0000000000..257e5342d5 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_getreq.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SPML_YODA_GET_REQUEST_H +#define OSHMEM_SPML_YODA_GET_REQUEST_H + +#include "ompi/mca/btl/btl.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/pml/ob1/pml_ob1_comm.h" +#include "ompi/mca/bml/bml.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "orte/runtime/orte_globals.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +BEGIN_C_DECLS + +struct mca_spml_yoda_getreq_parent { + uint32_t active_count; +}; + +struct mca_spml_yoda_get_request_t { + mca_spml_base_get_request_t req_get; + uint64_t *p_dst; + struct mca_spml_yoda_getreq_parent *parent; + mca_spml_yoda_rdma_frag_t get_frag; +}; + +typedef struct mca_spml_yoda_get_request_t mca_spml_yoda_get_request_t; +OBJ_CLASS_DECLARATION(mca_spml_yoda_get_request_t); + +static inline mca_spml_yoda_get_request_t *mca_spml_yoda_getreq_alloc(int dst) +{ + ompi_free_list_item_t *item; + mca_spml_yoda_get_request_t *getreq; + + OMPI_FREE_LIST_WAIT_MT(&mca_spml_base_get_requests, item); + getreq = (mca_spml_yoda_get_request_t*) item; + assert(getreq); + getreq->req_get.req_base.req_free_called = false; + getreq->req_get.req_base.req_oshmem.req_complete = false; + + return getreq; +} + +void mca_spml_yoda_get_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status); + +void mca_spml_yoda_get_response_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status); + +END_C_DECLS +#endif /* OSHMEM_SPML_YODA_GET_REQUEST_H */ diff --git a/oshmem/mca/spml/yoda/spml_yoda_putreq.c b/oshmem/mca/spml/yoda/spml_yoda_putreq.c new file mode 100644 index 0000000000..4a2c9da2ab --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_putreq.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "opal/prefetch.h" +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "ompi/mca/btl/btl.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/bml/base/base.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/runtime/runtime.h" +/* + * The free call mark the final stage in a request life-cycle. Starting from this + * point the request is completed at both SPML and user level, and can be used + * for others p2p communications. Therefore, in the case of the YODA SPML it should + * be added to the free request list. + */ +static int mca_spml_yoda_put_request_free(struct oshmem_request_t** request) +{ + mca_spml_yoda_put_request_t* putreq = + *(mca_spml_yoda_put_request_t**) request; + + assert( false == putreq->req_put.req_base.req_free_called); + + OPAL_THREAD_LOCK(&oshmem_request_lock); + putreq->req_put.req_base.req_free_called = true; + OMPI_FREE_LIST_RETURN_MT( &mca_spml_base_put_requests, + (ompi_free_list_item_t*)putreq); + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; + return OSHMEM_SUCCESS; +} + +static int mca_spml_yoda_put_request_cancel(struct oshmem_request_t* request, + int complete) +{ + /* we dont cancel put requests by now */ + return OSHMEM_SUCCESS; +} + +static void mca_spml_yoda_put_request_construct(mca_spml_yoda_put_request_t* req) +{ + req->req_put.req_base.req_type = MCA_SPML_REQUEST_PUT; + req->req_put.req_base.req_oshmem.req_free = mca_spml_yoda_put_request_free; + req->req_put.req_base.req_oshmem.req_cancel = + mca_spml_yoda_put_request_cancel; +} + +static void mca_spml_yoda_put_request_destruct(mca_spml_yoda_put_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_yoda_put_request_t, + mca_spml_base_put_request_t, + mca_spml_yoda_put_request_construct, + mca_spml_yoda_put_request_destruct); + +void mca_spml_yoda_put_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_spml_yoda_rdma_frag_t* frag = + (mca_spml_yoda_rdma_frag_t*) des->des_cbdata; + mca_spml_yoda_put_request_t* putreq = + (mca_spml_yoda_put_request_t*) frag->rdma_req; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, -1); + /* check completion status */ + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != status)) { + /* no way to propagete errors. die */ + SPML_ERROR("FATAL put completion error"); + oshmem_shmem_abort(-1); + } + + putreq->req_put.req_base.req_spml_complete = true; + oshmem_request_complete(&putreq->req_put.req_base.req_oshmem, 1); + oshmem_request_free((oshmem_request_t**) &putreq); + mca_bml_base_free(bml_btl, des); +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda_putreq.h b/oshmem/mca/spml/yoda/spml_yoda_putreq.h new file mode 100644 index 0000000000..898d971a53 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_putreq.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SPML_YODA_PUT_REQUEST_H +#define OSHMEM_SPML_YODA_PUT_REQUEST_H + +#include "ompi/mca/btl/btl.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/bml/bml.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "orte/runtime/orte_globals.h" + +BEGIN_C_DECLS + +struct mca_spml_yoda_put_request_t { + mca_spml_base_put_request_t req_put; + mca_spml_yoda_rdma_frag_t put_frag; +}; + +typedef struct mca_spml_yoda_put_request_t mca_spml_yoda_put_request_t; + +OBJ_CLASS_DECLARATION(mca_spml_yoda_put_request_t); + +static inline mca_spml_yoda_put_request_t *mca_spml_yoda_putreq_alloc(int dst) +{ + ompi_free_list_item_t *item; + mca_spml_yoda_put_request_t *putreq; + + OMPI_FREE_LIST_WAIT_MT(&mca_spml_base_put_requests, item); + putreq = (mca_spml_yoda_put_request_t*) item; + assert(putreq); + putreq->req_put.req_base.req_free_called = false; + putreq->req_put.req_base.req_oshmem.req_complete = false; + + return putreq; +} + +void mca_spml_yoda_put_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status); + +END_C_DECLS + +#endif /* OSHMEM_SPML_YODA_PUT_REQUEST_H */ diff --git a/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h b/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h new file mode 100644 index 0000000000..2a36ea7129 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_RDMAFRAG_H +#define MCA_SPML_YODA_RDMAFRAG_H + +#include "ompi/mca/btl/btl.h" +#include "opal/types.h" +#include "opal/util/arch.h" +#include "oshmem/proc/proc.h" + +BEGIN_C_DECLS + +typedef enum { + MCA_SPML_YODA_RDMA_PUT, + MCA_SPML_YODA_RDMA_GET +} mca_spml_yoda_rdma_state_t; + +typedef union mca_spml_yoda_segment_t { + mca_btl_base_segment_t base_seg; +} mca_spml_yoda_segment_t; + +struct mca_spml_yoda_rdma_frag_t { + mca_spml_yoda_segment_t rdma_segs[2]; + mca_btl_base_segment_t *btl_seg; /* save pointer to btl allocated descriptor segment */ + void *rdma_req; + int allocated; + int use_send; + int size; +}; + +typedef struct mca_spml_yoda_rdma_frag_t mca_spml_yoda_rdma_frag_t; +END_C_DECLS +#endif + diff --git a/oshmem/op/Makefile.am b/oshmem/op/Makefile.am new file mode 100644 index 0000000000..ba7223c28f --- /dev/null +++ b/oshmem/op/Makefile.am @@ -0,0 +1,19 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# # $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +# This makefile.am does not stand on its own - it is included from oshmem/Makefile.am + + +headers += \ + op/op.h + +libshmem_la_SOURCES += \ + op/op.c + diff --git a/oshmem/op/op.c b/oshmem/op/op.c new file mode 100644 index 0000000000..0da8fe29e0 --- /dev/null +++ b/oshmem/op/op.c @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "orte/runtime/orte_globals.h" + +#include "opal/datatype/opal_datatype_internal.h" +#include "opal/class/opal_pointer_array.h" + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" + +/* + * Table for op handle conversion + */ +opal_pointer_array_t oshmem_op_array; + +/* + * Class information + */ +static void oshmem_op_construct(oshmem_op_t *object); +static void oshmem_op_destruct(oshmem_op_t *object); + +/* + * Class instance + */ +OBJ_CLASS_INSTANCE(oshmem_op_t, + opal_object_t, + oshmem_op_construct, + oshmem_op_destruct); + +/* + * Intrinsic Operation objects + */ +/* Bitwise AND */ +oshmem_op_t* oshmem_op_and_short = NULL; +oshmem_op_t* oshmem_op_and_int = NULL; +oshmem_op_t* oshmem_op_and_long = NULL; +oshmem_op_t* oshmem_op_and_longlong = NULL; +oshmem_op_t* oshmem_op_and_fint4 = NULL; +oshmem_op_t* oshmem_op_and_fint8 = NULL; + +/* Bitwise OR */ +oshmem_op_t* oshmem_op_or_short = NULL; +oshmem_op_t* oshmem_op_or_int = NULL; +oshmem_op_t* oshmem_op_or_long = NULL; +oshmem_op_t* oshmem_op_or_longlong = NULL; +oshmem_op_t* oshmem_op_or_fint4 = NULL; +oshmem_op_t* oshmem_op_or_fint8 = NULL; + +/* Bitwise XOR */ +oshmem_op_t* oshmem_op_xor_short = NULL; +oshmem_op_t* oshmem_op_xor_int = NULL; +oshmem_op_t* oshmem_op_xor_long = NULL; +oshmem_op_t* oshmem_op_xor_longlong = NULL; +oshmem_op_t* oshmem_op_xor_fint4 = NULL; +oshmem_op_t* oshmem_op_xor_fint8 = NULL; + +/* MAX */ +oshmem_op_t* oshmem_op_max_short = NULL; +oshmem_op_t* oshmem_op_max_int = NULL; +oshmem_op_t* oshmem_op_max_long = NULL; +oshmem_op_t* oshmem_op_max_longlong = NULL; +oshmem_op_t* oshmem_op_max_float = NULL; +oshmem_op_t* oshmem_op_max_double = NULL; +oshmem_op_t* oshmem_op_max_longdouble = NULL; +oshmem_op_t* oshmem_op_max_fint4 = NULL; +oshmem_op_t* oshmem_op_max_fint8 = NULL; +oshmem_op_t* oshmem_op_max_freal4 = NULL; +oshmem_op_t* oshmem_op_max_freal8 = NULL; +oshmem_op_t* oshmem_op_max_freal16 = NULL; + +/* MIN */ +oshmem_op_t* oshmem_op_min_short = NULL; +oshmem_op_t* oshmem_op_min_int = NULL; +oshmem_op_t* oshmem_op_min_long = NULL; +oshmem_op_t* oshmem_op_min_longlong = NULL; +oshmem_op_t* oshmem_op_min_float = NULL; +oshmem_op_t* oshmem_op_min_double = NULL; +oshmem_op_t* oshmem_op_min_longdouble = NULL; +oshmem_op_t* oshmem_op_min_fint4 = NULL; +oshmem_op_t* oshmem_op_min_fint8 = NULL; +oshmem_op_t* oshmem_op_min_freal4 = NULL; +oshmem_op_t* oshmem_op_min_freal8 = NULL; +oshmem_op_t* oshmem_op_min_freal16 = NULL; + +/* SUM */ +oshmem_op_t* oshmem_op_sum_short = NULL; +oshmem_op_t* oshmem_op_sum_int = NULL; +oshmem_op_t* oshmem_op_sum_long = NULL; +oshmem_op_t* oshmem_op_sum_longlong = NULL; +oshmem_op_t* oshmem_op_sum_float = NULL; +oshmem_op_t* oshmem_op_sum_double = NULL; +oshmem_op_t* oshmem_op_sum_longdouble = NULL; +oshmem_op_t* oshmem_op_sum_complexf = NULL; +oshmem_op_t* oshmem_op_sum_complexd = NULL; +oshmem_op_t* oshmem_op_sum_fint4 = NULL; +oshmem_op_t* oshmem_op_sum_fint8 = NULL; +oshmem_op_t* oshmem_op_sum_freal4 = NULL; +oshmem_op_t* oshmem_op_sum_freal8 = NULL; +oshmem_op_t* oshmem_op_sum_freal16 = NULL; + +/* PROD */ +oshmem_op_t* oshmem_op_prod_short = NULL; +oshmem_op_t* oshmem_op_prod_int = NULL; +oshmem_op_t* oshmem_op_prod_long = NULL; +oshmem_op_t* oshmem_op_prod_longlong = NULL; +oshmem_op_t* oshmem_op_prod_float = NULL; +oshmem_op_t* oshmem_op_prod_double = NULL; +oshmem_op_t* oshmem_op_prod_longdouble = NULL; +oshmem_op_t* oshmem_op_prod_complexf = NULL; +oshmem_op_t* oshmem_op_prod_complexd = NULL; +oshmem_op_t* oshmem_op_prod_fint4 = NULL; +oshmem_op_t* oshmem_op_prod_fint8 = NULL; +oshmem_op_t* oshmem_op_prod_freal4 = NULL; +oshmem_op_t* oshmem_op_prod_freal8 = NULL; +oshmem_op_t* oshmem_op_prod_freal16 = NULL; + +#define FUNC_OP_CREATE(name, type_name, type, calc) \ + void oshmem_op_##name##_##type_name##_func(void *in, void *out, int count); \ + void oshmem_op_##name##_##type_name##_func(void *in, void *out, int count) \ + { \ + int i; \ + type *a = (type *) in; \ + type *b = (type *) out; \ + for (i = 0; i < count; ++i) { \ + *(b) = calc(*(b), *(a)); \ + ++b; \ + ++a; \ + } \ + } + +#define OBJ_OP_CREATE(name, type_name, type, op_id, dt_id) \ + oshmem_op_##name##_##type_name = OBJ_NEW(oshmem_op_t); \ + if (oshmem_op_##name##_##type_name) \ + { \ + oshmem_op_##name##_##type_name->op = op_id; \ + oshmem_op_##name##_##type_name->dt = dt_id; \ + oshmem_op_##name##_##type_name->dt_size = sizeof(type); \ + oshmem_op_##name##_##type_name->o_func.c_fn = oshmem_op_##name##_##type_name##_func; \ + } \ + +/* Bitwise AND */ +#define __and_op(a, b) ((a) & (b)) +FUNC_OP_CREATE(and, short, short, __and_op); +FUNC_OP_CREATE(and, int, int, __and_op); +FUNC_OP_CREATE(and, long, long, __and_op); +FUNC_OP_CREATE(and, longlong, long long, __and_op); +FUNC_OP_CREATE(and, fint4, ompi_fortran_integer4_t, __and_op); +FUNC_OP_CREATE(and, fint8, ompi_fortran_integer8_t, __and_op); + +/* Bitwise OR */ +#define __or_op(a, b) ((a) | (b)) +FUNC_OP_CREATE(or, short, short, __or_op); +FUNC_OP_CREATE(or, int, int, __or_op); +FUNC_OP_CREATE(or, long, long, __or_op); +FUNC_OP_CREATE(or, longlong, long long, __or_op); +FUNC_OP_CREATE(or, fint4, ompi_fortran_integer4_t, __or_op); +FUNC_OP_CREATE(or, fint8, ompi_fortran_integer8_t, __or_op); + +/* Bitwise XOR */ +#define __xor_op(a, b) ((a) ^ (b)) +FUNC_OP_CREATE(xor, short, short, __xor_op); +FUNC_OP_CREATE(xor, int, int, __xor_op); +FUNC_OP_CREATE(xor, long, long, __xor_op); +FUNC_OP_CREATE(xor, longlong, long long, __xor_op); +FUNC_OP_CREATE(xor, fint4, ompi_fortran_integer4_t, __xor_op); +FUNC_OP_CREATE(xor, fint8, ompi_fortran_integer8_t, __xor_op); + +/* MAX */ +#define __max_op(a, b) ((a) > (b) ? (a) : (b)) +FUNC_OP_CREATE(max, short, short, __max_op); +FUNC_OP_CREATE(max, int, int, __max_op); +FUNC_OP_CREATE(max, long, long, __max_op); +FUNC_OP_CREATE(max, longlong, long long, __max_op); +FUNC_OP_CREATE(max, float, float, __max_op); +FUNC_OP_CREATE(max, double, double, __max_op); +FUNC_OP_CREATE(max, longdouble, long double, __max_op); +FUNC_OP_CREATE(max, fint4, ompi_fortran_integer4_t, __max_op); +FUNC_OP_CREATE(max, fint8, ompi_fortran_integer8_t, __max_op); +FUNC_OP_CREATE(max, freal4, ompi_fortran_real4_t, __max_op); +FUNC_OP_CREATE(max, freal8, ompi_fortran_real8_t, __max_op); +FUNC_OP_CREATE(max, freal16, ompi_fortran_real16_t, __max_op); + +/* MIN */ +#define __min_op(a, b) ((a) < (b) ? (a) : (b)) +FUNC_OP_CREATE(min, short, short, __min_op); +FUNC_OP_CREATE(min, int, int, __min_op); +FUNC_OP_CREATE(min, long, long, __min_op); +FUNC_OP_CREATE(min, longlong, long long, __min_op); +FUNC_OP_CREATE(min, float, float, __min_op); +FUNC_OP_CREATE(min, double, double, __min_op); +FUNC_OP_CREATE(min, longdouble, long double, __min_op); +FUNC_OP_CREATE(min, fint4, ompi_fortran_integer4_t, __min_op); +FUNC_OP_CREATE(min, fint8, ompi_fortran_integer8_t, __min_op); +FUNC_OP_CREATE(min, freal4, ompi_fortran_real4_t, __min_op); +FUNC_OP_CREATE(min, freal8, ompi_fortran_real8_t, __min_op); +FUNC_OP_CREATE(min, freal16, ompi_fortran_real16_t, __min_op); + +/* SUM */ +#define __sum_op(a, b) ((a) + (b)) +FUNC_OP_CREATE(sum, short, short, __sum_op); +FUNC_OP_CREATE(sum, int, int, __sum_op); +FUNC_OP_CREATE(sum, long, long, __sum_op); +FUNC_OP_CREATE(sum, longlong, long long, __sum_op); +FUNC_OP_CREATE(sum, float, float, __sum_op); +FUNC_OP_CREATE(sum, double, double, __sum_op); +FUNC_OP_CREATE(sum, longdouble, long double, __sum_op); +FUNC_OP_CREATE(sum, complexf, float complex, __sum_op); +FUNC_OP_CREATE(sum, complexd, double complex, __sum_op); +FUNC_OP_CREATE(sum, fint4, ompi_fortran_integer4_t, __sum_op); +FUNC_OP_CREATE(sum, fint8, ompi_fortran_integer8_t, __sum_op); +FUNC_OP_CREATE(sum, freal4, ompi_fortran_real4_t, __sum_op); +FUNC_OP_CREATE(sum, freal8, ompi_fortran_real8_t, __sum_op); +FUNC_OP_CREATE(sum, freal16, ompi_fortran_real16_t, __sum_op); + +/* PROD */ +#define __prod_op(a, b) ((a) * (b)) +FUNC_OP_CREATE(prod, short, short, __prod_op); +FUNC_OP_CREATE(prod, int, int, __prod_op); +FUNC_OP_CREATE(prod, long, long, __prod_op); +FUNC_OP_CREATE(prod, longlong, long long, __prod_op); +FUNC_OP_CREATE(prod, float, float, __prod_op); +FUNC_OP_CREATE(prod, double, double, __prod_op); +FUNC_OP_CREATE(prod, longdouble, long double, __prod_op); +FUNC_OP_CREATE(prod, complexf, float complex, __prod_op); +FUNC_OP_CREATE(prod, complexd, double complex, __prod_op); +FUNC_OP_CREATE(prod, fint4, ompi_fortran_integer4_t, __prod_op); +FUNC_OP_CREATE(prod, fint8, ompi_fortran_integer8_t, __prod_op); +FUNC_OP_CREATE(prod, freal4, ompi_fortran_real4_t, __prod_op); +FUNC_OP_CREATE(prod, freal8, ompi_fortran_real8_t, __prod_op); +FUNC_OP_CREATE(prod, freal16, ompi_fortran_real16_t, __prod_op); + +int oshmem_op_init(void) +{ + + /* Setup operation array */ + OBJ_CONSTRUCT(&oshmem_op_array, opal_pointer_array_t); + if (OPAL_SUCCESS + != opal_pointer_array_init(&oshmem_op_array, + 0, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1)) { + return OSHMEM_ERROR; + } + + /* Bitwise AND */ + OBJ_OP_CREATE(and, short, short, OSHMEM_OP_AND, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(and, int, int, OSHMEM_OP_AND, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(and, long, long, OSHMEM_OP_AND, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(and, longlong, long long, OSHMEM_OP_AND, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(and, fint4, ompi_fortran_integer4_t, OSHMEM_OP_AND, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(and, fint8, ompi_fortran_integer8_t, OSHMEM_OP_AND, OSHMEM_OP_TYPE_FINT8); + + /* Bitwise OR */ + OBJ_OP_CREATE(or, short, short, OSHMEM_OP_OR, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(or, int, int, OSHMEM_OP_OR, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(or, long, long, OSHMEM_OP_OR, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(or, longlong, long long, OSHMEM_OP_OR, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(or, fint4, ompi_fortran_integer4_t, OSHMEM_OP_OR, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(or, fint8, ompi_fortran_integer8_t, OSHMEM_OP_OR, OSHMEM_OP_TYPE_FINT8); + + /* Bitwise XOR */ + OBJ_OP_CREATE(xor, short, short, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(xor, int, int, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(xor, long, long, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(xor, longlong, long long, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(xor, fint4, ompi_fortran_integer4_t, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(xor, fint8, ompi_fortran_integer8_t, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_FINT8); + + /* MAX */ + OBJ_OP_CREATE(max, short, short, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(max, int, int, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(max, long, long, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(max, longlong, long long, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(max, float, float, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(max, double, double, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(max, longdouble, long double, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(max, fint4, ompi_fortran_integer4_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(max, fint8, ompi_fortran_integer8_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(max, freal4, ompi_fortran_real4_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(max, freal8, ompi_fortran_real8_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(max, freal16, ompi_fortran_real16_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FREAL16); + + /* MIN */ + OBJ_OP_CREATE(min, short, short, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(min, int, int, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(min, long, long, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(min, longlong, long long, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(min, float, float, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(min, double, double, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(min, longdouble, long double, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(min, fint4, ompi_fortran_integer4_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(min, fint8, ompi_fortran_integer8_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(min, freal4, ompi_fortran_real4_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(min, freal8, ompi_fortran_real8_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(min, freal16, ompi_fortran_real16_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FREAL16); + + /* SUM */ + OBJ_OP_CREATE(sum, short, short, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(sum, int, int, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(sum, long, long, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(sum, longlong, long long, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(sum, float, float, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(sum, double, double, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(sum, longdouble, long double, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(sum, complexf, float complex, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FCOMPLEX); + OBJ_OP_CREATE(sum, complexd, double complex, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_DCOMPLEX); + OBJ_OP_CREATE(sum, fint4, ompi_fortran_integer4_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(sum, fint8, ompi_fortran_integer8_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(sum, freal4, ompi_fortran_real4_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(sum, freal8, ompi_fortran_real8_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(sum, freal16, ompi_fortran_real16_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FREAL16); + + /* PROD */ + OBJ_OP_CREATE(prod, short, short, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(prod, int, int, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(prod, long, long, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(prod, longlong, long long, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(prod, float, float, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(prod, double, double, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(prod, longdouble, long double, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(prod, complexf, float complex, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FCOMPLEX); + OBJ_OP_CREATE(prod, complexd, double complex, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_DCOMPLEX); + OBJ_OP_CREATE(prod, fint4, ompi_fortran_integer4_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(prod, fint8, ompi_fortran_integer8_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(prod, freal4, ompi_fortran_real4_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(prod, freal8, ompi_fortran_real8_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(prod, freal16, ompi_fortran_real16_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FREAL16); + + return OSHMEM_SUCCESS; +} + +int oshmem_op_finalize(void) +{ + int max, i; + oshmem_op_t *op; + + /* Check whether we have some left */ + max = opal_pointer_array_get_size(&oshmem_op_array); + for (i = 0; i < max; i++) { + op = (oshmem_op_t *) opal_pointer_array_get_item(&oshmem_op_array, i); + if (NULL != op) { + OBJ_RELEASE(op); + } + } + + OBJ_DESTRUCT(&oshmem_op_array); + + return OSHMEM_SUCCESS; +} + +/************************************************************************** + * + * Static functions + * + **************************************************************************/ + +/* + * Op constructor + */ +static void oshmem_op_construct(oshmem_op_t *object) +{ + object->id = opal_pointer_array_add(&oshmem_op_array, object); +} + +/* + * Op destructor + */ +static void oshmem_op_destruct(oshmem_op_t *object) +{ + if (NULL != opal_pointer_array_get_item(&oshmem_op_array, object->id)) { + opal_pointer_array_set_item(&oshmem_op_array, object->id, NULL ); + } +} diff --git a/oshmem/op/op.h b/oshmem/op/op.h new file mode 100644 index 0000000000..d102061f52 --- /dev/null +++ b/oshmem/op/op.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef OSHMEM_OP_H +#define OSHMEM_OP_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "oshmem/mca/scoll/scoll.h" + +#include "opal/class/opal_list.h" +#include "opal/dss/dss_types.h" + +#include "orte/types.h" + +BEGIN_C_DECLS + +/* ******************************************************************** */ + +/** + * Corresponding to the types that we can reduce over. + */ +enum { + OSHMEM_OP_TYPE_SHORT, /** C integer: short */ + OSHMEM_OP_TYPE_INT, /** C integer: int */ + OSHMEM_OP_TYPE_LONG, /** C integer: long */ + OSHMEM_OP_TYPE_LLONG, /** C integer: long long */ + + OSHMEM_OP_TYPE_FLOAT, /** Floating point: float */ + OSHMEM_OP_TYPE_DOUBLE, /** Floating point: double */ + OSHMEM_OP_TYPE_LDOUBLE, /** Floating point: long double */ + + OSHMEM_OP_TYPE_FCOMPLEX, /** Complex: float */ + OSHMEM_OP_TYPE_DCOMPLEX, /** Complex: double */ + + OSHMEM_OP_TYPE_FINT4, /** Fortran integer: int4 */ + OSHMEM_OP_TYPE_FINT8, /** Fortran integer: int8 */ + OSHMEM_OP_TYPE_FREAL4, /** Fortran integer: real4 */ + OSHMEM_OP_TYPE_FREAL8, /** Fortran integer: real8 */ + OSHMEM_OP_TYPE_FREAL16, /** Fortran integer: real16 */ + + /** Maximum type */ + OSHMEM_OP_TYPE_NUMBER +}; + +/** + * Supported reduce operations. + */ +enum { + OSHMEM_OP_AND, /** AND */ + OSHMEM_OP_OR, /** OR */ + OSHMEM_OP_XOR, /** XOR */ + OSHMEM_OP_MAX, /** MAX */ + OSHMEM_OP_MIN, /** MIN */ + OSHMEM_OP_SUM, /** SUM */ + OSHMEM_OP_PROD, /** PROD */ + + /** Maximum operation */ + OSHMEM_OP_NUMBER +}; + +typedef void (oshmem_op_c_handler_fn_t)(void *, void *, int); + +/** + * Back-end type of OSHMEM reduction operations + */ +struct oshmem_op_t { + opal_object_t base; + int id; /**< index in global array */ + int op; /**< operation type */ + int dt; /**< datatype */ + size_t dt_size; /**< datatype size */ + union { + /** C handler function pointer */ + oshmem_op_c_handler_fn_t *c_fn; + } o_func; +}; +typedef struct oshmem_op_t oshmem_op_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_op_t); + +/* Bitwise AND */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_fint8; + +/* Bitwise OR */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_fint8; + +/* Bitwise XOR */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_fint8; + +/* MAX */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_freal16; + +/* MIN */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_freal16; + +/* SUM */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_complexf; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_complexd; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_freal16; + +/* PROD */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_complexf; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_complexd; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_freal16; + +/** + * Initialize the op interface. + * + * @returns OSHMEM_SUCCESS Upon success + * @returns OSHMEM_ERROR Otherwise + * + * Invoked from oshmem_shmem_init(); sets up the op interface, creates + * the predefined operations. + */ +int oshmem_op_init(void); + +/** + * Finalize the op interface. + * + * @returns OSHMEM_SUCCESS Always + * + * Invokes from oshmem_shmem_finalize(); tears down the op interface. + */ +int oshmem_op_finalize(void); + +END_C_DECLS + +#endif /* OSHMEM_OP_H */ diff --git a/oshmem/proc/Makefile.am b/oshmem/proc/Makefile.am new file mode 100644 index 0000000000..a8c3e681a4 --- /dev/null +++ b/oshmem/proc/Makefile.am @@ -0,0 +1,21 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# # $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +# This makefile.am does not stand on its own - it is included from oshmem/Makefile.am + + +headers += \ + proc/proc.h \ + proc/proc_group_cache.h + +libshmem_la_SOURCES += \ + proc/proc.c \ + proc/proc_group_cache.c + diff --git a/oshmem/proc/proc.c b/oshmem/proc/proc.c new file mode 100644 index 0000000000..df4e276a54 --- /dev/null +++ b/oshmem/proc/proc.c @@ -0,0 +1,785 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/proc/proc.h" +#include "oshmem/constants.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/mca/scoll/base/base.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/util/proc_info.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" + +#include "opal/datatype/opal_convertor.h" +#include "opal/threads/mutex.h" +#include "opal/dss/dss.h" +#include "opal/util/arch.h" +#include "opal/class/opal_list.h" + +#include "ompi/proc/proc.h" + +opal_convertor_t* oshmem_shmem_local_convertor = NULL; + +opal_list_t oshmem_proc_list; +static opal_mutex_t oshmem_proc_lock; +oshmem_proc_t* oshmem_proc_local_proc = NULL; + +static void oshmem_proc_construct(oshmem_proc_t* proc); +static void oshmem_proc_destruct(oshmem_proc_t* proc); + +OBJ_CLASS_INSTANCE( oshmem_proc_t, + opal_list_item_t, + oshmem_proc_construct, + oshmem_proc_destruct); + +void oshmem_proc_construct(oshmem_proc_t* proc) +{ + bzero(proc->proc_endpoints, sizeof(proc->proc_endpoints)); + + /* By default all processors are supposedly having the same architecture as me. Thus, + * by default we run in a homogeneous environment. Later, when the RTE can tell us + * the arch of the remote nodes, we will have to set the convertors to the correct + * architecture. + */ + proc->proc_arch = opal_local_arch; + proc->proc_convertor = oshmem_shmem_local_convertor; + OBJ_RETAIN( oshmem_shmem_local_convertor); + + proc->proc_flags = 0; + proc->num_transports = 0; + + /* initialize this pointer to NULL */ + proc->proc_hostname = NULL; +} + +void oshmem_proc_destruct(oshmem_proc_t* proc) +{ + /* As all the convertors are created with OBJ_NEW we can just call OBJ_RELEASE. All, except + * the local convertor, will get destroyed at some point here. If the reference count is correct + * the local convertor (who has the reference count increased in the datatype) will not get + * destroyed here. It will be destroyed later when the ompi_datatype_finalize is called. + */ + OBJ_RELEASE( proc->proc_convertor); + + /* DO NOT FREE THE HOSTNAME FIELD AS THIS POINTS + * TO AN AREA ALLOCATED/FREE'D ELSEWHERE + */ + OPAL_THREAD_LOCK(&oshmem_proc_lock); + opal_list_remove_item(&oshmem_proc_list, (opal_list_item_t*) proc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); +} + +int oshmem_proc_init(void) +{ + orte_vpid_t i; + + OBJ_CONSTRUCT(&oshmem_proc_list, opal_list_t); + OBJ_CONSTRUCT(&oshmem_proc_lock, opal_mutex_t); + oshmem_shmem_local_convertor = opal_convertor_create(opal_local_arch, 0); + + size_t ompi_num_procs; + ompi_proc_t **ompi_procs = ompi_proc_world(&ompi_num_procs); + /* create proc structures and find self */ + for (i = 0; i < orte_process_info.num_procs; i++) { + oshmem_proc_t *proc = OBJ_NEW(oshmem_proc_t); + opal_list_append(&oshmem_proc_list, (opal_list_item_t*)proc); + + proc->proc_name.jobid = ompi_procs[i]->proc_name.jobid; + proc->proc_name.vpid = ompi_procs[i]->proc_name.vpid; + proc->proc_arch = ompi_procs[i]->proc_arch; + proc->proc_flags = ompi_procs[i]->proc_flags; + proc->proc_hostname = ompi_procs[i]->proc_hostname; + + if (i == ORTE_PROC_MY_NAME->vpid) { + oshmem_proc_local_proc = proc; + } + } + + if (ompi_procs) + free(ompi_procs); + + return OSHMEM_SUCCESS; +} + +/* in some cases, all PE procs are required to do a modex so they + * can (at the least) exchange their architecture. Since we cannot + * know in advance if this was required, we provide a separate function + * to set the arch (instead of doing it inside of oshmem_proc_init) that + * can be called after the modex completes in oshmem_shmem_init. Thus, we + * know that - regardless of how the arch is known, whether via modex + * or dropped in from a local daemon - the arch can be set correctly + * at this time + */ +int oshmem_proc_set_arch(void) +{ + oshmem_proc_t *proc = NULL; + opal_list_item_t *item = NULL; + int ret = OSHMEM_SUCCESS; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + for (item = opal_list_get_first(&oshmem_proc_list); + item != opal_list_get_end(&oshmem_proc_list); + item = opal_list_get_next(item)) { + proc = (oshmem_proc_t*) item; + + if (proc->proc_name.vpid != ORTE_PROC_MY_NAME->vpid) { + /* if arch is different than mine, create a new convertor for this proc */ + if (proc->proc_arch != opal_local_arch) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + OBJ_RELEASE(proc->proc_convertor); + proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0); +#else + orte_show_help("help-shmem-runtime.txt", + "heterogeneous-support-unavailable", + true, + orte_process_info.nodename, + proc->proc_hostname == NULL ? + "" : + proc->proc_hostname); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return OSHMEM_ERR_NOT_SUPPORTED; +#endif + } + } + } + + /* Set predefined groups */ + ret = oshmem_proc_group_init(); + + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return ret; +} + +int oshmem_proc_finalize(void) +{ + opal_list_item_t *item; + + /* Destroy all groups */ + oshmem_proc_group_finalize(); + + /* remove all items from list and destroy them. Since we cannot know + * the reference count of the procs for certain, it is possible that + * a single OBJ_RELEASE won't drive the count to zero, and hence will + * not release the memory. Accordingly, we cycle through the list here, + * calling release on each item. + * + * This will cycle until it forces the reference count of each item + * to zero, thus causing the destructor to run - which will remove + * the item from the list! + * + * We cannot do this under the thread lock as the destructor will + * call it when removing the item from the list. However, this function + * is ONLY called from MPI_Finalize, and all threads are prohibited from + * calling an MPI function once ANY thread has called MPI_Finalize. Of + * course, multiple threads are allowed to call MPI_Finalize, so this + * function may get called multiple times by various threads. We believe + * it is thread safe to do so...though it may not -appear- to be so + * without walking through the entire list/destructor sequence. + */ + while (opal_list_get_end(&oshmem_proc_list) + != (item = opal_list_get_first(&oshmem_proc_list))) { + OBJ_RELEASE(item); + } + OBJ_RELEASE( oshmem_shmem_local_convertor); + /* now destruct the list and thread lock */ + OBJ_DESTRUCT(&oshmem_proc_list); + OBJ_DESTRUCT(&oshmem_proc_lock); + + return OSHMEM_SUCCESS; +} + +oshmem_proc_t** oshmem_proc_world(size_t *size) +{ + oshmem_proc_t **procs; + oshmem_proc_t *proc; + size_t count = 0; + orte_ns_cmp_bitmask_t mask; + orte_process_name_t my_name; + + /* check bozo case */ + if (NULL == oshmem_proc_local_proc) { + return NULL ; + } + mask = ORTE_NS_CMP_JOBID; + my_name = oshmem_proc_local_proc->proc_name; + + /* First count how many match this jobid */ + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*) opal_list_get_next(proc)) { + if (OPAL_EQUAL + == orte_util_compare_name_fields(mask, + &proc->proc_name, + &my_name)) { + ++count; + } + } + + /* allocate an array */ + procs = (oshmem_proc_t**) malloc(count * sizeof(oshmem_proc_t*)); + if (NULL == procs) { + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return NULL ; + } + + /* now save only the procs that match this jobid */ + count = 0; + for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*) opal_list_get_next(proc)) { + if (OPAL_EQUAL + == orte_util_compare_name_fields(mask, + &proc->proc_name, + &my_name)) { + /* DO NOT RETAIN THIS OBJECT - the reference count on this + * object will be adjusted by external callers. The intent + * here is to allow the reference count to drop to zero if + * the app no longer desires to communicate with this proc. + * For example, the proc may call comm_disconnect on all + * communicators involving this proc. In such cases, we want + * the proc object to be removed from the list. By not incrementing + * the reference count here, we allow this to occur. + * + * We don't implement that yet, but we are still safe for now as + * the OBJ_NEW in oshmem_proc_init owns the initial reference + * count which cannot be released until oshmem_proc_finalize is + * called. + */ + procs[count++] = proc; + } + } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + *size = count; + return procs; +} + +oshmem_proc_t** oshmem_proc_all(size_t* size) +{ + oshmem_proc_t **procs = + (oshmem_proc_t**) malloc(opal_list_get_size(&oshmem_proc_list) + * sizeof(oshmem_proc_t*)); + oshmem_proc_t *proc; + size_t count = 0; + + if (NULL == procs) { + return NULL ; + } + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); + ((proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list)) + && (proc != NULL )); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + /* We know this isn't consistent with the behavior in oshmem_proc_world, + * but we are leaving the RETAIN for now because the code using this function + * assumes that the results need to be released when done. It will + * be cleaned up later as the "fix" will impact other places in + * the code + */ + OBJ_RETAIN(proc); + procs[count++] = proc; + } + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + *size = count; + + return procs; +} + +oshmem_proc_t** oshmem_proc_self(size_t* size) +{ + oshmem_proc_t **procs = (oshmem_proc_t**) malloc(sizeof(oshmem_proc_t*)); + if (NULL == procs) { + return NULL ; + } + /* We know this isn't consistent with the behavior in oshmem_proc_world, + * but we are leaving the RETAIN for now because the code using this function + * assumes that the results need to be released when done. It will + * be cleaned up later as the "fix" will impact other places in + * the code + */ + OBJ_RETAIN(oshmem_proc_local_proc); + + *procs = oshmem_proc_local_proc; + *size = 1; + return procs; +} + +oshmem_proc_t * oshmem_proc_find(const orte_process_name_t * name) +{ + oshmem_proc_t *proc, *rproc = NULL; + orte_ns_cmp_bitmask_t mask; + + /* return the proc-struct which matches this jobid+process id */ + mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*) opal_list_get_next(proc)) { + if (OPAL_EQUAL + == orte_util_compare_name_fields(mask, + &proc->proc_name, + name)) { + rproc = proc; + break; + } + } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return rproc; +} + +int oshmem_proc_refresh(void) +{ + oshmem_proc_t *proc = NULL; + opal_list_item_t *item = NULL; + orte_vpid_t i = 0; + int hostname_length = 0; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + for (item = opal_list_get_first(&oshmem_proc_list), i = 0; + item != opal_list_get_end(&oshmem_proc_list); + item = opal_list_get_next(item), ++i) { + proc = (oshmem_proc_t*) item; + + /* Does not change: proc->proc_name.vpid */ + proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; + + /* Make sure to clear the local flag before we set it below */ + proc->proc_flags = 0; + + proc->proc_arch = opal_local_arch; + oshmem_shmem_exchange_bcast(&proc->proc_arch, + sizeof(uint32_t), + i); + + hostname_length = strlen(orte_process_info.nodename); + oshmem_shmem_exchange_bcast(&hostname_length, + sizeof(int), + i); + + if (proc->proc_hostname) + free(proc->proc_hostname); + + proc->proc_hostname = ( + i == ORTE_PROC_MY_NAME->vpid ? + strdup(orte_process_info.nodename) : + (char *) malloc(hostname_length)); + oshmem_shmem_exchange_bcast(proc->proc_hostname, + hostname_length, + i); + + if (i == ORTE_PROC_MY_NAME->vpid) { + oshmem_proc_local_proc = proc; + } else { + /* if arch is different than mine, create a new convertor for this proc */ + if (proc->proc_arch != opal_local_arch) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + OBJ_RELEASE(proc->proc_convertor); + proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0); +#else + orte_show_help("help-shmem-runtime.txt", + "heterogeneous-support-unavailable", + true, + orte_process_info.nodename, + proc->proc_hostname == NULL ? + "" : + proc->proc_hostname); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return OSHMEM_ERR_NOT_SUPPORTED; +#endif + } + } + } + + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return OSHMEM_SUCCESS; +} + +int oshmem_proc_pack(oshmem_proc_t **proclist, + int proclistsize, + opal_buffer_t* buf) +{ + int i, rc; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + /* cycle through the provided array, packing the OSHMEM level + * data for each proc. This data may or may not be included + * in any subsequent modex operation, so we include it here + * to ensure completion of a connect/accept handshake. See + * the ompi/mca/dpm framework for an example of where and how + * this info is used. + * + * Eventually, we will review the procedures that call this + * function to see if duplication of communication can be + * reduced. For now, just go ahead and pack the info so it + * can be sent. + */ + for (i = 0; i < proclistsize; i++) { + rc = opal_dss.pack(buf, &(proclist[i]->proc_name), 1, ORTE_NAME); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return rc; + } + rc = opal_dss.pack(buf, &(proclist[i]->proc_arch), 1, OPAL_UINT32); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return rc; + } + rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return rc; + } + } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return OSHMEM_SUCCESS; +} + +static oshmem_proc_t * +oshmem_proc_find_and_add(const orte_process_name_t * name, bool* isnew) +{ + oshmem_proc_t *proc, *rproc = NULL; + orte_ns_cmp_bitmask_t mask; + + /* return the proc-struct which matches this jobid+process id */ + mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*) opal_list_get_next(proc)) { + if (OPAL_EQUAL + == orte_util_compare_name_fields(mask, + &proc->proc_name, + name)) { + rproc = proc; + *isnew = false; + break; + } + } + + /* if we didn't find this proc in the list, create a new + * proc_t and append it to the list + */ + if (NULL == rproc) { + *isnew = true; + rproc = OBJ_NEW(oshmem_proc_t); + if (NULL != rproc) { + opal_list_append(&oshmem_proc_list, (opal_list_item_t*)rproc); + rproc->proc_name = *name; + } + /* caller had better fill in the rest of the proc, or there's + going to be pain later... */ + } + + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return rproc; +} + +int oshmem_proc_unpack(opal_buffer_t* buf, + int proclistsize, + oshmem_proc_t ***proclist, + int *newproclistsize, + oshmem_proc_t ***newproclist) +{ + int i; + size_t newprocs_len = 0; + oshmem_proc_t **plist = NULL, **newprocs = NULL; + + /* do not free plist *ever*, since it is used in the remote group + structure of a communicator */ + plist = (oshmem_proc_t **) calloc(proclistsize, sizeof(oshmem_proc_t *)); + if (NULL == plist) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + /* free this on the way out */ + newprocs = (oshmem_proc_t **) calloc(proclistsize, sizeof(oshmem_proc_t *)); + if (NULL == newprocs) { + free(plist); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + /* cycle through the array of provided procs and unpack + * their info - as packed by oshmem_proc_pack + */ + for (i = 0; i < proclistsize; i++) { + orte_std_cntr_t count = 1; + orte_process_name_t new_name; + uint32_t new_arch; + char *new_hostname; + bool isnew = false; + int rc; + + rc = opal_dss.unpack(buf, &new_name, &count, ORTE_NAME); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + free(plist); + free(newprocs); + return rc; + } + rc = opal_dss.unpack(buf, &new_arch, &count, OPAL_UINT32); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + free(plist); + free(newprocs); + return rc; + } + rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + free(plist); + free(newprocs); + return rc; + } + + /* see if this proc is already on our oshmem_proc_list */ + plist[i] = oshmem_proc_find_and_add(&new_name, &isnew); + if (isnew) { + /* if not, then it was added, so update the values + * in the proc_t struct with the info that was passed + * to us + */ + newprocs[newprocs_len++] = plist[i]; + + /* update all the values */ + plist[i]->proc_arch = new_arch; + /* if arch is different than mine, create a new convertor for this proc */ + if (plist[i]->proc_arch != opal_local_arch) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + OBJ_RELEASE(plist[i]->proc_convertor); + plist[i]->proc_convertor = opal_convertor_create(plist[i]->proc_arch, 0); +#else + orte_show_help("help-shmem-runtime.txt", + "heterogeneous-support-unavailable", + true, + orte_process_info.nodename, + new_hostname == NULL ? "" : + new_hostname); + free(plist); + free(newprocs); + return OSHMEM_ERR_NOT_SUPPORTED; +#endif + } + if (0 + == strcmp(oshmem_proc_local_proc->proc_hostname, + new_hostname)) { + plist[i]->proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU + | OPAL_PROC_ON_CLUSTER); + } + + /* Save the hostname */ + plist[i]->proc_hostname = new_hostname; + + /* eventually, we will update the orte/mca/ess framework's data + * to contain the info for the new proc. For now, we ignore + * this step since the MPI layer already has all the info + * it requires + */ + } + } + + if (NULL != newproclistsize) + *newproclistsize = newprocs_len; + if (NULL != newproclist) { + *newproclist = newprocs; + } else if (newprocs != NULL ) { + free(newprocs); + } + + *proclist = plist; + return OSHMEM_SUCCESS; +} + +opal_pointer_array_t oshmem_group_array; + +oshmem_group_t* oshmem_group_all = NULL; +oshmem_group_t* oshmem_group_self = NULL; +oshmem_group_t* oshmem_group_null = NULL; + +OBJ_CLASS_INSTANCE(oshmem_group_t, opal_object_t, NULL, NULL); + +OSHMEM_DECLSPEC int oshmem_proc_group_init(void) +{ + + /* Setup communicator array */ + OBJ_CONSTRUCT(&oshmem_group_array, opal_pointer_array_t); + if (OPAL_SUCCESS + != opal_pointer_array_init(&oshmem_group_array, + 0, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1)) { + return OSHMEM_ERROR; + } + + /* Setup SHMEM_GROUP_ALL */ + if (NULL + == (oshmem_group_all = + oshmem_proc_group_create(0, + 1, + opal_list_get_size(&oshmem_proc_list)))) { + oshmem_proc_group_destroy(oshmem_group_all); + return OSHMEM_ERROR; + } + + /* Setup SHMEM_GROUP_SELF */ + if (NULL + == (oshmem_group_self = oshmem_proc_group_create(oshmem_proc_local() + ->proc_name + .vpid, + 0, + 1))) { + oshmem_proc_group_destroy(oshmem_group_self); + return OSHMEM_ERROR; + } + + /* Setup SHMEM_GROUP_NULL */ + oshmem_group_null = NULL; + + return OSHMEM_SUCCESS; +} + +OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void) +{ + int max, i; + oshmem_group_t *group; + + /* Check whether we have some left */ + max = opal_pointer_array_get_size(&oshmem_group_array); + for (i = 0; i < max; i++) { + group = + (oshmem_group_t *) opal_pointer_array_get_item(&oshmem_group_array, + i); + if (NULL != group) { + /* Group has not been freed before finalize */ + oshmem_proc_group_destroy(group); + } + } + + OBJ_DESTRUCT(&oshmem_group_array); + + return OSHMEM_SUCCESS; +} + +OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, + int pe_stride, + size_t pe_size) +{ + oshmem_group_t* group = NULL; + + group = OBJ_NEW(oshmem_group_t); + + if (group) { + int cur_pe = 0; + int count_pe = 0; + oshmem_proc_t** proc_array = NULL; + oshmem_proc_t* proc = NULL; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + /* allocate an array */ + proc_array = (oshmem_proc_t**) malloc(pe_size * sizeof(oshmem_proc_t*)); + if (NULL == proc_array) { + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return NULL ; + } + + group->my_pe = oshmem_proc_local()->proc_name.vpid; + group->is_member = 0; + /* now save only the procs that match this jobid */ + for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*) opal_list_get_next(proc)) { + if (count_pe >= (int) pe_size) { + break; + } else if ((cur_pe >= pe_start) + && ((pe_stride == 0) + || (((cur_pe - pe_start) % pe_stride) == 0))) { + proc_array[count_pe++] = proc; + if (oshmem_proc_pe(proc) == group->my_pe) + group->is_member = 1; + } + cur_pe++; + } + group->proc_array = proc_array; + group->proc_count = (int) count_pe; + + /* Prepare peers list */ + OBJ_CONSTRUCT(&(group->peer_list), opal_list_t); + { + int i = 0; + orte_namelist_t *peer = NULL; + + for (i = 0; i < group->proc_count; i++) { + peer = OBJ_NEW(orte_namelist_t); + peer->name.jobid = group->proc_array[i]->proc_name.jobid; + peer->name.vpid = group->proc_array[i]->proc_name.vpid; + opal_list_append(&(group->peer_list), &peer->super); + } + } + group->id = opal_pointer_array_add(&oshmem_group_array, group); + + memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t)); + + if (OSHMEM_SUCCESS != mca_scoll_base_select(group)) { + opal_output(0, + "Error: No collective modules are available: group is not created, returning NULL"); + oshmem_proc_group_destroy(group); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return NULL ; + } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + } + + return group; +} + +OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group) +{ + if (group) { + mca_scoll_base_group_unselect(group); + + /* Destroy proc array */ + if (group->proc_array) { + free(group->proc_array); + } + + /* Destroy peer list */ + { + opal_list_item_t *item; + + while (NULL != (item = opal_list_remove_first(&(group->peer_list)))) { + /* destruct the item (we constructed it), then free the memory chunk */ + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&(group->peer_list)); + } + + /* reset the oshmem_group_array entry - make sure that the + * entry is in the table */ + if (NULL + != opal_pointer_array_get_item(&oshmem_group_array, + group->id)) { + opal_pointer_array_set_item(&oshmem_group_array, group->id, NULL ); + } + + OBJ_RELEASE(group); + } +} diff --git a/oshmem/proc/proc.h b/oshmem/proc/proc.h new file mode 100644 index 0000000000..868154b917 --- /dev/null +++ b/oshmem/proc/proc.h @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef OSHMEM_PROC_PROC_H +#define OSHMEM_PROC_PROC_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "oshmem/mca/scoll/scoll.h" + +#include "opal/class/opal_list.h" +#include "opal/dss/dss_types.h" +#include "opal/mca/hwloc/hwloc.h" + +#include "orte/types.h" +#include "orte/runtime/orte_globals.h" +#include "ompi/mca/bml/bml.h" + +BEGIN_C_DECLS + +/* ******************************************************************** */ + +struct oshmem_group_t; + +#define OSHMEM_PE_INVALID (-1) + +/** + * Remote Open SHMEM process structure + * + * Remote Open SHMEM process structure. Each process contains exactly + * one oshmem_proc_t structure for each remote process it knows about. + */ +struct oshmem_proc_t { + /** allow proc to be placed on a list */ + opal_list_item_t super; + /** this process' name */ + orte_process_name_t proc_name; + /* endpoint data */ + void *proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MAX]; + /** architecture of this process */ + uint32_t proc_arch; + /** flags for this proc */ + opal_hwloc_locality_t proc_flags; + /** Base convertor for the proc described by this process */ + struct opal_convertor_t *proc_convertor; + /** A pointer to the name of this host - data is + * actually stored in the RTE + */ + char *proc_hostname; + + /* + * All transport channels are globally ordered. + * pe(s) can talk to each other via subset of transports + * these holds indexes of each transport into global array + * proc -> id, where id can be btl id in yoda or mxm ptl id + * in ikrit + * spml is supposed to fill this during add_procs() + **/ + int num_transports; + char *transport_ids; +}; + +typedef struct oshmem_proc_t oshmem_proc_t; +OBJ_CLASS_DECLARATION(oshmem_proc_t); + +/** + * Group of Open SHMEM processes structure + * + * Set of processes used in collective operations. + */ +struct oshmem_group_t { + opal_object_t base; + int id; /**< index in global array */ + int my_pe; + int proc_count; /**< number of processes in group */ + int is_member; /* true if my_pe is part of the group, participate in collectives */ + struct oshmem_proc_t **proc_array; /**< list of pointers to ompi_proc_t structures + for each process in the group */ + opal_list_t peer_list; + + /* Collectives module interface and data */ + mca_scoll_base_group_scoll_t g_scoll; +}; +typedef struct oshmem_group_t oshmem_group_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_group_t); + +OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_all; +OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_self; +OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_null; + +/** + * @private + * + * Pointer to the oshmem_proc_t structure for the local process + * + * Pointer to the oshmem_proc_t structure for the local process. + * + * @note This pointer is declared here to allow inline functions + * within this header file to access the local process quickly. + * Please use oshmem_proc_local() instead. + */ +OSHMEM_DECLSPEC extern oshmem_proc_t* oshmem_proc_local_proc; + +/* ******************************************************************** */ + +/** + * Initialize the OSHMEM process subsystem + * + * Initialize the Open SHMEM process subsystem. This function will + * query the run-time environment and build a list of the proc + * instances in the current pe set. The local information not + * easily determined by the run-time ahead of time (architecture and + * hostname) will be published during this call. + * + * @note While an oshmem_proc_t will exist with mostly valid information + * for each process in the pe set at the conclusion of this + * call, some information will not be immediately available. This + * includes the architecture and hostname, which will be available by + * the conclusion of the stage gate. + * + * @retval OSHMEM_SUCESS System successfully initialized + * @retval OSHMEM_ERROR Initialization failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_init(void); + +/** + * Set the arch of each proc in the oshmem_proc_list + * + * In some environments, SHMEM procs are required to exchange their + * arch via a modex operation during mpi_init. In other environments, + * the arch is determined by other mechanisms and provided to the + * proc directly. To support both mechanisms, we provide a separate + * function to set the arch of the procs -after- the modex operation + * has completed in mpi_init. + * + * @retval OSHMEM_SUCCESS Archs successfully set + * @retval OSHMEM_ERROR Archs could not be initialized + */ +OSHMEM_DECLSPEC int oshmem_proc_set_arch(void); + +/** + * Finalize the OSHMEM Process subsystem + * + * Finalize the Open SHMEM process subsystem. This function will + * release all memory created during the life of the application, + * including all oshmem_proc_t structures. + * + * @retval OSHMEM_SUCCESS System successfully finalized + */ +OSHMEM_DECLSPEC int oshmem_proc_finalize(void); + +/** + * Returns the list of proc instances associated with this job. + * + * Returns the list of proc instances associated with this job. Given + * the current association between a job and an pe set, this + * function provides the process instances for the current + * pe set. + * + * @note The reference count of each process in the array is + * NOT incremented - the caller is responsible for ensuring the + * correctness of the reference count once they are done with + * the array. + * + * @param[in] size Number of processes in the oshmem_proc_t array + * + * @return Array of pointers to proc instances in the current + * pe set, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_world(size_t* size); + +/** + * Returns the list of all known proc instances. + * + * Returns the list of all known proc instances, including those in + * other pe sets. It is possible that we may no longer be + * connected to some of the procs returned (in the SHMEM sense of the + * word connected). In a strictly SHMEM-1 application, this function + * will return the same information as oshmem_proc_world(). + * + * @note The reference count of each process in the array is + * incremented and the caller is responsible for releasing each + * process in the array, as well as freeing the array. + * + * @param[in] size Number of processes in the oshmem_proc_t array + * + * @return Array of pointers to proc instances in the current + * known universe, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_all(size_t* size); + +/** + * Returns a list of the local process + * + * Returns a list containing the local process (and only the local + * process). Has calling semantics similar to oshmem_proc_world() and + * oshmem_proc_all(). + * + * @note The reference count of each process in the array is + * incremented and the caller is responsible for releasing each + * process in the array, as well as freeing the array. + * + * @param[in] size Number of processes in the oshmem_proc_t array + * + * @return Array of pointers to proc instances in the current + * known universe, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_self(size_t* size); + +/** + * Returns a pointer to the local process + * + * Returns a pointer to the local process. Unlike oshmem_proc_self(), + * the reference count on the local proc instance is not modified by + * this function. + * + * @return Pointer to the local process structure + */ +static inline oshmem_proc_t* oshmem_proc_local(void) +{ + return oshmem_proc_local_proc; +} + +/** + * Returns the proc instance for a given name + * + * Returns the proc instance for the specified process name. The + * reference count for the proc instance is not incremented by this + * function. + * + * @param[in] name The process name to look for + * + * @return Pointer to the process instance for \c name + */ +OSHMEM_DECLSPEC oshmem_proc_t * oshmem_proc_find(const orte_process_name_t* name); + +/** + * Pack proc list into portable buffer + * + * This function takes a list of oshmem_proc_t pointers (e.g. as given + * in groups) and returns a orte buffer containing all information + * needed to add the proc to a remote list. This includes the ORTE + * process name, the architecture, and the hostname. Ordering is + * maintained. The buffer is packed to be sent to a remote node with + * different architecture (endian or word size). The buffer can be + * dss unloaded to be sent using SHMEM or send using rml_send_packed(). + * + * @param[in] proclist List of process pointers + * @param[in] proclistsize Length of the proclist array + * @param[in,out] buf An orte_buffer containing the packed names. + * The buffer must be constructed but empty when + * passed to this function + * @retval OSHMEM_SUCCESS Success + * @retval OSHMEM_ERROR Unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_pack(oshmem_proc_t **proclist, + int proclistsize, + opal_buffer_t *buf); + +/** + * Unpack a portable buffer of procs + * + * This function unpacks a packed list of oshmem_proc_t structures and + * returns the ordered list of proc structures. If the given proc is + * already "known", the architecture and hostname information in the + * buffer is ignored. If the proc is "new" to this process, it will + * be added to the global list of known procs, with information + * provided in the buffer. The lookup actions are always entirely + * local. The proclist returned is a list of pointers to all procs in + * the buffer, whether they were previously known or are new to this + * process. + * + * @note In previous versions of this function, The PML's add_procs() + * function was called for any new processes discovered as a result of + * this operation. That is no longer the case -- the caller must use + * the newproclist information to call add_procs() if necessary. + * + * @note The reference count for procs created as a result of this + * operation will be set to 1. Existing procs will not have their + * reference count changed. The reference count of a proc at the + * return of this function is the same regardless of whether NULL is + * provided for newproclist. The user is responsible for freeing the + * newproclist array. + * + * @param[in] buf orte_buffer containing the packed names + * @param[in] proclistsize number of expected proc-pointres + * @param[out] proclist list of process pointers + * @param[out] newproclistsize Number of new procs added as a result + * of the unpack operation. NULL may be + * provided if information is not needed. + * @param[out] newproclist List of new procs added as a result of + * the unpack operation. NULL may be + * provided if informationis not needed. + * + * Return value: + * OSHMEM_SUCCESS on success + * OSHMEM_ERROR else + */ +OSHMEM_DECLSPEC int oshmem_proc_unpack(opal_buffer_t *buf, + int proclistsize, + oshmem_proc_t ***proclist, + int *newproclistsize, + oshmem_proc_t ***newproclist); + +/** + * Refresh the OSHMEM process subsystem + * + * Refresh the Open SHMEM process subsystem. This function will update + * the list of proc instances in the current pe set with + * data from the run-time environemnt. + * + * @note This is primarily used when restarting a process and thus + * need to update the jobid and node name. + * + * @retval OSHMEM_SUCESS System successfully refreshed + * @retval OSHMEM_ERROR Refresh failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_refresh(void); + +static inline int oshmem_proc_pe(oshmem_proc_t *proc) +{ + return (proc ? (int) proc->proc_name.vpid : -1); +} + +/** + * Initialize the OSHMEM process predefined groups + * + * Initialize the Open SHMEM process predefined groups. This function will + * query the run-time environment and build a list of the proc + * instances in the current pe set. The local information not + * easily determined by the run-time ahead of time (architecture and + * hostname) will be published during this call. + * + * @note This is primarily used once during SHMEM setup. + * + * @retval OSHMEM_SUCESS System successfully initialized + * @retval OSHMEM_ERROR Initialization failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_group_init(void); + +/** + * Finalize the OSHMEM process predefined groups + * + * Initialize the Open SHMEM process predefined groups. This function will + * query the run-time environment and build a list of the proc + * instances in the current pe set. The local information not + * easily determined by the run-time ahead of time (architecture and + * hostname) will be published during this call. + * + * @note This is primarily used once during SHMEM setup. + * + * @retval OSHMEM_SUCESS System successfully initialized + * @retval OSHMEM_ERROR Initialization failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void); + +/** + * Create processes group. + * + * Returns the list of known proc instances located in this group. + * + * @param[in] pe_start The lowest PE in the active set. + * @param[in] pe_stride The log (base 2) of the stride between consecutive + * PEs in the active set. + * @param[in] pe_size The number of PEs in the active set. + * + * @return Array of pointers to proc instances in the current + * known universe, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, + int pe_stride, + size_t pe_size); + +/** + * Destroy processes group. + * + */ +OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group); + +static inline oshmem_proc_t *oshmem_proc_group_all(int pe) +{ + return oshmem_group_all->proc_array[pe]; +} + +static inline oshmem_proc_t* oshmem_proc_group_find(oshmem_group_t* group, + int pe) +{ + int i = 0; + oshmem_proc_t* proc = NULL; + + if (OPAL_LIKELY(group)) { + if (OPAL_LIKELY(group == oshmem_group_all)) { + /* To improve performance use direct index. It is feature of oshmem_group_all */ + proc = group->proc_array[pe]; + } else { + for (i = 0; i < group->proc_count; i++) { + if (pe == oshmem_proc_pe(group->proc_array[i])) { + proc = group->proc_array[i]; + break; + } + } + } + } else { + orte_process_name_t name; + + name.jobid = ORTE_PROC_MY_NAME->jobid; + name.vpid = pe; + proc = oshmem_proc_find(&name); + } + + return proc; +} + +static inline int oshmem_proc_group_find_id(oshmem_group_t* group, int pe) +{ + int i = 0; + int id = -1; + + if (group) { + for (i = 0; i < group->proc_count; i++) { + if (pe == oshmem_proc_pe(group->proc_array[i])) { + id = i; + break; + } + } + } + + return id; +} + +static inline int oshmem_proc_group_is_member(oshmem_group_t *group) +{ + return group->is_member; +} + +static inline int oshmem_num_procs(void) +{ + extern opal_list_t oshmem_proc_list; + + if (!oshmem_group_all) + return opal_list_get_size(&oshmem_proc_list); + + return oshmem_group_all->proc_count; +} + +static inline int oshmem_my_proc_id(void) +{ + return oshmem_group_self->my_pe; +} + +static inline int oshmem_get_transport_id(int pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, pe); + + return (int) proc->transport_ids[0]; +} + +static inline int oshmem_get_transport_count(int pe) +{ + oshmem_proc_t *proc; + proc = oshmem_proc_group_find(oshmem_group_all, pe); + return proc->num_transports; +} + +END_C_DECLS + +#endif /* OSHMEM_PROC_PROC_H */ diff --git a/oshmem/proc/proc_group_cache.c b/oshmem/proc/proc_group_cache.c new file mode 100644 index 0000000000..a5682abc98 --- /dev/null +++ b/oshmem/proc/proc_group_cache.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem/proc/proc_group_cache.h" +#include "oshmem/constants.h" +#include "oshmem/runtime/runtime.h" + +OBJ_CLASS_INSTANCE(oshmem_group_cache_t, opal_object_t, NULL, NULL); +opal_list_t oshmem_group_cache_list; +unsigned int oshmem_group_cache_size; +oshmem_group_t* find_group_in_cache(int PE_start, int logPE_stride, int PE_size) +{ + int cache_look_up_id[3] = { PE_start, logPE_stride, PE_size }; + opal_list_item_t *item; + if (opal_list_is_empty(&oshmem_group_cache_list)) { + return NULL ; + } + + for (item = opal_list_get_first(&oshmem_group_cache_list); + item != opal_list_get_end(&oshmem_group_cache_list); + item = opal_list_get_next(item)) { + if (!memcmp(((oshmem_group_cache_t *) item)->cache_id, + cache_look_up_id, + 3 * sizeof(int))) { + return ((oshmem_group_cache_t *) item)->group; + } + } + return NULL ; +} + +int cache_group(oshmem_group_t *group, + int PE_start, + int logPE_stride, + int PE_size) +{ + oshmem_group_cache_t *cached_group = NULL; + cached_group = OBJ_NEW(oshmem_group_cache_t); +#if OPAL_ENABLE_DEBUG + cached_group->item.opal_list_item_belong_to = NULL; + cached_group->item.opal_list_item_refcount = 0; +#endif + cached_group->group = group; + cached_group->cache_id[0] = PE_start; + cached_group->cache_id[1] = logPE_stride; + cached_group->cache_id[2] = PE_size; + if (opal_list_get_size(&oshmem_group_cache_list) + < oshmem_group_cache_size) { + opal_list_append(&oshmem_group_cache_list, + (opal_list_item_t *)cached_group); + } else { +#if ABORT_ON_CACHE_OVERFLOW + opal_output(0, + "error: group cache overflow on rank %i: cache_size = %u: try encreasing oshmem_group_cache_size mca parameter", + group->my_pe, + oshmem_group_cache_size); + oshmem_shmem_abort(-1); +#else + /*This part of code makes FIFO group cache management. Define ABORT_ON_CACHE_OVERFLOW as 0 to enable this.*/ + oshmem_group_cache_t *cached_group_to_remove = (oshmem_group_cache_t *)opal_list_remove_first(&oshmem_group_cache_list); + oshmem_proc_group_destroy(cached_group_to_remove->group); + OBJ_RELEASE(cached_group_to_remove); + opal_list_append(&oshmem_group_cache_list,(opal_list_item_t *)cached_group); +#endif + } + return OSHMEM_SUCCESS; +} + +int oshmem_group_cache_list_init(void) +{ + int mca_value; + int cache_size_default = 100; + OBJ_CONSTRUCT(&oshmem_group_cache_list, opal_list_t); + + mca_value = cache_size_default; + (void) mca_base_var_register("oshmem", + "proc", + NULL, + "group_cache_size", + "The depth of the oshmem_group cache list used to speed up collective operations", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_value); + if (mca_value < 0) { + opal_output(0, + "error: oshmem_group_cache_size mca parameter was set to %i while it has to be positive value. Default value %i will be used.", + mca_value, + cache_size_default); + mca_value = cache_size_default; + } + oshmem_group_cache_size = (unsigned int) mca_value; + return OSHMEM_SUCCESS; +} + +int oshmem_group_cache_list_free(void) +{ + oshmem_group_cache_t *cached_group = NULL; + opal_list_item_t *item; + while (NULL != (item = opal_list_remove_first(&oshmem_group_cache_list))) { + cached_group = (oshmem_group_cache_t *) item; + oshmem_proc_group_destroy(cached_group->group); + OBJ_RELEASE(cached_group); + } + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/proc/proc_group_cache.h b/oshmem/proc/proc_group_cache.h new file mode 100644 index 0000000000..d97cfa8380 --- /dev/null +++ b/oshmem/proc/proc_group_cache.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef _PROC_GROUP_CACHE_H +#define _PROC_GROUP_CACHE_H + +#include "oshmem_config.h" +#include "proc.h" + +#define OSHMEM_GROUP_CACHE_ENABLED 1 +#define ABORT_ON_CACHE_OVERFLOW 1 +BEGIN_C_DECLS +struct oshmem_group_cache_t { + opal_list_item_t item; + oshmem_group_t *group; + int cache_id[3]; +}; + +typedef struct oshmem_group_cache_t oshmem_group_cache_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_group_cache_t); +OSHMEM_DECLSPEC extern opal_list_t oshmem_group_cache_list; + +oshmem_group_t* find_group_in_cache(int PE_start, int logPE_stride, int PE_size); + +int cache_group(oshmem_group_t *group, + int PE_start, + int logPE_stride, + int PE_size); +int oshmem_group_cache_list_init(void); +int oshmem_group_cache_list_free(void); + +extern unsigned int oshmem_group_cache_size; +END_C_DECLS + +#endif diff --git a/oshmem/request/Makefile.am b/oshmem/request/Makefile.am new file mode 100644 index 0000000000..24afed3305 --- /dev/null +++ b/oshmem/request/Makefile.am @@ -0,0 +1,19 @@ +# -*- makefile -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This makefile.am does not stand on its own - it is included from oshmem/Makefile.am + + +headers += \ + request/request.h \ + request/request_dbg.h + +libshmem_la_SOURCES += \ + request/request.c diff --git a/oshmem/request/request.c b/oshmem/request/request.c new file mode 100644 index 0000000000..564765abbd --- /dev/null +++ b/oshmem/request/request.c @@ -0,0 +1,177 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "ompi/communicator/communicator.h" /* TODO: ompi_predefined_communicator_t*/ +#include "opal/class/opal_object.h" +#include "oshmem/request/request.h" +#include "oshmem/constants.h" +#include "oshmem/proc/proc.h" + +opal_pointer_array_t oshmem_request_f_to_c_table; +size_t oshmem_request_waiting = 0; +size_t oshmem_request_completed = 0; +opal_mutex_t oshmem_request_lock; +opal_condition_t oshmem_request_cond; +oshmem_predefined_request_t oshmem_request_null; +oshmem_request_t oshmem_request_empty; +oshmem_status_public_t oshmem_status_empty; +oshmem_request_fns_t oshmem_request_functions = { + NULL, /*oshmem_request_default_test,*/ + NULL, /*oshmem_request_default_test_any,*/ + NULL, /*oshmem_request_default_test_all,*/ + NULL, /*oshmem_request_default_test_some,*/ + NULL, /*oshmem_request_default_wait,*/ + NULL, /*oshmem_request_default_wait_any,*/ + NULL, /*oshmem_request_default_wait_all,*/ + NULL, /*oshmem_request_default_wait_some*/ +}; + +static void oshmem_request_construct(oshmem_request_t* req) +{ + /* don't call _INIT, we don't to set the request to _INACTIVE and there will + * be no matching _FINI invocation */ + req->req_state = OSHMEM_REQUEST_INVALID; + req->req_complete = false; + req->req_persistent = false; + req->req_free = NULL; + req->req_cancel = NULL; + req->req_complete_cb = NULL; + req->req_complete_cb_data = NULL; + req->req_f_to_c_index = SHMEM_UNDEFINED; + req->req_shmem_object.comm = (oshmem_group_t*) NULL; /* TODO: Implement*/ +} + +static void oshmem_request_destruct(oshmem_request_t* req) +{ + assert( SHMEM_UNDEFINED == req->req_f_to_c_index); + assert( OSHMEM_REQUEST_INVALID == req->req_state); +} + +static int oshmem_request_null_free(oshmem_request_t** request) +{ + return OSHMEM_SUCCESS; +} + +static int oshmem_request_null_cancel(oshmem_request_t* request, int flag) +{ + return OSHMEM_SUCCESS; +} + +static int oshmem_request_empty_free(oshmem_request_t** request) +{ + *request = &oshmem_request_null.request; + return OSHMEM_SUCCESS; +} + +int oshmem_request_persistent_proc_null_free(oshmem_request_t** request) +{ + OSHMEM_REQUEST_FINI(*request); + (*request)->req_state = OSHMEM_REQUEST_INVALID; + OBJ_RELEASE(*request); + *request = &oshmem_request_null.request; + return OSHMEM_SUCCESS; +} + +/*TODO: define under class oshmem_free_list_item_t */ +OBJ_CLASS_INSTANCE( oshmem_request_t, + ompi_free_list_item_t, + oshmem_request_construct, + oshmem_request_destruct); + +int oshmem_request_init(void) +{ + OBJ_CONSTRUCT(&oshmem_request_lock, opal_mutex_t); + OBJ_CONSTRUCT(&oshmem_request_cond, opal_condition_t); + + OBJ_CONSTRUCT(&oshmem_request_null, oshmem_request_t); + OBJ_CONSTRUCT(&oshmem_request_f_to_c_table, opal_pointer_array_t); + if (OPAL_SUCCESS + != opal_pointer_array_init(&oshmem_request_f_to_c_table, + 0, + OMPI_FORTRAN_HANDLE_MAX, + 64)) { + return OSHMEM_ERROR; + } + oshmem_request_null.request.req_type = OSHMEM_REQUEST_NULL; + oshmem_request_null.request.req_status.SHMEM_SOURCE = SHMEM_PROC_NULL; + oshmem_request_null.request.req_status.SHMEM_ERROR = SHMEM_SUCCESS; + oshmem_request_null.request.req_status._count = 0; + oshmem_request_null.request.req_status._cancelled = 0; + + oshmem_request_null.request.req_complete = true; + oshmem_request_null.request.req_state = OSHMEM_REQUEST_INACTIVE; + oshmem_request_null.request.req_persistent = false; + oshmem_request_null.request.req_f_to_c_index = + opal_pointer_array_add(&oshmem_request_f_to_c_table, + &oshmem_request_null); + oshmem_request_null.request.req_free = oshmem_request_null_free; + oshmem_request_null.request.req_cancel = oshmem_request_null_cancel; + oshmem_request_null.request.req_shmem_object.comm = + (oshmem_group_t*) &ompi_mpi_comm_world.comm; + + if (0 != oshmem_request_null.request.req_f_to_c_index) { + return OSHMEM_ERR_REQUEST; + } + + /* We need a way to distinguish between the user provided + * SHMEM_REQUEST_NULL to SHMEM_Wait* and a non-active (SHMEM_PROC_NULL) + * request passed to any P2P non-blocking function. + * + * The main difference to oshmem_request_null is + * req_state being OSHMEM_REQUEST_ACTIVE, so that SHMEM_Waitall + * does not set the status to oshmem_status_empty and the different + * req_free function, which resets the + * request to SHMEM_REQUEST_NULL. + * The req_cancel function need not be changed. + */ + OBJ_CONSTRUCT(&oshmem_request_empty, oshmem_request_t); + oshmem_request_empty.req_type = OSHMEM_REQUEST_NULL; + oshmem_request_empty.req_status.SHMEM_SOURCE = SHMEM_PROC_NULL; + oshmem_request_empty.req_status.SHMEM_ERROR = SHMEM_SUCCESS; + oshmem_request_empty.req_status._count = 0; + oshmem_request_empty.req_status._cancelled = 0; + + oshmem_request_empty.req_complete = true; + oshmem_request_empty.req_state = OSHMEM_REQUEST_ACTIVE; + oshmem_request_empty.req_persistent = false; + oshmem_request_empty.req_f_to_c_index = + opal_pointer_array_add(&oshmem_request_f_to_c_table, + &oshmem_request_empty); + oshmem_request_empty.req_free = oshmem_request_empty_free; + oshmem_request_empty.req_cancel = oshmem_request_null_cancel; + oshmem_request_empty.req_shmem_object.comm = + (oshmem_group_t*) &ompi_mpi_comm_world.comm; + + if (1 != oshmem_request_empty.req_f_to_c_index) { + return OSHMEM_ERR_REQUEST; + } + + oshmem_status_empty.SHMEM_SOURCE = SHMEM_ANY_SOURCE; + oshmem_status_empty.SHMEM_ERROR = SHMEM_SUCCESS; + oshmem_status_empty._count = 0; + oshmem_status_empty._cancelled = 0; + + return OSHMEM_SUCCESS; +} + +int oshmem_request_finalize(void) +{ + OSHMEM_REQUEST_FINI( &oshmem_request_null.request); + OBJ_DESTRUCT( &oshmem_request_null.request); + OSHMEM_REQUEST_FINI( &oshmem_request_empty); + OBJ_DESTRUCT( &oshmem_request_empty); + OBJ_DESTRUCT( &oshmem_request_cond); + OBJ_DESTRUCT( &oshmem_request_lock); + OBJ_DESTRUCT( &oshmem_request_f_to_c_table); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/request/request.h b/oshmem/request/request.h new file mode 100644 index 0000000000..5f87029ac9 --- /dev/null +++ b/oshmem/request/request.h @@ -0,0 +1,444 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Top-level description of requests + */ + +#ifndef OSHMEM_REQUEST_H +#define OSHMEM_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/constants.h" + +#include "ompi/class/ompi_free_list.h" + +#include "opal/class/opal_pointer_array.h" +#include "opal/threads/condition.h" + +BEGIN_C_DECLS + +/** + * Request class + */ +/*OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_request_t);*/ +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_request_t); + +/* + * The following include pulls in shared typedefs with debugger plugins. + * For more information on why we do this see the Notice to developers + * comment at the top of the oshmem_msgq_dll.c file. + */ + +#include "request_dbg.h" + +struct oshmem_request_t; + +typedef struct oshmem_request_t *SHMEM_Request; +typedef struct oshmem_status_public_t SHMEM_Status; + +/* This constants are used to check status of request->req_status.SHMEM_ERROR */ +#define SHMEM_SUCCESS 0 +#define SHMEM_ERR_IN_STATUS 18 + +/* + * SHMEM_Status + */ +struct oshmem_status_public_t { + int SHMEM_SOURCE; + /*int MPI_TAG;*/ + int SHMEM_ERROR; + int _count; + int _cancelled; +}; +typedef struct oshmem_status_public_t oshmem_status_public_t; + +typedef int (SHMEM_Grequest_query_function)(void *, SHMEM_Status *); +typedef int (SHMEM_Grequest_free_function)(void *); +typedef int (SHMEM_Grequest_cancel_function)(void *, int); + +#define SHMEM_STATUS_IGNORE ((SHMEM_Status *) 0) +#define SHMEM_STATUSES_IGNORE ((SHMEM_Status *) 0) + +#define SHMEM_REQUEST_NULL OSHMEM_PREDEFINED_GLOBAL(SHMEM_Request, oshmem_request_null) + +/* + * Required function to free the request and any associated resources. + */ +typedef int (*oshmem_request_free_fn_t)(struct oshmem_request_t** rptr); + +/* + * Optional function to cancel a pending request. + */ +typedef int (*oshmem_request_cancel_fn_t)(struct oshmem_request_t* request, + int flag); + +/* + * Optional function called when the request is completed from the SHMEM + * library perspective. This function is not allowed to release any + * ressources related to the request. + */ +typedef int (*oshmem_request_complete_fn_t)(struct oshmem_request_t* request); + +/* TODO: decide whether to remove comm */ +/** + * Forward declaration + */ +struct oshmem_group_t; + +/** + * Forward declaration + */ +/*struct oshmem_file_t;*/ + +/** + * Union for holding several different SHMEM pointer types on the request + */ +typedef union oshmem_shmem_object_t { + struct oshmem_group_t *comm; +/* struct oshmem_file_t *file;*/ +} oshmem_shmem_object_t; + +/** + * Main top-level request struct definition + */ +struct oshmem_request_t { + ompi_free_list_item_t super; /**< Base type *//*TODO: Implement in shmem */ + oshmem_request_type_t req_type; /**< Enum indicating the type of the request */ + oshmem_status_public_t req_status; /**< Completion status */ + volatile bool req_complete; /**< Flag indicating completion on a request */ + volatile oshmem_request_state_t req_state; /**< enum indicate the state of the request */ + bool req_persistent; /* TODO: NOT Required */ + /**< flag indicating if this is a persistent request */ + int req_f_to_c_index; /* TODO: NOT Required */ + /**< Index in Fortran <-> C translation array */ + oshmem_request_free_fn_t req_free; /**< Called by free */ + oshmem_request_cancel_fn_t req_cancel; /* TODO: Not Required */ + /**< Optional function to cancel the request */ + oshmem_request_complete_fn_t req_complete_cb; /**< Called when the request is SHMEM completed */ + void *req_complete_cb_data; + oshmem_shmem_object_t req_shmem_object; /**< Pointer to SHMEM object that created this request */ +}; + +/** + * Convenience typedef + */ +typedef struct oshmem_request_t oshmem_request_t; + +/** + * Padded struct to maintain back compatibiltiy. + * See oshmem/communicator/communicator.h comments with struct oshmem_group_t + * for full explanation why we chose the following padding construct for predefines. + */ +#define PREDEFINED_REQUEST_PAD (sizeof(void*) * 32) + +struct oshmem_predefined_request_t { + struct oshmem_request_t request; + char padding[PREDEFINED_REQUEST_PAD - sizeof(oshmem_request_t)]; +}; + +typedef struct oshmem_predefined_request_t oshmem_predefined_request_t; + +/** + * Initialize a request. This is a macro to avoid function call + * overhead, since this is typically invoked in the critical + * performance path (since requests may be re-used, it is possible + * that we will have to initialize a request multiple times). + */ +#define OSHMEM_REQUEST_INIT(request, persistent) \ + do { \ + (request)->req_complete = false; \ + (request)->req_state = OSHMEM_REQUEST_INACTIVE; \ + (request)->req_persistent = (persistent); \ + } while (0); + +/** + * Finalize a request. This is a macro to avoid function call + * overhead, since this is typically invoked in the critical + * performance path (since requests may be re-used, it is possible + * that we will have to finalize a request multiple times). + * + * When finalizing a request, if MPI_Request_f2c() was previously + * invoked on that request, then this request was added to the f2c + * table, and we need to remove it + * + * This function should be called only from the SHMEM layer. It should + * never be called from the SPML. It take care of the upper level clean-up. + * When the user call MPI_Request_free we should release all SHMEM level + * ressources, so we have to call this function too. + */ +#define OSHMEM_REQUEST_FINI(request) \ +do { \ + (request)->req_state = OSHMEM_REQUEST_INVALID; \ + if (SHMEM_UNDEFINED != (request)->req_f_to_c_index) { \ + opal_pointer_array_set_item(&oshmem_request_f_to_c_table, \ + (request)->req_f_to_c_index, NULL); \ + (request)->req_f_to_c_index = SHMEM_UNDEFINED; \ + } \ +} while (0); + +/** + * Non-blocking test for request completion. + * + * @param request (IN) Array of requests + * @param complete (OUT) Flag indicating if index is valid (a request completed). + * @param status (OUT) Status of completed request. + * @return OSHMEM_SUCCESS or failure status. + * + * Note that upon completion, the request is freed, and the + * request handle at index set to NULL. + */ +typedef int (*oshmem_request_test_fn_t)(oshmem_request_t ** rptr, + int *completed, + oshmem_status_public_t * status); +/** + * Non-blocking test for request completion. + * + * @param count (IN) Number of requests + * @param request (IN) Array of requests + * @param index (OUT) Index of first completed request. + * @param complete (OUT) Flag indicating if index is valid (a request completed). + * @param status (OUT) Status of completed request. + * @return OSHMEM_SUCCESS or failure status. + * + * Note that upon completion, the request is freed, and the + * request handle at index set to NULL. + */ +typedef int (*oshmem_request_test_any_fn_t)(size_t count, + oshmem_request_t ** requests, + int *index, + int *completed, + oshmem_status_public_t * status); +/** + * Non-blocking test for request completion. + * + * @param count (IN) Number of requests + * @param requests (IN) Array of requests + * @param completed (OUT) Flag indicating wether all requests completed. + * @param statuses (OUT) Array of completion statuses. + * @return OSHMEM_SUCCESS or failure status. + * + * This routine returns completed==true if all requests have completed. + * The statuses parameter is only updated if all requests completed. Likewise, + * the requests array is not modified (no requests freed), unless all requests + * have completed. + */ +typedef int (*oshmem_request_test_all_fn_t)(size_t count, + oshmem_request_t ** requests, + int *completed, + oshmem_status_public_t * statuses); +/** + * Non-blocking test for some of N requests to complete. + * + * @param count (IN) Number of requests + * @param requests (INOUT) Array of requests + * @param outcount (OUT) Number of finished requests + * @param indices (OUT) Indices of the finished requests + * @param statuses (OUT) Array of completion statuses. + * @return OSHMEM_SUCCESS, OSHMEM_ERR_IN_STATUS or failure status. + * + */ +typedef int (*oshmem_request_test_some_fn_t)(size_t count, + oshmem_request_t ** requests, + int * outcount, + int * indices, + oshmem_status_public_t * statuses); +/** + * Wait (blocking-mode) for one requests to complete. + * + * @param request (IN) Pointer to request. + * @param status (OUT) Status of completed request. + * @return OSHMEM_SUCCESS or failure status. + * + */ +typedef int (*oshmem_request_wait_fn_t)(oshmem_request_t ** req_ptr, + oshmem_status_public_t * status); +/** + * Wait (blocking-mode) for one of N requests to complete. + * + * @param count (IN) Number of requests + * @param requests (IN) Array of requests + * @param index (OUT) Index into request array of completed request. + * @param status (OUT) Status of completed request. + * @return OSHMEM_SUCCESS or failure status. + * + */ +typedef int (*oshmem_request_wait_any_fn_t)(size_t count, + oshmem_request_t ** requests, + int *index, + oshmem_status_public_t * status); +/** + * Wait (blocking-mode) for all of N requests to complete. + * + * @param count (IN) Number of requests + * @param requests (IN) Array of requests + * @param statuses (OUT) Array of completion statuses. + * @return OSHMEM_SUCCESS or failure status. + * + */ +typedef int (*oshmem_request_wait_all_fn_t)(size_t count, + oshmem_request_t ** requests, + oshmem_status_public_t * statuses); +/** + * Wait (blocking-mode) for some of N requests to complete. + * + * @param count (IN) Number of requests + * @param requests (INOUT) Array of requests + * @param outcount (OUT) Number of finished requests + * @param indices (OUT) Indices of the finished requests + * @param statuses (OUT) Array of completion statuses. + * @return OSHMEM_SUCCESS, OSHMEM_ERR_IN_STATUS or failure status. + * + */ +typedef int (*oshmem_request_wait_some_fn_t)(size_t count, + oshmem_request_t ** requests, + int * outcount, + int * indices, + oshmem_status_public_t * statuses); + +/** + * Replaceable request functions + */ +typedef struct oshmem_request_fns_t { + oshmem_request_test_fn_t req_test; + oshmem_request_test_any_fn_t req_test_any; + oshmem_request_test_all_fn_t req_test_all; + oshmem_request_test_some_fn_t req_test_some; + oshmem_request_wait_fn_t req_wait; + oshmem_request_wait_any_fn_t req_wait_any; + oshmem_request_wait_all_fn_t req_wait_all; + oshmem_request_wait_some_fn_t req_wait_some; +} oshmem_request_fns_t; + +/** + * Globals used for tracking requests and request completion. + */ +OSHMEM_DECLSPEC extern opal_pointer_array_t oshmem_request_f_to_c_table; +OSHMEM_DECLSPEC extern size_t oshmem_request_waiting; +OSHMEM_DECLSPEC extern size_t oshmem_request_completed; +OSHMEM_DECLSPEC extern int32_t oshmem_request_poll; +OSHMEM_DECLSPEC extern opal_mutex_t oshmem_request_lock; +OSHMEM_DECLSPEC extern opal_condition_t oshmem_request_cond; +OSHMEM_DECLSPEC extern oshmem_predefined_request_t oshmem_request_null; +OSHMEM_DECLSPEC extern oshmem_request_t oshmem_request_empty; +OSHMEM_DECLSPEC extern oshmem_status_public_t oshmem_status_empty; +OSHMEM_DECLSPEC extern oshmem_request_fns_t oshmem_request_functions; + +/** + * Initialize the OSHMEM_Request subsystem; invoked during SHMEM_INIT. + */ +int oshmem_request_init(void); + +/** + * Free a persistent request to a MPI_PROC_NULL peer (there's no + * freelist to put it back to, so we have to actually OBJ_RELEASE it). + */ +OSHMEM_DECLSPEC int oshmem_request_persistent_proc_null_free(oshmem_request_t **request); + +/** + * Shut down the SHMEM_Request subsystem; invoked during SHMEM_FINALIZE. + */ +int oshmem_request_finalize(void); + +/** + * Cancel a pending request. + */ +static inline int oshmem_request_cancel(oshmem_request_t* request) +{ + if (request->req_cancel != NULL ) { + return request->req_cancel(request, true); + } + return OSHMEM_SUCCESS; +} + +/** + * Free a request. + * + * @param request (INOUT) Pointer to request. + */ +static inline int oshmem_request_free(oshmem_request_t** request) +{ + return (*request)->req_free(request); +} + +#define oshmem_request_test (oshmem_request_functions.req_test) +#define oshmem_request_test_any (oshmem_request_functions.req_test_any) +#define oshmem_request_test_all (oshmem_request_functions.req_test_all) +#define oshmem_request_test_some (oshmem_request_functions.req_test_some) +#define oshmem_request_wait (oshmem_request_functions.req_wait) +#define oshmem_request_wait_any (oshmem_request_functions.req_wait_any) +#define oshmem_request_wait_all (oshmem_request_functions.req_wait_all) +#define oshmem_request_wait_some (oshmem_request_functions.req_wait_some) + +/** + * Wait for any completion. It is a caller responsibility to check for + * condition and call us again if needed. + */ +static inline void oshmem_request_wait_any_completion(void) +{ + OPAL_THREAD_LOCK(&oshmem_request_lock); + oshmem_request_waiting++; + opal_condition_wait(&oshmem_request_cond, &oshmem_request_lock); + oshmem_request_waiting--; + OPAL_THREAD_UNLOCK(&oshmem_request_lock); +} + +/** + * Wait a particular request for completion + */ +static inline void oshmem_request_wait_completion(oshmem_request_t *req) +{ + if (false == req->req_complete) { +#if OPAL_ENABLE_PROGRESS_THREADS + if(opal_progress_spin(&req->req_complete)) { + return; + } +#endif + OPAL_THREAD_LOCK(&oshmem_request_lock); + oshmem_request_waiting++; + while (false == req->req_complete) { + opal_condition_wait(&oshmem_request_cond, &oshmem_request_lock); + } + oshmem_request_waiting--; + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + } +} + +/** + * Signal or mark a request as complete. If with_signal is true this will + * wake any thread pending on the request and oshmem_request_lock should be + * held while calling this function. If with_signal is false, there will + * signal generated, and no lock required. This is a special case when + * the function is called from the critical path for small messages, where + * we know the current execution flow created the request, and is still + * in the _START macro. + */ +static inline int oshmem_request_complete(oshmem_request_t* request, + bool with_signal) +{ + if (NULL != request->req_complete_cb) { + request->req_complete_cb(request); + } + oshmem_request_completed++; + request->req_complete = true; + if (with_signal && oshmem_request_waiting) { + /* Broadcast the condition, otherwise if there is already a thread + * waiting on another request it can use all signals. + */ + opal_condition_broadcast(&oshmem_request_cond); + } + return OSHMEM_SUCCESS; +} + +END_C_DECLS + +#endif diff --git a/oshmem/request/request_dbg.h b/oshmem/request/request_dbg.h new file mode 100644 index 0000000000..6800d37aa3 --- /dev/null +++ b/oshmem/request/request_dbg.h @@ -0,0 +1,49 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef OSHMEM_REQUEST_DBG_H +#define OSHMEM_REQUEST_DBG_H + +/* + * This file contains definitions used by both OSHMEM and debugger plugins. + * For more information on why we do this see the Notice to developers + * comment at the top of the ompi_msgq_dll.c file. + */ + +/** + * Enum inidicating the type of the request + */ +typedef enum { + OSHMEM_REQUEST_SPML, /**< MPI point-to-point request */ + OSHMEM_REQUEST_IO, /**< MPI-2 IO request */ + OSHMEM_REQUEST_GEN, /**< MPI-2 generalized request */ + OSHMEM_REQUEST_WIN, /**< MPI-2 one-sided request */ + OSHMEM_REQUEST_COLL, /**< MPI-3 non-blocking collectives request */ + OSHMEM_REQUEST_NULL, /**< NULL request */ + OSHMEM_REQUEST_NOOP, /**< A request that does nothing (e.g., to PROC_NULL) */ + OSHMEM_REQUEST_MAX /**< Maximum request type */ +} oshmem_request_type_t; + +/** + * Enum indicating the state of the request + */ +typedef enum { + /** Indicates that the request should not be progressed */ + OSHMEM_REQUEST_INVALID, + /** A defined, but inactive request (i.e., it's valid, but should + not be progressed) */ + OSHMEM_REQUEST_INACTIVE, + /** A valid and progressing request */ + OSHMEM_REQUEST_ACTIVE, + /** The request has been cancelled */ + OSHMEM_REQUEST_CANCELLED /* TODO: Not required */ +} oshmem_request_state_t; + +#endif diff --git a/oshmem/runtime/Makefile.am b/oshmem/runtime/Makefile.am new file mode 100644 index 0000000000..b32913515b --- /dev/null +++ b/oshmem/runtime/Makefile.am @@ -0,0 +1,28 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from ompi/Makefile.am + +dist_pkgdata_DATA += runtime/help-shmem-runtime.txt + + +headers += \ + runtime/runtime.h \ + runtime/params.h \ + runtime/oshmem_shmem_preconnect.h + +libshmem_la_SOURCES += \ + runtime/oshmem_shmem_init.c \ + runtime/oshmem_shmem_finalize.c \ + runtime/oshmem_shmem_abort.c \ + runtime/oshmem_shmem_params.c \ + runtime/oshmem_shmem_exchange.c + + + diff --git a/oshmem/runtime/help-shmem-runtime.txt b/oshmem/runtime/help-shmem-runtime.txt new file mode 100644 index 0000000000..9dc1e30fad --- /dev/null +++ b/oshmem/runtime/help-shmem-runtime.txt @@ -0,0 +1,64 @@ +# -*- text -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open SHMEM. +# +[shmem_init:startup:internal-failure] +It looks like %s failed for some reason; your parallel process is +likely to abort. There are many reasons that a parallel process can +fail during %s; some of which are due to configuration or environment +problems. This failure appears to be an internal failure; here's some +additional information (which may only be relevant to an Open SHMEM +developer): + + %s + --> Returned "%s" (%d) instead of "Success" (0) +# +[shmem_finalize:invoked_multiple_times] +The function SHMEM_FINALIZE was invoked multiple times in a single +process on host %s, PID %d. + +This indicates an erroneous SHMEM program; SHMEM_FINALIZE is only allowed +to be invoked exactly once in a process. +# +[heterogeneous-support-unavailable] +This installation of Open SHMEM was configured without support for +heterogeneous architectures, but at least one node in the allocation +was detected to have a different architecture. The detected node was: + +Node: %s + +In order to operate in a heterogeneous environment, please reconfigure +Open SHMEM with --enable-heterogeneous. +# +[shmem_init:warn-fork] +A SHMEM process has executed an operation involving a call to the +"fork()" system call to create a child process. Open SHMEM is currently +operating in a condition that could result in memory corruption or +other system errors; your SHMEM job may hang, crash, or produce silent +data corruption. The use of fork() (or system() or other calls that +create child processes) is strongly discouraged. + +The process that invoked fork was: + + Local host: %s (PID %d) + My PE: %d + +If you are *absolutely sure* that your application will successfully +and correctly survive a call to fork(), you may disable this warning +by setting the mpi_warn_on_fork MCA parameter to 0. +# +[oshmem shmem abort:cannot guarantee all killed] +A SHMEM process is aborting at a time when it cannot guarantee that all +of its peer processes in the job will be killed properly. You should +double check that everything has shut down cleanly. + +Local host: %s +PID: %d diff --git a/oshmem/runtime/oshmem_shmem_abort.c b/oshmem/runtime/oshmem_shmem_abort.c new file mode 100644 index 0000000000..ffe12a2115 --- /dev/null +++ b/oshmem/runtime/oshmem_shmem_abort.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif + +#include "opal/mca/backtrace/backtrace.h" + +#include "orte/util/proc_info.h" +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "oshmem/runtime/params.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/constants.h" +#include "oshmem/proc/proc.h" + +static bool have_been_invoked = false; + +int oshmem_shmem_abort(int errcode) +{ + char *host, hostname[MAXHOSTNAMELEN]; + pid_t pid = 0; + + /* Protection for recursive invocation */ + if (have_been_invoked) { + return OSHMEM_SUCCESS; + } + have_been_invoked = true; + + /* If ORTE is initialized, use its nodename. Otherwise, call + gethostname. */ + + if (orte_initialized) { + host = orte_process_info.nodename; + } else { + gethostname(hostname, sizeof(hostname)); + host = hostname; + } + pid = getpid(); + + orte_show_help("help-shmem-api.txt", + "shmem-abort", + true, + ORTE_PROC_MY_NAME->vpid, + pid, + host, + errcode); + + /* Should we print a stack trace? Not aggregated because they + might be different on all processes. */ + if (ompi_mpi_abort_print_stack) { + char **messages; + int len, i; + + if (OSHMEM_SUCCESS == opal_backtrace_buffer(&messages, &len)) { + for (i = 0; i < len; ++i) { + fprintf(stderr, + "[%s:%d] [%d] func:%s\n", + host, + (int) pid, + i, + messages[i]); + fflush(stderr); + } + free(messages); + } else { + /* This will print an message if it's unable to print the + backtrace, so we don't need an additional "else" clause + if opal_backtrace_print() is not supported. */ + opal_backtrace_print(stderr); + } + } + + if (!orte_initialized || !oshmem_shmem_initialized) { + if (orte_show_help_is_available()) { + /* TODO help message from SHMEM not from MPI is needed*/ + orte_show_help("help-shmem-runtime.txt", + "oshmem shmem abort:cannot guarantee all killed", + true, + host, + (int) pid); + } else { + fprintf(stderr, + "[%s:%d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n", + host, + (int) pid); + } + oshmem_shmem_aborted = true; + exit(errcode); + } + + /* abort local procs in the communicator. If the communicator is + an intercommunicator AND the abort has explicitly requested + that we abort the remote procs, then do that as well. */ + + oshmem_shmem_aborted = true; + /* now that we've aborted everyone else, gracefully die. */ + + orte_errmgr.abort(errcode, NULL ); + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/runtime/oshmem_shmem_exchange.c b/oshmem/runtime/oshmem_shmem_exchange.c new file mode 100644 index 0000000000..17ac70e8bf --- /dev/null +++ b/oshmem/runtime/oshmem_shmem_exchange.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "orte/runtime/orte_globals.h" + +#include "ompi/communicator/communicator.h" /*TODO: ompi_communicator_t */ +#include "ompi/patterns/comm/coll_ops.h" /*TODO: comm_bcast_pml */ + +#include "oshmem/constants.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/runtime/params.h" + +OSHMEM_DECLSPEC int oshmem_shmem_exchange_allgather(void *buf, + int buf_size) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + int *ranks_in_comm = NULL; + + ranks_in_comm = (int *) malloc(orte_process_info.num_procs * sizeof(int)); + if (NULL == ranks_in_comm) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + for (i = 0; i < (int) orte_process_info.num_procs; ++i) { + ranks_in_comm[i] = i; + } + void* buf_temp = malloc(buf_size); + memcpy(buf_temp, buf + buf_size * ORTE_PROC_MY_NAME->vpid, buf_size); + + rc = comm_allgather_pml( buf_temp, + buf, + buf_size, + MPI_BYTE, + ORTE_PROC_MY_NAME->vpid, + orte_process_info.num_procs, + ranks_in_comm, + (ompi_communicator_t *) &ompi_mpi_comm_world); + + if (ranks_in_comm) + free(ranks_in_comm); + if (buf_temp) + free(buf_temp); + return rc; +} + +OSHMEM_DECLSPEC int oshmem_shmem_exchange_bcast(void *buf, + int buf_size, + int peer) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + int *ranks_in_comm = NULL; + + ranks_in_comm = (int *) malloc(orte_process_info.num_procs * sizeof(int)); + if (NULL == ranks_in_comm) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + for (i = 0; i < (int) orte_process_info.num_procs; ++i) { + ranks_in_comm[i] = i; + } + rc = comm_bcast_pml((void *) buf, + peer, + buf_size, + MPI_BYTE, + ORTE_PROC_MY_NAME->vpid, + orte_process_info.num_procs, + ranks_in_comm, + (ompi_communicator_t *) &ompi_mpi_comm_world); + if (ranks_in_comm) + free(ranks_in_comm); + + return rc; +} diff --git a/oshmem/runtime/oshmem_shmem_finalize.c b/oshmem/runtime/oshmem_shmem_finalize.c new file mode 100644 index 0000000000..f6ddfb0c44 --- /dev/null +++ b/oshmem/runtime/oshmem_shmem_finalize.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/runtime/opal_progress.h" +#include "opal/mca/base/base.h" +#include "opal/sys/atomic.h" +#include "opal/runtime/opal.h" + +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/rcache/base/base.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/allocator/base/base.h" +#include "ompi/runtime/mpiruntime.h" + +#include "oshmem/constants.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "oshmem/runtime/params.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/proc/proc_group_cache.h" +#include "oshmem/op/op.h" +#include "oshmem/request/request.h" +#include "oshmem/shmem/shmem_lock.h" +#include "oshmem/runtime/oshmem_shmem_preconnect.h" + +static int __shmem_finalize(void); + +int oshmem_shmem_finalize(void) +{ + int ret = OSHMEM_SUCCESS; + static int32_t finalize_has_already_started = 0; + + if (opal_atomic_cmpset_32(&finalize_has_already_started, 0, 1) + && oshmem_shmem_initialized && !oshmem_shmem_aborted) { + /* Should be called first because ompi_mpi_finalize makes orte and opal finalization */ + ret = __shmem_finalize(); + + if ((OSHMEM_SUCCESS == ret) && ompi_mpi_initialized + && !ompi_mpi_finalized) { + ret = ompi_mpi_finalize(); + } + + if (OSHMEM_SUCCESS == ret) { + oshmem_shmem_initialized = false; + } + } + + return ret; +} + +static int __shmem_finalize(void) +{ + int ret = OSHMEM_SUCCESS; + + shmem_barrier_all(); + + shmem_lock_finalize(); + + /* Finalize preconnect framework */ + if (OSHMEM_SUCCESS != (ret = oshmem_shmem_preconnect_all_finalize())) { + return ret; + } + + /* free requests */ + if (OSHMEM_SUCCESS != (ret = oshmem_request_finalize())) { + return ret; + } + /* must free cached groups before we kill collectives */ + if (OSHMEM_SUCCESS != (ret = oshmem_group_cache_list_free())) { + return ret; + } + /* this is a special group which is not cached. We can only release its collectives at this point */ + mca_scoll_base_group_unselect(oshmem_group_all); + + /* Close down MCA modules */ + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_atomic_base_framework) ) ) { + return ret; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_scoll_base_framework) ) ) { + return ret; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_memheap_base_framework) ) ) { + return ret; + } + + if (OSHMEM_SUCCESS + != (ret = + MCA_SPML_CALL(del_procs(oshmem_group_all->proc_array, oshmem_group_all->proc_count)))) { + return ret; + } + + /* free spml resource */ + if (OSHMEM_SUCCESS != (ret = mca_spml_base_finalize())) { + return ret; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_spml_base_framework) ) ) { + return ret; + } + + /* free op resources */ + if (OSHMEM_SUCCESS != (ret = oshmem_op_finalize())) { + return ret; + } + + /* free proc resources */ + if (OSHMEM_SUCCESS != (ret = oshmem_proc_finalize())) { + return ret; + } + + return ret; +} + diff --git a/oshmem/runtime/oshmem_shmem_init.c b/oshmem/runtime/oshmem_shmem_init.c new file mode 100644 index 0000000000..5bdba61a3f --- /dev/null +++ b/oshmem/runtime/oshmem_shmem_init.c @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif /* HAVE_SYS_TIME_H */ +#ifdef HAVE_PTHREAD_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif + +#include + +#include "math.h" +#include "opal/class/opal_list.h" +#include "opal/mca/base/base.h" +#include "opal/runtime/opal_progress.h" +#include "opal/threads/threads.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/error.h" +#include "opal/util/stacktrace.h" +#include "opal/util/show_help.h" +#include "opal/runtime/opal.h" + +#include "orte/util/proc_info.h" +#include "orte/runtime/runtime.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" +#include "orte/mca/ess/ess.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" + +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/mca/rcache/base/base.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/allocator/base/base.h" +#include "ompi/proc/proc.h" +#include "ompi/runtime/mpiruntime.h" + +#include "oshmem/constants.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/runtime/params.h" +#include "oshmem/runtime/oshmem_shmem_preconnect.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/proc/proc_group_cache.h" +#include "oshmem/op/op.h" +#include "oshmem/request/request.h" +#include "oshmem/shmem/shmem_api_logger.h" + +#include "oshmem/shmem/shmem_lock.h" + +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#if OPAL_CC_USE_PRAGMA_IDENT +#pragma ident OMPI_IDENT_STRING +#elif OPAL_CC_USE_IDENT +#ident OSHMEM_IDENT_STRING +#endif + +/* + * WHAT: add thread for invoking opal_progress() function + * WHY: SHMEM based on current ompi/trunk (by the time of integrating into Open MPI) + * has put/get implementation via send and needs opal_progress() invocation + * on the remote side (i.e. not true one-sided operations). + */ +#define OSHMEM_OPAL_THREAD_ENABLE 0 + +const char ompi_version_string[] = OSHMEM_IDENT_STRING; + +/* + * Global variables and symbols for the MPI layer + */ + +bool oshmem_shmem_initialized = false; +bool oshmem_shmem_aborted = false; +bool oshmem_mpi_thread_multiple = false; +int oshmem_mpi_thread_requested = SHMEM_THREAD_SINGLE; +int oshmem_mpi_thread_provided = SHMEM_THREAD_SINGLE; +long *preconnect_value = 0; + +opal_thread_t *oshmem_mpi_main_thread = NULL; + +/* + * These variables are here, rather than under ompi/mpi/c/foo.c + * because it is not sufficient to have a .c file that only contains + * variables -- you must have a function that is invoked from + * elsewhere in the code to guarantee that all linkers will pull in + * the .o file from the library. Hence, although these are MPI + * constants, we might as well just define them here (i.e., in a file + * that already has a function that is guaranteed to be linked in, + * rather than make a new .c file with the constants and a + * corresponding dummy function that is invoked from this function). + * + * Additionally, there can be/are strange linking paths such that + * ompi_info needs symbols such as ompi_fortran_status_ignore, + * which, if they weren't here with a collection of other global + * symbols that are initialized (which seems to force this .o file to + * be pulled into the resolution process, because ompi_info certainly + * does not call ompi_mpi_init()), would not be able to be found by + * the OSX linker. + * + * NOTE: See the big comment in ompi/mpi/f77/constants.h about why we + * have four symbols for each of the common blocks (e.g., the Fortran + * equivalent(s) of MPI_STATUS_IGNORE). Here, we can only have *one* + * value (not four). So the only thing we can do is make it equal to + * the fortran compiler convention that was selected at configure + * time. Note that this is also true for the value of .TRUE. from the + * Fortran compiler, so even though Open MPI supports all four Fortran + * symbol conventions, it can only support one convention for the two + * C constants (MPI_FORTRAN_STATUS[ES]_IGNORE) and only support one + * compiler for the value of .TRUE. Ugh!! + * + * Note that the casts here are ok -- we're *only* comparing pointer + * values (i.e., they'll never be de-referenced). The global symbols + * are actually of type (ompi_fortran_common_t) (for alignment + * issues), but MPI says that MPI_F_STATUS[ES]_IGNORE must be of type + * (MPI_Fint*). Hence, we have to cast to make compilers not + * complain. + */ +#if OMPI_WANT_F77_BINDINGS +# if OMPI_F77_CAPS +MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &MPI_FORTRAN_STATUS_IGNORE; +MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &MPI_FORTRAN_STATUSES_IGNORE; +# elif OMPI_F77_PLAIN +MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore; +MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore; +# elif OMPI_F77_SINGLE_UNDERSCORE +MPI_Fint *MPI_F_STATUS_IGNORE; +MPI_Fint *MPI_F_STATUSES_IGNORE; +# elif OMPI_F77_DOUBLE_UNDERSCORE +MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore__; +MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore__; +# else +# error Unrecognized Fortran 77 name mangling scheme +# endif +#else +MPI_Fint *MPI_F_STATUS_IGNORE = NULL; +MPI_Fint *MPI_F_STATUSES_IGNORE = NULL; +#endif /* OMPI_WANT_F77_BINDINGS */ + +/* Constants for the Fortran layer. These values are referred to via + common blocks in the Fortran equivalents. See + ompi/mpi/f77/constants.h for a more detailed explanation. + + The values are *NOT* initialized. We do not use the values of + these constants; only their addresses (because they're always + passed by reference by Fortran). + + Initializing upon instantiation these can reveal size and/or + alignment differences between Fortran and C (!) which can cause + warnings or errors upon linking (e.g., making static libraries with + the intel 9.0 compilers on 64 bit platforms shows alignment + differences between libmpi.a and the user's application, resulting + in a linker warning). FWIW, if you initialize these variables in + functions (i.e., not at the instantiation in the global scope), the + linker somehow "figures it all out" (w.r.t. different alignments + between fortan common blocks and the corresponding C variables) and + no linker warnings occur. + + Note that the rationale for the types of each of these variables is + discussed in ompi/include/mpif-common.h. Do not change the types + without also modifying ompi/mpi/f77/constants.h and + ompi/include/mpif-common.h. + */ + +#define INST(type, upper_case, lower_case, single_u, double_u) \ + type lower_case; \ +type upper_case; \ +type single_u; \ +type double_u + +INST(int, + MPI_FORTRAN_BOTTOM, + mpi_fortran_bottom, + mpi_fortran_bottom_, + mpi_fortran_bottom__); +INST(int, + MPI_FORTRAN_IN_PLACE, + mpi_fortran_in_place, + mpi_fortran_in_place_, + mpi_fortran_in_place__); +INST(char *, + MPI_FORTRAN_ARGV_NULL, + mpi_fortran_argv_null, + mpi_fortran_argv_null_, + mpi_fortran_argv_null__); +INST(double, + MPI_FORTRAN_ARGVS_NULL, + mpi_fortran_argvs_null, + mpi_fortran_argvs_null_, + mpi_fortran_argvs_null__); +INST(int *, + MPI_FORTRAN_ERRCODES_IGNORE, + mpi_fortran_errcodes_ignore, + mpi_fortran_errcodes_ignore_, + mpi_fortran_errcodes_ignore__); +INST(int *, + MPI_FORTRAN_STATUS_IGNORE, + mpi_fortran_status_ignore, + mpi_fortran_status_ignore_, + mpi_fortran_status_ignore__); +INST(double, + MPI_FORTRAN_STATUSES_IGNORE, + mpi_fortran_statuses_ignore, + mpi_fortran_statuses_ignore_, + mpi_fortran_statuses_ignore__); + +/* + * Hash tables for MPI_Type_create_f90* functions + */ +opal_hash_table_t ompi_mpi_f90_integer_hashtable; +opal_hash_table_t ompi_mpi_f90_real_hashtable; +opal_hash_table_t ompi_mpi_f90_complex_hashtable; + +static int __shmem_init(int argc, char **argv, int requested, int *provided); + +#if OSHMEM_OPAL_THREAD_ENABLE +static void* shmem_opal_thread(void* argc) +{ +/* + * WHAT: sleep() invocation + * WHY: there occures a segfault sometimes and sleep() + * reduces it's possibility + */ + sleep(1); + while(oshmem_shmem_initialized) + opal_progress(); + return NULL; +} +#endif + +int oshmem_shmem_init(int argc, char **argv, int requested, int *provided) +{ + int ret = OSHMEM_SUCCESS; + + if (!oshmem_shmem_initialized) { + if (!ompi_mpi_initialized && !ompi_mpi_finalized) { + ret = ompi_mpi_init(argc, argv, requested, provided); + } + if (OSHMEM_SUCCESS == ret) { + ret = __shmem_init(argc, argv, requested, provided); + } + + if (OSHMEM_SUCCESS == ret) { + oshmem_shmem_initialized = true; + + MCA_MEMHEAP_CALL(get_all_mkeys()); + oshmem_shmem_preconnect_all(); +#if OSHMEM_OPAL_THREAD_ENABLE + pthread_t thread_id; + int perr; + perr = pthread_create(&thread_id, NULL, &shmem_opal_thread, NULL); + if (perr != 0) + { + SHMEM_API_ERROR("cannot creat opal thread for SHMEM"); + return OSHMEM_ERROR; + } +#endif + } + } + + return ret; +} + +int oshmem_shmem_preconnect_all(void) +{ + int mca_value = 0; + int rc = OSHMEM_SUCCESS; + + (void) mca_base_var_register("oshmem", + "runtime", + NULL, + "preconnect_all", + "Whether to force SHMEM processes to fully " + "wire-up the connections between SHMEM " + "processes during " + "initialization (vs. making connections lazily -- " + "upon the first SHMEM traffic between each " + "process peer pair)", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_value); + + /* force qp creation and rkey exchange for memheap. Does not force exchange of static vars */ + if (mca_value) { + long val; + int nproc = 0; + int i; + + val = 0xdeadbeaf; + + if (!preconnect_value) { + rc = + MCA_MEMHEAP_CALL(private_alloc(sizeof(long), (void **)&preconnect_value)); + } + if (!preconnect_value || (rc != OSHMEM_SUCCESS)) { + SHMEM_API_ERROR("shmem_preconnect_all failed"); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + nproc = _num_pes(); + for (i = 0; i < nproc; i++) { + shmem_long_p(preconnect_value, val, i); + } + shmem_fence(); + shmem_barrier_all(); + SHMEM_API_VERBOSE(5, "Preconnected all PEs"); + } + + return OSHMEM_SUCCESS; +} + +int oshmem_shmem_preconnect_all_finalize(void) +{ + if (preconnect_value) { + MCA_MEMHEAP_CALL(private_free(preconnect_value)); + preconnect_value = 0; + } + + return OSHMEM_SUCCESS; +} + +static int __shmem_init(int argc, char **argv, int requested, int *provided) +{ + int ret = OSHMEM_SUCCESS; + char *error = NULL; + + if (OSHMEM_SUCCESS != (ret = oshmem_proc_init())) { + error = "oshmem_proc_init() failed"; + goto error; + } + + /* We need to do this anyway. + * This place requires to be reviewed and more elegant way is expected + */ + ompi_proc_local_proc = (ompi_proc_t*) oshmem_proc_local_proc; + + if (OSHMEM_SUCCESS != (ret = oshmem_group_cache_list_init())) { + error = "oshmem_group_cache_list_init() failed"; + goto error; + } + + if (OSHMEM_SUCCESS != (ret = oshmem_op_init())) { + error = "oshmem_op_init() failed"; + goto error; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_open(&oshmem_spml_base_framework, 0))) { + error = "mca_spml_base_open() failed"; + goto error; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_open(&oshmem_scoll_base_framework, 0))) { + error = "mca_scoll_base_open() failed"; + goto error; + } + + if (OSHMEM_SUCCESS + != (ret = mca_spml_base_select(OMPI_ENABLE_PROGRESS_THREADS, + OMPI_ENABLE_THREAD_MULTIPLE))) { + error = "mca_spml_base_select() failed"; + goto error; + } + + if (OSHMEM_SUCCESS + != (ret = + mca_scoll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, + OMPI_ENABLE_THREAD_MULTIPLE))) { + error = "mca_scoll_base_find_available() failed"; + goto error; + } + + /* Initialize each SHMEM handle subsystem */ + /* Initialize requests */ + if (OSHMEM_SUCCESS != (ret = oshmem_request_init())) { + error = "oshmem_request_init() failed"; + goto error; + } + + /* identify the architectures of remote procs and setup + * their datatype convertors, if required + */ + if (OSHMEM_SUCCESS != (ret = oshmem_proc_set_arch())) { + error = "oshmem_proc_set_arch failed"; + goto error; + } + + /* start SPML/BTL's */ + ret = MCA_SPML_CALL(enable(true)); + if (OSHMEM_SUCCESS != ret) { + error = "SPML control failed"; + goto error; + } + + /* There is issue with call add_proc twice so + * we need to use btl info got from PML add_procs() before call of SPML add_procs() + */ + { + ompi_proc_t** procs = NULL; + size_t nprocs = 0; + procs = ompi_proc_world(&nprocs); + while (nprocs--) { + oshmem_group_all->proc_array[nprocs]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = + procs[nprocs]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + } + free(procs); + } + + ret = + MCA_SPML_CALL(add_procs(oshmem_group_all->proc_array, oshmem_group_all->proc_count)); + if (OSHMEM_SUCCESS != ret) { + error = "SPML add procs failed"; + goto error; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_open(&oshmem_memheap_base_framework, 0))) { + error = "mca_memheap_base_open() failed"; + goto error; + } + + if (OSHMEM_SUCCESS != (ret = mca_memheap_base_select())) { + error = "mca_select_base_select() failed"; + goto error; + } + + if (OSHMEM_SUCCESS != (ret = mca_base_framework_open(&oshmem_atomic_base_framework, 0))) { + error = "mca_atomic_base_open() failed"; + goto error; + } + + if (OSHMEM_SUCCESS + != (ret = + mca_atomic_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, + OMPI_ENABLE_THREAD_MULTIPLE))) { + error = "mca_atomic_base_find_available() failed"; + goto error; + } + + /* This call should be done after memheap initialization */ + if (OSHMEM_SUCCESS != (ret = mca_scoll_enable())) { + error = "mca_scoll_enable() failed"; + goto error; + } + + if (OSHMEM_SUCCESS != shmem_lock_init()) { + error = "shmem_lock_init() failed"; + goto error; + } + + error: if (ret != OSHMEM_SUCCESS) { + const char *err_msg = opal_strerror(ret); + orte_show_help("help-shmem-runtime.txt", + "shmem_init:startup:internal-failure", + true, + "SHMEM_INIT", + "SHMEM_INIT", + error, + err_msg, + ret); + return ret; + } + + return ret; +} + diff --git a/oshmem/runtime/oshmem_shmem_params.c b/oshmem/runtime/oshmem_shmem_params.c new file mode 100644 index 0000000000..6baf7bb9b3 --- /dev/null +++ b/oshmem/runtime/oshmem_shmem_params.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "params.h" +#include "runtime.h" +#include "oshmem/constants.h" + + +int oshmem_shmem_lock_recursive = 0; +int oshmem_shmem_api_verbose = 0; + +int oshmem_shmem_register_params(void) +{ + (void) mca_base_var_register("oshmem", + "runtime", + NULL, + "lock_recursive", + "Whether or not distributed locking support recursive calls (default = no)", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &oshmem_shmem_lock_recursive); + + (void) mca_base_var_register("oshmem", + "runtime", + NULL, + "api_verbose", + "Verbosity level of the shmem c functions (default = 0)", + MCA_BASE_VAR_TYPE_INT, + NULL, + 0, + 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &oshmem_shmem_api_verbose); + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/runtime/oshmem_shmem_preconnect.h b/oshmem/runtime/oshmem_shmem_preconnect.h new file mode 100644 index 0000000000..0c36a3aba2 --- /dev/null +++ b/oshmem/runtime/oshmem_shmem_preconnect.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SHMEM_PRECONNECT_H +#define OSHMEM_SHMEM_PRECONNECT_H + +BEGIN_C_DECLS + +/** Preconnect peers */ +int oshmem_shmem_preconnect_all(void); + +/** Finalize preconnection framework*/ +int oshmem_shmem_preconnect_all_finalize(void); + +END_C_DECLS + +#endif diff --git a/oshmem/runtime/params.h b/oshmem/runtime/params.h new file mode 100644 index 0000000000..ee2acba2c5 --- /dev/null +++ b/oshmem/runtime/params.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_RUNTIME_PARAMS_H +#define OSHMEM_RUNTIME_PARAMS_H + +#include "oshmem_config.h" + +BEGIN_C_DECLS + +/* + * Global variables + */ + +/** + * Whether an MPI_ABORT should print out a stack trace or not. + */ +OSHMEM_DECLSPEC extern bool ompi_mpi_abort_print_stack; + +/** + * Whether or not the lock routines are recursive + * (ie support embedded calls) + */ +OSHMEM_DECLSPEC extern int oshmem_shmem_lock_recursive; + +/** + * Level of shmem API verbosity + */ +OSHMEM_DECLSPEC extern int oshmem_shmem_api_verbose; + +END_C_DECLS + +#endif /* OSHMEM_RUNTIME_PARAMS_H */ diff --git a/oshmem/runtime/runtime.h b/oshmem/runtime/runtime.h new file mode 100644 index 0000000000..8fc114b069 --- /dev/null +++ b/oshmem/runtime/runtime.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Interface into the SHMEM portion of the Open SHMEM Run Time Environment + */ + +#ifndef OSHMEM_SHMEM_RUNTIME_H +#define OSHMEM_SHMEM_RUNTIME_H + +#include "oshmem_config.h" + +#include "opal/class/opal_list.h" +#include "opal/class/opal_hash_table.h" + +#include "orte/runtime/orte_globals.h" + +BEGIN_C_DECLS + +/* Global variables and symbols for the SHMEM layer */ + +/** Is oshmem initialized? */ +OSHMEM_DECLSPEC extern bool oshmem_shmem_initialized; +/** Has oshmem been aborted **/ +OSHMEM_DECLSPEC extern bool oshmem_shmem_aborted; + +/** Do we have multiple threads? */ +OSHMEM_DECLSPEC extern bool oshmem_mpi_thread_multiple; +/** Thread level requested to \c MPI_Init_thread() */ +OSHMEM_DECLSPEC extern int oshmem_mpi_thread_requested; +/** Thread level provided by Open MPI */ +OSHMEM_DECLSPEC extern int oshmem_mpi_thread_provided; +/** Identifier of the main thread */ +OSHMEM_DECLSPEC extern struct opal_thread_t *oshmem_mpi_main_thread; + +/* + * SHMEM_Init_thread constants + */ +enum { + SHMEM_THREAD_SINGLE, + SHMEM_THREAD_FUNNELED, + SHMEM_THREAD_SERIALIZED, + SHMEM_THREAD_MULTIPLE +}; + +/** Bitflags to be used for the modex exchange for the various thread + * levels. Required to support heterogeneous environments */ +#define OSHMEM_THREADLEVEL_SINGLE_BF 0x00000001 +#define OSHMEM_THREADLEVEL_FUNNELED_BF 0x00000002 +#define OSHMEM_THREADLEVEL_SERIALIZED_BF 0x00000004 +#define OSHMEM_THREADLEVEL_MULTIPLE_BF 0x00000008 + +#define OSHMEM_THREADLEVEL_SET_BITFLAG(threadlevelin,threadlevelout) { \ + if ( SHMEM_THREAD_SINGLE == threadlevelin ) { \ + threadlevelout |= OSHMEM_THREADLEVEL_SINGLE_BF; \ + } else if ( SHMEM_THREAD_FUNNELED == threadlevelin ) { \ + threadlevelout |= OSHMEM_THREADLEVEL_FUNNELED_BF; \ + } else if ( SHMEM_THREAD_SERIALIZED == threadlevelin ) { \ + threadlevelout |= OSHMEM_THREADLEVEL_SERIALIZED_BF; \ + } else if ( SHMEM_THREAD_MULTIPLE == threadlevelin ) { \ + threadlevelout |= OSHMEM_THREADLEVEL_MULTIPLE_BF; \ + }} + +#define OSHMEM_THREADLEVEL_IS_MULTIPLE(threadlevel) (threadlevel & OSHMEM_THREADLEVEL_MULTIPLE_BF) + +/** In ompi_mpi_init: the lists of Fortran 90 mathing datatypes. + * We need these lists and hashtables in order to satisfy the new + * requirements introduced in MPI 2-1 Sect. 10.2.5, + * MPI_TYPE_CREATE_F90_xxxx, page 295, line 47. + */ +extern opal_hash_table_t ompi_mpi_f90_integer_hashtable; +extern opal_hash_table_t ompi_mpi_f90_real_hashtable; +extern opal_hash_table_t ompi_mpi_f90_complex_hashtable; + +/** version string of ompi */ +OSHMEM_DECLSPEC extern const char ompi_version_string[]; + +/** + * Initialize the Open SHMEM environment + * + * @param argc argc, typically from main() (IN) + * @param argv argv, typically from main() (IN) + * @param requested Thread support that is requested (IN) + * @param provided Thread support that is provided (OUT) + * + * @returns OSHMEM_SUCCESS if successful + * @returns Error code if unsuccessful + * + * Intialize all support code needed for SHMEM applications. This + * function should only be called by SHMEM applications (including + * singletons). If this function is called, ompi_init() and + * ompi_rte_init() should *not* be called. + * + * It is permissable to pass in (0, NULL) for (argc, argv). + */ +int oshmem_shmem_init(int argc, char **argv, int requested, int *provided); + +/** + * Finalize the Open SHMEM environment + * + * @returns OSHMEM_SUCCESS if successful + * @returns Error code if unsuccessful + * + */ +int oshmem_shmem_finalize(void); + +/** + * Abort SHMEM processes + */ +OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode); + +/** + * Exchange initial info between processes + */ +OSHMEM_DECLSPEC int oshmem_shmem_exchange_allgather(void *buf, int buf_size); + +OSHMEM_DECLSPEC int oshmem_shmem_exchange_bcast(void *buf, int buf_size, int root); + +/** + * Register OSHMEM specific runtime parameters + */ +OSHMEM_DECLSPEC int oshmem_shmem_register_params(void); + +#if OSHMEM_PARAM_CHECK == 1 + +#define RUNTIME_CHECK_ERROR(format, ...) \ + do { \ + fprintf(stderr, "[%s]%s[%s:%d:%s] ", \ + orte_process_info.nodename, \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + __FILE__, __LINE__, __func__); \ + fprintf(stderr, format, ## __VA_ARGS__); \ + } while(0); + +/** + * Check if SHMEM API generates internal error return code + * Note: most API does not return error code + */ +#define RUNTIME_CHECK_RC(x) \ + if (OPAL_UNLIKELY(OSHMEM_SUCCESS != (x))) \ + { \ + RUNTIME_CHECK_ERROR("Internal error is appeared rc = %d\n", (x)); \ + } + +/** + * Check if we called start_pes() and passed initialization phase + */ +#define RUNTIME_CHECK_INIT() \ + if (OPAL_UNLIKELY(!oshmem_shmem_initialized)) \ + { \ + RUNTIME_CHECK_ERROR("SHMEM is not initialized\n"); \ + oshmem_shmem_abort(-1); \ + } + +/** + * Check if we target PE is valid + */ +#define RUNTIME_CHECK_PE(x) \ + if (OPAL_UNLIKELY(((x) < 0) || \ + ((int)(x) > (int)(orte_process_info.num_procs - 1)))) \ + { \ + RUNTIME_CHECK_ERROR("Target PE #%d is not in valid range\n", (x)); \ + oshmem_shmem_abort(-1); \ + } + +/** + * Check if required address is in symmetric address space + */ +#include "oshmem/mca/memheap/memheap.h" +#define RUNTIME_CHECK_ADDR(x) \ + if (OPAL_UNLIKELY(!MCA_MEMHEAP_CALL(is_symmetric_addr((unsigned long)(x))))) \ + { \ + RUNTIME_CHECK_ERROR("Required address %p is not in symmetric space\n", (x)); \ + oshmem_shmem_abort(-1); \ + } + +#define RUNTIME_CHECK_WITH_MEMHEAP_SIZE(x) \ + if (OPAL_UNLIKELY((long)(x) > MCA_MEMHEAP_CALL(size))) \ + { \ + RUNTIME_CHECK_ERROR("Requested (%ld)bytes and it exceeds symmetric space size (%ld)bytes\n", (long)(x), MCA_MEMHEAP_CALL(size)); \ + } + +#else + +#define RUNTIME_CHECK_RC(x) (x = x) +#define RUNTIME_CHECK_INIT() +#define RUNTIME_CHECK_PE(x) +#define RUNTIME_CHECK_ADDR(x) + +#endif /* OSHMEM_PARAM_CHECK */ + +END_C_DECLS + +#endif /* OSHMEM_SHMEM_RUNTIME_H */ diff --git a/oshmem/shmem/Makefile.am b/oshmem/shmem/Makefile.am new file mode 100644 index 0000000000..460a96d42e --- /dev/null +++ b/oshmem/shmem/Makefile.am @@ -0,0 +1,12 @@ +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +headers += shmem/shmem_api_logger.h \ + shmem/shmem_lock.h +dist_pkgdata_DATA += shmem/c/help-shmem-api.txt diff --git a/oshmem/shmem/c/Makefile.am b/oshmem/shmem/c/Makefile.am new file mode 100644 index 0000000000..e8a1d005a4 --- /dev/null +++ b/oshmem/shmem/c/Makefile.am @@ -0,0 +1,75 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +if OSHMEM_PROFILING + SUBDIRS = profile +endif + + +SHMEM_API_SOURCES = \ + shmem_init.c \ + shmem_free.c \ + shmem_alloc.c \ + shmem_realloc.c \ + shmem_align.c \ + shmem_finalize.c \ + shmem_query.c \ + shmem_p.c \ + shmem_put.c \ + shmem_g.c \ + shmem_get.c \ + shmem_broadcast.c \ + shmem_collect.c \ + shmem_ptr.c \ + shmem_pe_accessible.c \ + shmem_addr_accessible.c \ + shmem_barrier.c \ + shmem_fence.c \ + shmem_quiet.c \ + shmem_wait.c \ + shmem_iget.c \ + shmem_iput.c \ + shmem_udcflush.c \ + shmem_udcflush_line.c \ + shmem_set_cache_inv.c \ + shmem_set_cache_line_inv.c \ + shmem_clear_cache_inv.c \ + shmem_clear_cache_line_inv.c \ + shmem_reduce.c \ + shmem_swap.c \ + shmem_cswap.c \ + shmem_fadd.c \ + shmem_finc.c \ + shmem_add.c \ + shmem_inc.c \ + shmem_clear_lock.c \ + shmem_set_lock.c \ + shmem_test_lock.c \ + shmem_lock.c + + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +AM_CPPFLAGS = -DOSHMEM_PROFILING=0 + +noinst_LTLIBRARIES = libshmem_c.la + +headers = + +libshmem_c_la_SOURCES = $(SHMEM_API_SOURCES) + +if WANT_INSTALL_HEADERS +oshmemdir = $(includedir)/openshmem/oshmem/shmem/c +oshmem_HEADERS = $(headers) +else +oshmemdir = $(includedir) +endif + diff --git a/oshmem/shmem/c/help-shmem-api.txt b/oshmem/shmem/c/help-shmem-api.txt new file mode 100644 index 0000000000..6baad8fe9a --- /dev/null +++ b/oshmem/shmem/c/help-shmem-api.txt @@ -0,0 +1,21 @@ +# -*- text -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open SHMEM. +# +[shmem-function-after-finalize] +Calling any SHMEM-function after calling shmem_finalize is erroneous. +%s was called. +# +[shmem-initialize-twice] +Calling shmem_init twice is erroneous. +# +[shmem-abort] +SHMEM_ABORT was invoked on rank %d (pid %d, host=%s) with errorcode %d. diff --git a/oshmem/shmem/c/profile/Makefile.am b/oshmem/shmem/c/profile/Makefile.am new file mode 100644 index 0000000000..3781e4e2c3 --- /dev/null +++ b/oshmem/shmem/c/profile/Makefile.am @@ -0,0 +1,96 @@ +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# OSHMEM_PROFILING flag is enabled when we want our shmem_* symbols +# to be replaced by pshmem_*. In other words, this flag decides +# whether "profile/defines.h" is included or not. "profile/defines.h" +# replaces all shmem_* symbols with pshmem_* symbols. In this directory +# we definately need it to be 1. +# +AM_CPPFLAGS = -DOSHMEM_PROFILING=1 + + +noinst_LTLIBRARIES = libshmem_c_pshmem.la + +headers = defines.h + + +SHMEM_API_SOURCES = \ + shmem_init.c \ + shmem_free.c \ + shmem_alloc.c \ + shmem_realloc.c \ + shmem_align.c \ + shmem_query.c \ + shmem_p.c \ + shmem_put.c \ + shmem_g.c \ + shmem_get.c \ + shmem_broadcast.c \ + shmem_collect.c \ + shmem_ptr.c \ + shmem_pe_accessible.c \ + shmem_addr_accessible.c \ + shmem_barrier.c \ + shmem_fence.c \ + shmem_quiet.c \ + shmem_wait.c \ + shmem_iget.c \ + shmem_iput.c \ + shmem_udcflush.c \ + shmem_udcflush_line.c \ + shmem_set_cache_inv.c \ + shmem_set_cache_line_inv.c \ + shmem_clear_cache_inv.c \ + shmem_clear_cache_line_inv.c \ + shmem_reduce.c \ + shmem_swap.c \ + shmem_cswap.c \ + shmem_fadd.c \ + shmem_finc.c \ + shmem_add.c \ + shmem_inc.c \ + shmem_clear_lock.c \ + shmem_set_lock.c \ + shmem_test_lock.c + +#nodist_libshmem_c_pshmem_la_SOURCES = $(addprefix p, $(SHMEM_API_SOURCES)) +nodist_libshmem_c_pshmem_la_SOURCES = \ + $(SHMEM_API_SOURCES) + + +# +# Sym link in the sources from the real MPI directory +# +$(nodist_libshmem_c_pshmem_la_SOURCES): + if test ! -r $@ ; then \ + pname=`echo $@ | cut -b '1-'` ; \ + $(LN_S) $(top_srcdir)/oshmem/shmem/c/$$pname $@ ; \ + fi + +if WANT_INSTALL_HEADERS +oshmemdir = $(includedir)/openshmem/oshmem/shmem/c/profile +oshmem_HEADERS = $(headers) +else +oshmemdir = $(includedir) +endif + +# These files were created by targets above + +MAINTAINERCLEANFILES = $(nodist_libshmem_c_pshmem_la_SOURCES) + +# Don't want these targets in here + +tags-recursive: +tags: +TAGS: +GTAGS: +ID: diff --git a/oshmem/shmem/c/profile/defines.h b/oshmem/shmem/c/profile/defines.h new file mode 100644 index 0000000000..41b16972aa --- /dev/null +++ b/oshmem/shmem/c/profile/defines.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2013 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_C_PROFILE_DEFINES_H +#define OSHMEM_C_PROFILE_DEFINES_H +/* + * This file is included in the top directory only if + * profiling is required. Once profiling is required, + * this file will replace all shmem_* symbols with + * pshmem_* symbols + */ + +#define start_pes pstart_pes + + +/* + * Query routines + */ +#define _num_pes p_num_pes +#define _my_pe p_my_pe + + +/* + * Accessability routines + */ +#define shmem_pe_accessible pshmem_pe_accessible +#define shmem_addr_accessible pshmem_addr_accessible + +/* + * Symmetric heap routines + */ +#define shmalloc pshmalloc +#define shmemalign pshmemalign +#define shrealloc pshrealloc +#define shfree pshfree + +/* + * Remote pointer operations + */ +#define shmem_ptr pshmem_ptr + +/* + * Elemental put routines + */ +#define shmem_short_p pshmem_short_p +#define shmem_int_p pshmem_int_p +#define shmem_long_p pshmem_long_p +#define shmem_float_p pshmem_float_p +#define shmem_double_p pshmem_double_p +#define shmem_longlong_p pshmem_longlong_p +#define shmem_longdouble_p pshmem_longdouble_p + +/* + * Block data put routines + */ +#define shmem_char_put pshmem_char_put +#define shmem_short_put pshmem_short_put +#define shmem_int_put pshmem_int_put +#define shmem_long_put pshmem_long_put +#define shmem_float_put pshmem_float_put +#define shmem_double_put pshmem_double_put +#define shmem_longlong_put pshmem_longlong_put +#define shmem_longdouble_put pshmem_longdouble_put +#define shmem_put32 pshmem_put32 +#define shmem_put64 pshmem_put64 +#define shmem_put128 pshmem_put128 +#define shmem_putmem pshmem_putmem + +/* + * Strided put routines + */ +#define shmem_int_iput pshmem_int_iput +#define shmem_short_iput pshmem_short_iput +#define shmem_float_iput pshmem_float_iput +#define shmem_double_iput pshmem_double_iput +#define shmem_longlong_iput pshmem_longlong_iput +#define shmem_longdouble_iput pshmem_longdouble_iput +#define shmem_long_iput pshmem_long_iput +#define shmem_iput32 pshmem_iput32 +#define shmem_iput64 pshmem_iput64 +#define shmem_iput128 pshmem_iput128 + +/* + * Elemental get routines + */ +#define shmem_short_g pshmem_short_g +#define shmem_int_g pshmem_int_g +#define shmem_long_g pshmem_long_g +#define shmem_float_g pshmem_float_g +#define shmem_double_g pshmem_double_g +#define shmem_longlong_g pshmem_longlong_g +#define shmem_longdouble_g pshmem_longdouble_g + +/* + * Block data get routines + */ +#define shmem_char_get pshmem_char_get +#define shmem_short_get pshmem_short_get +#define shmem_int_get pshmem_int_get +#define shmem_long_get pshmem_long_get +#define shmem_float_get pshmem_float_get +#define shmem_double_get pshmem_double_get +#define shmem_longlong_get pshmem_longlong_get +#define shmem_longdouble_get pshmem_longdouble_get +#define shmem_get32 pshmem_get32 +#define shmem_get64 pshmem_get64 +#define shmem_get128 pshmem_get128 +#define shmem_getmem pshmem_getmem + +/* + * Strided get routines + */ +#define shmem_int_iget pshmem_int_iget +#define shmem_short_iget pshmem_short_iget +#define shmem_float_iget pshmem_float_iget +#define shmem_double_iget pshmem_double_iget +#define shmem_longlong_iget pshmem_longlong_iget +#define shmem_longdouble_iget pshmem_longdouble_iget +#define shmem_long_iget pshmem_long_iget +#define shmem_iget32 pshmem_iget32 +#define shmem_iget64 pshmem_iget64 +#define shmem_iget128 pshmem_iget128 + +/* + * Atomic operations + */ +/* Atomic swap */ +#define shmem_swap pshmem_swap +#define shmem_double_swap pshmem_double_swap +#define shmem_float_swap pshmem_float_swap +#define shmem_int_swap pshmem_int_swap +#define shmem_long_swap pshmem_long_swap +#define shmem_longlong_swap pshmem_longlong_swap + +/* Atomic conditional swap */ +#define shmem_int_cswap pshmem_int_cswap +#def