1
1

Merge pull request #7807 from bwbarrett/backports/v4.1.x-ofi-updates

Backport OFI changes from master to v4.1.x
Этот коммит содержится в:
Brian Barrett 2020-06-17 15:07:32 -07:00 коммит произвёл GitHub
родитель f334a699b7 204922fff6
Коммит a4e8f2b2cb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
45 изменённых файлов: 5671 добавлений и 323 удалений

Просмотреть файл

@ -111,3 +111,5 @@ Geoffrey Paulsen <gpaulsen@us.ibm.com> <gpaulsen@users.noreply.github.com>
Anandhi S Jayakumar <anandhi.s.jayakumar@intel.com>
Mohan Gandhi <mohgan@amazon.com>
Harumi Kuno <harumi.kuno@hpe.com>

4
NEWS
Просмотреть файл

@ -60,6 +60,10 @@ included in the vX.Y.Z section and be denoted as:
4.1.0 -- July, 2020
-------------------
- OFI/libfabric: Added support for multiple NICs
- OFI/libfabric: Added support for Scalable Endpoints
- OFI/libfabric: Added btl for one-sided support
4.0.4 -- June, 2020
-----------------------
- Fix a memory patcher issue intercepting shmat and shmdt. This was

Просмотреть файл

@ -116,6 +116,7 @@ libmca_orte_common_alps_so_version=70:0:30
# OPAL layer
libmca_opal_common_cuda_so_version=70:0:30
libmca_opal_common_ofi_so_version=10:0:0
libmca_opal_common_sm_so_version=70:0:30
libmca_opal_common_ucx_so_version=70:0:30
libmca_opal_common_ugni_so_version=70:0:30

Просмотреть файл

@ -1,6 +1,6 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2015-2019 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
dnl reserved.
dnl $COPYRIGHT$
@ -10,6 +10,45 @@ dnl
dnl $HEADER$
dnl
dnl
dnl OPAL_CHECK_OFI_VERSION_GE
dnl
dnl Check that the OFI API version number is >= a specific value.
dnl
dnl $1: version number to compare, in the form of "major,minor"
dnl (without quotes) -- i.e., a single token representing the
dnl arguments to FI_VERSION()
dnl $2: action if OFI API version is >= $1
dnl $3: action if OFI API version is < $1
AC_DEFUN([OPAL_CHECK_OFI_VERSION_GE],[
OPAL_VAR_SCOPE_PUSH([opal_ofi_ver_ge_save_CPPFLAGS opal_ofi_ver_ge_happy])
AC_MSG_CHECKING([if OFI API version number is >= $1])
opal_ofi_ver_ge_save_CPPFLAGS=$CPPFLAGS
CPPFLAGS=$opal_ofi_CPPFLAGS
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fabric.h>]],
[[
#if !defined(FI_MAJOR_VERSION)
#error "we cannot check the version -- sad panda"
#elif FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION($1))
#error "version is too low -- nopes"
#endif
]])],
[opal_ofi_ver_ge_happy=1],
[opal_ofi_ver_ge_happy=0])
AS_IF([test $opal_ofi_ver_ge_happy -eq 1],
[AC_MSG_RESULT([yes])
$2],
[AC_MSG_RESULT([no])
$3])
CPPFLAGS=$opal_ofi_ver_ge_save_CPPFLAGS
OPAL_VAR_SCOPE_POP
])dnl
dnl
dnl _OPAL_CHECK_OFI
dnl --------------------------------------------------------
@ -50,10 +89,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
OPAL_CHECK_WITHDIR([ofi-libdir], [$with_ofi_libdir],
[libfabric.*])
OPAL_VAR_SCOPE_PUSH([opal_check_ofi_save_CPPFLAGS opal_check_ofi_save_LDFLAGS opal_check_ofi_save_LIBS])
OPAL_VAR_SCOPE_PUSH([opal_check_ofi_save_CPPFLAGS opal_check_ofi_save_LDFLAGS opal_check_ofi_save_LIBS opal_check_fi_info_pci])
opal_check_ofi_save_CPPFLAGS=$CPPFLAGS
opal_check_ofi_save_LDFLAGS=$LDFLAGS
opal_check_ofi_save_LIBS=$LIBS
opal_check_fi_info_pci=0
opal_ofi_happy=yes
AS_IF([test "$with_ofi" = "no"],
@ -81,6 +121,16 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
[],
[opal_ofi_happy=no])])
AS_IF([test $opal_ofi_happy = yes],
[AC_CHECK_MEMBER([struct fi_info.nic],
[opal_check_fi_info_pci=1],
[opal_check_fi_info_pci=0],
[[#include "$with_ofi/include/rdma/fabric.h"]])])
AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE],
[$opal_check_fi_info_pci],
[check if pci data is available in ofi])
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
LDFLAGS=$opal_check_ofi_save_LDFLAGS
LIBS=$opal_check_ofi_save_LIBS

Просмотреть файл

@ -154,6 +154,7 @@ AC_SUBST(libopen_pal_so_version)
# transparently by adding some intelligence in autogen.pl
# and/or opal_mca.m4, but I don't have the cycles to do this
# right now.
AC_SUBST(libmca_opal_common_ofi_so_version)
AC_SUBST(libmca_opal_common_cuda_so_version)
AC_SUBST(libmca_opal_common_sm_so_version)
AC_SUBST(libmca_opal_common_ugni_so_version)

5
ompi/mca/mtl/ofi/.gitignore поставляемый Обычный файл
Просмотреть файл

@ -0,0 +1,5 @@
mtl_ofi_improbe_opt.c
mtl_ofi_iprobe_opt.c
mtl_ofi_irecv_opt.c
mtl_ofi_isend_opt.c
mtl_ofi_send_opt.c

Просмотреть файл

@ -14,21 +14,50 @@
# $HEADER$
#
EXTRA_DIST = post_configure.sh
EXTRA_DIST = post_configure.sh \
$(generated_source_modules)
MAINTAINERCLEANFILES = \
$(generated_sources)
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
dist_ompidata_DATA = help-mtl-ofi.txt
generated_source_modules = \
mtl_ofi_send_opt.pm \
mtl_ofi_isend_opt.pm \
mtl_ofi_irecv_opt.pm \
mtl_ofi_iprobe_opt.pm \
mtl_ofi_improbe_opt.pm
generated_sources = \
mtl_ofi_send_opt.c \
mtl_ofi_isend_opt.c \
mtl_ofi_irecv_opt.c \
mtl_ofi_iprobe_opt.c \
mtl_ofi_improbe_opt.c
mtl_ofi_sources = \
mtl_ofi.h \
mtl_ofi.c \
mtl_ofi_compat.h \
mtl_ofi_component.c \
mtl_ofi_endpoint.h \
mtl_ofi_endpoint.c \
mtl_ofi_request.h \
mtl_ofi_types.h
mtl_ofi.h \
mtl_ofi.c \
mtl_ofi_compat.h \
mtl_ofi_component.c \
mtl_ofi_endpoint.h \
mtl_ofi_endpoint.c \
mtl_ofi_request.h \
mtl_ofi_types.h \
mtl_ofi_opt.h \
$(generated_sources)
# A number of files are generated from macro expansion to minimize
# branches in the critical path. These files have perl modules with the suffix
# .pm that generate the corresponding .c file with all possible branches as
# their own function and symbol. Additional input
# files should be added to generated_source_modules, as well as adding
# their .c variants to generated_sources.
%.c : %.pm;
$(PERL) -I$(top_srcdir)/ompi/mca/mtl/ofi $(top_srcdir)/ompi/mca/mtl/ofi/generate-opt-funcs.pl $@
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
@ -49,6 +78,7 @@ mca_mtl_ofi_la_LDFLAGS = \
$(opal_ofi_LDFLAGS) \
-module -avoid-version
mca_mtl_ofi_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la \
$(opal_ofi_LIBS)
noinst_LTLIBRARIES = $(component_noinst)

Просмотреть файл

@ -1,5 +1,5 @@
OFI MTL
OFI MTL:
--------
The OFI MTL supports Libfabric (a.k.a. Open Fabrics Interfaces OFI,
https://ofiwg.github.io/libfabric/) tagged APIs (fi_tagged(3)). At
initialization time, the MTL queries libfabric for providers supporting tag matching
@ -9,6 +9,7 @@ The user may modify the OFI provider selection with mca parameters
mtl_ofi_provider_include or mtl_ofi_provider_exclude.
PROGRESS:
---------
The MTL registers a progress function to opal_progress. There is currently
no support for asynchronous progress. The progress function reads multiple events
from the OFI provider Completion Queue (CQ) per iteration (defaults to 100, can be
@ -16,12 +17,14 @@ modified with the mca mtl_ofi_progress_event_cnt) and iterates until the
completion queue is drained.
COMPLETIONS:
------------
Each operation uses a request type ompi_mtl_ofi_request_t which includes a reference
to an operation specific completion callback, an MPI request, and a context. The
to an operation specific completion callback, an MPI request, and a context. The
context (fi_context) is used to map completion events with MPI_requests when reading the
CQ.
OFI TAG:
--------
MPI needs to send 96 bits of information per message (32 bits communicator id,
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
@ -67,3 +70,271 @@ This is signaled in mem_tag_format (see fi_endpoint(3)) by setting higher order
to zero. In such cases, the OFI MTL will reduce the number of communicator ids supported
by reducing the bits available for the communicator ID field in the OFI tag.
SCALABLE ENDPOINTS:
-------------------
OFI MTL supports OFI Scalable Endpoints (SEP) feature as a means to improve
multi-threaded application throughput and message rate. Currently the feature
is designed to utilize multiple TX/RX contexts exposed by the OFI provider in
conjunction with a multi-communicator MPI application model. Therefore, new OFI
contexts are created as and when communicators are duplicated in a lazy fashion
instead of creating them all at once during init time and this approach also
favours only creating as many contexts as needed.
1. Multi-communicator model:
With this approach, the MPI application is requried to first duplicate
the communicators it wants to use with MPI operations (ideally creating
as many communicators as the number of threads it wants to use to call
into MPI). The duplicated communicators are then used by the
corresponding threads to perform MPI operations. A possible usage
scenario could be in an MPI + OMP application as follows
(example limited to 2 ranks):
MPI_Comm dup_comm[n];
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
for (i = 0; i < n; i++) {
MPI_Comm_dup(MPI_COMM_WORLD, &dup_comm[i]);
}
if (rank == 0) {
#pragma omp parallel for private(host_sbuf, host_rbuf) num_threads(n)
for (i = 0; i < n ; i++) {
MPI_Send(host_sbuf, MYBUFSIZE, MPI_CHAR,
1, MSG_TAG, dup_comm[i]);
MPI_Recv(host_rbuf, MYBUFSIZE, MPI_CHAR,
1, MSG_TAG, dup_comm[i], &status);
}
} else if (rank == 1) {
#pragma omp parallel for private(status, host_sbuf, host_rbuf) num_threads(n)
for (i = 0; i < n ; i++) {
MPI_Recv(host_rbuf, MYBUFSIZE, MPI_CHAR,
0, MSG_TAG, dup_comm[i], &status);
MPI_Send(host_sbuf, MYBUFSIZE, MPI_CHAR,
0, MSG_TAG, dup_comm[i]);
}
}
2. MCA variables:
To utilize the feature, the following MCA variables need to be set:
mtl_ofi_enable_sep:
This MCA variable needs to be set to enable the use of Scalable Endpoints (SEP)
feature in the OFI MTL. The underlying provider is also checked to ensure the
feature is supported. If the provider chosen does not support it, user needs
to either set this variable to 0 or select a different provider which supports
the feature.
For single-threaded applications one OFI context is sufficient, so OFI SEPs
may not add benefit.
Note that mtl_ofi_thread_grouping (see below) needs to be enabled to use the
different OFI SEP contexts. Otherwise, only one context (ctxt 0) will be used.
Default: 0
Command-line syntax:
"-mca mtl_ofi_enable_sep 1"
mtl_ofi_thread_grouping:
Turn Thread Grouping feature on. This is needed to use the Multi-communicator
model explained above. This means that the OFI MTL will use the communicator
ID to decide the SEP contexts to be used by the thread. In this way, each
thread will have direct access to different OFI resources. If disabled,
only context 0 will be used.
Requires mtl_ofi_enable_sep to be set to 1.
Default: 0
It is not recommended to set the MCA variable for:
- Multi-threaded MPI applications not following multi-communicator approach.
- Applications that have multiple threads using a single communicator as
it may degrade performance.
Command-line syntax:
"-mca mtl_ofi_thread_grouping 1"
mtl_ofi_num_ctxts:
This MCA variable allows user to set the number of OFI SEP contexts the
application expects to use. For multi-threaded applications using Thread
Grouping feature, this number should be set to the number of user threads
that will call into MPI. This variable will only have effect if
mtl_ofi_enable_sep is set to 1.
Default: 1
Command-line syntax:
"-mca mtl_ofi_num_ctxts N" [ N: number of OFI contexts required by
application ]
3. Notes on performance:
- OFI MTL will create as many TX/RX contexts as set by MCA mtl_ofi_num_ctxts.
The number of contexts that can be created is also limited by the underlying
provider as each provider may have different thresholds. Once the threshold
is exceeded, contexts are used in a round-robin fashion which leads to
resource sharing among threads. Therefore locks are required to guard
against race conditions. For performance, it is recommended to have
Number of threads = Number of communicators = Number of contexts
For example, when using PSM2 provider, the number of contexts is dictated
by the Intel Omni-Path HFI1 driver module.
- OPAL layer allows for multiple threads to enter progress simultaneously. To
enable this feature, user needs to set MCA variable
"max_thread_in_progress". When using Thread Grouping feature, it is
recommended to set this MCA parameter to the number of threads expected to
call into MPI as it provides performance benefits.
Command-line syntax:
"-mca opal_max_thread_in_progress N" [ N: number of threads expected to
make MPI calls ]
Default: 1
- For applications using a single thread with multiple communicators and MCA
variable "mtl_ofi_thread_grouping" set to 1, the MTL will use multiple
contexts, but the benefits may be negligible as only one thread is driving
progress.
SPECIALIZED FUNCTIONS:
-------------------
To improve performance when calling message passing APIs in the OFI mtl
specialized functions are generated at compile time that eliminate all the
if conditionals that can be determined at init and don't need to be
queried again during the critical path. These functions are generated by
perl scripts during make which generate functions and symbols for every
combination of flags for each function.
1. ADDING NEW FLAGS FOR SPECIALIZATION OF EXISTING FUNCTION:
To add a new flag to an existing specialized function for handling cases
where different OFI providers may or may not support a particular feature,
then you must follow these steps:
1) Update the "_generic" function in mtl_ofi.h with the new flag and
the if conditionals to read the new value.
2) Update the *.pm file corresponding to the function with the new flag in:
gen_funcs(), gen_*_function(), & gen_*_sym_init()
3) Update mtl_ofi_opt.h with:
The new flag as #define NEW_FLAG_TYPES #NUMBER_OF_STATES
example: #define OFI_CQ_DATA 2 (only has TRUE/FALSE states)
Update the function's types with:
#define OMPI_MTL_OFI_FUNCTION_TYPES [NEW_FLAG_TYPES]
2. ADDING A NEW FUNCTION FOR SPECIALIZATION:
To add a new function to be specialized you must
follow these steps:
1) Create a new mtl_ofi_"function_name"_opt.pm based off opt_common/mtl_ofi_opt.pm.template
2) Add new .pm file to generated_source_modules in Makefile.am
3) Add .c file to generated_sources in Makefile.am named the same as the corresponding .pm file
4) Update existing or create function in mtl_ofi.h to _generic with new flags.
5) Update mtl_ofi_opt.h with:
a) New function types: #define OMPI_MTL_OFI_FUNCTION_TYPES [FLAG_TYPES]
b) Add new function to the struct ompi_mtl_ofi_symtable:
struct ompi_mtl_ofi_symtable {
...
int (*ompi_mtl_ofi_FUNCTION OMPI_MTL_OFI_FUNCTION_TYPES )
}
c) Add new symbol table init function definition:
void ompi_mtl_ofi_FUNCTION_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
6) Add calls to init the new function in the symbol table and assign the function
pointer to be used based off the flags in mtl_ofi_component.c:
ompi_mtl_ofi_FUNCTION_symtable_init(&ompi_mtl_ofi.sym_table);
ompi_mtl_ofi.base.mtl_FUNCTION =
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_FUNCTION[ompi_mtl_ofi.flag];
3. EXAMPLE SPECIALIZED FILE:
The code below is an example of what is generated by the specialization
scripts for use in the OFI mtl. This code specializes the blocking
send functionality based on FI_REMOTE_CQ_DATA & OFI Scalable Endpoint support
provided by an OFI Provider. Only one function and symbol is used during
runtime based on if FI_REMOTE_CQ_DATA is supported and/or if OFI Scalable
Endpoint support is enabled.
/*
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "mtl_ofi.h"
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_send_false_false(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode)
{
const bool OFI_CQ_DATA = false;
const bool OFI_SCEP_EPS = false;
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
convertor, mode,
OFI_CQ_DATA, OFI_SCEP_EPS);
}
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_send_false_true(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode)
{
const bool OFI_CQ_DATA = false;
const bool OFI_SCEP_EPS = true;
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
convertor, mode,
OFI_CQ_DATA, OFI_SCEP_EPS);
}
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_send_true_false(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode)
{
const bool OFI_CQ_DATA = true;
const bool OFI_SCEP_EPS = false;
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
convertor, mode,
OFI_CQ_DATA, OFI_SCEP_EPS);
}
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_send_true_true(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode)
{
const bool OFI_CQ_DATA = true;
const bool OFI_SCEP_EPS = true;
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
convertor, mode,
OFI_CQ_DATA, OFI_SCEP_EPS);
}
void ompi_mtl_ofi_send_symtable_init(struct ompi_mtl_ofi_symtable* sym_table)
{
sym_table->ompi_mtl_ofi_send[false][false]
= ompi_mtl_ofi_send_false_false;
sym_table->ompi_mtl_ofi_send[false][true]
= ompi_mtl_ofi_send_false_true;
sym_table->ompi_mtl_ofi_send[true][false]
= ompi_mtl_ofi_send_true_false;
sym_table->ompi_mtl_ofi_send[true][true]
= ompi_mtl_ofi_send_true_true;
}
###

Просмотреть файл

@ -2,7 +2,7 @@
#
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved
#
# Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2014-2020 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
@ -28,6 +28,12 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[
# Check for OFI
OPAL_CHECK_OFI
# The OFI MTL requires at least OFI libfabric v1.5.
AS_IF([test "$opal_ofi_happy" = "yes"],
[OPAL_CHECK_OFI_VERSION_GE([1,5],
[],
[opal_ofi_happy=no])])
AS_IF([test "$opal_ofi_happy" = "yes"],
[$1],
[$2])

62
ompi/mca/mtl/ofi/generate-opt-funcs.pl Обычный файл
Просмотреть файл

@ -0,0 +1,62 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use mtl_ofi_send_opt;
use mtl_ofi_isend_opt;
use mtl_ofi_irecv_opt;
use mtl_ofi_iprobe_opt;
use mtl_ofi_improbe_opt;
use opt_common::mtl_ofi_opt_common;
my $MTL_OFI_HEADER =
'/*
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "mtl_ofi.h"';
my $specialization_file = $ARGV[0];
my $specialization_type = $specialization_file;
$specialization_type =~ s{\.[^.]+$}{};
my $sym_table_type = $specialization_type;
$sym_table_type =~ s/_opt//g;
open my $gen_file, ">", $specialization_file;
#
# Generate the Specialized functions & symbol table for the specified file.
#
print $gen_file "$MTL_OFI_HEADER\n\n";
my $GEN_FUNC = $specialization_type . "::gen_funcs\(\$gen_file, \"FUNC\"\)";
my $GEN_SYM = $specialization_type . "::gen_funcs\(\$gen_file, \"SYM\"\)";
my $SYM_TABLE = "ompi_" . $sym_table_type . "_symtable";
eval $GEN_FUNC;
my $SYM_FUNC_HEADER = opt_common::mtl_ofi_opt_common::gen_sym_function_header($SYM_TABLE);
print $gen_file "$SYM_FUNC_HEADER\n";
eval $GEN_SYM;
my $SYM_FUNC_FOOTER = opt_common::mtl_ofi_opt_common::gen_sym_function_footer();
print $gen_file "$SYM_FUNC_FOOTER\n\n";
close($gen_file);
exit(0);
###

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
# $COPYRIGHT$
@ -16,5 +16,64 @@ unusual; your job may behave unpredictably (and/or abort) after this.
Local host: %s
Location: %s:%d
Error: %s (%zd)
#
[Not enough bits for CID]
OFI provider "%s" does not have enough free bits in its tag to fit the MPI
Communicator ID. See the mem_tag_format of the provider by running:
fi_info -v -p %s
Local host: %s
Location: %s:%d
[SEP unavailable]
Scalable Endpoint feature is enabled by the user but it is not supported by
%s provider. Try disabling this feature or use a different provider that
supports it using mtl_ofi_provider_include.
Local host: %s
Location: %s:%d
[SEP required]
Scalable Endpoint feature is required for Thread Grouping feature to work.
Please try enabling Scalable Endpoints using mtl_ofi_enable_sep.
Local host: %s
Location: %s:%d
[SEP thread grouping ctxt limit]
Reached limit (%d) for number of OFI contexts set by mtl_ofi_num_ctxts.
Please set mtl_ofi_num_ctxts to a larger value if you need more contexts.
If an MPI application creates more communicators than mtl_ofi_num_ctxts,
OFI MTL will make the new communicators re-use existing contexts in
round-robin fashion which will impact performance.
Local host: %s
Location: %s:%d
[Local ranks exceed ofi contexts]
Number of local ranks exceed the number of available OFI contexts in %s
provider and we cannot provision enough contexts for each rank. Try disabling
Scalable Endpoint feature.
Local host: %s
Location: %s:%d
[Ctxts exceeded available]
User requested for more than available contexts from provider. Limiting
to max allowed (%d). Contexts will be re used in round-robin fashion if there
are more threads than the available contexts.
Local host: %s
Location: %s:%d
[modex failed]
The OFI MTL was not able to find endpoint information for a remote
endpoint. Most likely, this means that the remote process was unable
to initialize the Libfabric NIC correctly. This error is not
recoverable and your application is likely to abort.
Local host: %s
Remote host: %s
Error: %s (%d)
[message too big]
Message size %llu bigger than supported by selected transport. Max = %llu

Просмотреть файл

@ -23,12 +23,12 @@ mca_mtl_ofi_module_t ompi_mtl_ofi = {
ompi_mtl_ofi_del_procs,
ompi_mtl_ofi_finalize,
ompi_mtl_ofi_send,
ompi_mtl_ofi_isend,
ompi_mtl_ofi_irecv,
ompi_mtl_ofi_iprobe,
ompi_mtl_ofi_imrecv,
ompi_mtl_ofi_improbe,
NULL,
NULL,
NULL,
NULL,
ompi_mtl_ofi_imrecv,
NULL,
ompi_mtl_ofi_cancel,
ompi_mtl_ofi_add_comm,
@ -98,9 +98,10 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
(void**)&ep_name,
&size);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: modex_recv failed: %d\n",
__FILE__, __LINE__, ret);
opal_show_help("help-mtl-ofi.txt", "modex failed",
true, ompi_process_info.nodename,
procs[i]->super.proc_hostname,
opal_strerror(ret), ret);
goto bail;
}
memcpy(&ep_names[i*namelen], ep_name, namelen);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -15,6 +15,7 @@
#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/printf.h"
#include "opal/mca/common/ofi/common_ofi.h"
static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@ -33,6 +34,11 @@ static int data_progress;
static int av_type;
static int ofi_tag_mode;
#if OPAL_HAVE_THREAD_LOCAL
opal_thread_local int per_thread_ctx;
opal_thread_local struct fi_cq_tagged_entry wc[MTL_OFI_MAX_PROG_EVENT_COUNT];
#endif
/*
* Enumerators
*/
@ -142,8 +148,8 @@ ompi_mtl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&prov_exclude);
ompi_mtl_ofi.ofi_progress_event_count = 100;
asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
opal_asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"progress_event_cnt",
desc,
@ -229,6 +235,38 @@ ompi_mtl_ofi_component_register(void)
&av_type);
OBJ_RELEASE(new_enum);
ompi_mtl_ofi.enable_sep = 0;
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"enable_sep",
"Enable SEP feature",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.enable_sep);
ompi_mtl_ofi.thread_grouping = 0;
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"thread_grouping",
"Enable/Disable Thread Grouping feature",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.thread_grouping);
/*
* Default Policy: Create 1 context and let user ask for more for
* multi-threaded workloads. User needs to ask for as many contexts as the
* number of threads that are anticipated to make MPI calls.
*/
ompi_mtl_ofi.num_ofi_contexts = 1;
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"num_ctxts",
"Specify number of OFI contexts to create",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.num_ofi_contexts);
return OMPI_SUCCESS;
}
@ -242,8 +280,7 @@ ompi_mtl_ofi_component_open(void)
ompi_mtl_ofi.domain = NULL;
ompi_mtl_ofi.av = NULL;
ompi_mtl_ofi.cq = NULL;
ompi_mtl_ofi.ep = NULL;
ompi_mtl_ofi.sep = NULL;
/**
* Sanity check: provider_include and provider_exclude must be mutually
@ -304,21 +341,12 @@ is_in_list(char **list, char *item)
}
static struct fi_info*
select_ofi_provider(struct fi_info *providers)
select_ofi_provider(struct fi_info *providers,
char **include_list, char **exclude_list)
{
char **include_list = NULL;
char **exclude_list = NULL;
struct fi_info *prov = providers;
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
__FILE__, __LINE__, prov_include);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
__FILE__, __LINE__, prov_exclude);
if (NULL != prov_include) {
include_list = opal_argv_split(prov_include, ',');
if (NULL != include_list) {
while ((NULL != prov) &&
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
@ -327,8 +355,7 @@ select_ofi_provider(struct fi_info *providers)
prov->fabric_attr->prov_name);
prov = prov->next;
}
} else if (NULL != prov_exclude) {
exclude_list = opal_argv_split(prov_exclude, ',');
} else if (NULL != exclude_list) {
while ((NULL != prov) &&
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
@ -339,14 +366,33 @@ select_ofi_provider(struct fi_info *providers)
}
}
opal_argv_free(include_list);
opal_argv_free(exclude_list);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:prov: %s\n",
__FILE__, __LINE__,
(prov ? prov->fabric_attr->prov_name : "none"));
/* The initial fi_getinfo() call will return a list of providers
* available for this process. once a provider is selected from the
* list, we will cycle through the remaining list to identify NICs
* serviced by this provider, and try to pick one on the same NUMA
* node as this process. If there are no NICs on the same NUMA node,
* we pick one in a manner which allows all ranks to make balanced
* use of available NICs on the system.
*
* Most providers give a separate fi_info object for each NIC,
* however some may have multiple info objects with different
* attributes for the same NIC. The initial provider attributes
* are used to ensure that all NICs we return provide the same
* capabilities as the inital one.
*/
if (NULL != prov) {
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider: %s\n",
__FILE__, __LINE__,
(prov ? prov->domain_attr->name : "none"));
}
return prov;
}
@ -387,10 +433,10 @@ ompi_mtl_ofi_check_fi_remote_cq_data(int fi_version,
}
static void
ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
switch (ofi_tag_mode) {
case MTL_OFI_TAG_1:
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_1 ) - 1);
*bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_1;
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1);
ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_1;
@ -405,7 +451,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_1;
break;
case MTL_OFI_TAG_2:
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_2 ) - 1);
*bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_2;
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_2 - 1)) - 1);
ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_2;
@ -420,7 +466,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_2;
break;
default: /* use FI_REMOTE_CQ_DATA */
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_DATA ) - 1);
*bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_DATA;
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_DATA - 1)) - 1);
ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_DATA;
@ -431,19 +477,182 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
}
}
#define MTL_OFI_ALLOC_COMM_TO_CONTEXT(arr_size) \
do { \
ompi_mtl_ofi.comm_to_context = calloc(arr_size, sizeof(int)); \
if (OPAL_UNLIKELY(!ompi_mtl_ofi.comm_to_context)) { \
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
"%s:%d: alloc of comm_to_context array failed: %s\n",\
__FILE__, __LINE__, strerror(errno)); \
return ret; \
} \
} while (0);
#define MTL_OFI_ALLOC_OFI_CTXTS() \
do { \
ompi_mtl_ofi.ofi_ctxt = (mca_mtl_ofi_context_t *) malloc(ompi_mtl_ofi.num_ofi_contexts * \
sizeof(mca_mtl_ofi_context_t)); \
if (OPAL_UNLIKELY(!ompi_mtl_ofi.ofi_ctxt)) { \
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
"%s:%d: alloc of ofi_ctxt array failed: %s\n", \
__FILE__, __LINE__, strerror(errno)); \
return ret; \
} \
} while(0);
static int ompi_mtl_ofi_init_sep(struct fi_info *prov, int universe_size)
{
int ret = OMPI_SUCCESS, num_ofi_ctxts;
struct fi_av_attr av_attr = {0};
prov->ep_attr->tx_ctx_cnt = prov->ep_attr->rx_ctx_cnt =
ompi_mtl_ofi.num_ofi_contexts;
ret = fi_scalable_ep(ompi_mtl_ofi.domain, prov, &ompi_mtl_ofi.sep, NULL);
if (0 != ret) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_scalable_ep",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
return ret;
}
ompi_mtl_ofi.rx_ctx_bits = 0;
while (ompi_mtl_ofi.num_ofi_contexts >> ++ompi_mtl_ofi.rx_ctx_bits);
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
av_attr.rx_ctx_bits = ompi_mtl_ofi.rx_ctx_bits;
av_attr.count = ompi_mtl_ofi.num_ofi_contexts * universe_size;
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
if (0 != ret) {
MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
return ret;
}
ret = fi_scalable_ep_bind(ompi_mtl_ofi.sep, (fid_t)ompi_mtl_ofi.av, 0);
if (0 != ret) {
MTL_OFI_LOG_FI_ERR(ret, "fi_bind AV-EP failed");
return ret;
}
/*
* If SEP supported and Thread Grouping feature enabled, use
* num_ofi_contexts + 2. Extra 2 items is to accomodate Open MPI contextid
* numbering- COMM_WORLD is 0, COMM_SELF is 1. Other user created
* Comm contextid values are assigned sequentially starting with 3.
*/
num_ofi_ctxts = ompi_mtl_ofi.thread_grouping ?
ompi_mtl_ofi.num_ofi_contexts + 2 : 1;
MTL_OFI_ALLOC_COMM_TO_CONTEXT(num_ofi_ctxts);
ompi_mtl_ofi.total_ctxts_used = 0;
ompi_mtl_ofi.threshold_comm_context_id = 0;
/* Allocate memory for OFI contexts */
MTL_OFI_ALLOC_OFI_CTXTS();
return ret;
}
static int ompi_mtl_ofi_init_regular_ep(struct fi_info * prov, int universe_size)
{
int ret = OMPI_SUCCESS;
struct fi_av_attr av_attr = {0};
struct fi_cq_attr cq_attr = {0};
cq_attr.format = FI_CQ_FORMAT_TAGGED;
cq_attr.size = ompi_mtl_ofi.ofi_progress_event_count;
/* Override any user defined setting */
ompi_mtl_ofi.num_ofi_contexts = 1;
ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */
prov, /* In: Provider */
&ompi_mtl_ofi.sep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_endpoint",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
return ret;
}
/**
* Create the objects that will be bound to the endpoint.
* The objects include:
* - address vector and completion queues
*/
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
av_attr.count = universe_size;
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
if (ret) {
MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
return ret;
}
ret = fi_ep_bind(ompi_mtl_ofi.sep,
(fid_t)ompi_mtl_ofi.av,
0);
if (0 != ret) {
MTL_OFI_LOG_FI_ERR(ret, "fi_bind AV-EP failed");
return ret;
}
MTL_OFI_ALLOC_COMM_TO_CONTEXT(1);
/* Allocate memory for OFI contexts */
MTL_OFI_ALLOC_OFI_CTXTS();
ompi_mtl_ofi.ofi_ctxt[0].tx_ep = ompi_mtl_ofi.sep;
ompi_mtl_ofi.ofi_ctxt[0].rx_ep = ompi_mtl_ofi.sep;
ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.ofi_ctxt[0].cq, NULL);
if (ret) {
MTL_OFI_LOG_FI_ERR(ret, "fi_cq_open failed");
return ret;
}
/* Bind CQ to endpoint object */
ret = fi_ep_bind(ompi_mtl_ofi.sep, (fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq,
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
if (0 != ret) {
MTL_OFI_LOG_FI_ERR(ret, "fi_bind CQ-EP failed");
return ret;
}
return ret;
}
static mca_mtl_base_module_t*
ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads)
{
int ret, fi_version;
struct fi_info *hints;
int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
char **include_list = NULL;
char **exclude_list = NULL;
struct fi_info *hints, *hints_dup = NULL;
struct fi_info *providers = NULL;
struct fi_info *prov = NULL;
struct fi_info *prov_cq_data = NULL;
struct fi_cq_attr cq_attr = {0};
struct fi_av_attr av_attr = {0};
char ep_name[FI_NAME_MAX] = {0};
size_t namelen;
int universe_size;
char *univ_size_str;
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
__FILE__, __LINE__, prov_include);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
__FILE__, __LINE__, prov_exclude);
if (NULL != prov_include) {
include_list = opal_argv_split(prov_include, ',');
} else if (NULL != prov_exclude) {
exclude_list = opal_argv_split(prov_exclude, ',');
}
/**
* Hints to filter providers
@ -462,15 +671,23 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
__FILE__, __LINE__);
goto error;
}
/* Make sure to get a RDM provider that can do the tagged matching
interface and local communication and remote communication. */
hints->mode = FI_CONTEXT;
hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */
hints->caps = FI_TAGGED; /* Tag matching interface */
hints->ep_attr->type = FI_EP_RDM;
hints->caps = FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM;
hints->tx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->op_flags = FI_COMPLETION;
hints->tx_attr->op_flags = FI_COMPLETION;
hints->domain_attr->threading = FI_THREAD_UNSPEC;
if (enable_mpi_threads) {
ompi_mtl_ofi.mpi_thread_multiple = true;
hints->domain_attr->threading = FI_THREAD_SAFE;
} else {
ompi_mtl_ofi.mpi_thread_multiple = false;
hints->domain_attr->threading = FI_THREAD_DOMAIN;
}
switch (control_progress) {
case MTL_OFI_PROG_AUTO:
@ -506,8 +723,59 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
* FI_VERSION provides binary backward and forward compatibility support
* Specify the version of OFI is coded to, the provider will select struct
* layouts that are compatible with this version.
*
* Note: API version 1.5 is the first version that supports
* FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
* that checking -- e.g., some providers are suitable for RXD or
* RXM, but can't provide local communication).
*/
fi_version = FI_VERSION(1, 0);
fi_version = FI_VERSION(1, 5);
/**
* The EFA provider in Libfabric versions prior to 1.10 contains a bug
* where the FI_LOCAL_COMM and FI_REMOTE_COMM capabilities are not
* advertised. However, we know that this provider supports both local and
* remote communication. We must exclude these capability bits in order to
* select EFA when we are using a version of Libfabric with this bug.
*
* Call fi_getinfo() without those capabilities and specifically ask for
* the EFA provider. This is safe to do as EFA is only supported on Amazon
* EC2 and EC2 only supports EFA and TCP-based networks. We'll also skip
* this logic if the user specifies an include list without EFA or adds EFA
* to the exclude list.
*/
if ((include_list && is_in_list(include_list, "efa")) ||
(exclude_list && !is_in_list(exclude_list, "efa"))) {
hints_dup = fi_dupinfo(hints);
hints_dup->caps &= ~(FI_LOCAL_COMM | FI_REMOTE_COMM);
hints_dup->fabric_attr->prov_name = strdup("efa");
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: EFA specific fi_getinfo(): %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
if (FI_ENODATA == -ret) {
/**
* EFA is not available so fall through to call fi_getinfo() again
* with the local/remote capabilities set.
*/
fi_freeinfo(hints_dup);
hints_dup = NULL;
} else if (0 != ret) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_getinfo",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
goto error;
} else {
fi_freeinfo(hints);
hints = hints_dup;
hints_dup = NULL;
goto select_prov;
}
}
/**
* fi_getinfo: returns information about fabric services for reaching a
@ -520,6 +788,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
0ULL, /* Optional flag */
hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getinfo(): %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
if (FI_ENODATA == -ret) {
// It is not an error if no information is returned.
goto error;
@ -531,10 +804,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
goto error;
}
select_prov:
/**
* Select a provider from the list returned by fi_getinfo().
*/
prov = select_ofi_provider(providers);
prov = select_ofi_provider(providers, include_list, exclude_list);
if (!prov) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: select_ofi_provider: no provider found\n",
@ -542,6 +816,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
goto error;
}
opal_argv_free(include_list);
include_list = NULL;
opal_argv_free(exclude_list);
exclude_list = NULL;
/**
* Select the format of the OFI tag
*/
@ -558,7 +837,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
ompi_mtl_ofi.fi_cq_data = false;
if (MTL_OFI_TAG_AUTO == ofi_tag_mode) {
/* Fallback to MTL_OFI_TAG_1 */
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1);
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1, &ofi_tag_bits_for_cid);
} else { /* MTL_OFI_TAG_FULL */
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
@ -569,15 +848,92 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
/* Use FI_REMTOTE_CQ_DATA */
ompi_mtl_ofi.fi_cq_data = true;
prov = prov_cq_data;
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL);
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL, &ofi_tag_bits_for_cid);
}
} else { /* MTL_OFI_TAG_1 or MTL_OFI_TAG_2 */
ompi_mtl_ofi.fi_cq_data = false;
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode);
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode, &ofi_tag_bits_for_cid);
}
/**
* Initialize the MTL OFI Symbol Tables & function pointers
* for specialized functions.
*/
ompi_mtl_ofi_send_symtable_init(&ompi_mtl_ofi.sym_table);
ompi_mtl_ofi.base.mtl_send =
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_send[ompi_mtl_ofi.fi_cq_data];
ompi_mtl_ofi_isend_symtable_init(&ompi_mtl_ofi.sym_table);
ompi_mtl_ofi.base.mtl_isend =
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_isend[ompi_mtl_ofi.fi_cq_data];
ompi_mtl_ofi_irecv_symtable_init(&ompi_mtl_ofi.sym_table);
ompi_mtl_ofi.base.mtl_irecv =
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_irecv[ompi_mtl_ofi.fi_cq_data];
ompi_mtl_ofi_iprobe_symtable_init(&ompi_mtl_ofi.sym_table);
ompi_mtl_ofi.base.mtl_iprobe =
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_iprobe[ompi_mtl_ofi.fi_cq_data];
ompi_mtl_ofi_improbe_symtable_init(&ompi_mtl_ofi.sym_table);
ompi_mtl_ofi.base.mtl_improbe =
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_improbe[ompi_mtl_ofi.fi_cq_data];
/**
* Check for potential bits in the OFI tag that providers may be reserving
* for internal usage (see mem_tag_format in fi_endpoint man page).
*/
ofi_tag_leading_zeros = 0;
while (!((prov->ep_attr->mem_tag_format << ofi_tag_leading_zeros++) &
(uint64_t) MTL_OFI_HIGHEST_TAG_BIT) &&
/* Do not keep looping if the provider does not support enough bits */
(ofi_tag_bits_for_cid >= MTL_OFI_MINIMUM_CID_BITS)){
ofi_tag_bits_for_cid--;
}
if (ofi_tag_bits_for_cid < MTL_OFI_MINIMUM_CID_BITS) {
opal_show_help("help-mtl-ofi.txt", "Not enough bits for CID", true,
prov->fabric_attr->prov_name,
prov->fabric_attr->prov_name,
ompi_process_info.nodename, __FILE__, __LINE__);
goto error;
}
/* Update the maximum supported Communicator ID */
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << ofi_tag_bits_for_cid) - 1);
ompi_mtl_ofi.num_peers = 0;
/* Check if Scalable Endpoints can be enabled for the provider */
sep_support_in_provider = 0;
if ((prov->domain_attr->max_ep_tx_ctx > 1) ||
(prov->domain_attr->max_ep_rx_ctx > 1)) {
sep_support_in_provider = 1;
}
if (1 == ompi_mtl_ofi.enable_sep) {
if (0 == sep_support_in_provider) {
opal_show_help("help-mtl-ofi.txt", "SEP unavailable", true,
prov->fabric_attr->prov_name,
ompi_process_info.nodename, __FILE__, __LINE__);
goto error;
} else if (1 == sep_support_in_provider) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
__FILE__, __LINE__, prov->fabric_attr->prov_name);
}
} else {
/*
* Scalable Endpoints is required for Thread Grouping feature
*/
if (1 == ompi_mtl_ofi.thread_grouping) {
opal_show_help("help-mtl-ofi.txt", "SEP required", true,
ompi_process_info.nodename, __FILE__, __LINE__);
goto error;
}
}
/**
* Open fabric
* The getinfo struct returns a fabric attribute struct that can be used to
@ -612,25 +968,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
goto error;
}
/**
* Create a transport level communication endpoint. To use the endpoint,
* it must be bound to completion counters or event queues and enabled,
* and the resources consumed by it, such as address vectors, counters,
* completion queues, etc.
* see man fi_endpoint for more details.
*/
ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */
prov, /* In: Provider */
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_endpoint",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
goto error;
}
/**
* Save the maximum sizes.
*/
@ -638,76 +975,79 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
/**
* Create the objects that will be bound to the endpoint.
* The objects include:
* - completion queue for events
* - address vector of other endpoint addresses
* - dynamic memory-spanning memory region
* The user is not allowed to exceed MTL_OFI_MAX_PROG_EVENT_COUNT.
* The reason is because progress entries array is now a TLS variable
* as opposed to being allocated on the heap for thread-safety purposes.
*/
cq_attr.format = FI_CQ_FORMAT_TAGGED;
if (ompi_mtl_ofi.ofi_progress_event_count > MTL_OFI_MAX_PROG_EVENT_COUNT) {
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
}
/**
* If a user has set an ofi_progress_event_count > the default, then
* the CQ size hint is set to the user's desired value such that
* the CQ created will have enough slots to store up to
* ofi_progress_event_count events. If a user has not set the
* ofi_progress_event_count, then the provider is trusted to set a
* default high CQ size and the CQ size hint is left unspecified.
* Create a transport level communication endpoint. To use the endpoint,
* it must be bound to the resources consumed by it such as address
* vectors, completion counters or event queues etc, and enabled.
* See man fi_endpoint for more details.
*/
if (ompi_mtl_ofi.ofi_progress_event_count > 100) {
cq_attr.size = ompi_mtl_ofi.ofi_progress_event_count;
}
ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.cq, NULL);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_cq_open failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_av_open failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Bind the CQ and AV to the endpoint object.
/* use the universe size as a rough guess on the address vector
* size hint that should be passed to fi_av_open(). For regular
* endpoints, the count will be the universe size. For scalable
* endpoints, the count will be the universe size multiplied by
* the number of contexts. In either case, if the universe grows
* (via dynamic processes), the count is a hint, not a hard limit,
* so libfabric will just be slightly less efficient.
*/
ret = fi_ep_bind(ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.cq,
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind CQ-EP failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
univ_size_str = getenv("OMPI_UNIVERSE_SIZE");
if (NULL == univ_size_str ||
(universe_size = strtol(univ_size_str, NULL, 0)) <= 0) {
universe_size = ompi_proc_world_size();
}
if (1 == ompi_mtl_ofi.enable_sep) {
max_ofi_ctxts = (prov->domain_attr->max_ep_tx_ctx <
prov->domain_attr->max_ep_rx_ctx) ?
prov->domain_attr->max_ep_tx_ctx :
prov->domain_attr->max_ep_rx_ctx;
num_local_ranks = 1 + ompi_process_info.num_local_peers;
if (max_ofi_ctxts <= num_local_ranks) {
opal_show_help("help-mtl-ofi.txt", "Local ranks exceed ofi contexts",
true, prov->fabric_attr->prov_name,
ompi_process_info.nodename, __FILE__, __LINE__);
goto error;
}
/* Provision enough contexts to service all ranks in a node */
max_ofi_ctxts /= num_local_ranks;
/*
* If num ctxts user specified is more than max allowed, limit to max
* and start round-robining. Print warning to user.
*/
if (max_ofi_ctxts < ompi_mtl_ofi.num_ofi_contexts) {
opal_show_help("help-mtl-ofi.txt", "Ctxts exceeded available",
true, max_ofi_ctxts,
ompi_process_info.nodename, __FILE__, __LINE__);
ompi_mtl_ofi.num_ofi_contexts = max_ofi_ctxts;
}
ret = ompi_mtl_ofi_init_sep(prov, universe_size);
} else {
ret = ompi_mtl_ofi_init_regular_ep(prov, universe_size);
}
if (OMPI_SUCCESS != ret) {
goto error;
}
ret = fi_ep_bind(ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.av,
0);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind AV-EP failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
ompi_mtl_ofi.total_ctxts_used = 0;
ompi_mtl_ofi.threshold_comm_context_id = 0;
/**
* Enable the endpoint for communication
* This commits the bind operations.
*/
ret = fi_enable(ompi_mtl_ofi.ep);
/* Enable Endpoint for communication */
ret = fi_enable(ompi_mtl_ofi.sep);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_enable failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
MTL_OFI_LOG_FI_ERR(ret, "fi_enable failed");
goto error;
}
@ -725,11 +1065,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
* Get our address and publish it with modex.
*/
namelen = sizeof(ep_name);
ret = fi_getname((fid_t)ompi_mtl_ofi.ep, &ep_name[0], &namelen);
ret = fi_getname((fid_t)ompi_mtl_ofi.sep,
&ep_name[0],
&namelen);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getname failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
MTL_OFI_LOG_FI_ERR(ret, "fi_getname failed");
goto error;
}
@ -751,20 +1091,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
*/
ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC;
/**
* Activate progress callback.
*/
ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_progress_register failed: %d\n",
__FILE__, __LINE__, ret);
goto error;
}
return &ompi_mtl_ofi.base;
error:
if (include_list) {
opal_argv_free(include_list);
}
if (exclude_list) {
opal_argv_free(exclude_list);
}
if (providers) {
(void) fi_freeinfo(providers);
}
@ -774,14 +1109,20 @@ error:
if (hints) {
(void) fi_freeinfo(hints);
}
if (hints_dup) {
(void) fi_freeinfo(hints_dup);
}
if (ompi_mtl_ofi.sep) {
(void) fi_close((fid_t)ompi_mtl_ofi.sep);
}
if (ompi_mtl_ofi.av) {
(void) fi_close((fid_t)ompi_mtl_ofi.av);
}
if (ompi_mtl_ofi.cq) {
(void) fi_close((fid_t)ompi_mtl_ofi.cq);
}
if (ompi_mtl_ofi.ep) {
(void) fi_close((fid_t)ompi_mtl_ofi.ep);
if ((0 == ompi_mtl_ofi.enable_sep) &&
ompi_mtl_ofi.ofi_ctxt != NULL &&
ompi_mtl_ofi.ofi_ctxt[0].cq) {
/* Check if CQ[0] was created for non-SEP case and close if needed */
(void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq);
}
if (ompi_mtl_ofi.domain) {
(void) fi_close((fid_t)ompi_mtl_ofi.domain);
@ -789,6 +1130,12 @@ error:
if (ompi_mtl_ofi.fabric) {
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
}
if (ompi_mtl_ofi.comm_to_context) {
free(ompi_mtl_ofi.comm_to_context);
}
if (ompi_mtl_ofi.ofi_ctxt) {
free(ompi_mtl_ofi.ofi_ctxt);
}
return NULL;
}
@ -801,11 +1148,7 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
/* Close all the OFI objects */
if ((ret = fi_close((fid_t)ompi_mtl_ofi.ep))) {
goto finalize_err;
}
if ((ret = fi_close((fid_t)ompi_mtl_ofi.cq))) {
if ((ret = fi_close((fid_t)ompi_mtl_ofi.sep))) {
goto finalize_err;
}
@ -813,6 +1156,18 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
goto finalize_err;
}
if (0 == ompi_mtl_ofi.enable_sep) {
/*
* CQ[0] is bound to SEP object Nwhen SEP is not supported by a
* provider. OFI spec requires that we close the Endpoint that is bound
* to the CQ before closing the CQ itself. So, for the non-SEP case, we
* handle the closing of CQ[0] here.
*/
if ((ret = fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq))) {
goto finalize_err;
}
}
if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) {
goto finalize_err;
}
@ -821,6 +1176,10 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
goto finalize_err;
}
/* Free memory allocated for TX/RX contexts */
free(ompi_mtl_ofi.comm_to_context);
free(ompi_mtl_ofi.ofi_ctxt);
return OMPI_SUCCESS;
finalize_err:

Просмотреть файл

@ -1,5 +1,7 @@
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
@ -11,11 +13,9 @@
#ifndef OMPI_MTL_OFI_ENDPOINT_H
#define OMPI_MTL_OFI_ENDPOINT_H
BEGIN_C_DECLS
#include "ompi/mca/pml/pml.h"
extern int ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
size_t nprocs,
struct ompi_proc_t **procs);
BEGIN_C_DECLS
OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint_t);
@ -38,10 +38,12 @@ struct mca_mtl_ofi_endpoint_t {
typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t;
static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
static inline mca_mtl_ofi_endpoint_t *
ompi_mtl_ofi_get_endpoint(struct mca_mtl_base_module_t* mtl,
ompi_proc_t *ompi_proc)
{
if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc))) {
if (OPAL_UNLIKELY(OMPI_SUCCESS != MCA_PML_CALL(add_procs(&ompi_proc, 1)))) {
/* Fatal error. exit() out */
opal_output(0, "%s:%d: *** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__);

73
ompi/mca/mtl/ofi/mtl_ofi_improbe_opt.pm Обычный файл
Просмотреть файл

@ -0,0 +1,73 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use opt_common::mtl_ofi_opt_common;
package mtl_ofi_improbe_opt;
my @true_false = ("false", "true");
sub gen_funcs {
my $gen_file = $_[0];
my $gen_type = $_[1];
my $OFI_CQ_DATA_EN = "false";
foreach $OFI_CQ_DATA_EN (@true_false) {
my @flags = ($OFI_CQ_DATA_EN);
if (($gen_type cmp "FUNC") == 0) {
my $FUNC = gen_improbe_function(\@flags);
print $gen_file "$FUNC\n\n";
}
if (($gen_type cmp "SYM") == 0) {
my $SYM = gen_improbe_sym_init(\@flags);
print $gen_file "$SYM\n";
}
}
}
sub gen_improbe_function {
my @op_flags = @{$_[0]};
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
my $OFI_CQ_DATA_EN = $op_flags[0];
my $IMPROBE_FUNCTION =
"__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_improbe_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *matched,
struct ompi_message_t **message,
struct ompi_status_public_t *status)
{
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
return ompi_mtl_ofi_improbe_generic(mtl, comm, src, tag,
matched, message, status,
OFI_CQ_DATA);
}";
return $IMPROBE_FUNCTION;
}
sub gen_improbe_sym_init {
my @op_flags = @{$_[0]};
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_improbe_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
my $OFI_CQ_DATA_EN = $op_flags[0];
my $symbol_init =
"
sym_table->ompi_mtl_ofi_improbe[".$OFI_CQ_DATA_EN."]
= ".$MTL_OFI_FUNC_NAME.";
";
return $symbol_init;
}
1;

72
ompi/mca/mtl/ofi/mtl_ofi_iprobe_opt.pm Обычный файл
Просмотреть файл

@ -0,0 +1,72 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use opt_common::mtl_ofi_opt_common;
package mtl_ofi_iprobe_opt;
my @true_false = ("false", "true");
sub gen_funcs {
my $gen_file = $_[0];
my $gen_type = $_[1];
my $OFI_CQ_DATA_EN = "false";
foreach $OFI_CQ_DATA_EN (@true_false) {
my @flags = ($OFI_CQ_DATA_EN);
if (($gen_type cmp "FUNC") == 0) {
my $FUNC = gen_iprobe_function(\@flags);
print $gen_file "$FUNC\n\n";
}
if (($gen_type cmp "SYM") == 0) {
my $SYM = gen_iprobe_sym_init(\@flags);
print $gen_file "$SYM\n";
}
}
}
sub gen_iprobe_function {
my @op_flags = @{$_[0]};
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
my $OFI_CQ_DATA_EN = $op_flags[0];
my $IPROBE_FUNCTION =
"__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_iprobe_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *flag,
struct ompi_status_public_t *status)
{
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
return ompi_mtl_ofi_iprobe_generic(mtl, comm, src, tag,
flag, status,
OFI_CQ_DATA);
}";
return $IPROBE_FUNCTION;
}
sub gen_iprobe_sym_init {
my @op_flags = @{$_[0]};
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_iprobe_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
my $OFI_CQ_DATA_EN = $op_flags[0];
my $symbol_init =
"
sym_table->ompi_mtl_ofi_iprobe[".$OFI_CQ_DATA_EN."]
= ".$MTL_OFI_FUNC_NAME.";
";
return $symbol_init;
}
1;

72
ompi/mca/mtl/ofi/mtl_ofi_irecv_opt.pm Обычный файл
Просмотреть файл

@ -0,0 +1,72 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use opt_common::mtl_ofi_opt_common;
package mtl_ofi_irecv_opt;
my @true_false = ("false", "true");
sub gen_funcs {
my $gen_file = $_[0];
my $gen_type = $_[1];
my $OFI_CQ_DATA_EN = "false";
foreach $OFI_CQ_DATA_EN (@true_false) {
my @flags = ($OFI_CQ_DATA_EN);
if (($gen_type cmp "FUNC") == 0) {
my $FUNC = gen_irecv_function(\@flags);
print $gen_file "$FUNC\n\n";
}
if (($gen_type cmp "SYM") == 0) {
my $SYM = gen_irecv_sym_init(\@flags);
print $gen_file "$SYM\n";
}
}
}
sub gen_irecv_function {
my @op_flags = @{$_[0]};
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
my $OFI_CQ_DATA_EN = $op_flags[0];
my $IRECV_FUNCTION =
"__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_irecv_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
struct opal_convertor_t *convertor,
mca_mtl_request_t *mtl_request)
{
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
return ompi_mtl_ofi_irecv_generic(mtl, comm, src, tag,
convertor, mtl_request,
OFI_CQ_DATA);
}";
return $IRECV_FUNCTION;
}
sub gen_irecv_sym_init {
my @op_flags = @{$_[0]};
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_irecv_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
my $OFI_CQ_DATA_EN = $op_flags[0];
my $symbol_init =
"
sym_table->ompi_mtl_ofi_irecv[".$OFI_CQ_DATA_EN."]
= ".$MTL_OFI_FUNC_NAME.";
";
return $symbol_init;
}
1;

74
ompi/mca/mtl/ofi/mtl_ofi_isend_opt.pm Обычный файл
Просмотреть файл

@ -0,0 +1,74 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use opt_common::mtl_ofi_opt_common;
package mtl_ofi_isend_opt;
my @true_false = ("false", "true");
sub gen_funcs {
my $gen_file = $_[0];
my $gen_type = $_[1];
my $OFI_CQ_DATA_EN = "false";
foreach $OFI_CQ_DATA_EN (@true_false) {
my @flags = ($OFI_CQ_DATA_EN);
if (($gen_type cmp "FUNC") == 0) {
my $FUNC = gen_isend_function(\@flags);
print $gen_file "$FUNC\n\n";
}
if (($gen_type cmp "SYM") == 0) {
my $SYM = gen_isend_sym_init(\@flags);
print $gen_file "$SYM\n";
}
}
}
sub gen_isend_function {
my @op_flags = @{$_[0]};
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
my $OFI_CQ_DATA_EN = $op_flags[0];
my $ISEND_FUNCTION =
"__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_isend_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode,
bool blocking,
mca_mtl_request_t *mtl_request)
{
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
return ompi_mtl_ofi_isend_generic(mtl, comm, dest, tag,
convertor, mode, blocking,
mtl_request, OFI_CQ_DATA);
}";
return $ISEND_FUNCTION;
}
sub gen_isend_sym_init {
my @op_flags = @{$_[0]};
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_isend_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
my $OFI_CQ_DATA_EN = $op_flags[0];
my $symbol_init =
"
sym_table->ompi_mtl_ofi_isend[".$OFI_CQ_DATA_EN."]
= ".$MTL_OFI_FUNC_NAME.";
";
return $symbol_init;
}
1;

77
ompi/mca/mtl/ofi/mtl_ofi_opt.h Обычный файл
Просмотреть файл

@ -0,0 +1,77 @@
/*
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MTL_OFI_OPT_H
#define MTL_OFI_OPT_H
#include "mtl_ofi.h"
BEGIN_C_DECLS
#define CQ_DATA_TYPES 2
#define OMPI_MTL_OFI_SEND_TYPES [CQ_DATA_TYPES]
#define OMPI_MTL_OFI_ISEND_TYPES [CQ_DATA_TYPES]
#define OMPI_MTL_OFI_IRECV_TYPES [CQ_DATA_TYPES]
#define OMPI_MTL_OFI_IPROBE_TYPES [CQ_DATA_TYPES]
#define OMPI_MTL_OFI_IMPROBE_TYPES [CQ_DATA_TYPES]
struct ompi_mtl_ofi_symtable {
int (*ompi_mtl_ofi_send OMPI_MTL_OFI_SEND_TYPES )
(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode);
int (*ompi_mtl_ofi_isend OMPI_MTL_OFI_ISEND_TYPES )
(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode,
bool blocking,
mca_mtl_request_t *mtl_request);
int (*ompi_mtl_ofi_irecv OMPI_MTL_OFI_IRECV_TYPES )
(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
struct opal_convertor_t *convertor,
mca_mtl_request_t *mtl_request);
int (*ompi_mtl_ofi_iprobe OMPI_MTL_OFI_IPROBE_TYPES )
(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *flag,
struct ompi_status_public_t *status);
int (*ompi_mtl_ofi_improbe OMPI_MTL_OFI_IMPROBE_TYPES )
(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *matched,
struct ompi_message_t **message,
struct ompi_status_public_t *status);
};
/**
* MTL OFI specialization function symbol table init
*/
void ompi_mtl_ofi_send_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
void ompi_mtl_ofi_isend_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
void ompi_mtl_ofi_irecv_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
void ompi_mtl_ofi_iprobe_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
void ompi_mtl_ofi_improbe_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
END_C_DECLS
#endif /* MTL_OFI_OPT_H */

71
ompi/mca/mtl/ofi/mtl_ofi_send_opt.pm Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use opt_common::mtl_ofi_opt_common;
package mtl_ofi_send_opt;
my @true_false = ("false", "true");
sub gen_funcs {
my $gen_file = $_[0];
my $gen_type = $_[1];
my $OFI_CQ_DATA_EN = "false";
foreach $OFI_CQ_DATA_EN (@true_false) {
my @flags = ($OFI_CQ_DATA_EN);
if (($gen_type cmp "FUNC") == 0) {
my $FUNC = gen_send_function(\@flags);
print $gen_file "$FUNC\n\n";
}
if (($gen_type cmp "SYM") == 0) {
my $SYM = gen_send_sym_init(\@flags);
print $gen_file "$SYM\n";
}
}
}
sub gen_send_function {
my @op_flags = @{$_[0]};
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
my $OFI_CQ_DATA_EN = $op_flags[0];
my $SEND_FUNCTION =
"__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_send_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode)
{
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
convertor, mode,
OFI_CQ_DATA);
}";
return $SEND_FUNCTION;
}
sub gen_send_sym_init {
my @op_flags = @{$_[0]};
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_send_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
my $OFI_CQ_DATA_EN = $op_flags[0];
my $symbol_init =
"
sym_table->ompi_mtl_ofi_send[".$OFI_CQ_DATA_EN."]
= ".$MTL_OFI_FUNC_NAME.";
";
return $symbol_init;
}
1;

Просмотреть файл

@ -19,6 +19,19 @@ BEGIN_C_DECLS
/**
* MTL Module Interface
*/
typedef struct mca_mtl_ofi_context_t {
/* Transmit and receive contexts */
struct fid_ep *tx_ep;
struct fid_ep *rx_ep;
/* Completion queue */
struct fid_cq *cq;
/* Thread locking */
opal_mutex_t context_lock;
} mca_mtl_ofi_context_t;
typedef struct mca_mtl_ofi_module_t {
mca_mtl_base_module_t base;
@ -31,11 +44,19 @@ typedef struct mca_mtl_ofi_module_t {
/** Address vector handle */
struct fid_av *av;
/** Completion queue handle */
struct fid_cq *cq;
/* Multi-threaded Application flag */
bool mpi_thread_multiple;
/** Endpoint to communicate on */
struct fid_ep *ep;
/* Scalable Endpoint attributes */
struct fid_ep *sep; /* Endpoint object */
mca_mtl_ofi_context_t *ofi_ctxt; /* OFI contexts */
int threshold_comm_context_id; /* Set threshold communicator ID */
int *comm_to_context; /* Map communicator ID to context */
int rx_ctx_bits; /* Bits used for RX context */
int total_ctxts_used; /* Total number of contexts used */
int enable_sep; /* MCA to enable/disable SEP feature */
int thread_grouping; /* MCA for thread grouping feature */
int num_ofi_contexts; /* MCA for number of contexts to use */
/** Endpoint name length */
size_t epnamelen;
@ -71,6 +92,9 @@ typedef struct mca_mtl_ofi_module_t {
unsigned long long sync_send_ack;
unsigned long long sync_proto_mask;
/** Optimized function Symbol Tables **/
struct ompi_mtl_ofi_symtable sym_table;
} mca_mtl_ofi_module_t;
extern mca_mtl_ofi_module_t ompi_mtl_ofi;
@ -80,6 +104,19 @@ typedef struct mca_mtl_ofi_component_t {
mca_mtl_base_component_2_0_0_t super;
} mca_mtl_ofi_component_t;
typedef enum {
OFI_REGULAR_EP = 0,
OFI_SCALABLE_EP,
} mca_mtl_ofi_ep_type;
/*
* Define upper limit for number of events read from a CQ.
* Setting this to 100 as this was deemed optimal from empirical data.
* If one wants to read lesser number of events from the CQ, the MCA
* variable can be used.
*/
#define MTL_OFI_MAX_PROG_EVENT_COUNT 100
/*OFI TAG:
* Define 3 different OFI tag distributions:
* 1) Support FI_REMOTE_CQ_DATA: No need for source rank in the tag
@ -89,12 +126,15 @@ typedef struct mca_mtl_ofi_component_t {
* More details of the tags are in the README file (mtl_ofi_tag_mode).
*/
#define MTL_OFI_MINIMUM_CID_BITS (8)
/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
* 01234567 01234567 01234567 012345 67 01234567 01234567 01234567 01234567
* | |
* context_id |prot| message tag
*/
#define MTL_OFI_PROTO_BIT_COUNT (2)
#define MTL_OFI_HIGHEST_TAG_BIT (0x8000000000000000ULL)
#define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
#define MTL_OFI_CID_BIT_COUNT_DATA (30)

Просмотреть файл

@ -0,0 +1,66 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
use opt_common::mtl_ofi_opt_common;
package mtl_ofi_#INSERT FUNCTION NAME HERE#_opt;
my @en_dis = (0, 1);
sub gen_funcs {
my $gen_file = $_[0];
my $gen_type = $_[1];
my $#INSERT FLAG NAME HERE# = 0;
foreach $#INSERT FLAG NAME HERE# (@en_dis) {
my @flags = ($#INSERT FLAG NAME HERE#);
if (($gen_type cmp "FUNC") == 0) {
my $FUNC = gen_#INSERT FUNCTION NAME HERE#_function(\@flags);
print $gen_file "$FUNC\n\n";
}
if (($gen_type cmp "SYM") == 0) {
my $SYM = gen_#INSERT FUNCTION NAME HERE#_sym_init(\@flags);
print $gen_file "$SYM\n";
}
}
}
sub gen_#INSERT FUNCTION NAME HERE#_function {
my @op_flags = @{$_[0]};
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
my $#INSERT FLAG NAME HERE#_EN = $op_flags[0];
my $FUNCTION =
"__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#_" . $MTL_OFI_NAME_EXT . "(#INSERT FUNCTION ARGS HERE#)
{
const int $#INSERT FLAG NAME HERE# = " . $#INSERT FLAG NAME HERE#_EN . ";
return ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#_generic(#INSERT FUNCTION ARGS HERE#,
#INSERT FLAG NAME HERE#);
}";
return $FUNCTION;
}
sub gen_#INSERT FUNCTION NAME HERE#_sym_init {
my @op_flags = @{$_[0]};
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
my $#INSERT FLAG NAME HERE#_EN = $op_flags[0];
my $symbol_init =
"
sym_table->ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#[".$#INSERT FLAG NAME HERE#_EN."]
= ".$MTL_OFI_FUNC_NAME.";
";
return $symbol_init;
}
1;

Просмотреть файл

@ -0,0 +1,54 @@
#!/usr/bin/env perl
#
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
use strict;
use warnings;
package opt_common::mtl_ofi_opt_common;
#
# Generate the extension for functions and symbols based off the flags.
#
sub gen_flags_ext {
my $OP_FLAGS = "";
my @name_flags = @{$_[0]};
my $num_flags = $#name_flags;
for my $flag (@name_flags) {
$OP_FLAGS = $OP_FLAGS . $flag;
if ($num_flags--) {
$OP_FLAGS = $OP_FLAGS . '_';
}
}
return $OP_FLAGS;
}
#
# Generate the header for the specialized symbol table init function.
#
sub gen_sym_function_header {
my $MTL_OFI_SYM_TYPE = $_[0];
my $header =
"void ".$MTL_OFI_SYM_TYPE."_init(struct ompi_mtl_ofi_symtable *sym_table)
{";
return $header;
}
###
#
# Generate the footer for the specialized symbol table init function.
#
sub gen_sym_function_footer {
my $footer =
"}";
return $footer;
}
###
1;

67
opal/mca/btl/ofi/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,67 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2018 Intel, inc. All rights reserved
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#dist_opaldata_DATA = help-mpi-btl-ofi.txt
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
sources = \
btl_ofi.h \
btl_ofi_component.c \
btl_ofi_endpoint.h \
btl_ofi_endpoint.c \
btl_ofi_module.c \
btl_ofi_rdma.h \
btl_ofi_rdma.c \
btl_ofi_atomics.c \
btl_ofi_frag.c \
btl_ofi_frag.h \
btl_ofi_context.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_opal_btl_ofi_DSO
lib =
lib_sources =
component = mca_btl_ofi.la
component_sources = $(sources)
else
lib = libmca_btl_ofi.la
lib_sources = $(sources)
component =
component_sources =
endif
mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component)
mca_btl_ofi_la_SOURCES = $(component_sources)
mca_btl_ofi_la_LDFLAGS = -module -avoid-version \
$(opal_ofi_LDFLAGS)
mca_btl_ofi_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la \
$(opal_ofi_LIBS)
noinst_LTLIBRARIES = $(lib)
libmca_btl_ofi_la_SOURCES = $(lib_sources)
libmca_btl_ofi_la_LDFLAGS = -module -avoid-version $(opal_ofi_LDFLAGS)
libmca_btl_ofi_la_LIBS = $(opal_ofi_LIBS)

110
opal/mca/btl/ofi/README Обычный файл
Просмотреть файл

@ -0,0 +1,110 @@
========================================
Design notes on BTL/OFI
========================================
This is the RDMA only btl based on OFI Libfabric. The goal is to enable RDMA
with multiple vendor hardware through one interface. Most of the operations are
managed by upper layer (osc/rdma). This BTL is mostly doing the low level work.
Tested providers: sockets,psm2,ugni
========================================
Component
This BTL is requesting libfabric version 1.5 API and will not support older versions.
The required capabilities of this BTL is FI_ATOMIC and FI_RMA with the endpoint type
of FI_EP_RDM only. This BTL does NOT support libfabric provider that requires local
memory registration (FI_MR_LOCAL).
BTL/OFI will initialize a module with ONLY the first compatible info returned from OFI.
This means it will rely on OFI provider to do load balancing. The support for multiple
device might be added later.
The BTL creates only one endpoint and one CQ.
========================================
Memory Registration
Open MPI has a system in place to exchange remote address and always use the remote
virtual address to refer to a piece of memory. However, some libfabric providers might
not support the use of virtual address and instead will use zero-based offset addressing.
FI_MR_VIRT_ADDR is the flag that determine this behavior. mca_btl_ofi_reg_mem() handles
this by storing the base address in registration handle in case of the provider does not
support FI_MR_VIRT_ADDR. This base address will be used to calculate the offset later in
RDMA/Atomic operations.
The BTL will try to use the address of registration handle as the key. However, if the
provider supports FI_MR_PROV_KEY, it will use provider provided key. Simply does not care.
The BTL does not register local operand or compare. This is why this BTL does not support
FI_MR_LOCAL and will allocate every buffer before registering. This means FI_MR_ALLOCATED
is supported. So to be explicit.
Supported MR mode bits (will work with or without):
enum:
- FI_MR_BASIC
- FI_MR_SCALABLE
mode bits:
- FI_MR_VIRT_ADDR
- FI_MR_ALLOCATED
- FI_MR_PROV_KEY
The BTL does NOT support (will not work with):
- FI_MR_LOCAL
- FI_MR_MMU_NOTIFY
- FI_MR_RMA_EVENT
- FI_MR_ENDPOINT
Just a reminder, in libfabric API 1.5...
FI_MR_BASIC == (FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_VIRT_ADDR)
========================================
Completions
Every operation in this BTL is asynchronous. The completion handling will occur in
mca_btl_ofi_component_progress() where we read the CQ with the completion context and
execute the callback functions. The completions are local. No remote completion event is
generated as local completion already guarantee global completion.
The BTL keep tracks of number of outstanding operations and provide flush interface.
========================================
Sockets Provider
Sockets provider is the proof of concept provider for libfabric. It is supposed to support
all the OFI API with emulations. This provider is considered very slow and bound to raise
problems that we might not see from other faster providers.
Known Problems:
- sockets provider uses progress thread and can cause segfault in finalize as we free
the resources while progress thread is still using it. sleep(1) was put in
mca_btl_ofi_componenet_close() for this reason.
- sockets provider deadlock in two-sided mode. Might be something about buffered recv.
(August 2018).
========================================
Scalable Endpoint
This BTL will try to use scalable endpoint to create communication context. This will increase
multithreaded performance for some application. The default number of context created is 1 and
can be tuned VIA MCA parameter "btl_ofi_num_contexts_per_module". It is advised that the number
of context should be equal to number of physical core for optimal performance.
User can disable scalable endpoint by MCA parameter "btl_ofi_disable_sep".
With scalable endpoint disbled, the BTL will alias OFI endpoint to both tx and rx context.
========================================
Two sided communication
Two sided communication is added later on to BTL OFI to enable non tag-matching provider
to be able to use in Open MPI with this BTL. However, the support is only for "functional"
and has not been optimized for performance at this point. (August 2018)

378
opal/mca/btl/ofi/btl_ofi.h Обычный файл
Просмотреть файл

@ -0,0 +1,378 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2018 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_BTL_OFI_H
#define MCA_BTL_OFI_H
#include "opal_config.h"
#include <sys/types.h>
#include <string.h>
/* Open MPI includes */
#include "opal/mca/event/event.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/mpool/mpool.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/class/opal_hash_table.h"
#include <rdma/fabric.h>
#include <rdma/fi_domain.h>
#include <rdma/fi_errno.h>
#include <rdma/fi_cm.h>
#include <rdma/fi_endpoint.h>
#include <rdma/fi_rma.h>
BEGIN_C_DECLS
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_NUM_CQE_READ 64
#define MCA_BTL_OFI_DEFAULT_RD_NUM 10
#define MCA_BTL_OFI_DEFAULT_MAX_CQE 128
#define MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD 64
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
#define TWO_SIDED_ENABLED mca_btl_ofi_component.two_sided_enabled
enum mca_btl_ofi_mode {
MCA_BTL_OFI_MODE_ONE_SIDED = 0,
MCA_BTL_OFI_MODE_TWO_SIDED,
MCA_BTL_OFI_MODE_FULL_SUPPORT,
MCA_BTL_OFI_MODE_TOTAL
};
enum mca_btl_ofi_hdr_type {
MCA_BTL_OFI_TYPE_PUT = 0,
MCA_BTL_OFI_TYPE_GET,
MCA_BTL_OFI_TYPE_AOP,
MCA_BTL_OFI_TYPE_AFOP,
MCA_BTL_OFI_TYPE_CSWAP,
MCA_BTL_OFI_TYPE_SEND,
MCA_BTL_OFI_TYPE_RECV,
MCA_BTL_OFI_TYPE_TOTAL
};
struct mca_btl_ofi_context_t {
int32_t context_id;
/* transmit context */
struct fid_ep *tx_ctx;
struct fid_ep *rx_ctx;
/* completion queue */
struct fid_cq *cq;
/* completion info freelist */
/* We have it per context to reduce the thread contention
* on the freelist. Things can get really slow. */
opal_free_list_t rdma_comp_list;
opal_free_list_t frag_comp_list;
opal_free_list_t frag_list;
/* for thread locking */
volatile int32_t lock;
};
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
/**
* @brief OFI BTL module
*/
struct mca_btl_ofi_module_t {
/** base BTL interface */
mca_btl_base_module_t super;
/* libfabric components */
struct fi_info *fabric_info;
struct fid_fabric *fabric;
struct fid_domain *domain;
struct fid_ep *ofi_endpoint;
struct fid_av *av;
int num_contexts;
mca_btl_ofi_context_t *contexts;
char *linux_device_name;
/** whether the module has been fully initialized or not */
bool initialized;
bool use_virt_addr;
bool is_scalable_ep;
int64_t outstanding_rdma;
int64_t outstanding_send;
/** linked list of BTL endpoints. this list is never searched so
* there is no need for a complicated structure here at this time*/
opal_list_t endpoints;
opal_mutex_t module_lock;
opal_hash_table_t id_to_endpoint;
/** registration cache */
mca_rcache_base_module_t *rcache;
};
typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t;
extern mca_btl_ofi_module_t mca_btl_ofi_module_template;
/**
* @brief OFI BTL component
*/
struct mca_btl_ofi_component_t {
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
/** number of TL modules */
int module_count;
int num_contexts_per_module;
int num_cqe_read;
int progress_threshold;
int mode;
int rd_num;
bool two_sided_enabled;
size_t namelen;
/** All BTL OFI modules (1 per tl) */
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
};
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
OPAL_MODULE_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component;
struct mca_btl_base_registration_handle_t {
uint64_t rkey;
void *desc;
void *base_addr;
};
struct mca_btl_ofi_reg_t {
mca_rcache_base_registration_t base;
struct fid_mr *ur_mr;
/* remote handle */
mca_btl_base_registration_handle_t handle;
};
typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t);
struct mca_btl_ofi_header_t {
mca_btl_base_tag_t tag;
size_t len;
};
typedef struct mca_btl_ofi_header_t mca_btl_ofi_header_t;
struct mca_btl_ofi_base_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segments[2];
int context_id;
struct mca_btl_ofi_module_t *btl;
struct mca_btl_base_endpoint_t *endpoint;
opal_free_list_t *free_list;
mca_btl_ofi_header_t hdr;
};
typedef struct mca_btl_ofi_base_frag_t mca_btl_ofi_base_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_base_frag_t);
struct mca_btl_ofi_completion_context_t {
struct fi_context ctx;
void *comp;
};
typedef struct mca_btl_ofi_completion_context_t mca_btl_ofi_completion_context_t;
/* completion structure store information needed
* for RDMA callbacks */
struct mca_btl_ofi_base_completion_t {
opal_free_list_item_t comp_list;
opal_free_list_t *my_list;
struct mca_btl_base_module_t *btl;
struct mca_btl_base_endpoint_t *endpoint;
struct mca_btl_ofi_context_t *my_context;
int type;
};
typedef struct mca_btl_ofi_base_completion_t mca_btl_ofi_base_completion_t;
struct mca_btl_ofi_rdma_completion_t {
mca_btl_ofi_base_completion_t base;
mca_btl_ofi_completion_context_t comp_ctx;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t operand;
uint64_t compare;
mca_btl_base_rdma_completion_fn_t cbfunc;
void *cbcontext;
void *cbdata;
};
typedef struct mca_btl_ofi_rdma_completion_t mca_btl_ofi_rdma_completion_t;
struct mca_btl_ofi_frag_completion_t {
mca_btl_ofi_base_completion_t base;
mca_btl_ofi_completion_context_t comp_ctx;
mca_btl_ofi_base_frag_t *frag;
};
typedef struct mca_btl_ofi_frag_completion_t mca_btl_ofi_frag_completion_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_rdma_completion_t);
OBJ_CLASS_DECLARATION(mca_btl_ofi_frag_completion_t);
/**
* Initiate an asynchronous put.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the put operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/
int mca_btl_ofi_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous get.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the get operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param order (IN) Ordering
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/
int mca_btl_ofi_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
int mca_btl_ofi_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
int mca_btl_ofi_finalize (mca_btl_base_module_t *btl);
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module);
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
mca_rcache_base_registration_t *reg);
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode);
int mca_btl_ofi_post_recvs(mca_btl_base_module_t* module, mca_btl_ofi_context_t *context, int count);
void mca_btl_ofi_exit(void);
/* thread atomics */
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
{
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
}
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
{
while (mca_btl_ofi_context_trylock(context));
}
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
{
opal_atomic_mb();
context->lock = 0;
}
END_C_DECLS
#endif

193
opal/mca/btl/ofi/btl_ofi_atomics.c Обычный файл
Просмотреть файл

@ -0,0 +1,193 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <rdma/fi_atomic.h>
#include "btl_ofi_rdma.h"
static inline int to_fi_op(mca_btl_base_atomic_op_t op)
{
switch (op) {
case MCA_BTL_ATOMIC_ADD:
return FI_SUM;
case MCA_BTL_ATOMIC_SWAP:
return FI_ATOMIC_WRITE;
default:
BTL_ERROR(("Unknown or unsupported atomic op."));
MCA_BTL_OFI_ABORT();
/* just to squash the warning */
return OPAL_ERROR;
}
}
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
int rc;
int fi_datatype = FI_UINT64;
int fi_op;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_rdma_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32;
}
fi_op = to_fi_op(op);
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_AFOP);
/* copy the operand because it might get freed from upper layer */
comp->operand = (uint64_t) operand;
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
rc = fi_fetch_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, /* operand */
local_address, local_handle->desc, /* results */
btl_endpoint->peer_addr, /* remote addr */
remote_address, remote_handle->rkey, /* remote buffer */
fi_datatype, fi_op, &comp->comp_ctx);
if (rc == -FI_EAGAIN) {
return OPAL_ERR_OUT_OF_RESOURCE;
} else if (rc < 0) {
BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
MCA_BTL_OFI_ABORT();
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
return OPAL_SUCCESS;
}
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int rc;
int fi_datatype = FI_UINT64;
int fi_op;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_rdma_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32;
}
fi_op = to_fi_op(op);
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
NULL,
NULL,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_AOP);
/* copy the operand because it might get freed from upper layer */
comp->operand = (uint64_t) operand;
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
rc = fi_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, /* operand */
btl_endpoint->peer_addr, /* remote addr */
remote_address, remote_handle->rkey, /* remote buffer */
fi_datatype, fi_op, &comp->comp_ctx);
if (rc == -FI_EAGAIN) {
return OPAL_ERR_OUT_OF_RESOURCE;
} else if (rc < 0) {
BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
MCA_BTL_OFI_ABORT();
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
return OPAL_SUCCESS;
}
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int rc;
int fi_datatype = FI_UINT64;
mca_btl_ofi_rdma_completion_t *comp = NULL;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32;
}
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_CSWAP);
/* copy the operand because it might get freed from upper layer */
comp->operand = (uint64_t) value;
comp->compare = (uint64_t) compare;
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* perform atomic */
rc = fi_compare_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL,
(void*) &comp->compare, NULL,
local_address, local_handle->desc,
btl_endpoint->peer_addr,
remote_address, remote_handle->rkey,
fi_datatype,
FI_CSWAP,
&comp->comp_ctx);
if (rc == -FI_EAGAIN) {
return OPAL_ERR_OUT_OF_RESOURCE;
} else if (rc < 0) {
BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
MCA_BTL_OFI_ABORT();
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
return OPAL_SUCCESS;
}

699
opal/mca/btl/ofi/btl_ofi_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,699 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/common/ofi/common_ofi.h"
#include <string.h>
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "btl_ofi_rdma.h"
#include "btl_ofi_frag.h"
#define MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
#define MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS (FI_MSG)
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
static char *prov_include;
static char *ofi_progress_mode;
static bool disable_sep;
static int mca_btl_ofi_init_device(struct fi_info *info);
/* validate information returned from fi_getinfo().
* return OPAL_ERROR if we dont have what we need. */
static int validate_info(struct fi_info *info, uint64_t required_caps)
{
int mr_mode;
BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
/* we need exactly all the required bits */
if ((info->caps & required_caps) != required_caps) {
BTL_VERBOSE(("unsupported caps"));
return OPAL_ERROR;
}
/* we need FI_EP_RDM */
if (info->ep_attr->type != FI_EP_RDM) {
BTL_VERBOSE(("unsupported EP type"));
return OPAL_ERROR;
}
mr_mode = info->domain_attr->mr_mode;
if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
(mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
BTL_VERBOSE(("unsupported MR mode"));
return OPAL_ERROR;
}
if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) {
BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE"));
return OPAL_ERROR;
}
BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name));
return OPAL_SUCCESS;
}
/* Register the MCA parameters */
static int mca_btl_ofi_component_register(void)
{
char *msg;
mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template;
asprintf(&msg, "BTL OFI mode of operation. Valid values are: %d = One-Sided only, %d=Two-Sided only, "
"%d = Both one and two sided. BTL OFI is only optimized for one-sided communication",
MCA_BTL_OFI_MODE_ONE_SIDED,
MCA_BTL_OFI_MODE_TWO_SIDED,
MCA_BTL_OFI_MODE_FULL_SUPPORT);
if (NULL == msg) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
mca_btl_ofi_component.mode = MCA_BTL_OFI_MODE_ONE_SIDED;
(void)mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"mode",
msg,
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.mode);
/* fi_getinfo with prov_name == NULL means ALL provider.
* Since now we are using the first valid info returned, I'm not sure
* if we need to provide the support for comma limited provider list. */
prov_include = NULL;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"provider_include",
"OFI provider that ofi btl will query for. This parameter only "
"accept ONE provider name. "
"(e.g., \"psm2\"; an empty value means that all providers will "
"be considered.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&prov_include);
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"num_cq_read",
"Number of completion entries to read from a single cq_read. ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.num_cqe_read);
ofi_progress_mode = "unspec";
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"progress_mode",
"requested provider progress mode. [unspec, auto, manual]"
"(default: unspec)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ofi_progress_mode);
mca_btl_ofi_component.num_contexts_per_module = 1;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"num_contexts_per_module",
"number of communication context per module to create. "
"This should increase multithreaded performance but it is "
"advised that this number should be lower than total cores.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.num_contexts_per_module);
disable_sep = false;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"disable_sep",
"force btl/ofi to never use scalable endpoint.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&disable_sep);
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"progress_threshold",
"number of outstanding operation before btl will progress "
"automatically. Tuning this might improve performance on "
"certain type of application.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.progress_threshold);
mca_btl_ofi_component.rd_num = MCA_BTL_OFI_DEFAULT_RD_NUM;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"rd_num",
"Number of receive descriptor posted per context.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.rd_num);
/* for now we want this component to lose to the MTL. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
&module->super);
}
static int mca_btl_ofi_component_open(void)
{
mca_btl_ofi_component.module_count = 0;
return OPAL_SUCCESS;
}
/*
* component cleanup - sanity checking of queue lengths
*/
static int mca_btl_ofi_component_close(void)
{
/* If we don't sleep, sockets provider freaks out. */
sleep(1);
return OPAL_SUCCESS;
}
void mca_btl_ofi_exit(void)
{
BTL_ERROR(("BTL OFI will now abort."));
exit(1);
}
/*
* OFI component initialization:
* read interface list from kernel and compare against component parameters
* then create a BTL instance for selected interfaces
*/
static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, bool enable_progress_threads,
bool enable_mpi_threads)
{
/* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */
int rc;
uint64_t progress_mode;
unsigned resource_count = 0;
struct mca_btl_base_module_t **base_modules;
BTL_VERBOSE(("initializing ofi btl"));
/* Set up libfabric hints. */
uint32_t libfabric_api;
libfabric_api = fi_version();
/* bail if OFI version is less than 1.5. */
if (libfabric_api < FI_VERSION(1, 5)) {
BTL_VERBOSE(("ofi btl disqualified because OFI version < 1.5."));
return NULL;
}
struct fi_info *info, *info_list, *selected_info;
struct fi_info hints = {0};
struct fi_ep_attr ep_attr = {0};
struct fi_rx_attr rx_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_fabric_attr fabric_attr = {0};
struct fi_domain_attr domain_attr = {0};
uint64_t required_caps;
switch (mca_btl_ofi_component.mode) {
case MCA_BTL_OFI_MODE_TWO_SIDED:
mca_btl_ofi_component.two_sided_enabled = true;
required_caps = MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
break;
case MCA_BTL_OFI_MODE_FULL_SUPPORT:
mca_btl_ofi_component.two_sided_enabled = true;
required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS |
MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
break;
default:
/* default to only one sided. */
required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS;
break;
}
/* Select the provider */
fabric_attr.prov_name = prov_include;
domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE;
/* message progression mode. */
if (!strcmp(ofi_progress_mode, "auto")) {
progress_mode = FI_PROGRESS_AUTO;
} else if (!strcmp(ofi_progress_mode, "manual")) {
progress_mode = FI_PROGRESS_MANUAL;
} else {
progress_mode = FI_PROGRESS_UNSPEC;
}
domain_attr.control_progress = progress_mode;
domain_attr.data_progress = progress_mode;
/* select endpoint type */
ep_attr.type = FI_EP_RDM;
/* ask for capabilities */
/* TODO: catch the caps here. */
hints.caps = required_caps;
hints.mode = FI_CONTEXT;
/* Ask for completion context */
hints.mode = FI_CONTEXT;
hints.fabric_attr = &fabric_attr;
hints.domain_attr = &domain_attr;
hints.ep_attr = &ep_attr;
hints.tx_attr = &tx_attr;
hints.rx_attr = &rx_attr;
/* for now */
tx_attr.iov_limit = 1;
rx_attr.iov_limit = 1;
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
mca_btl_ofi_component.module_count = 0;
/* do the query. */
rc = fi_getinfo(FI_VERSION(1, 5), NULL, NULL, 0, &hints, &info_list);
if (0 != rc) {
BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc)));
return NULL;
}
/* count the number of resources/ */
info = info_list;
while(info) {
resource_count++;
info = info->next;
}
BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count));
info = info_list;
while(info) {
rc = validate_info(info, required_caps);
if (OPAL_SUCCESS == rc) {
/* Device passed sanity check, let's make a module.
*
* The initial fi_getinfo() call will return a list of providers
* available for this process. once a provider is selected from the
* list, we will cycle through the remaining list to identify NICs
* serviced by this provider, and try to pick one on the same NUMA
* node as this process. If there are no NICs on the same NUMA node,
* we pick one in a manner which allows all ranks to make balanced
* use of available NICs on the system.
*
* Most providers give a separate fi_info object for each NIC,
* however some may have multiple info objects with different
* attributes for the same NIC. The initial provider attributes
* are used to ensure that all NICs we return provide the same
* capabilities as the inital one.
*/
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
rc = mca_btl_ofi_init_device(selected_info);
if (OPAL_SUCCESS == rc) {
info = selected_info;
break;
}
}
info = info->next;
}
/* We are done with the returned info. */
fi_freeinfo(info_list);
/* pass module array back to caller */
base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules));
if (NULL == base_modules) {
return NULL;
}
memcpy(base_modules, mca_btl_ofi_component.modules,
mca_btl_ofi_component.module_count *sizeof (mca_btl_ofi_component.modules[0]));
BTL_VERBOSE(("ofi btl initialization complete. found %d suitable transports",
mca_btl_ofi_component.module_count));
*num_btl_modules = mca_btl_ofi_component.module_count;
return base_modules;
}
static int mca_btl_ofi_init_device(struct fi_info *info)
{
int rc;
int *module_count = &mca_btl_ofi_component.module_count;
size_t namelen;
size_t num_contexts_to_create;
char *linux_device_name;
char ep_name[FI_NAME_MAX];
struct fi_info *ofi_info;
struct fi_ep_attr *ep_attr;
struct fi_domain_attr *domain_attr;
struct fi_av_attr av_attr = {0};
struct fid_fabric *fabric = NULL;
struct fid_domain *domain = NULL;
struct fid_ep *ep = NULL;
struct fid_av *av = NULL;
mca_btl_ofi_module_t *module;
module = mca_btl_ofi_module_alloc(mca_btl_ofi_component.mode);
if (NULL == module) {
BTL_VERBOSE(("failed allocating ofi module"));
goto fail;
}
/* If the user ask for two sided support, something bad is happening
* to the MTL, so we will take maximum priority to supersede the MTL. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
/* make a copy of the given info to store on the module */
ofi_info = fi_dupinfo(info);
ep_attr = ofi_info->ep_attr;
domain_attr = ofi_info->domain_attr;
/* mtl_btl_ofi_rcache_init() initializes patcher which should only
* take place things are single threaded. OFI providers may start
* spawn threads, so initialize the rcache before creating OFI objects
* to prevent races. */
mca_btl_ofi_rcache_init(module);
linux_device_name = info->domain_attr->name;
BTL_VERBOSE(("initializing dev:%s provider:%s",
linux_device_name,
info->fabric_attr->prov_name));
/* fabric */
rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_fabric with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* domain */
rc = fi_domain(fabric, ofi_info, &domain, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_domain with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* AV */
av_attr.type = FI_AV_MAP;
rc = fi_av_open(domain, &av_attr, &av, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_av_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module;
/* If the domain support scalable endpoint. */
if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) {
BTL_VERBOSE(("btl/ofi using scalable endpoint."));
if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) {
BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)",
module->num_contexts,
domain_attr->max_ep_tx_ctx));
goto fail;
}
/* modify the info to let the provider know we are creating x contexts */
ep_attr->tx_ctx_cnt = num_contexts_to_create;
ep_attr->rx_ctx_cnt = num_contexts_to_create;
/* create scalable endpoint */
rc = fi_scalable_ep(domain, ofi_info, &ep, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
module->num_contexts = num_contexts_to_create;
module->is_scalable_ep = true;
/* create contexts */
module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info,
domain, ep, av,
num_contexts_to_create);
} else {
/* warn the user if they want more than 1 context */
if (num_contexts_to_create > 1) {
BTL_ERROR(("cannot create %zu contexts as the provider does not support "
"scalable endpoint. Falling back to single context endpoint.",
num_contexts_to_create));
}
BTL_VERBOSE(("btl/ofi using normal endpoint."));
rc = fi_endpoint(domain, ofi_info, &ep, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
module->num_contexts = 1;
module->is_scalable_ep = false;
/* create contexts */
module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info,
domain, ep, av);
}
if (NULL == module->contexts) {
/* error message is already printed */
goto fail;
}
/* enable the endpoint for using */
rc = fi_enable(ep);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* Everything succeeded, lets create a module for this device. */
/* store the information. */
module->fabric_info = ofi_info;
module->fabric = fabric;
module->domain = domain;
module->av = av;
module->ofi_endpoint = ep;
module->linux_device_name = linux_device_name;
module->outstanding_rdma = 0;
module->use_virt_addr = false;
if (ofi_info->domain_attr->mr_mode == FI_MR_BASIC ||
ofi_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) {
module->use_virt_addr = true;
}
/* create endpoint list */
OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t);
rc = opal_hash_table_init (&module->id_to_endpoint, 512);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error initializing hash table."));
goto fail;
}
/* create and send the modex for this device */
namelen = sizeof(ep_name);
rc = fi_getname((fid_t)ep, &ep_name[0], &namelen);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_getname with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* If we have two-sided support. */
if (TWO_SIDED_ENABLED) {
/* post wildcard recvs */
for (int i=0; i < module->num_contexts; i++) {
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) module,
&module->contexts[i],
mca_btl_ofi_component.rd_num);
if (OPAL_SUCCESS != rc) {
goto fail;
}
}
}
/* post our endpoint name so peer can use it to connect to us */
OPAL_MODEX_SEND(rc,
OPAL_PMIX_GLOBAL,
&mca_btl_ofi_component.super.btl_version,
&ep_name,
namelen);
mca_btl_ofi_component.namelen = namelen;
/* add this module to the list */
mca_btl_ofi_component.modules[(*module_count)++] = module;
return OPAL_SUCCESS;
fail:
/* clean up */
/* close basic ep before closing av */
if (NULL != ep && !module->is_scalable_ep) {
fi_close(&ep->fid);
ep = NULL;
}
/* if the contexts have not been initiated, num_contexts should
* be zero and we skip this. */
for (int i=0; i < module->num_contexts; i++) {
mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep);
}
free(module->contexts);
/* check for NULL ep to avoid double-close */
if (NULL != ep) {
fi_close(&ep->fid);
}
/* close av after closing basic ep */
if (NULL != av) {
fi_close(&av->fid);
}
if (NULL != domain) {
fi_close(&domain->fid);
}
if (NULL != fabric) {
fi_close(&fabric->fid);
}
free(module);
/* not really a failure. just skip this device. */
return OPAL_ERR_OUT_OF_RESOURCE;
}
/**
* @brief OFI BTL progress function
*
* This function explictly progresses all workers.
*/
static int mca_btl_ofi_component_progress (void)
{
int events = 0;
mca_btl_ofi_context_t *context;
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
/* progress context we own first. */
context = get_ofi_context(module);
if (mca_btl_ofi_context_trylock(context)) {
events += mca_btl_ofi_context_progress(context);
mca_btl_ofi_context_unlock(context);
}
/* if there is nothing to do, try progress other's. */
if (events == 0) {
for (int j = 0 ; j < module->num_contexts ; j++ ) {
context = get_ofi_context_rr(module);
if (mca_btl_ofi_context_trylock(context)) {
events += mca_btl_ofi_context_progress(context);
mca_btl_ofi_context_unlock(context);
}
/* If we did something, good enough. return now.
* This is crucial for performance/latency. */
if (events > 0) {
break;
}
}
}
}
return events;
}
/** OFI btl component */
mca_btl_ofi_component_t mca_btl_ofi_component = {
.super = {
.btl_version = {
MCA_BTL_DEFAULT_VERSION("ofi"),
.mca_open_component = mca_btl_ofi_component_open,
.mca_close_component = mca_btl_ofi_component_close,
.mca_register_component_params = mca_btl_ofi_component_register,
},
.btl_data = {
/* The component is not checkpoint ready */
.param_field = MCA_BASE_METADATA_PARAM_NONE
},
.btl_init = mca_btl_ofi_component_init,
.btl_progress = mca_btl_ofi_component_progress,
},
};

463
opal/mca/btl/ofi/btl_ofi_context.c Обычный файл
Просмотреть файл

@ -0,0 +1,463 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* $COPYRIGHT$
* Copyright (c) 2018 Intel Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi.h"
#include "btl_ofi_frag.h"
#include "btl_ofi_rdma.h"
#if OPAL_HAVE_THREAD_LOCAL
opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
#endif /* OPAL_HAVE_THREAD_LOCAL */
int init_context_freelists(mca_btl_ofi_context_t *context)
{
int rc;
OBJ_CONSTRUCT(&context->rdma_comp_list, opal_free_list_t);
rc = opal_free_list_init(&context->rdma_comp_list,
sizeof(mca_btl_ofi_rdma_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_rdma_completion_t),
0,
0,
512,
-1,
512,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
return rc;
}
if (TWO_SIDED_ENABLED) {
OBJ_CONSTRUCT(&context->frag_comp_list, opal_free_list_t);
rc = opal_free_list_init(&context->frag_comp_list,
sizeof(mca_btl_ofi_frag_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_frag_completion_t),
0,
0,
512,
-1,
512,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
return rc;
}
/* Initialize frag pool */
OBJ_CONSTRUCT(&context->frag_list, opal_free_list_t);
rc = opal_free_list_init(&context->frag_list,
sizeof(mca_btl_ofi_base_frag_t) +
MCA_BTL_OFI_FRAG_SIZE,
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_base_frag_t),
0,
0,
1024,
-1,
1024,
NULL,
0,
NULL,
NULL,
NULL);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("failed to init frag pool (free_list)"));
}
}
return rc;
}
/* mca_btl_ofi_context_alloc_normal()
*
* This function will allocate an ofi_context, map the endpoint to tx/rx context,
* bind CQ,AV to the endpoint and initialize all the structure.
* USE WITH NORMAL ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av)
{
int rc;
uint32_t cq_flags = FI_TRANSMIT | FI_SEND | FI_RECV;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
mca_btl_ofi_context_t *context;
context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
if (NULL == context) {
BTL_VERBOSE(("cannot allocate context"));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto single_fail;
}
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = init_context_freelists(context);
if (rc != OPAL_SUCCESS) {
goto single_fail;
}
context->tx_ctx = ep;
context->rx_ctx = ep;
context->context_id = 0;
return context;
single_fail:
mca_btl_ofi_context_finalize(context, false);
return NULL;
}
/* mca_btl_ofi_context_alloc_scalable()
*
* This function allocate communication contexts and return the pointer
* to the first btl context. It also take care of all the bindings needed.
* USE WITH SCALABLE ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts)
{
BTL_VERBOSE(("creating %zu contexts", num_contexts));
int rc;
size_t i;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_rx_attr rx_attr = {0};
mca_btl_ofi_context_t *contexts;
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
if (NULL == contexts) {
BTL_VERBOSE(("cannot allocate communication contexts."));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto scalable_fail;
}
/* bind AV to endpoint */
rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
for (i=0; i < num_contexts; i++) {
rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* We don't actually need a receiving context as we only do one-sided.
* However, sockets provider will hang if we dont have one. It is
* also nice to have equal number of tx/rx context. */
rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* create CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to transmit context */
rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, FI_TRANSMIT);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to receiving context */
if (TWO_SIDED_ENABLED) {
rc = fi_ep_bind(contexts[i].rx_ctx, (fid_t)contexts[i].cq, FI_RECV);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
}
/* enable the context. */
rc = fi_enable(contexts[i].tx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
rc = fi_enable(contexts[i].rx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* initialize freelists. */
rc = init_context_freelists(&contexts[i]);
if (rc != OPAL_SUCCESS) {
goto scalable_fail;
}
/* assign the id */
contexts[i].context_id = i;
}
return contexts;
scalable_fail:
/* close and free */
for(i=0; i < num_contexts; i++) {
mca_btl_ofi_context_finalize(&contexts[i], true);
}
free(contexts);
return NULL;
}
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
/* if it is a scalable ep, we have to close all contexts. */
if (scalable_ep) {
if (NULL != context->tx_ctx) {
fi_close(&context->tx_ctx->fid);
}
if (NULL != context->rx_ctx) {
fi_close(&context->rx_ctx->fid);
}
}
if( NULL != context->cq) {
fi_close(&context->cq->fid);
}
/* Can we destruct the object that hasn't been constructed? */
OBJ_DESTRUCT(&context->rdma_comp_list);
if (TWO_SIDED_ENABLED) {
OBJ_DESTRUCT(&context->frag_comp_list);
OBJ_DESTRUCT(&context->frag_list);
}
}
/* Get a context to use for communication.
* If TLS is supported, it will use the cached endpoint.
* If not, it will invoke the normal round-robin assignment. */
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
{
#if OPAL_HAVE_THREAD_LOCAL
/* With TLS, we cache the context we use. */
static volatile int64_t cur_num = 0;
if (OPAL_UNLIKELY(my_context == NULL)) {
OPAL_THREAD_LOCK(&btl->module_lock);
my_context = &btl->contexts[cur_num];
cur_num = (cur_num + 1) %btl->num_contexts;
OPAL_THREAD_UNLOCK(&btl->module_lock);
}
assert (my_context);
return my_context;
#else
return get_ofi_context_rr(btl);
#endif
}
/* return the context in a round-robin. */
/* There is no need for atomics here as it might hurt the performance. */
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
{
static volatile uint64_t rr_num = 0;
return &btl->contexts[rr_num++%btl->num_contexts];
}
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
int ret = 0;
int events_read;
int events = 0;
struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE];
struct fi_cq_err_entry cqerr = {0};
mca_btl_ofi_completion_context_t *c_ctx;
mca_btl_ofi_base_completion_t *comp;
mca_btl_ofi_rdma_completion_t *rdma_comp;
mca_btl_ofi_frag_completion_t *frag_comp;
ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
if (0 < ret) {
events_read = ret;
for (int i = 0; i < events_read; i++) {
if (NULL != cq_entry[i].op_context) {
++events;
c_ctx = (mca_btl_ofi_completion_context_t*) cq_entry[i].op_context;
/* We are casting to every type here just for simplicity. */
comp = (mca_btl_ofi_base_completion_t*) c_ctx->comp;
frag_comp = (mca_btl_ofi_frag_completion_t*) c_ctx->comp;
rdma_comp = (mca_btl_ofi_rdma_completion_t*) c_ctx->comp;
switch (comp->type) {
case MCA_BTL_OFI_TYPE_GET:
case MCA_BTL_OFI_TYPE_PUT:
case MCA_BTL_OFI_TYPE_AOP:
case MCA_BTL_OFI_TYPE_AFOP:
case MCA_BTL_OFI_TYPE_CSWAP:
/* call the callback */
if (rdma_comp->cbfunc) {
rdma_comp->cbfunc (comp->btl, comp->endpoint,
rdma_comp->local_address, rdma_comp->local_handle,
rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS);
}
MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t*) comp->btl);
break;
case MCA_BTL_OFI_TYPE_RECV:
mca_btl_ofi_recv_frag((mca_btl_ofi_module_t*) comp->btl,
(mca_btl_ofi_endpoint_t*) comp->endpoint,
context, frag_comp->frag);
break;
case MCA_BTL_OFI_TYPE_SEND:
MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t*) comp->btl);
mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
}
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
}
}
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(context->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
}
#ifdef FI_EINTR
/* sometimes, sockets provider complain about interupt. We do nothing. */
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
}
#endif
/* If the error is not FI_EAGAIN, report the error and abort. */
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
MCA_BTL_OFI_ABORT();
}
return events;
}

50
opal/mca/btl/ofi/btl_ofi_endpoint.c Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "opal/util/proc.h"
static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint)
{
endpoint->peer_addr = 0;
OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t);
}
static void mca_btl_ofi_endpoint_destruct (mca_btl_ofi_endpoint_t *endpoint)
{
endpoint->peer_addr = 0;
/* set to null, we will free ofi endpoint in module */
endpoint->ofi_endpoint = NULL;
OBJ_DESTRUCT(&endpoint->ep_lock);
}
OBJ_CLASS_INSTANCE(mca_btl_ofi_endpoint_t, opal_list_item_t,
mca_btl_ofi_endpoint_construct,
mca_btl_ofi_endpoint_destruct);
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep)
{
mca_btl_ofi_endpoint_t *endpoint = OBJ_NEW(mca_btl_ofi_endpoint_t);
if (OPAL_UNLIKELY(NULL == endpoint)) {
return NULL;
}
endpoint->ep_proc = proc;
endpoint->ofi_endpoint = ep;
return (mca_btl_base_endpoint_t *) endpoint;
}

75
opal/mca/btl/ofi/btl_ofi_endpoint.h Обычный файл
Просмотреть файл

@ -0,0 +1,75 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_OFI_ENDPOINT_H
#define MCA_BTL_OFI_ENDPOINT_H
#include "opal/class/opal_list.h"
#include "opal/mca/event/event.h"
#include "btl_ofi.h"
BEGIN_C_DECLS
#if OPAL_HAVE_THREAD_LOCAL
extern opal_thread_local mca_btl_ofi_context_t *my_context;
#endif /* OPAL_HAVE_THREAD_LOCAL */
struct mca_btl_base_endpoint_t {
opal_list_item_t super;
struct fid_ep *ofi_endpoint;
fi_addr_t peer_addr;
/** endpoint proc */
opal_proc_t *ep_proc;
/** mutex to protect this structure */
opal_mutex_t ep_lock;
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t);
int init_context_freelists(mca_btl_ofi_context_t *context);
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep);
/* contexts */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts);
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av);
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep);
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl);
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl);
END_C_DECLS
#endif

198
opal/mca/btl/ofi/btl_ofi_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,198 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* $COPYRIGHT$
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi.h"
#include "btl_ofi_frag.h"
#include "btl_ofi_rdma.h"
#include "btl_ofi_endpoint.h"
static void mca_btl_ofi_base_frag_constructor (mca_btl_ofi_base_frag_t *frag)
{
/* zero everything out */
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
}
static void mca_btl_ofi_base_frag_destructor (mca_btl_ofi_base_frag_t *frag)
{
}
OBJ_CLASS_INSTANCE(mca_btl_ofi_base_frag_t,
mca_btl_base_descriptor_t,
mca_btl_ofi_base_frag_constructor,
mca_btl_ofi_base_frag_destructor);
OBJ_CLASS_INSTANCE(mca_btl_ofi_frag_completion_t,
opal_free_list_item_t,
NULL,
NULL);
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
(mca_btl_base_module_t *btl,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag,
int type)
{
mca_btl_ofi_frag_completion_t *comp;
comp = (mca_btl_ofi_frag_completion_t*) opal_free_list_get(&context->frag_comp_list);
comp->base.btl = btl;
comp->base.my_context = context;
comp->base.my_list = &context->frag_comp_list;
comp->base.type = type;
comp->frag = frag;
comp->comp_ctx.comp = comp;
return comp;
}
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
uint64_t order, size_t size, uint32_t flags)
{
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
mca_btl_ofi_base_frag_t *frag = NULL;
mca_btl_ofi_context_t *context = get_ofi_context(ofi_btl);
frag = mca_btl_ofi_frag_alloc(ofi_btl, &context->frag_list, endpoint);
if (OPAL_LIKELY(frag)) {
frag->segments[0].seg_addr.pval = frag + 1;
frag->segments[0].seg_len = size;
frag->base.des_segment_count = 1;
frag->base.des_segments = &frag->segments[0];
frag->base.des_flags = flags;
frag->base.order = order;
frag->hdr.len = size;
}
return (mca_btl_base_descriptor_t*) frag;
}
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
{
/* return the frag to the free list. */
mca_btl_ofi_frag_return ((mca_btl_ofi_base_frag_t*) des);
return OPAL_SUCCESS;
}
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
int rc = 0;
mca_btl_ofi_context_t *context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
mca_btl_ofi_endpoint_t *ofi_ep = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t*) descriptor;
mca_btl_ofi_frag_completion_t *comp;
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
/* This tag is the active message tag for the remote side */
frag->hdr.tag = tag;
/* create completion context */
context = get_ofi_context(ofi_btl);
comp = mca_btl_ofi_frag_completion_alloc(btl, context, frag,
MCA_BTL_OFI_TYPE_SEND);
/* send the frag. Note that we start sending from BTL header + payload
* because we need the other side to have this header information. */
rc = fi_send(context->tx_ctx,
&frag->hdr,
sizeof(mca_btl_ofi_header_t) + frag->hdr.len,
NULL,
ofi_ep->peer_addr,
&comp->comp_ctx);
if (OPAL_UNLIKELY(FI_SUCCESS != rc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
MCA_BTL_OFI_NUM_SEND_INC(ofi_btl);
return OPAL_SUCCESS;
}
int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag)
{
int rc;
mca_btl_active_message_callback_t *reg;
/* Tell PML where the payload is */
frag->base.des_segments = frag->segments;
frag->segments[0].seg_addr.pval = frag+1;
frag->segments[0].seg_len = frag->hdr.len;
frag->base.des_segment_count = 1;
/* call the callback */
reg = mca_btl_base_active_message_trigger + frag->hdr.tag;
reg->cbfunc (&ofi_btl->super, frag->hdr.tag, &frag->base, reg->cbdata);
mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS);
/* repost the recv */
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) ofi_btl, context, 1);
if (OPAL_SUCCESS != rc) {
/* might not be that bad but let's just fail here. */
BTL_ERROR(("failed reposting receive."));
MCA_BTL_OFI_ABORT();
}
return OPAL_SUCCESS;
}
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
opal_convertor_t *convertor,
uint8_t order, size_t reserve,
size_t *size, uint32_t flags)
{
struct iovec iov;
size_t length;
uint32_t iov_count = 1;
mca_btl_ofi_base_frag_t *frag;
/* allocate the frag with reserve. */
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(btl, endpoint,
order, reserve, flags);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/* pack the data after the reserve */
iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag->segments[0].seg_addr.pval)) + reserve);
opal_convertor_pack(convertor, &iov, &iov_count, &length);
/* pass on frag information */
frag->base.des_segments = frag->segments;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_len += length;
frag->hdr.len += length;
*size = length;
return &frag->base;
}

95
opal/mca/btl/ofi/btl_ofi_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,95 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(MCA_BTL_OFI_FRAG_H)
#define MCA_BTL_OFI_FRAG_H
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#define MCA_BTL_OFI_HDR_SIZE sizeof(mca_btl_ofi_header_t)
#define MCA_BTL_OFI_FRAG_SIZE 4096
#define MCA_BTL_OFI_RECV_SIZE MCA_BTL_OFI_FRAG_SIZE + MCA_BTL_OFI_HDR_SIZE
#define MCA_BTL_OFI_NUM_SEND_INC(module) \
OPAL_ATOMIC_ADD_FETCH64(&(module)->outstanding_send, 1); \
if (module->outstanding_send > mca_btl_ofi_component.progress_threshold) { \
mca_btl_ofi_component.super.btl_progress(); \
}
#define MCA_BTL_OFI_NUM_SEND_DEC(module) \
OPAL_ATOMIC_ADD_FETCH64(&(module)->outstanding_send, -1);
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
uint64_t order, size_t size, uint32_t flags);
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des);
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag);
int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag);
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
opal_convertor_t *convertor,
uint8_t order, size_t reserve,
size_t *size, uint32_t flags);
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
(mca_btl_base_module_t *btl,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag,
int type);
static inline mca_btl_ofi_base_frag_t *mca_btl_ofi_frag_alloc (mca_btl_ofi_module_t *ofi_btl, opal_free_list_t *fl,
mca_btl_base_endpoint_t *endpoint)
{
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t *) opal_free_list_get (fl);
if (OPAL_LIKELY(NULL != frag)) {
frag->free_list = fl;
frag->endpoint = endpoint;
frag->btl = ofi_btl;
}
return frag;
}
static inline void mca_btl_ofi_frag_return (mca_btl_ofi_base_frag_t *frag)
{
opal_free_list_return (frag->free_list, &frag->base.super);
}
static inline void mca_btl_ofi_frag_complete (mca_btl_ofi_base_frag_t *frag, int rc) {
mca_btl_ofi_module_t *ofi_btl = frag->btl;
/* call the local callback if specified */
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
frag->base.des_cbfunc(&ofi_btl->super, frag->endpoint, &frag->base, rc);
}
/* If the BTL has ownership, return it to the free list, */
if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
mca_btl_ofi_frag_return (frag);
}
}
#endif /* !defined(MCA_BTL_OFI_FRAG_H) */

447
opal/mca/btl/ofi/btl_ofi_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,447 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <string.h>
#include "opal/class/opal_bitmap.h"
#include "opal/mca/btl/btl.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/mpool/mpool.h"
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "btl_ofi_frag.h"
static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
size_t nprocs, opal_proc_t **opal_procs,
mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable)
{
int rc;
int count;
char *ep_name = NULL;
size_t namelen = mca_btl_ofi_component.namelen;
opal_proc_t *proc;
mca_btl_base_endpoint_t *ep;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
for (size_t i = 0 ; i < nprocs ; ++i) {
proc = opal_procs[i];
/* See if we already have an endpoint for this proc. */
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void **) &ep);
if (OPAL_SUCCESS == rc) {
BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name)));
peers[i] = ep;
} else {
/* We don't have this endpoint yet, create one */
peers[i] = mca_btl_ofi_endpoint_create (proc, ofi_btl->ofi_endpoint);
BTL_VERBOSE(("creating peer %p", peers[i]));
if (OPAL_UNLIKELY(NULL == peers[i])) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* Add this endpoint to the lookup table */
(void) opal_hash_table_set_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void**) &ep);
}
OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version,
&peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error receiving modex"));
MCA_BTL_OFI_ABORT();
}
/* get peer fi_addr */
count = fi_av_insert(ofi_btl->av, /* Address vector to insert */
ep_name, /* peer name */
1, /* amount to insert */
&peers[i]->peer_addr, /* return peer address here */
0, /* flags */
NULL); /* context */
/* if succeed, add this proc and mark reachable */
if (count == 1) { /* we inserted 1 address. */
opal_list_append (&ofi_btl->endpoints, &peers[i]->super);
opal_bitmap_set_bit(reachable, i);
} else {
BTL_VERBOSE(("fi_av_insert failed with rc = %d", count));
MCA_BTL_OFI_ABORT();
}
}
return OPAL_SUCCESS;
}
static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs,
opal_proc_t **procs, mca_btl_base_endpoint_t **peers)
{
int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_base_endpoint_t *ep;
for (size_t i = 0 ; i < nprocs ; ++i) {
if (peers[i]) {
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i], (void **) &ep);
if (OPAL_SUCCESS == rc) {
/* remove the address from AV. */
rc = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
if (rc < 0) {
/* remove failed. this should not happen. */
/* Lets not crash because we failed to remove an address. */
BTL_ERROR(("fi_av_remove failed with error %d:%s",
rc, fi_strerror(-rc)));
}
/* remove and free MPI endpoint from the list. */
opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
(void) opal_hash_table_remove_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i]);
OBJ_RELEASE(peers[i]);
}
}
}
return OPAL_SUCCESS;
}
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module)
{
if (!module->initialized) {
mca_rcache_base_resources_t rcache_resources;
char *tmp;
(void) asprintf (&tmp, "ofi.%s", module->linux_device_name);
rcache_resources.cache_name = tmp;
rcache_resources.reg_data = (void *) module;
rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t);
rcache_resources.register_mem = mca_btl_ofi_reg_mem;
rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem;
module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources);
free (tmp);
if (NULL == module->rcache) {
/* something when horribly wrong */
BTL_ERROR(("cannot create rcache"));
MCA_BTL_OFI_ABORT();
}
module->initialized = true;
}
}
/**
* @brief Register a memory region for put/get/atomic operations.
*
* @param btl (IN) BTL module
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
* @param base (IN) Pointer to start of region
* @param size (IN) Size of region
* @param flags (IN) Flags indicating what operation will be performed. Valid
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
* and MCA_BTL_DES_FLAGS_ATOMIC
*
* @returns a memory registration handle valid for both local and remote operations
* @returns NULL if the region could not be registered
*
* This function registers the specified region with the hardware for use with
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
* functions. Care should be taken to not hold an excessive number of registrations
* as they may use limited system/NIC resources.
*/
static struct mca_btl_base_registration_handle_t *
mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
size_t size, uint32_t flags)
{
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_reg_t *reg;
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
int rc;
rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags,
(mca_rcache_base_registration_t **) &reg);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return NULL;
}
return &reg->handle;
}
/**
* @brief Deregister a memory region
*
* @param btl (IN) BTL module region was registered with
* @param handle (IN) BTL registration handle to deregister
*
* This function deregisters the memory region associated with the specified handle. Care
* should be taken to not perform any RDMA or atomic operation on this memory region
* after it is deregistered. It is erroneous to specify a memory handle associated with
* a remote node.
*/
static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
{
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_reg_t *reg =
(mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle));
(void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, &reg->base);
return OPAL_SUCCESS;
}
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg)
{
int rc;
static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE;
mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data;
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg;
rc = fi_mr_reg(btl->domain, base, size, access_flags, 0,
(uint64_t) reg, 0, &ur->ur_mr, NULL);
if (0 != rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
ur->handle.rkey = fi_mr_key(ur->ur_mr);
ur->handle.desc = fi_mr_desc(ur->ur_mr);
/* In case the provider doesn't support FI_MR_VIRT_ADDR,
* we need to reference the remote address by the distance from base registered
* address. We keep this information to use in rdma/atomic operations. */
if (btl->use_virt_addr) {
ur->handle.base_addr = 0;
} else {
ur->handle.base_addr = base;
}
return OPAL_SUCCESS;
}
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
{
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg;
if (ur->ur_mr != NULL) {
if (0 != fi_close(&ur->ur_mr->fid)) {
BTL_ERROR(("%s: error unpinning memory mr=%p: %s",
__func__, (void*) ur->ur_mr, strerror(errno)));
return OPAL_ERROR;
}
}
return OPAL_SUCCESS;
}
/*
* Cleanup/release module resources.
*/
int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
{
int i;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *endpoint, *next;
assert(btl);
/* clear the rcache */
if (ofi_btl->rcache) {
mca_rcache_base_module_destroy (ofi_btl->rcache);
ofi_btl->rcache = NULL;
}
/* Close basic ep before closing its attached resources. */
if (NULL != ofi_btl->ofi_endpoint && !ofi_btl->is_scalable_ep) {
fi_close(&ofi_btl->ofi_endpoint->fid);
ofi_btl->ofi_endpoint = NULL;
}
/* loop over all the contexts */
for (i=0; i < ofi_btl->num_contexts; i++) {
mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
}
free(ofi_btl->contexts);
if (NULL != ofi_btl->ofi_endpoint) {
fi_close(&ofi_btl->ofi_endpoint->fid);
}
/* close ep before closing av */
if (NULL != ofi_btl->av) {
fi_close(&ofi_btl->av->fid);
}
if (NULL != ofi_btl->domain) {
fi_close(&ofi_btl->domain->fid);
}
if (NULL != ofi_btl->fabric) {
fi_close(&ofi_btl->fabric->fid);
}
if (NULL != ofi_btl->fabric_info) {
fi_freeinfo(ofi_btl->fabric_info);
}
/* clean up any leftover endpoints */
OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) {
opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super);
OBJ_RELEASE(endpoint);
}
OBJ_DESTRUCT(&ofi_btl->endpoints);
OBJ_DESTRUCT(&ofi_btl->id_to_endpoint);
OBJ_DESTRUCT(&ofi_btl->module_lock);
free (btl);
return OPAL_SUCCESS;
}
/* Post wildcard recvs on the rx context. */
int mca_btl_ofi_post_recvs (mca_btl_base_module_t *module,
mca_btl_ofi_context_t *context,
int count)
{
int i;
int rc;
mca_btl_ofi_base_frag_t *frag;
mca_btl_ofi_frag_completion_t *comp;
for (i=0; i < count; i++) {
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(module,
NULL,
0,
MCA_BTL_OFI_FRAG_SIZE,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (NULL == frag) {
BTL_ERROR(("cannot allocate recv frag."));
return OPAL_ERROR;
}
comp = mca_btl_ofi_frag_completion_alloc (module,
context,
frag,
MCA_BTL_OFI_TYPE_RECV);
rc = fi_recv (context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE,
NULL, FI_ADDR_UNSPEC, &comp->comp_ctx);
if (FI_SUCCESS != rc) {
BTL_ERROR(("cannot post recvs"));
return OPAL_ERROR;
}
}
return OPAL_SUCCESS;
}
/* Allocate and fill out the module capabilities according to operation mode. */
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode)
{
mca_btl_ofi_module_t *module;
/* allocate module */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
if (NULL == module) {
return NULL;
}
/* fill in the defaults */
*module = mca_btl_ofi_module_template;
if (mode == MCA_BTL_OFI_MODE_ONE_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
module->super.btl_put = mca_btl_ofi_put;
module->super.btl_get = mca_btl_ofi_get;
module->super.btl_atomic_op = mca_btl_ofi_aop;
module->super.btl_atomic_fop = mca_btl_ofi_afop;
module->super.btl_atomic_cswap = mca_btl_ofi_acswap;
module->super.btl_flush = mca_btl_ofi_flush;
module->super.btl_register_mem = mca_btl_ofi_register_mem;
module->super.btl_deregister_mem = mca_btl_ofi_deregister_mem;
module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS |
MCA_BTL_FLAGS_ATOMIC_OPS |
MCA_BTL_FLAGS_RDMA;
module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_SWAP |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
MCA_BTL_ATOMIC_SUPPORTS_32BIT ;
module->super.btl_put_limit = 1 << 23;
module->super.btl_put_alignment = 0;
module->super.btl_get_limit = 1 << 23;
module->super.btl_get_alignment = 0;
module->super.btl_registration_handle_size =
sizeof(mca_btl_base_registration_handle_t);
}
if (mode == MCA_BTL_OFI_MODE_TWO_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
module->super.btl_alloc = mca_btl_ofi_alloc;
module->super.btl_free = mca_btl_ofi_free;
module->super.btl_prepare_src = mca_btl_ofi_prepare_src;
module->super.btl_send = mca_btl_ofi_send;
module->super.btl_flags |= MCA_BTL_FLAGS_SEND;
module->super.btl_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
module->super.btl_max_send_size = MCA_BTL_OFI_FRAG_SIZE;
module->super.btl_rndv_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
/* If two sided is enabled, we expected that the user knows exactly what
* they want. We bump the priority to maximum, making this BTL the default. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
}
if (mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
module->super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
module->super.btl_rdma_pipeline_send_length = 8 * 1024;
}
return module;
}
mca_btl_ofi_module_t mca_btl_ofi_module_template = {
.super = {
.btl_component = &mca_btl_ofi_component.super,
.btl_add_procs = mca_btl_ofi_add_procs,
.btl_del_procs = mca_btl_ofi_del_procs,
.btl_finalize = mca_btl_ofi_finalize,
}
};

159
opal/mca/btl/ofi/btl_ofi_rdma.c Обычный файл
Просмотреть файл

@ -0,0 +1,159 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi_rdma.h"
OBJ_CLASS_INSTANCE(mca_btl_ofi_rdma_completion_t,
opal_free_list_item_t,
NULL,
NULL);
mca_btl_ofi_rdma_completion_t *mca_btl_ofi_rdma_completion_alloc (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
void *local_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata,
int type)
{
assert(btl);
assert(endpoint);
assert(ofi_context);
mca_btl_ofi_rdma_completion_t *comp;
comp = (mca_btl_ofi_rdma_completion_t*) opal_free_list_get(&ofi_context->rdma_comp_list);
assert(comp);
comp->base.btl = btl;
comp->base.endpoint = endpoint;
comp->base.my_context = ofi_context;
comp->base.my_list = &ofi_context->rdma_comp_list;
comp->base.type = type;
comp->local_address = local_address;
comp->local_handle = local_handle;
comp->cbfunc = cbfunc;
comp->cbcontext = cbcontext;
comp->cbdata = cbdata;
comp->comp_ctx.comp = comp;
return comp;
}
int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int rc;
mca_btl_ofi_rdma_completion_t *comp;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_GET);
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* Remote write data across the wire */
rc = fi_read(ofi_context->tx_ctx,
local_address, size, /* payload */
local_handle->desc,
btl_endpoint->peer_addr,
remote_address, remote_handle->rkey,
&comp->comp_ctx); /* completion context */
if (-FI_EAGAIN == rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (0 != rc) {
BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc)));
MCA_BTL_OFI_ABORT();
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
return OPAL_SUCCESS;
}
int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */
mca_btl_ofi_rdma_completion_t *comp;
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_PUT);
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* Remote write data across the wire */
rc = fi_write(ofi_context->tx_ctx,
local_address, size, /* payload */
local_handle->desc,
btl_endpoint->peer_addr,
remote_address, remote_handle->rkey,
&comp->comp_ctx); /* completion context */
if (-FI_EAGAIN == rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
if (0 != rc) {
BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc)));
MCA_BTL_OFI_ABORT();
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
return OPAL_SUCCESS;
}
int mca_btl_ofi_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint)
{
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
while(ofi_btl->outstanding_rdma > 0) {
(void) mca_btl_ofi_component.super.btl_progress();
}
return OPAL_SUCCESS;
}

42
opal/mca/btl/ofi/btl_ofi_rdma.h Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel, Inc, All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef BTL_OFI_RDMA_H
#define BTL_OFI_RDMA_H
#include "opal/threads/thread_usage.h"
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
mca_btl_ofi_rdma_completion_t *mca_btl_ofi_rdma_completion_alloc (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
void *local_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata,
int type);
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); \
if (module->outstanding_rdma > mca_btl_ofi_component.progress_threshold){ \
mca_btl_ofi_component.super.btl_progress(); \
}
#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1);
#endif /* !defined(BTL_OFI_RDMA_H) */

52
opal/mca/btl/ofi/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,52 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 QLogic Corp. All rights reserved.
# Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2011-2018 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2018 Intel, inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# OPAL_CHECK_OFI(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if OFI support can be found. sets prefix_{CPPFLAGS,
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([MCA_opal_btl_ofi_CONFIG],[
OPAL_VAR_SCOPE_PUSH([opal_btl_ofi_happy CPPFLAGS_save])
AC_CONFIG_FILES([opal/mca/btl/ofi/Makefile])
# Check for OFI
OPAL_CHECK_OFI
opal_btl_ofi_happy=0
AS_IF([test "$opal_ofi_happy" = "yes"],
[CPPFLAGS_save=$CPPFLAGS
CPPFLAGS="$opal_ofi_CPPFLAGS $CPPFLAGS"
AC_CHECK_DECL([FI_MR_VIRT_ADDR], [opal_btl_ofi_happy=1], [],
[#include <rdma/fabric.h>])
CPPFLAGS=$CPPFLAGS_save])
AS_IF([test $opal_btl_ofi_happy -eq 1],
[$1],
[$2])
OPAL_VAR_SCOPE_POP
])dnl

7
opal/mca/btl/ofi/owner.txt Обычный файл
Просмотреть файл

@ -0,0 +1,7 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner:Intel
status:active

Просмотреть файл

@ -12,7 +12,7 @@
# All rights reserved.
# Copyright (c) 2006 Sandia National Laboratories. All rights
# reserved.
# Copyright (c) 2010-2019 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
@ -100,25 +100,11 @@ AC_DEFUN([_OPAL_BTL_USNIC_DO_CONFIG],[
OPAL_CHECK_OFI
opal_btl_usnic_happy=$opal_ofi_happy])
# The usnic BTL requires at least OFI libfabric v1.1 (there was a
# critical bug in libfabric v1.0).
# The usnic BTL requires at least OFI libfabric v1.3.
AS_IF([test "$opal_btl_usnic_happy" = "yes"],
[AC_MSG_CHECKING([whether OFI libfabric is >= v1.1])
opal_btl_usnic_CPPFLAGS_save=$CPPFLAGS
CPPFLAGS="$opal_ofi_CPPFLAGS $CPPFLAGS"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fabric.h>]],
[[
#if !defined(FI_MAJOR_VERSION)
#error your version of OFI libfabric is too old
#elif FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION) < FI_VERSION(1, 1)
#error your version of OFI libfabric is too old
#endif
]])],
[opal_btl_usnic_happy=yes],
[opal_btl_usnic_happy=no])
AC_MSG_RESULT([$opal_btl_usnic_happy])
CPPFLAGS=$opal_btl_usnic_CPPFLAGS_save
])
[OPAL_CHECK_OFI_VERSION_GE([1,3],
[],
[opal_btl_usnic_happy=no])])
# Make sure we can find the OFI libfabric usnic extensions header
AS_IF([test "$opal_btl_usnic_happy" = "yes" ],

106
opal/mca/common/ofi/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,106 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2013 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2019 Hewlett Packard Enterprise. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# A word of explanation...
#
# This library is linked against various MCA components because the
# support for ofis is needed in various places.
#
# Note that building this common component statically and linking
# against other dynamic components is *not* supported!
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
# Header files
headers = \
common_ofi.h
# Source files
sources = \
common_ofi.c
# As per above, we'll either have an installable or noinst result.
# The installable one should follow the same MCA prefix naming rules
# (i.e., libmca_<type>_<name>.la). The noinst one can be named
# whatever it wants, although libmca_<type>_<name>_noinst.la is
# recommended.
# To simplify components that link to this library, we will *always*
# have an output libtool library named libmca_<type>_<name>.la -- even
# for case 2) described above (i.e., so there's no conditional logic
# necessary in component Makefile.am's that link to this library).
# Hence, if we're creating a noinst version of this library (i.e.,
# case 2), we sym link it to the libmca_<type>_<name>.la name
# (libtool will do the Right Things under the covers). See the
# all-local and clean-local rules, below, for how this is effected.
lib_LTLIBRARIES =
noinst_LTLIBRARIES =
comp_inst = lib@OPAL_LIB_PREFIX@mca_common_ofi.la
comp_noinst = lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst.la
if MCA_BUILD_opal_common_ofi_DSO
lib_LTLIBRARIES += $(comp_inst)
else
noinst_LTLIBRARIES += $(comp_noinst)
endif
lib@OPAL_LIB_PREFIX@mca_common_ofi_la_SOURCES = $(headers) $(sources)
lib@OPAL_LIB_PREFIX@mca_common_ofi_la_LDFLAGS = \
$(opal_ofi_LDFLAGS) \
-version-info $(libmca_opal_common_ofi_so_version)
lib@OPAL_LIB_PREFIX@mca_common_ofi_la_LIBADD = $(opal_ofi_LIBS)
lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst_la_SOURCES = $(headers) $(sources)
lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst_la_LDFLAGS = $(opal_ofi_LDFLAGS)
lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst_la_LIBADD = $(opal_ofi_LIBS)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
opaldir = $(opalincludedir)/$(subdir)
opal_HEADERS = $(headers)
endif
# These two rules will sym link the "noinst" libtool library filename
# to the installable libtool library filename in the case where we are
# compiling this component statically (case 2), described above).
V=0
OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V)
ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY)
ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`;
all-local:
$(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \
rm -f "$(comp_inst)"; \
$(LN_S) "$(comp_noinst)" "$(comp_inst)"; \
fi
clean-local:
if test -z "$(lib_LTLIBRARIES)"; then \
rm -f "$(comp_inst)"; \
fi

311
opal/mca/common/ofi/common_ofi.c Обычный файл
Просмотреть файл

@ -0,0 +1,311 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/hwloc/base/base.h"
#include <errno.h>
#include <unistd.h>
#include "common_ofi.h"
int mca_common_ofi_register_mca_variables(void)
{
if (fi_version() >= FI_VERSION(1,0)) {
return OPAL_SUCCESS;
} else {
return OPAL_ERROR;
}
}
/* check that the tx attributes match */
static int
check_tx_attr(struct fi_tx_attr *provider_info,
struct fi_tx_attr *provider)
{
if (!(provider->msg_order & ~(provider_info->msg_order)) &&
!(provider->op_flags & ~(provider_info->op_flags)) &&
(provider->inject_size == provider_info->inject_size)) {
return 0;
} else {
return OPAL_ERROR;
}
}
/* check that the rx attributes match */
static int
check_rx_attr(struct fi_rx_attr *provider_info,
struct fi_rx_attr *provider)
{
if (!(provider->msg_order & ~(provider_info->msg_order)) &&
!(provider->op_flags & ~(provider_info->op_flags))) {
return 0;
} else {
return OPAL_ERROR;
}
}
/* check that the ep attributes match */
static int
check_ep_attr(struct fi_ep_attr *provider_info,
struct fi_ep_attr *provider)
{
if (!(provider->type & ~(provider_info->type)) &&
!(provider->mem_tag_format & ~(provider_info->mem_tag_format)) &&
(provider->max_msg_size == provider_info->max_msg_size) &&
(provider->tx_ctx_cnt == provider_info->tx_ctx_cnt) &&
(provider->rx_ctx_cnt == provider_info->rx_ctx_cnt)) {
return 0;
} else {
return OPAL_ERROR;
}
}
/* check that the provider attributes match */
static int
check_provider_attr(struct fi_info *provider_info,
struct fi_info *provider)
{
/* make sure both info are the same provider and provide the same attributes */
if (0 == strcmp(provider_info->fabric_attr->prov_name, provider->fabric_attr->prov_name) &&
!check_tx_attr(provider_info->tx_attr, provider->tx_attr) &&
!check_rx_attr(provider_info->rx_attr, provider->rx_attr) &&
!check_ep_attr(provider_info->ep_attr, provider->ep_attr) &&
!(provider_info->caps & ~(provider->caps)) &&
!(provider_info->mode & ~(provider->mode))) {
return 0;
} else {
return OPAL_ERROR;
}
}
#if OPAL_OFI_PCI_DATA_AVAILABLE
/* Check if a process and a pci device share the same cpuset
* @param (IN) pci struct fi_pci_attr pci device attributes,
* used to find hwloc object for device.
*
* @param (IN) topology hwloc_topology_t topology to get the cpusets
* from
*
* @param (OUT) returns true if cpusets match and false if
* cpusets do not match or an error prevents comparison
*
* Uses a pci device to find an ancestor that contains a cpuset, and
* determines if it intersects with the cpuset that the process is bound to.
* if the process is not bound, or if a cpuset is unavailable for whatever
* reason, returns false. Otherwise, returns the result of
* hwloc_cpuset_intersects()
*/
static bool
compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
{
bool result = false;
int ret;
hwloc_bitmap_t proc_cpuset;
hwloc_obj_t obj = NULL;
/* Cannot find topology info if no topology is found */
if (NULL == topology) {
return false;
}
/* Allocate memory for proc_cpuset */
proc_cpuset = hwloc_bitmap_alloc();
if (NULL == proc_cpuset) {
return false;
}
/* Fill cpuset with the collection of cpu cores that the process runs on */
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
if (0 > ret) {
goto error;
}
/* Get the pci device from bdf */
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id,
pci.device_id, pci.function_id);
if (NULL == obj) {
goto error;
}
/* pcidev objects don't have cpusets so find the first non-io object above */
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
if (NULL != obj) {
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
}
error:
hwloc_bitmap_free(proc_cpuset);
return result;
}
#endif
/* Count providers returns the number of providers present in an fi_info list
* @param (IN) provider_list struct fi_info* list of providers available
*
* @param (OUT) int number of providers present in the list
*
* returns 0 if the list is NULL
*/
static int
count_providers(struct fi_info* provider_list)
{
struct fi_info* dev = provider_list;
int num_provider = 0;
while (NULL != dev) {
num_provider++;
dev = dev->next;
}
return num_provider;
}
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
*
* Initializes opal_hwloc_topology to access hardware topology if not previously
* initialized
*
* There are 3 main cases that this covers:
*
* 1. If the first provider passed into this function is the only valid
* provider, this provider is returned.
*
* 2. If there is more than 1 provider that matches the type of the first
* provider in the list, and the BDF data
* is available then a provider is selected based on locality of device
* cpuset and process cpuset and tries to ensure that processes are distributed
* evenly across NICs. This has two separate cases:
*
* i. There is one or more provider local to the process:
*
* (local rank % number of providers of the same type that share the process cpuset)
* is used to select one of these providers.
*
* ii. There is no provider that is local to the process:
*
* (local rank % number of providers of the same type)
* is used to select one of these providers
*
* 3. If there is more than 1 providers of the same type in the list, and the BDF data
* is not available (the ofi version does not support fi_info.nic or the
* provider does not support BDF) then (local rank % number of providers of the same type)
* is used to select one of these providers
*
* @param provider_list (IN) struct fi_info* An initially selected
* provider NIC. The provider name and
* attributes are used to restrict NIC
* selection. This provider is returned if the
* NIC selection fails.
*
* @param local_index (IN) int The local rank of the process. Used to
* select one valid NIC if there is a case
* where more than one can be selected. This
* could occur when more than one provider
* shares the same cpuset as the process.
*
* @param provider (OUT) struct fi_info* object with the selected
* provider if the selection succeeds
* if the selection fails, returns the fi_info
* object that was initially provided.
*
* All errors should be recoverable and will return the initially provided
* provider. However, if an error occurs we can no longer guarantee
* that the provider returned is local to the process or that the processes will
* balance across available NICs.
*/
struct fi_info*
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
{
struct fi_info *provider = provider_list, *current_provider = provider_list;
struct fi_info **provider_table;
#if OPAL_OFI_PCI_DATA_AVAILABLE
struct fi_pci_attr pci;
#endif
int ret;
unsigned int num_provider = 0, provider_limit = 0;
bool provider_found = false, cpusets_match = false;
/* Initialize opal_hwloc_topology if it is not already */
ret = opal_hwloc_base_get_topology();
if (0 > ret) {
/* Provider selection can continue but there is no guarantee of locality */
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Failed to initialize topology\n",
__FILE__, __LINE__);
}
provider_limit = count_providers(provider_list);
/* Allocate memory for provider table */
provider_table = calloc(provider_limit, sizeof(struct fi_info*));
if (NULL == provider_table) {
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Failed to allocate memory for provider table\n",
__FILE__, __LINE__);
return provider_list;
}
current_provider = provider;
/* Cycle through remaining fi_info objects, looking for alike providers */
while (NULL != current_provider) {
if (!check_provider_attr(provider, current_provider)) {
cpusets_match = false;
#if OPAL_OFI_PCI_DATA_AVAILABLE
if (NULL != current_provider->nic) {
pci = current_provider->nic->bus_attr->attr.pci;
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
}
#endif
/* Reset the list if the cpusets match and no other provider was
* found on the same cpuset as the process.
*/
if (cpusets_match && !provider_found) {
provider_found = true;
num_provider = 0;
}
/* Add the provider to the provider list if the cpusets match or if
* no other provider was found on the same cpuset as the process.
*/
if (cpusets_match || !provider_found) {
provider_table[num_provider] = current_provider;
num_provider++;
}
}
current_provider = current_provider->next;
}
/* Select provider from local rank % number of providers */
if (num_provider > 0) {
provider = provider_table[local_index % num_provider];
}
#if OPAL_OFI_PCI_DATA_AVAILABLE
if (NULL != provider->nic) {
pci = provider->nic->bus_attr->attr.pci;
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
}
#endif
#if OPAL_DEBUG_ENABLE
opal_output_verbose(1, opal_common_ofi.output,
"local rank: %d device: %s cpusets match: %s\n",
local_index, provider->domain_attr->name,
cpusets_match ? "true" : "false");
#endif
free(provider_table);
return provider;
}

21
opal/mca/common/ofi/common_ofi.h Обычный файл
Просмотреть файл

@ -0,0 +1,21 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_MCA_COMMON_OFI_H
#define OPAL_MCA_COMMON_OFI_H
#include <rdma/fabric.h>
OPAL_DECLSPEC int mca_common_ofi_register_mca_variables(void);
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
#endif /* OPAL_MCA_COMMON_OFI_H */

35
opal/mca/common/ofi/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
# -*- shell-script -*-
#
# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2013 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2019 Hewlett Packard Enterprise. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AC_DEFUN([MCA_opal_common_ofi_CONFIG],[
AC_CONFIG_FILES([opal/mca/common/ofi/Makefile])
OPAL_CHECK_OFI
# Note that $opal_common_ofi_happy is
# used in other configure.m4's to know if ofi configured
# successfully.
AS_IF([test "$opal_ofi_happy" = "yes"],
[opal_common_ofi_happy=yes
common_ofi_WRAPPER_EXTRA_LDFLAGS=$opal_ofi_LDFLAGS
common_ofi_WRAPPER_EXTRA_LIBS=$opal_ofi_LIBS
$1],
[opal_common_ofi_happy=no
$2])
])dnl

7
opal/mca/common/ofi/owner.txt Обычный файл
Просмотреть файл

@ -0,0 +1,7 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: Hewlett Packard Enterprise
status:active