Merge pull request #7807 from bwbarrett/backports/v4.1.x-ofi-updates
Backport OFI changes from master to v4.1.x
Этот коммит содержится в:
Коммит
a4e8f2b2cb
2
.mailmap
2
.mailmap
@ -111,3 +111,5 @@ Geoffrey Paulsen <gpaulsen@us.ibm.com> <gpaulsen@users.noreply.github.com>
|
||||
Anandhi S Jayakumar <anandhi.s.jayakumar@intel.com>
|
||||
|
||||
Mohan Gandhi <mohgan@amazon.com>
|
||||
|
||||
Harumi Kuno <harumi.kuno@hpe.com>
|
||||
|
4
NEWS
4
NEWS
@ -60,6 +60,10 @@ included in the vX.Y.Z section and be denoted as:
|
||||
4.1.0 -- July, 2020
|
||||
-------------------
|
||||
|
||||
- OFI/libfabric: Added support for multiple NICs
|
||||
- OFI/libfabric: Added support for Scalable Endpoints
|
||||
- OFI/libfabric: Added btl for one-sided support
|
||||
|
||||
4.0.4 -- June, 2020
|
||||
-----------------------
|
||||
- Fix a memory patcher issue intercepting shmat and shmdt. This was
|
||||
|
1
VERSION
1
VERSION
@ -116,6 +116,7 @@ libmca_orte_common_alps_so_version=70:0:30
|
||||
|
||||
# OPAL layer
|
||||
libmca_opal_common_cuda_so_version=70:0:30
|
||||
libmca_opal_common_ofi_so_version=10:0:0
|
||||
libmca_opal_common_sm_so_version=70:0:30
|
||||
libmca_opal_common_ucx_so_version=70:0:30
|
||||
libmca_opal_common_ugni_so_version=70:0:30
|
||||
|
@ -1,6 +1,6 @@
|
||||
dnl -*- shell-script -*-
|
||||
dnl
|
||||
dnl Copyright (c) 2015-2019 Cisco Systems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
|
||||
dnl reserved.
|
||||
dnl $COPYRIGHT$
|
||||
@ -10,6 +10,45 @@ dnl
|
||||
dnl $HEADER$
|
||||
dnl
|
||||
|
||||
dnl
|
||||
dnl OPAL_CHECK_OFI_VERSION_GE
|
||||
dnl
|
||||
dnl Check that the OFI API version number is >= a specific value.
|
||||
dnl
|
||||
dnl $1: version number to compare, in the form of "major,minor"
|
||||
dnl (without quotes) -- i.e., a single token representing the
|
||||
dnl arguments to FI_VERSION()
|
||||
dnl $2: action if OFI API version is >= $1
|
||||
dnl $3: action if OFI API version is < $1
|
||||
AC_DEFUN([OPAL_CHECK_OFI_VERSION_GE],[
|
||||
OPAL_VAR_SCOPE_PUSH([opal_ofi_ver_ge_save_CPPFLAGS opal_ofi_ver_ge_happy])
|
||||
|
||||
AC_MSG_CHECKING([if OFI API version number is >= $1])
|
||||
opal_ofi_ver_ge_save_CPPFLAGS=$CPPFLAGS
|
||||
CPPFLAGS=$opal_ofi_CPPFLAGS
|
||||
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fabric.h>]],
|
||||
[[
|
||||
#if !defined(FI_MAJOR_VERSION)
|
||||
#error "we cannot check the version -- sad panda"
|
||||
#elif FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION($1))
|
||||
#error "version is too low -- nopes"
|
||||
#endif
|
||||
]])],
|
||||
[opal_ofi_ver_ge_happy=1],
|
||||
[opal_ofi_ver_ge_happy=0])
|
||||
|
||||
AS_IF([test $opal_ofi_ver_ge_happy -eq 1],
|
||||
[AC_MSG_RESULT([yes])
|
||||
$2],
|
||||
[AC_MSG_RESULT([no])
|
||||
$3])
|
||||
|
||||
CPPFLAGS=$opal_ofi_ver_ge_save_CPPFLAGS
|
||||
|
||||
OPAL_VAR_SCOPE_POP
|
||||
])dnl
|
||||
|
||||
dnl
|
||||
dnl _OPAL_CHECK_OFI
|
||||
dnl --------------------------------------------------------
|
||||
@ -50,10 +89,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
|
||||
OPAL_CHECK_WITHDIR([ofi-libdir], [$with_ofi_libdir],
|
||||
[libfabric.*])
|
||||
|
||||
OPAL_VAR_SCOPE_PUSH([opal_check_ofi_save_CPPFLAGS opal_check_ofi_save_LDFLAGS opal_check_ofi_save_LIBS])
|
||||
OPAL_VAR_SCOPE_PUSH([opal_check_ofi_save_CPPFLAGS opal_check_ofi_save_LDFLAGS opal_check_ofi_save_LIBS opal_check_fi_info_pci])
|
||||
opal_check_ofi_save_CPPFLAGS=$CPPFLAGS
|
||||
opal_check_ofi_save_LDFLAGS=$LDFLAGS
|
||||
opal_check_ofi_save_LIBS=$LIBS
|
||||
opal_check_fi_info_pci=0
|
||||
|
||||
opal_ofi_happy=yes
|
||||
AS_IF([test "$with_ofi" = "no"],
|
||||
@ -81,6 +121,16 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
|
||||
[],
|
||||
[opal_ofi_happy=no])])
|
||||
|
||||
AS_IF([test $opal_ofi_happy = yes],
|
||||
[AC_CHECK_MEMBER([struct fi_info.nic],
|
||||
[opal_check_fi_info_pci=1],
|
||||
[opal_check_fi_info_pci=0],
|
||||
[[#include "$with_ofi/include/rdma/fabric.h"]])])
|
||||
|
||||
AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE],
|
||||
[$opal_check_fi_info_pci],
|
||||
[check if pci data is available in ofi])
|
||||
|
||||
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
|
||||
LDFLAGS=$opal_check_ofi_save_LDFLAGS
|
||||
LIBS=$opal_check_ofi_save_LIBS
|
||||
|
@ -154,6 +154,7 @@ AC_SUBST(libopen_pal_so_version)
|
||||
# transparently by adding some intelligence in autogen.pl
|
||||
# and/or opal_mca.m4, but I don't have the cycles to do this
|
||||
# right now.
|
||||
AC_SUBST(libmca_opal_common_ofi_so_version)
|
||||
AC_SUBST(libmca_opal_common_cuda_so_version)
|
||||
AC_SUBST(libmca_opal_common_sm_so_version)
|
||||
AC_SUBST(libmca_opal_common_ugni_so_version)
|
||||
|
5
ompi/mca/mtl/ofi/.gitignore
поставляемый
Обычный файл
5
ompi/mca/mtl/ofi/.gitignore
поставляемый
Обычный файл
@ -0,0 +1,5 @@
|
||||
mtl_ofi_improbe_opt.c
|
||||
mtl_ofi_iprobe_opt.c
|
||||
mtl_ofi_irecv_opt.c
|
||||
mtl_ofi_isend_opt.c
|
||||
mtl_ofi_send_opt.c
|
@ -14,21 +14,50 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = post_configure.sh
|
||||
EXTRA_DIST = post_configure.sh \
|
||||
$(generated_source_modules)
|
||||
|
||||
MAINTAINERCLEANFILES = \
|
||||
$(generated_sources)
|
||||
|
||||
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
|
||||
|
||||
dist_ompidata_DATA = help-mtl-ofi.txt
|
||||
|
||||
generated_source_modules = \
|
||||
mtl_ofi_send_opt.pm \
|
||||
mtl_ofi_isend_opt.pm \
|
||||
mtl_ofi_irecv_opt.pm \
|
||||
mtl_ofi_iprobe_opt.pm \
|
||||
mtl_ofi_improbe_opt.pm
|
||||
|
||||
generated_sources = \
|
||||
mtl_ofi_send_opt.c \
|
||||
mtl_ofi_isend_opt.c \
|
||||
mtl_ofi_irecv_opt.c \
|
||||
mtl_ofi_iprobe_opt.c \
|
||||
mtl_ofi_improbe_opt.c
|
||||
|
||||
mtl_ofi_sources = \
|
||||
mtl_ofi.h \
|
||||
mtl_ofi.c \
|
||||
mtl_ofi_compat.h \
|
||||
mtl_ofi_component.c \
|
||||
mtl_ofi_endpoint.h \
|
||||
mtl_ofi_endpoint.c \
|
||||
mtl_ofi_request.h \
|
||||
mtl_ofi_types.h
|
||||
mtl_ofi.h \
|
||||
mtl_ofi.c \
|
||||
mtl_ofi_compat.h \
|
||||
mtl_ofi_component.c \
|
||||
mtl_ofi_endpoint.h \
|
||||
mtl_ofi_endpoint.c \
|
||||
mtl_ofi_request.h \
|
||||
mtl_ofi_types.h \
|
||||
mtl_ofi_opt.h \
|
||||
$(generated_sources)
|
||||
|
||||
# A number of files are generated from macro expansion to minimize
|
||||
# branches in the critical path. These files have perl modules with the suffix
|
||||
# .pm that generate the corresponding .c file with all possible branches as
|
||||
# their own function and symbol. Additional input
|
||||
# files should be added to generated_source_modules, as well as adding
|
||||
# their .c variants to generated_sources.
|
||||
%.c : %.pm;
|
||||
$(PERL) -I$(top_srcdir)/ompi/mca/mtl/ofi $(top_srcdir)/ompi/mca/mtl/ofi/generate-opt-funcs.pl $@
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
@ -49,6 +78,7 @@ mca_mtl_ofi_la_LDFLAGS = \
|
||||
$(opal_ofi_LDFLAGS) \
|
||||
-module -avoid-version
|
||||
mca_mtl_ofi_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la \
|
||||
$(opal_ofi_LIBS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
|
@ -1,5 +1,5 @@
|
||||
OFI MTL
|
||||
|
||||
OFI MTL:
|
||||
--------
|
||||
The OFI MTL supports Libfabric (a.k.a. Open Fabrics Interfaces OFI,
|
||||
https://ofiwg.github.io/libfabric/) tagged APIs (fi_tagged(3)). At
|
||||
initialization time, the MTL queries libfabric for providers supporting tag matching
|
||||
@ -9,6 +9,7 @@ The user may modify the OFI provider selection with mca parameters
|
||||
mtl_ofi_provider_include or mtl_ofi_provider_exclude.
|
||||
|
||||
PROGRESS:
|
||||
---------
|
||||
The MTL registers a progress function to opal_progress. There is currently
|
||||
no support for asynchronous progress. The progress function reads multiple events
|
||||
from the OFI provider Completion Queue (CQ) per iteration (defaults to 100, can be
|
||||
@ -16,12 +17,14 @@ modified with the mca mtl_ofi_progress_event_cnt) and iterates until the
|
||||
completion queue is drained.
|
||||
|
||||
COMPLETIONS:
|
||||
------------
|
||||
Each operation uses a request type ompi_mtl_ofi_request_t which includes a reference
|
||||
to an operation specific completion callback, an MPI request, and a context. The
|
||||
to an operation specific completion callback, an MPI request, and a context. The
|
||||
context (fi_context) is used to map completion events with MPI_requests when reading the
|
||||
CQ.
|
||||
|
||||
OFI TAG:
|
||||
--------
|
||||
MPI needs to send 96 bits of information per message (32 bits communicator id,
|
||||
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
|
||||
addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
|
||||
@ -67,3 +70,271 @@ This is signaled in mem_tag_format (see fi_endpoint(3)) by setting higher order
|
||||
to zero. In such cases, the OFI MTL will reduce the number of communicator ids supported
|
||||
by reducing the bits available for the communicator ID field in the OFI tag.
|
||||
|
||||
SCALABLE ENDPOINTS:
|
||||
-------------------
|
||||
OFI MTL supports OFI Scalable Endpoints (SEP) feature as a means to improve
|
||||
multi-threaded application throughput and message rate. Currently the feature
|
||||
is designed to utilize multiple TX/RX contexts exposed by the OFI provider in
|
||||
conjunction with a multi-communicator MPI application model. Therefore, new OFI
|
||||
contexts are created as and when communicators are duplicated in a lazy fashion
|
||||
instead of creating them all at once during init time and this approach also
|
||||
favours only creating as many contexts as needed.
|
||||
|
||||
1. Multi-communicator model:
|
||||
With this approach, the MPI application is requried to first duplicate
|
||||
the communicators it wants to use with MPI operations (ideally creating
|
||||
as many communicators as the number of threads it wants to use to call
|
||||
into MPI). The duplicated communicators are then used by the
|
||||
corresponding threads to perform MPI operations. A possible usage
|
||||
scenario could be in an MPI + OMP application as follows
|
||||
(example limited to 2 ranks):
|
||||
|
||||
MPI_Comm dup_comm[n];
|
||||
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
|
||||
for (i = 0; i < n; i++) {
|
||||
MPI_Comm_dup(MPI_COMM_WORLD, &dup_comm[i]);
|
||||
}
|
||||
if (rank == 0) {
|
||||
#pragma omp parallel for private(host_sbuf, host_rbuf) num_threads(n)
|
||||
for (i = 0; i < n ; i++) {
|
||||
MPI_Send(host_sbuf, MYBUFSIZE, MPI_CHAR,
|
||||
1, MSG_TAG, dup_comm[i]);
|
||||
MPI_Recv(host_rbuf, MYBUFSIZE, MPI_CHAR,
|
||||
1, MSG_TAG, dup_comm[i], &status);
|
||||
}
|
||||
} else if (rank == 1) {
|
||||
#pragma omp parallel for private(status, host_sbuf, host_rbuf) num_threads(n)
|
||||
for (i = 0; i < n ; i++) {
|
||||
MPI_Recv(host_rbuf, MYBUFSIZE, MPI_CHAR,
|
||||
0, MSG_TAG, dup_comm[i], &status);
|
||||
MPI_Send(host_sbuf, MYBUFSIZE, MPI_CHAR,
|
||||
0, MSG_TAG, dup_comm[i]);
|
||||
}
|
||||
}
|
||||
|
||||
2. MCA variables:
|
||||
To utilize the feature, the following MCA variables need to be set:
|
||||
mtl_ofi_enable_sep:
|
||||
This MCA variable needs to be set to enable the use of Scalable Endpoints (SEP)
|
||||
feature in the OFI MTL. The underlying provider is also checked to ensure the
|
||||
feature is supported. If the provider chosen does not support it, user needs
|
||||
to either set this variable to 0 or select a different provider which supports
|
||||
the feature.
|
||||
For single-threaded applications one OFI context is sufficient, so OFI SEPs
|
||||
may not add benefit.
|
||||
Note that mtl_ofi_thread_grouping (see below) needs to be enabled to use the
|
||||
different OFI SEP contexts. Otherwise, only one context (ctxt 0) will be used.
|
||||
|
||||
Default: 0
|
||||
|
||||
Command-line syntax:
|
||||
"-mca mtl_ofi_enable_sep 1"
|
||||
|
||||
mtl_ofi_thread_grouping:
|
||||
Turn Thread Grouping feature on. This is needed to use the Multi-communicator
|
||||
model explained above. This means that the OFI MTL will use the communicator
|
||||
ID to decide the SEP contexts to be used by the thread. In this way, each
|
||||
thread will have direct access to different OFI resources. If disabled,
|
||||
only context 0 will be used.
|
||||
Requires mtl_ofi_enable_sep to be set to 1.
|
||||
|
||||
Default: 0
|
||||
|
||||
It is not recommended to set the MCA variable for:
|
||||
- Multi-threaded MPI applications not following multi-communicator approach.
|
||||
- Applications that have multiple threads using a single communicator as
|
||||
it may degrade performance.
|
||||
|
||||
Command-line syntax:
|
||||
"-mca mtl_ofi_thread_grouping 1"
|
||||
|
||||
mtl_ofi_num_ctxts:
|
||||
This MCA variable allows user to set the number of OFI SEP contexts the
|
||||
application expects to use. For multi-threaded applications using Thread
|
||||
Grouping feature, this number should be set to the number of user threads
|
||||
that will call into MPI. This variable will only have effect if
|
||||
mtl_ofi_enable_sep is set to 1.
|
||||
|
||||
Default: 1
|
||||
|
||||
Command-line syntax:
|
||||
"-mca mtl_ofi_num_ctxts N" [ N: number of OFI contexts required by
|
||||
application ]
|
||||
|
||||
3. Notes on performance:
|
||||
- OFI MTL will create as many TX/RX contexts as set by MCA mtl_ofi_num_ctxts.
|
||||
The number of contexts that can be created is also limited by the underlying
|
||||
provider as each provider may have different thresholds. Once the threshold
|
||||
is exceeded, contexts are used in a round-robin fashion which leads to
|
||||
resource sharing among threads. Therefore locks are required to guard
|
||||
against race conditions. For performance, it is recommended to have
|
||||
|
||||
Number of threads = Number of communicators = Number of contexts
|
||||
|
||||
For example, when using PSM2 provider, the number of contexts is dictated
|
||||
by the Intel Omni-Path HFI1 driver module.
|
||||
|
||||
- OPAL layer allows for multiple threads to enter progress simultaneously. To
|
||||
enable this feature, user needs to set MCA variable
|
||||
"max_thread_in_progress". When using Thread Grouping feature, it is
|
||||
recommended to set this MCA parameter to the number of threads expected to
|
||||
call into MPI as it provides performance benefits.
|
||||
|
||||
Command-line syntax:
|
||||
"-mca opal_max_thread_in_progress N" [ N: number of threads expected to
|
||||
make MPI calls ]
|
||||
Default: 1
|
||||
|
||||
- For applications using a single thread with multiple communicators and MCA
|
||||
variable "mtl_ofi_thread_grouping" set to 1, the MTL will use multiple
|
||||
contexts, but the benefits may be negligible as only one thread is driving
|
||||
progress.
|
||||
|
||||
SPECIALIZED FUNCTIONS:
|
||||
-------------------
|
||||
To improve performance when calling message passing APIs in the OFI mtl
|
||||
specialized functions are generated at compile time that eliminate all the
|
||||
if conditionals that can be determined at init and don't need to be
|
||||
queried again during the critical path. These functions are generated by
|
||||
perl scripts during make which generate functions and symbols for every
|
||||
combination of flags for each function.
|
||||
|
||||
1. ADDING NEW FLAGS FOR SPECIALIZATION OF EXISTING FUNCTION:
|
||||
To add a new flag to an existing specialized function for handling cases
|
||||
where different OFI providers may or may not support a particular feature,
|
||||
then you must follow these steps:
|
||||
1) Update the "_generic" function in mtl_ofi.h with the new flag and
|
||||
the if conditionals to read the new value.
|
||||
2) Update the *.pm file corresponding to the function with the new flag in:
|
||||
gen_funcs(), gen_*_function(), & gen_*_sym_init()
|
||||
3) Update mtl_ofi_opt.h with:
|
||||
The new flag as #define NEW_FLAG_TYPES #NUMBER_OF_STATES
|
||||
example: #define OFI_CQ_DATA 2 (only has TRUE/FALSE states)
|
||||
Update the function's types with:
|
||||
#define OMPI_MTL_OFI_FUNCTION_TYPES [NEW_FLAG_TYPES]
|
||||
|
||||
2. ADDING A NEW FUNCTION FOR SPECIALIZATION:
|
||||
To add a new function to be specialized you must
|
||||
follow these steps:
|
||||
1) Create a new mtl_ofi_"function_name"_opt.pm based off opt_common/mtl_ofi_opt.pm.template
|
||||
2) Add new .pm file to generated_source_modules in Makefile.am
|
||||
3) Add .c file to generated_sources in Makefile.am named the same as the corresponding .pm file
|
||||
4) Update existing or create function in mtl_ofi.h to _generic with new flags.
|
||||
5) Update mtl_ofi_opt.h with:
|
||||
a) New function types: #define OMPI_MTL_OFI_FUNCTION_TYPES [FLAG_TYPES]
|
||||
b) Add new function to the struct ompi_mtl_ofi_symtable:
|
||||
struct ompi_mtl_ofi_symtable {
|
||||
...
|
||||
int (*ompi_mtl_ofi_FUNCTION OMPI_MTL_OFI_FUNCTION_TYPES )
|
||||
}
|
||||
c) Add new symbol table init function definition:
|
||||
void ompi_mtl_ofi_FUNCTION_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
||||
6) Add calls to init the new function in the symbol table and assign the function
|
||||
pointer to be used based off the flags in mtl_ofi_component.c:
|
||||
ompi_mtl_ofi_FUNCTION_symtable_init(&ompi_mtl_ofi.sym_table);
|
||||
ompi_mtl_ofi.base.mtl_FUNCTION =
|
||||
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_FUNCTION[ompi_mtl_ofi.flag];
|
||||
|
||||
3. EXAMPLE SPECIALIZED FILE:
|
||||
The code below is an example of what is generated by the specialization
|
||||
scripts for use in the OFI mtl. This code specializes the blocking
|
||||
send functionality based on FI_REMOTE_CQ_DATA & OFI Scalable Endpoint support
|
||||
provided by an OFI Provider. Only one function and symbol is used during
|
||||
runtime based on if FI_REMOTE_CQ_DATA is supported and/or if OFI Scalable
|
||||
Endpoint support is enabled.
|
||||
/*
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "mtl_ofi.h"
|
||||
|
||||
__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_send_false_false(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode)
|
||||
{
|
||||
const bool OFI_CQ_DATA = false;
|
||||
const bool OFI_SCEP_EPS = false;
|
||||
|
||||
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
||||
convertor, mode,
|
||||
OFI_CQ_DATA, OFI_SCEP_EPS);
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_send_false_true(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode)
|
||||
{
|
||||
const bool OFI_CQ_DATA = false;
|
||||
const bool OFI_SCEP_EPS = true;
|
||||
|
||||
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
||||
convertor, mode,
|
||||
OFI_CQ_DATA, OFI_SCEP_EPS);
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_send_true_false(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode)
|
||||
{
|
||||
const bool OFI_CQ_DATA = true;
|
||||
const bool OFI_SCEP_EPS = false;
|
||||
|
||||
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
||||
convertor, mode,
|
||||
OFI_CQ_DATA, OFI_SCEP_EPS);
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_send_true_true(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode)
|
||||
{
|
||||
const bool OFI_CQ_DATA = true;
|
||||
const bool OFI_SCEP_EPS = true;
|
||||
|
||||
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
||||
convertor, mode,
|
||||
OFI_CQ_DATA, OFI_SCEP_EPS);
|
||||
}
|
||||
|
||||
void ompi_mtl_ofi_send_symtable_init(struct ompi_mtl_ofi_symtable* sym_table)
|
||||
{
|
||||
|
||||
sym_table->ompi_mtl_ofi_send[false][false]
|
||||
= ompi_mtl_ofi_send_false_false;
|
||||
|
||||
|
||||
sym_table->ompi_mtl_ofi_send[false][true]
|
||||
= ompi_mtl_ofi_send_false_true;
|
||||
|
||||
|
||||
sym_table->ompi_mtl_ofi_send[true][false]
|
||||
= ompi_mtl_ofi_send_true_false;
|
||||
|
||||
|
||||
sym_table->ompi_mtl_ofi_send[true][true]
|
||||
= ompi_mtl_ofi_send_true_true;
|
||||
|
||||
}
|
||||
###
|
||||
|
@ -2,7 +2,7 @@
|
||||
#
|
||||
# Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
#
|
||||
# Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
|
||||
# Copyright (c) 2014-2020 Cisco Systems, Inc. All rights reserved
|
||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
@ -28,6 +28,12 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[
|
||||
# Check for OFI
|
||||
OPAL_CHECK_OFI
|
||||
|
||||
# The OFI MTL requires at least OFI libfabric v1.5.
|
||||
AS_IF([test "$opal_ofi_happy" = "yes"],
|
||||
[OPAL_CHECK_OFI_VERSION_GE([1,5],
|
||||
[],
|
||||
[opal_ofi_happy=no])])
|
||||
|
||||
AS_IF([test "$opal_ofi_happy" = "yes"],
|
||||
[$1],
|
||||
[$2])
|
||||
|
62
ompi/mca/mtl/ofi/generate-opt-funcs.pl
Обычный файл
62
ompi/mca/mtl/ofi/generate-opt-funcs.pl
Обычный файл
@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use mtl_ofi_send_opt;
|
||||
use mtl_ofi_isend_opt;
|
||||
use mtl_ofi_irecv_opt;
|
||||
use mtl_ofi_iprobe_opt;
|
||||
use mtl_ofi_improbe_opt;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
|
||||
my $MTL_OFI_HEADER =
|
||||
'/*
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "mtl_ofi.h"';
|
||||
|
||||
my $specialization_file = $ARGV[0];
|
||||
my $specialization_type = $specialization_file;
|
||||
$specialization_type =~ s{\.[^.]+$}{};
|
||||
my $sym_table_type = $specialization_type;
|
||||
$sym_table_type =~ s/_opt//g;
|
||||
|
||||
open my $gen_file, ">", $specialization_file;
|
||||
|
||||
#
|
||||
# Generate the Specialized functions & symbol table for the specified file.
|
||||
#
|
||||
print $gen_file "$MTL_OFI_HEADER\n\n";
|
||||
|
||||
my $GEN_FUNC = $specialization_type . "::gen_funcs\(\$gen_file, \"FUNC\"\)";
|
||||
my $GEN_SYM = $specialization_type . "::gen_funcs\(\$gen_file, \"SYM\"\)";
|
||||
my $SYM_TABLE = "ompi_" . $sym_table_type . "_symtable";
|
||||
|
||||
eval $GEN_FUNC;
|
||||
|
||||
my $SYM_FUNC_HEADER = opt_common::mtl_ofi_opt_common::gen_sym_function_header($SYM_TABLE);
|
||||
print $gen_file "$SYM_FUNC_HEADER\n";
|
||||
|
||||
eval $GEN_SYM;
|
||||
|
||||
my $SYM_FUNC_FOOTER = opt_common::mtl_ofi_opt_common::gen_sym_function_footer();
|
||||
print $gen_file "$SYM_FUNC_FOOTER\n\n";
|
||||
close($gen_file);
|
||||
exit(0);
|
||||
###
|
@ -1,6 +1,6 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
|
||||
# $COPYRIGHT$
|
||||
@ -16,5 +16,64 @@ unusual; your job may behave unpredictably (and/or abort) after this.
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
Error: %s (%zd)
|
||||
#
|
||||
[Not enough bits for CID]
|
||||
OFI provider "%s" does not have enough free bits in its tag to fit the MPI
|
||||
Communicator ID. See the mem_tag_format of the provider by running:
|
||||
fi_info -v -p %s
|
||||
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
|
||||
[SEP unavailable]
|
||||
Scalable Endpoint feature is enabled by the user but it is not supported by
|
||||
%s provider. Try disabling this feature or use a different provider that
|
||||
supports it using mtl_ofi_provider_include.
|
||||
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
|
||||
[SEP required]
|
||||
Scalable Endpoint feature is required for Thread Grouping feature to work.
|
||||
Please try enabling Scalable Endpoints using mtl_ofi_enable_sep.
|
||||
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
|
||||
[SEP thread grouping ctxt limit]
|
||||
Reached limit (%d) for number of OFI contexts set by mtl_ofi_num_ctxts.
|
||||
Please set mtl_ofi_num_ctxts to a larger value if you need more contexts.
|
||||
If an MPI application creates more communicators than mtl_ofi_num_ctxts,
|
||||
OFI MTL will make the new communicators re-use existing contexts in
|
||||
round-robin fashion which will impact performance.
|
||||
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
|
||||
[Local ranks exceed ofi contexts]
|
||||
Number of local ranks exceed the number of available OFI contexts in %s
|
||||
provider and we cannot provision enough contexts for each rank. Try disabling
|
||||
Scalable Endpoint feature.
|
||||
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
|
||||
[Ctxts exceeded available]
|
||||
User requested for more than available contexts from provider. Limiting
|
||||
to max allowed (%d). Contexts will be re used in round-robin fashion if there
|
||||
are more threads than the available contexts.
|
||||
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
|
||||
[modex failed]
|
||||
The OFI MTL was not able to find endpoint information for a remote
|
||||
endpoint. Most likely, this means that the remote process was unable
|
||||
to initialize the Libfabric NIC correctly. This error is not
|
||||
recoverable and your application is likely to abort.
|
||||
|
||||
Local host: %s
|
||||
Remote host: %s
|
||||
Error: %s (%d)
|
||||
[message too big]
|
||||
Message size %llu bigger than supported by selected transport. Max = %llu
|
||||
|
@ -23,12 +23,12 @@ mca_mtl_ofi_module_t ompi_mtl_ofi = {
|
||||
ompi_mtl_ofi_del_procs,
|
||||
ompi_mtl_ofi_finalize,
|
||||
|
||||
ompi_mtl_ofi_send,
|
||||
ompi_mtl_ofi_isend,
|
||||
ompi_mtl_ofi_irecv,
|
||||
ompi_mtl_ofi_iprobe,
|
||||
ompi_mtl_ofi_imrecv,
|
||||
ompi_mtl_ofi_improbe,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
ompi_mtl_ofi_imrecv,
|
||||
NULL,
|
||||
|
||||
ompi_mtl_ofi_cancel,
|
||||
ompi_mtl_ofi_add_comm,
|
||||
@ -98,9 +98,10 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
(void**)&ep_name,
|
||||
&size);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: modex_recv failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
opal_show_help("help-mtl-ofi.txt", "modex failed",
|
||||
true, ompi_process_info.nodename,
|
||||
procs[i]->super.proc_hostname,
|
||||
opal_strerror(ret), ret);
|
||||
goto bail;
|
||||
}
|
||||
memcpy(&ep_names[i*namelen], ep_name, namelen);
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -15,6 +15,7 @@
|
||||
#include "mtl_ofi.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/mca/common/ofi/common_ofi.h"
|
||||
|
||||
static int ompi_mtl_ofi_component_open(void);
|
||||
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
|
||||
@ -33,6 +34,11 @@ static int data_progress;
|
||||
static int av_type;
|
||||
static int ofi_tag_mode;
|
||||
|
||||
#if OPAL_HAVE_THREAD_LOCAL
|
||||
opal_thread_local int per_thread_ctx;
|
||||
opal_thread_local struct fi_cq_tagged_entry wc[MTL_OFI_MAX_PROG_EVENT_COUNT];
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Enumerators
|
||||
*/
|
||||
@ -142,8 +148,8 @@ ompi_mtl_ofi_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_exclude);
|
||||
|
||||
ompi_mtl_ofi.ofi_progress_event_count = 100;
|
||||
asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
|
||||
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
|
||||
opal_asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
"progress_event_cnt",
|
||||
desc,
|
||||
@ -229,6 +235,38 @@ ompi_mtl_ofi_component_register(void)
|
||||
&av_type);
|
||||
OBJ_RELEASE(new_enum);
|
||||
|
||||
ompi_mtl_ofi.enable_sep = 0;
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
"enable_sep",
|
||||
"Enable SEP feature",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mtl_ofi.enable_sep);
|
||||
|
||||
ompi_mtl_ofi.thread_grouping = 0;
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
"thread_grouping",
|
||||
"Enable/Disable Thread Grouping feature",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mtl_ofi.thread_grouping);
|
||||
|
||||
/*
|
||||
* Default Policy: Create 1 context and let user ask for more for
|
||||
* multi-threaded workloads. User needs to ask for as many contexts as the
|
||||
* number of threads that are anticipated to make MPI calls.
|
||||
*/
|
||||
ompi_mtl_ofi.num_ofi_contexts = 1;
|
||||
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
|
||||
"num_ctxts",
|
||||
"Specify number of OFI contexts to create",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mtl_ofi.num_ofi_contexts);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -242,8 +280,7 @@ ompi_mtl_ofi_component_open(void)
|
||||
|
||||
ompi_mtl_ofi.domain = NULL;
|
||||
ompi_mtl_ofi.av = NULL;
|
||||
ompi_mtl_ofi.cq = NULL;
|
||||
ompi_mtl_ofi.ep = NULL;
|
||||
ompi_mtl_ofi.sep = NULL;
|
||||
|
||||
/**
|
||||
* Sanity check: provider_include and provider_exclude must be mutually
|
||||
@ -304,21 +341,12 @@ is_in_list(char **list, char *item)
|
||||
}
|
||||
|
||||
static struct fi_info*
|
||||
select_ofi_provider(struct fi_info *providers)
|
||||
select_ofi_provider(struct fi_info *providers,
|
||||
char **include_list, char **exclude_list)
|
||||
{
|
||||
char **include_list = NULL;
|
||||
char **exclude_list = NULL;
|
||||
struct fi_info *prov = providers;
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
|
||||
__FILE__, __LINE__, prov_include);
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
|
||||
__FILE__, __LINE__, prov_exclude);
|
||||
|
||||
if (NULL != prov_include) {
|
||||
include_list = opal_argv_split(prov_include, ',');
|
||||
if (NULL != include_list) {
|
||||
while ((NULL != prov) &&
|
||||
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
@ -327,8 +355,7 @@ select_ofi_provider(struct fi_info *providers)
|
||||
prov->fabric_attr->prov_name);
|
||||
prov = prov->next;
|
||||
}
|
||||
} else if (NULL != prov_exclude) {
|
||||
exclude_list = opal_argv_split(prov_exclude, ',');
|
||||
} else if (NULL != exclude_list) {
|
||||
while ((NULL != prov) &&
|
||||
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
@ -339,14 +366,33 @@ select_ofi_provider(struct fi_info *providers)
|
||||
}
|
||||
}
|
||||
|
||||
opal_argv_free(include_list);
|
||||
opal_argv_free(exclude_list);
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:prov: %s\n",
|
||||
__FILE__, __LINE__,
|
||||
(prov ? prov->fabric_attr->prov_name : "none"));
|
||||
|
||||
/* The initial fi_getinfo() call will return a list of providers
|
||||
* available for this process. once a provider is selected from the
|
||||
* list, we will cycle through the remaining list to identify NICs
|
||||
* serviced by this provider, and try to pick one on the same NUMA
|
||||
* node as this process. If there are no NICs on the same NUMA node,
|
||||
* we pick one in a manner which allows all ranks to make balanced
|
||||
* use of available NICs on the system.
|
||||
*
|
||||
* Most providers give a separate fi_info object for each NIC,
|
||||
* however some may have multiple info objects with different
|
||||
* attributes for the same NIC. The initial provider attributes
|
||||
* are used to ensure that all NICs we return provide the same
|
||||
* capabilities as the inital one.
|
||||
*/
|
||||
if (NULL != prov) {
|
||||
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider: %s\n",
|
||||
__FILE__, __LINE__,
|
||||
(prov ? prov->domain_attr->name : "none"));
|
||||
}
|
||||
|
||||
return prov;
|
||||
}
|
||||
|
||||
@ -387,10 +433,10 @@ ompi_mtl_ofi_check_fi_remote_cq_data(int fi_version,
|
||||
}
|
||||
|
||||
static void
|
||||
ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
|
||||
ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
|
||||
switch (ofi_tag_mode) {
|
||||
case MTL_OFI_TAG_1:
|
||||
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_1 ) - 1);
|
||||
*bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_1;
|
||||
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1);
|
||||
|
||||
ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_1;
|
||||
@ -405,7 +451,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
|
||||
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_1;
|
||||
break;
|
||||
case MTL_OFI_TAG_2:
|
||||
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_2 ) - 1);
|
||||
*bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_2;
|
||||
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_2 - 1)) - 1);
|
||||
|
||||
ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_2;
|
||||
@ -420,7 +466,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
|
||||
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_2;
|
||||
break;
|
||||
default: /* use FI_REMOTE_CQ_DATA */
|
||||
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_DATA ) - 1);
|
||||
*bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_DATA;
|
||||
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_DATA - 1)) - 1);
|
||||
|
||||
ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_DATA;
|
||||
@ -431,19 +477,182 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
|
||||
}
|
||||
}
|
||||
|
||||
#define MTL_OFI_ALLOC_COMM_TO_CONTEXT(arr_size) \
|
||||
do { \
|
||||
ompi_mtl_ofi.comm_to_context = calloc(arr_size, sizeof(int)); \
|
||||
if (OPAL_UNLIKELY(!ompi_mtl_ofi.comm_to_context)) { \
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
|
||||
"%s:%d: alloc of comm_to_context array failed: %s\n",\
|
||||
__FILE__, __LINE__, strerror(errno)); \
|
||||
return ret; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define MTL_OFI_ALLOC_OFI_CTXTS() \
|
||||
do { \
|
||||
ompi_mtl_ofi.ofi_ctxt = (mca_mtl_ofi_context_t *) malloc(ompi_mtl_ofi.num_ofi_contexts * \
|
||||
sizeof(mca_mtl_ofi_context_t)); \
|
||||
if (OPAL_UNLIKELY(!ompi_mtl_ofi.ofi_ctxt)) { \
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
|
||||
"%s:%d: alloc of ofi_ctxt array failed: %s\n", \
|
||||
__FILE__, __LINE__, strerror(errno)); \
|
||||
return ret; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
static int ompi_mtl_ofi_init_sep(struct fi_info *prov, int universe_size)
|
||||
{
|
||||
int ret = OMPI_SUCCESS, num_ofi_ctxts;
|
||||
struct fi_av_attr av_attr = {0};
|
||||
|
||||
prov->ep_attr->tx_ctx_cnt = prov->ep_attr->rx_ctx_cnt =
|
||||
ompi_mtl_ofi.num_ofi_contexts;
|
||||
|
||||
ret = fi_scalable_ep(ompi_mtl_ofi.domain, prov, &ompi_mtl_ofi.sep, NULL);
|
||||
if (0 != ret) {
|
||||
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||
"fi_scalable_ep",
|
||||
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||
fi_strerror(-ret), -ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ompi_mtl_ofi.rx_ctx_bits = 0;
|
||||
while (ompi_mtl_ofi.num_ofi_contexts >> ++ompi_mtl_ofi.rx_ctx_bits);
|
||||
|
||||
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
|
||||
av_attr.rx_ctx_bits = ompi_mtl_ofi.rx_ctx_bits;
|
||||
av_attr.count = ompi_mtl_ofi.num_ofi_contexts * universe_size;
|
||||
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
|
||||
|
||||
if (0 != ret) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = fi_scalable_ep_bind(ompi_mtl_ofi.sep, (fid_t)ompi_mtl_ofi.av, 0);
|
||||
if (0 != ret) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_bind AV-EP failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* If SEP supported and Thread Grouping feature enabled, use
|
||||
* num_ofi_contexts + 2. Extra 2 items is to accomodate Open MPI contextid
|
||||
* numbering- COMM_WORLD is 0, COMM_SELF is 1. Other user created
|
||||
* Comm contextid values are assigned sequentially starting with 3.
|
||||
*/
|
||||
num_ofi_ctxts = ompi_mtl_ofi.thread_grouping ?
|
||||
ompi_mtl_ofi.num_ofi_contexts + 2 : 1;
|
||||
MTL_OFI_ALLOC_COMM_TO_CONTEXT(num_ofi_ctxts);
|
||||
|
||||
ompi_mtl_ofi.total_ctxts_used = 0;
|
||||
ompi_mtl_ofi.threshold_comm_context_id = 0;
|
||||
|
||||
/* Allocate memory for OFI contexts */
|
||||
MTL_OFI_ALLOC_OFI_CTXTS();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ompi_mtl_ofi_init_regular_ep(struct fi_info * prov, int universe_size)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
struct fi_av_attr av_attr = {0};
|
||||
struct fi_cq_attr cq_attr = {0};
|
||||
cq_attr.format = FI_CQ_FORMAT_TAGGED;
|
||||
cq_attr.size = ompi_mtl_ofi.ofi_progress_event_count;
|
||||
|
||||
/* Override any user defined setting */
|
||||
ompi_mtl_ofi.num_ofi_contexts = 1;
|
||||
ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */
|
||||
prov, /* In: Provider */
|
||||
&ompi_mtl_ofi.sep, /* Out: Endpoint object */
|
||||
NULL); /* Optional context */
|
||||
if (0 != ret) {
|
||||
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||
"fi_endpoint",
|
||||
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||
fi_strerror(-ret), -ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the objects that will be bound to the endpoint.
|
||||
* The objects include:
|
||||
* - address vector and completion queues
|
||||
*/
|
||||
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
|
||||
av_attr.count = universe_size;
|
||||
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
|
||||
if (ret) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = fi_ep_bind(ompi_mtl_ofi.sep,
|
||||
(fid_t)ompi_mtl_ofi.av,
|
||||
0);
|
||||
if (0 != ret) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_bind AV-EP failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
MTL_OFI_ALLOC_COMM_TO_CONTEXT(1);
|
||||
|
||||
/* Allocate memory for OFI contexts */
|
||||
MTL_OFI_ALLOC_OFI_CTXTS();
|
||||
|
||||
ompi_mtl_ofi.ofi_ctxt[0].tx_ep = ompi_mtl_ofi.sep;
|
||||
ompi_mtl_ofi.ofi_ctxt[0].rx_ep = ompi_mtl_ofi.sep;
|
||||
|
||||
ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.ofi_ctxt[0].cq, NULL);
|
||||
if (ret) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_cq_open failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Bind CQ to endpoint object */
|
||||
ret = fi_ep_bind(ompi_mtl_ofi.sep, (fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq,
|
||||
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
|
||||
if (0 != ret) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_bind CQ-EP failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static mca_mtl_base_module_t*
|
||||
ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
int ret, fi_version;
|
||||
struct fi_info *hints;
|
||||
int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
|
||||
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
|
||||
char **include_list = NULL;
|
||||
char **exclude_list = NULL;
|
||||
struct fi_info *hints, *hints_dup = NULL;
|
||||
struct fi_info *providers = NULL;
|
||||
struct fi_info *prov = NULL;
|
||||
struct fi_info *prov_cq_data = NULL;
|
||||
struct fi_cq_attr cq_attr = {0};
|
||||
struct fi_av_attr av_attr = {0};
|
||||
char ep_name[FI_NAME_MAX] = {0};
|
||||
size_t namelen;
|
||||
int universe_size;
|
||||
char *univ_size_str;
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
|
||||
__FILE__, __LINE__, prov_include);
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
|
||||
__FILE__, __LINE__, prov_exclude);
|
||||
|
||||
if (NULL != prov_include) {
|
||||
include_list = opal_argv_split(prov_include, ',');
|
||||
} else if (NULL != prov_exclude) {
|
||||
exclude_list = opal_argv_split(prov_exclude, ',');
|
||||
}
|
||||
|
||||
/**
|
||||
* Hints to filter providers
|
||||
@ -462,15 +671,23 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
__FILE__, __LINE__);
|
||||
goto error;
|
||||
}
|
||||
/* Make sure to get a RDM provider that can do the tagged matching
|
||||
interface and local communication and remote communication. */
|
||||
hints->mode = FI_CONTEXT;
|
||||
hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */
|
||||
hints->caps = FI_TAGGED; /* Tag matching interface */
|
||||
hints->ep_attr->type = FI_EP_RDM;
|
||||
hints->caps = FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM;
|
||||
hints->tx_attr->msg_order = FI_ORDER_SAS;
|
||||
hints->rx_attr->msg_order = FI_ORDER_SAS;
|
||||
hints->rx_attr->op_flags = FI_COMPLETION;
|
||||
hints->tx_attr->op_flags = FI_COMPLETION;
|
||||
|
||||
hints->domain_attr->threading = FI_THREAD_UNSPEC;
|
||||
if (enable_mpi_threads) {
|
||||
ompi_mtl_ofi.mpi_thread_multiple = true;
|
||||
hints->domain_attr->threading = FI_THREAD_SAFE;
|
||||
} else {
|
||||
ompi_mtl_ofi.mpi_thread_multiple = false;
|
||||
hints->domain_attr->threading = FI_THREAD_DOMAIN;
|
||||
}
|
||||
|
||||
switch (control_progress) {
|
||||
case MTL_OFI_PROG_AUTO:
|
||||
@ -506,8 +723,59 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
* FI_VERSION provides binary backward and forward compatibility support
|
||||
* Specify the version of OFI is coded to, the provider will select struct
|
||||
* layouts that are compatible with this version.
|
||||
*
|
||||
* Note: API version 1.5 is the first version that supports
|
||||
* FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
|
||||
* that checking -- e.g., some providers are suitable for RXD or
|
||||
* RXM, but can't provide local communication).
|
||||
*/
|
||||
fi_version = FI_VERSION(1, 0);
|
||||
fi_version = FI_VERSION(1, 5);
|
||||
|
||||
/**
|
||||
* The EFA provider in Libfabric versions prior to 1.10 contains a bug
|
||||
* where the FI_LOCAL_COMM and FI_REMOTE_COMM capabilities are not
|
||||
* advertised. However, we know that this provider supports both local and
|
||||
* remote communication. We must exclude these capability bits in order to
|
||||
* select EFA when we are using a version of Libfabric with this bug.
|
||||
*
|
||||
* Call fi_getinfo() without those capabilities and specifically ask for
|
||||
* the EFA provider. This is safe to do as EFA is only supported on Amazon
|
||||
* EC2 and EC2 only supports EFA and TCP-based networks. We'll also skip
|
||||
* this logic if the user specifies an include list without EFA or adds EFA
|
||||
* to the exclude list.
|
||||
*/
|
||||
if ((include_list && is_in_list(include_list, "efa")) ||
|
||||
(exclude_list && !is_in_list(exclude_list, "efa"))) {
|
||||
hints_dup = fi_dupinfo(hints);
|
||||
hints_dup->caps &= ~(FI_LOCAL_COMM | FI_REMOTE_COMM);
|
||||
hints_dup->fabric_attr->prov_name = strdup("efa");
|
||||
|
||||
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: EFA specific fi_getinfo(): %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
|
||||
if (FI_ENODATA == -ret) {
|
||||
/**
|
||||
* EFA is not available so fall through to call fi_getinfo() again
|
||||
* with the local/remote capabilities set.
|
||||
*/
|
||||
fi_freeinfo(hints_dup);
|
||||
hints_dup = NULL;
|
||||
} else if (0 != ret) {
|
||||
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||
"fi_getinfo",
|
||||
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||
fi_strerror(-ret), -ret);
|
||||
goto error;
|
||||
} else {
|
||||
fi_freeinfo(hints);
|
||||
hints = hints_dup;
|
||||
hints_dup = NULL;
|
||||
goto select_prov;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* fi_getinfo: returns information about fabric services for reaching a
|
||||
@ -520,6 +788,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
0ULL, /* Optional flag */
|
||||
hints, /* In: Hints to filter providers */
|
||||
&providers); /* Out: List of matching providers */
|
||||
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_getinfo(): %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
|
||||
if (FI_ENODATA == -ret) {
|
||||
// It is not an error if no information is returned.
|
||||
goto error;
|
||||
@ -531,10 +804,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
goto error;
|
||||
}
|
||||
|
||||
select_prov:
|
||||
/**
|
||||
* Select a provider from the list returned by fi_getinfo().
|
||||
*/
|
||||
prov = select_ofi_provider(providers);
|
||||
prov = select_ofi_provider(providers, include_list, exclude_list);
|
||||
if (!prov) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: select_ofi_provider: no provider found\n",
|
||||
@ -542,6 +816,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
goto error;
|
||||
}
|
||||
|
||||
opal_argv_free(include_list);
|
||||
include_list = NULL;
|
||||
opal_argv_free(exclude_list);
|
||||
exclude_list = NULL;
|
||||
|
||||
/**
|
||||
* Select the format of the OFI tag
|
||||
*/
|
||||
@ -558,7 +837,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
ompi_mtl_ofi.fi_cq_data = false;
|
||||
if (MTL_OFI_TAG_AUTO == ofi_tag_mode) {
|
||||
/* Fallback to MTL_OFI_TAG_1 */
|
||||
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1);
|
||||
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1, &ofi_tag_bits_for_cid);
|
||||
} else { /* MTL_OFI_TAG_FULL */
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
|
||||
@ -569,15 +848,92 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
/* Use FI_REMTOTE_CQ_DATA */
|
||||
ompi_mtl_ofi.fi_cq_data = true;
|
||||
prov = prov_cq_data;
|
||||
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL);
|
||||
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL, &ofi_tag_bits_for_cid);
|
||||
}
|
||||
} else { /* MTL_OFI_TAG_1 or MTL_OFI_TAG_2 */
|
||||
ompi_mtl_ofi.fi_cq_data = false;
|
||||
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode);
|
||||
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode, &ofi_tag_bits_for_cid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the MTL OFI Symbol Tables & function pointers
|
||||
* for specialized functions.
|
||||
*/
|
||||
|
||||
ompi_mtl_ofi_send_symtable_init(&ompi_mtl_ofi.sym_table);
|
||||
ompi_mtl_ofi.base.mtl_send =
|
||||
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_send[ompi_mtl_ofi.fi_cq_data];
|
||||
|
||||
ompi_mtl_ofi_isend_symtable_init(&ompi_mtl_ofi.sym_table);
|
||||
ompi_mtl_ofi.base.mtl_isend =
|
||||
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_isend[ompi_mtl_ofi.fi_cq_data];
|
||||
|
||||
ompi_mtl_ofi_irecv_symtable_init(&ompi_mtl_ofi.sym_table);
|
||||
ompi_mtl_ofi.base.mtl_irecv =
|
||||
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_irecv[ompi_mtl_ofi.fi_cq_data];
|
||||
|
||||
ompi_mtl_ofi_iprobe_symtable_init(&ompi_mtl_ofi.sym_table);
|
||||
ompi_mtl_ofi.base.mtl_iprobe =
|
||||
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_iprobe[ompi_mtl_ofi.fi_cq_data];
|
||||
|
||||
ompi_mtl_ofi_improbe_symtable_init(&ompi_mtl_ofi.sym_table);
|
||||
ompi_mtl_ofi.base.mtl_improbe =
|
||||
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_improbe[ompi_mtl_ofi.fi_cq_data];
|
||||
|
||||
/**
|
||||
* Check for potential bits in the OFI tag that providers may be reserving
|
||||
* for internal usage (see mem_tag_format in fi_endpoint man page).
|
||||
*/
|
||||
|
||||
ofi_tag_leading_zeros = 0;
|
||||
while (!((prov->ep_attr->mem_tag_format << ofi_tag_leading_zeros++) &
|
||||
(uint64_t) MTL_OFI_HIGHEST_TAG_BIT) &&
|
||||
/* Do not keep looping if the provider does not support enough bits */
|
||||
(ofi_tag_bits_for_cid >= MTL_OFI_MINIMUM_CID_BITS)){
|
||||
ofi_tag_bits_for_cid--;
|
||||
}
|
||||
|
||||
if (ofi_tag_bits_for_cid < MTL_OFI_MINIMUM_CID_BITS) {
|
||||
opal_show_help("help-mtl-ofi.txt", "Not enough bits for CID", true,
|
||||
prov->fabric_attr->prov_name,
|
||||
prov->fabric_attr->prov_name,
|
||||
ompi_process_info.nodename, __FILE__, __LINE__);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Update the maximum supported Communicator ID */
|
||||
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << ofi_tag_bits_for_cid) - 1);
|
||||
ompi_mtl_ofi.num_peers = 0;
|
||||
|
||||
/* Check if Scalable Endpoints can be enabled for the provider */
|
||||
sep_support_in_provider = 0;
|
||||
if ((prov->domain_attr->max_ep_tx_ctx > 1) ||
|
||||
(prov->domain_attr->max_ep_rx_ctx > 1)) {
|
||||
sep_support_in_provider = 1;
|
||||
}
|
||||
|
||||
if (1 == ompi_mtl_ofi.enable_sep) {
|
||||
if (0 == sep_support_in_provider) {
|
||||
opal_show_help("help-mtl-ofi.txt", "SEP unavailable", true,
|
||||
prov->fabric_attr->prov_name,
|
||||
ompi_process_info.nodename, __FILE__, __LINE__);
|
||||
goto error;
|
||||
} else if (1 == sep_support_in_provider) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
|
||||
__FILE__, __LINE__, prov->fabric_attr->prov_name);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Scalable Endpoints is required for Thread Grouping feature
|
||||
*/
|
||||
if (1 == ompi_mtl_ofi.thread_grouping) {
|
||||
opal_show_help("help-mtl-ofi.txt", "SEP required", true,
|
||||
ompi_process_info.nodename, __FILE__, __LINE__);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Open fabric
|
||||
* The getinfo struct returns a fabric attribute struct that can be used to
|
||||
@ -612,25 +968,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
goto error;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a transport level communication endpoint. To use the endpoint,
|
||||
* it must be bound to completion counters or event queues and enabled,
|
||||
* and the resources consumed by it, such as address vectors, counters,
|
||||
* completion queues, etc.
|
||||
* see man fi_endpoint for more details.
|
||||
*/
|
||||
ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */
|
||||
prov, /* In: Provider */
|
||||
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
|
||||
NULL); /* Optional context */
|
||||
if (0 != ret) {
|
||||
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||
"fi_endpoint",
|
||||
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||
fi_strerror(-ret), -ret);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the maximum sizes.
|
||||
*/
|
||||
@ -638,76 +975,79 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
|
||||
|
||||
/**
|
||||
* Create the objects that will be bound to the endpoint.
|
||||
* The objects include:
|
||||
* - completion queue for events
|
||||
* - address vector of other endpoint addresses
|
||||
* - dynamic memory-spanning memory region
|
||||
* The user is not allowed to exceed MTL_OFI_MAX_PROG_EVENT_COUNT.
|
||||
* The reason is because progress entries array is now a TLS variable
|
||||
* as opposed to being allocated on the heap for thread-safety purposes.
|
||||
*/
|
||||
cq_attr.format = FI_CQ_FORMAT_TAGGED;
|
||||
if (ompi_mtl_ofi.ofi_progress_event_count > MTL_OFI_MAX_PROG_EVENT_COUNT) {
|
||||
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
|
||||
}
|
||||
|
||||
/**
|
||||
* If a user has set an ofi_progress_event_count > the default, then
|
||||
* the CQ size hint is set to the user's desired value such that
|
||||
* the CQ created will have enough slots to store up to
|
||||
* ofi_progress_event_count events. If a user has not set the
|
||||
* ofi_progress_event_count, then the provider is trusted to set a
|
||||
* default high CQ size and the CQ size hint is left unspecified.
|
||||
* Create a transport level communication endpoint. To use the endpoint,
|
||||
* it must be bound to the resources consumed by it such as address
|
||||
* vectors, completion counters or event queues etc, and enabled.
|
||||
* See man fi_endpoint for more details.
|
||||
*/
|
||||
if (ompi_mtl_ofi.ofi_progress_event_count > 100) {
|
||||
cq_attr.size = ompi_mtl_ofi.ofi_progress_event_count;
|
||||
}
|
||||
|
||||
ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.cq, NULL);
|
||||
if (ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_cq_open failed: %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
goto error;
|
||||
}
|
||||
|
||||
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
|
||||
|
||||
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
|
||||
if (ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_av_open failed: %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
goto error;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bind the CQ and AV to the endpoint object.
|
||||
/* use the universe size as a rough guess on the address vector
|
||||
* size hint that should be passed to fi_av_open(). For regular
|
||||
* endpoints, the count will be the universe size. For scalable
|
||||
* endpoints, the count will be the universe size multiplied by
|
||||
* the number of contexts. In either case, if the universe grows
|
||||
* (via dynamic processes), the count is a hint, not a hard limit,
|
||||
* so libfabric will just be slightly less efficient.
|
||||
*/
|
||||
ret = fi_ep_bind(ompi_mtl_ofi.ep,
|
||||
(fid_t)ompi_mtl_ofi.cq,
|
||||
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
|
||||
if (0 != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_bind CQ-EP failed: %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
univ_size_str = getenv("OMPI_UNIVERSE_SIZE");
|
||||
if (NULL == univ_size_str ||
|
||||
(universe_size = strtol(univ_size_str, NULL, 0)) <= 0) {
|
||||
universe_size = ompi_proc_world_size();
|
||||
}
|
||||
|
||||
if (1 == ompi_mtl_ofi.enable_sep) {
|
||||
max_ofi_ctxts = (prov->domain_attr->max_ep_tx_ctx <
|
||||
prov->domain_attr->max_ep_rx_ctx) ?
|
||||
prov->domain_attr->max_ep_tx_ctx :
|
||||
prov->domain_attr->max_ep_rx_ctx;
|
||||
|
||||
num_local_ranks = 1 + ompi_process_info.num_local_peers;
|
||||
if (max_ofi_ctxts <= num_local_ranks) {
|
||||
opal_show_help("help-mtl-ofi.txt", "Local ranks exceed ofi contexts",
|
||||
true, prov->fabric_attr->prov_name,
|
||||
ompi_process_info.nodename, __FILE__, __LINE__);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Provision enough contexts to service all ranks in a node */
|
||||
max_ofi_ctxts /= num_local_ranks;
|
||||
|
||||
/*
|
||||
* If num ctxts user specified is more than max allowed, limit to max
|
||||
* and start round-robining. Print warning to user.
|
||||
*/
|
||||
if (max_ofi_ctxts < ompi_mtl_ofi.num_ofi_contexts) {
|
||||
opal_show_help("help-mtl-ofi.txt", "Ctxts exceeded available",
|
||||
true, max_ofi_ctxts,
|
||||
ompi_process_info.nodename, __FILE__, __LINE__);
|
||||
ompi_mtl_ofi.num_ofi_contexts = max_ofi_ctxts;
|
||||
}
|
||||
|
||||
ret = ompi_mtl_ofi_init_sep(prov, universe_size);
|
||||
} else {
|
||||
ret = ompi_mtl_ofi_init_regular_ep(prov, universe_size);
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
ret = fi_ep_bind(ompi_mtl_ofi.ep,
|
||||
(fid_t)ompi_mtl_ofi.av,
|
||||
0);
|
||||
if (0 != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_bind AV-EP failed: %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
goto error;
|
||||
}
|
||||
ompi_mtl_ofi.total_ctxts_used = 0;
|
||||
ompi_mtl_ofi.threshold_comm_context_id = 0;
|
||||
|
||||
/**
|
||||
* Enable the endpoint for communication
|
||||
* This commits the bind operations.
|
||||
*/
|
||||
ret = fi_enable(ompi_mtl_ofi.ep);
|
||||
/* Enable Endpoint for communication */
|
||||
ret = fi_enable(ompi_mtl_ofi.sep);
|
||||
if (0 != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_enable failed: %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_enable failed");
|
||||
goto error;
|
||||
}
|
||||
|
||||
@ -725,11 +1065,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
* Get our address and publish it with modex.
|
||||
*/
|
||||
namelen = sizeof(ep_name);
|
||||
ret = fi_getname((fid_t)ompi_mtl_ofi.ep, &ep_name[0], &namelen);
|
||||
ret = fi_getname((fid_t)ompi_mtl_ofi.sep,
|
||||
&ep_name[0],
|
||||
&namelen);
|
||||
if (ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: fi_getname failed: %s\n",
|
||||
__FILE__, __LINE__, fi_strerror(-ret));
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_getname failed");
|
||||
goto error;
|
||||
}
|
||||
|
||||
@ -751,20 +1091,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
*/
|
||||
ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC;
|
||||
|
||||
/**
|
||||
* Activate progress callback.
|
||||
*/
|
||||
ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: opal_progress_register failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
goto error;
|
||||
}
|
||||
|
||||
return &ompi_mtl_ofi.base;
|
||||
|
||||
error:
|
||||
if (include_list) {
|
||||
opal_argv_free(include_list);
|
||||
}
|
||||
if (exclude_list) {
|
||||
opal_argv_free(exclude_list);
|
||||
}
|
||||
if (providers) {
|
||||
(void) fi_freeinfo(providers);
|
||||
}
|
||||
@ -774,14 +1109,20 @@ error:
|
||||
if (hints) {
|
||||
(void) fi_freeinfo(hints);
|
||||
}
|
||||
if (hints_dup) {
|
||||
(void) fi_freeinfo(hints_dup);
|
||||
}
|
||||
if (ompi_mtl_ofi.sep) {
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.sep);
|
||||
}
|
||||
if (ompi_mtl_ofi.av) {
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.av);
|
||||
}
|
||||
if (ompi_mtl_ofi.cq) {
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.cq);
|
||||
}
|
||||
if (ompi_mtl_ofi.ep) {
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.ep);
|
||||
if ((0 == ompi_mtl_ofi.enable_sep) &&
|
||||
ompi_mtl_ofi.ofi_ctxt != NULL &&
|
||||
ompi_mtl_ofi.ofi_ctxt[0].cq) {
|
||||
/* Check if CQ[0] was created for non-SEP case and close if needed */
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq);
|
||||
}
|
||||
if (ompi_mtl_ofi.domain) {
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.domain);
|
||||
@ -789,6 +1130,12 @@ error:
|
||||
if (ompi_mtl_ofi.fabric) {
|
||||
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
|
||||
}
|
||||
if (ompi_mtl_ofi.comm_to_context) {
|
||||
free(ompi_mtl_ofi.comm_to_context);
|
||||
}
|
||||
if (ompi_mtl_ofi.ofi_ctxt) {
|
||||
free(ompi_mtl_ofi.ofi_ctxt);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@ -801,11 +1148,7 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
|
||||
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
|
||||
|
||||
/* Close all the OFI objects */
|
||||
if ((ret = fi_close((fid_t)ompi_mtl_ofi.ep))) {
|
||||
goto finalize_err;
|
||||
}
|
||||
|
||||
if ((ret = fi_close((fid_t)ompi_mtl_ofi.cq))) {
|
||||
if ((ret = fi_close((fid_t)ompi_mtl_ofi.sep))) {
|
||||
goto finalize_err;
|
||||
}
|
||||
|
||||
@ -813,6 +1156,18 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
|
||||
goto finalize_err;
|
||||
}
|
||||
|
||||
if (0 == ompi_mtl_ofi.enable_sep) {
|
||||
/*
|
||||
* CQ[0] is bound to SEP object Nwhen SEP is not supported by a
|
||||
* provider. OFI spec requires that we close the Endpoint that is bound
|
||||
* to the CQ before closing the CQ itself. So, for the non-SEP case, we
|
||||
* handle the closing of CQ[0] here.
|
||||
*/
|
||||
if ((ret = fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq))) {
|
||||
goto finalize_err;
|
||||
}
|
||||
}
|
||||
|
||||
if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) {
|
||||
goto finalize_err;
|
||||
}
|
||||
@ -821,6 +1176,10 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
|
||||
goto finalize_err;
|
||||
}
|
||||
|
||||
/* Free memory allocated for TX/RX contexts */
|
||||
free(ompi_mtl_ofi.comm_to_context);
|
||||
free(ompi_mtl_ofi.ofi_ctxt);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
finalize_err:
|
||||
|
@ -1,5 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -11,11 +13,9 @@
|
||||
#ifndef OMPI_MTL_OFI_ENDPOINT_H
|
||||
#define OMPI_MTL_OFI_ENDPOINT_H
|
||||
|
||||
BEGIN_C_DECLS
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
|
||||
extern int ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs);
|
||||
BEGIN_C_DECLS
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint_t);
|
||||
|
||||
@ -38,10 +38,12 @@ struct mca_mtl_ofi_endpoint_t {
|
||||
|
||||
typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t;
|
||||
|
||||
static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
|
||||
static inline mca_mtl_ofi_endpoint_t *
|
||||
ompi_mtl_ofi_get_endpoint(struct mca_mtl_base_module_t* mtl,
|
||||
ompi_proc_t *ompi_proc)
|
||||
{
|
||||
if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc))) {
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != MCA_PML_CALL(add_procs(&ompi_proc, 1)))) {
|
||||
/* Fatal error. exit() out */
|
||||
opal_output(0, "%s:%d: *** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
|
||||
__FILE__, __LINE__);
|
||||
|
73
ompi/mca/mtl/ofi/mtl_ofi_improbe_opt.pm
Обычный файл
73
ompi/mca/mtl/ofi/mtl_ofi_improbe_opt.pm
Обычный файл
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
package mtl_ofi_improbe_opt;
|
||||
|
||||
my @true_false = ("false", "true");
|
||||
|
||||
sub gen_funcs {
|
||||
my $gen_file = $_[0];
|
||||
my $gen_type = $_[1];
|
||||
my $OFI_CQ_DATA_EN = "false";
|
||||
|
||||
foreach $OFI_CQ_DATA_EN (@true_false) {
|
||||
my @flags = ($OFI_CQ_DATA_EN);
|
||||
if (($gen_type cmp "FUNC") == 0) {
|
||||
my $FUNC = gen_improbe_function(\@flags);
|
||||
print $gen_file "$FUNC\n\n";
|
||||
}
|
||||
if (($gen_type cmp "SYM") == 0) {
|
||||
my $SYM = gen_improbe_sym_init(\@flags);
|
||||
print $gen_file "$SYM\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_improbe_function {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
|
||||
my $IMPROBE_FUNCTION =
|
||||
"__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_improbe_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int src,
|
||||
int tag,
|
||||
int *matched,
|
||||
struct ompi_message_t **message,
|
||||
struct ompi_status_public_t *status)
|
||||
{
|
||||
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
|
||||
|
||||
return ompi_mtl_ofi_improbe_generic(mtl, comm, src, tag,
|
||||
matched, message, status,
|
||||
OFI_CQ_DATA);
|
||||
}";
|
||||
return $IMPROBE_FUNCTION;
|
||||
}
|
||||
|
||||
sub gen_improbe_sym_init {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_improbe_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
my $symbol_init =
|
||||
"
|
||||
sym_table->ompi_mtl_ofi_improbe[".$OFI_CQ_DATA_EN."]
|
||||
= ".$MTL_OFI_FUNC_NAME.";
|
||||
";
|
||||
return $symbol_init;
|
||||
}
|
||||
|
||||
1;
|
72
ompi/mca/mtl/ofi/mtl_ofi_iprobe_opt.pm
Обычный файл
72
ompi/mca/mtl/ofi/mtl_ofi_iprobe_opt.pm
Обычный файл
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
package mtl_ofi_iprobe_opt;
|
||||
|
||||
my @true_false = ("false", "true");
|
||||
|
||||
sub gen_funcs {
|
||||
my $gen_file = $_[0];
|
||||
my $gen_type = $_[1];
|
||||
my $OFI_CQ_DATA_EN = "false";
|
||||
|
||||
foreach $OFI_CQ_DATA_EN (@true_false) {
|
||||
my @flags = ($OFI_CQ_DATA_EN);
|
||||
if (($gen_type cmp "FUNC") == 0) {
|
||||
my $FUNC = gen_iprobe_function(\@flags);
|
||||
print $gen_file "$FUNC\n\n";
|
||||
}
|
||||
if (($gen_type cmp "SYM") == 0) {
|
||||
my $SYM = gen_iprobe_sym_init(\@flags);
|
||||
print $gen_file "$SYM\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_iprobe_function {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
|
||||
my $IPROBE_FUNCTION =
|
||||
"__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_iprobe_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int src,
|
||||
int tag,
|
||||
int *flag,
|
||||
struct ompi_status_public_t *status)
|
||||
{
|
||||
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
|
||||
|
||||
return ompi_mtl_ofi_iprobe_generic(mtl, comm, src, tag,
|
||||
flag, status,
|
||||
OFI_CQ_DATA);
|
||||
}";
|
||||
return $IPROBE_FUNCTION;
|
||||
}
|
||||
|
||||
sub gen_iprobe_sym_init {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_iprobe_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
my $symbol_init =
|
||||
"
|
||||
sym_table->ompi_mtl_ofi_iprobe[".$OFI_CQ_DATA_EN."]
|
||||
= ".$MTL_OFI_FUNC_NAME.";
|
||||
";
|
||||
return $symbol_init;
|
||||
}
|
||||
|
||||
1;
|
72
ompi/mca/mtl/ofi/mtl_ofi_irecv_opt.pm
Обычный файл
72
ompi/mca/mtl/ofi/mtl_ofi_irecv_opt.pm
Обычный файл
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
package mtl_ofi_irecv_opt;
|
||||
|
||||
my @true_false = ("false", "true");
|
||||
|
||||
sub gen_funcs {
|
||||
my $gen_file = $_[0];
|
||||
my $gen_type = $_[1];
|
||||
my $OFI_CQ_DATA_EN = "false";
|
||||
|
||||
foreach $OFI_CQ_DATA_EN (@true_false) {
|
||||
my @flags = ($OFI_CQ_DATA_EN);
|
||||
if (($gen_type cmp "FUNC") == 0) {
|
||||
my $FUNC = gen_irecv_function(\@flags);
|
||||
print $gen_file "$FUNC\n\n";
|
||||
}
|
||||
if (($gen_type cmp "SYM") == 0) {
|
||||
my $SYM = gen_irecv_sym_init(\@flags);
|
||||
print $gen_file "$SYM\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_irecv_function {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
|
||||
my $IRECV_FUNCTION =
|
||||
"__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_irecv_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int src,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_mtl_request_t *mtl_request)
|
||||
{
|
||||
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
|
||||
|
||||
return ompi_mtl_ofi_irecv_generic(mtl, comm, src, tag,
|
||||
convertor, mtl_request,
|
||||
OFI_CQ_DATA);
|
||||
}";
|
||||
return $IRECV_FUNCTION;
|
||||
}
|
||||
|
||||
sub gen_irecv_sym_init {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_irecv_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
my $symbol_init =
|
||||
"
|
||||
sym_table->ompi_mtl_ofi_irecv[".$OFI_CQ_DATA_EN."]
|
||||
= ".$MTL_OFI_FUNC_NAME.";
|
||||
";
|
||||
return $symbol_init;
|
||||
}
|
||||
|
||||
1;
|
74
ompi/mca/mtl/ofi/mtl_ofi_isend_opt.pm
Обычный файл
74
ompi/mca/mtl/ofi/mtl_ofi_isend_opt.pm
Обычный файл
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
package mtl_ofi_isend_opt;
|
||||
|
||||
my @true_false = ("false", "true");
|
||||
|
||||
sub gen_funcs {
|
||||
my $gen_file = $_[0];
|
||||
my $gen_type = $_[1];
|
||||
my $OFI_CQ_DATA_EN = "false";
|
||||
|
||||
foreach $OFI_CQ_DATA_EN (@true_false) {
|
||||
my @flags = ($OFI_CQ_DATA_EN);
|
||||
if (($gen_type cmp "FUNC") == 0) {
|
||||
my $FUNC = gen_isend_function(\@flags);
|
||||
print $gen_file "$FUNC\n\n";
|
||||
}
|
||||
if (($gen_type cmp "SYM") == 0) {
|
||||
my $SYM = gen_isend_sym_init(\@flags);
|
||||
print $gen_file "$SYM\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_isend_function {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
|
||||
my $ISEND_FUNCTION =
|
||||
"__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_isend_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode,
|
||||
bool blocking,
|
||||
mca_mtl_request_t *mtl_request)
|
||||
{
|
||||
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
|
||||
|
||||
return ompi_mtl_ofi_isend_generic(mtl, comm, dest, tag,
|
||||
convertor, mode, blocking,
|
||||
mtl_request, OFI_CQ_DATA);
|
||||
}";
|
||||
return $ISEND_FUNCTION;
|
||||
}
|
||||
|
||||
sub gen_isend_sym_init {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_isend_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
my $symbol_init =
|
||||
"
|
||||
sym_table->ompi_mtl_ofi_isend[".$OFI_CQ_DATA_EN."]
|
||||
= ".$MTL_OFI_FUNC_NAME.";
|
||||
";
|
||||
return $symbol_init;
|
||||
}
|
||||
|
||||
1;
|
77
ompi/mca/mtl/ofi/mtl_ofi_opt.h
Обычный файл
77
ompi/mca/mtl/ofi/mtl_ofi_opt.h
Обычный файл
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MTL_OFI_OPT_H
|
||||
#define MTL_OFI_OPT_H
|
||||
|
||||
#include "mtl_ofi.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define CQ_DATA_TYPES 2
|
||||
#define OMPI_MTL_OFI_SEND_TYPES [CQ_DATA_TYPES]
|
||||
#define OMPI_MTL_OFI_ISEND_TYPES [CQ_DATA_TYPES]
|
||||
#define OMPI_MTL_OFI_IRECV_TYPES [CQ_DATA_TYPES]
|
||||
#define OMPI_MTL_OFI_IPROBE_TYPES [CQ_DATA_TYPES]
|
||||
#define OMPI_MTL_OFI_IMPROBE_TYPES [CQ_DATA_TYPES]
|
||||
|
||||
struct ompi_mtl_ofi_symtable {
|
||||
int (*ompi_mtl_ofi_send OMPI_MTL_OFI_SEND_TYPES )
|
||||
(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode);
|
||||
int (*ompi_mtl_ofi_isend OMPI_MTL_OFI_ISEND_TYPES )
|
||||
(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode,
|
||||
bool blocking,
|
||||
mca_mtl_request_t *mtl_request);
|
||||
int (*ompi_mtl_ofi_irecv OMPI_MTL_OFI_IRECV_TYPES )
|
||||
(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int src,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_mtl_request_t *mtl_request);
|
||||
int (*ompi_mtl_ofi_iprobe OMPI_MTL_OFI_IPROBE_TYPES )
|
||||
(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int src,
|
||||
int tag,
|
||||
int *flag,
|
||||
struct ompi_status_public_t *status);
|
||||
int (*ompi_mtl_ofi_improbe OMPI_MTL_OFI_IMPROBE_TYPES )
|
||||
(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int src,
|
||||
int tag,
|
||||
int *matched,
|
||||
struct ompi_message_t **message,
|
||||
struct ompi_status_public_t *status);
|
||||
};
|
||||
|
||||
/**
|
||||
* MTL OFI specialization function symbol table init
|
||||
*/
|
||||
void ompi_mtl_ofi_send_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
||||
void ompi_mtl_ofi_isend_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
||||
void ompi_mtl_ofi_irecv_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
||||
void ompi_mtl_ofi_iprobe_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
||||
void ompi_mtl_ofi_improbe_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MTL_OFI_OPT_H */
|
71
ompi/mca/mtl/ofi/mtl_ofi_send_opt.pm
Обычный файл
71
ompi/mca/mtl/ofi/mtl_ofi_send_opt.pm
Обычный файл
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
package mtl_ofi_send_opt;
|
||||
|
||||
my @true_false = ("false", "true");
|
||||
|
||||
sub gen_funcs {
|
||||
my $gen_file = $_[0];
|
||||
my $gen_type = $_[1];
|
||||
my $OFI_CQ_DATA_EN = "false";
|
||||
|
||||
foreach $OFI_CQ_DATA_EN (@true_false) {
|
||||
my @flags = ($OFI_CQ_DATA_EN);
|
||||
if (($gen_type cmp "FUNC") == 0) {
|
||||
my $FUNC = gen_send_function(\@flags);
|
||||
print $gen_file "$FUNC\n\n";
|
||||
}
|
||||
if (($gen_type cmp "SYM") == 0) {
|
||||
my $SYM = gen_send_sym_init(\@flags);
|
||||
print $gen_file "$SYM\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_send_function {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
my $SEND_FUNCTION =
|
||||
"__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_send_" . $MTL_OFI_NAME_EXT . "(struct mca_mtl_base_module_t *mtl,
|
||||
struct ompi_communicator_t *comm,
|
||||
int dest,
|
||||
int tag,
|
||||
struct opal_convertor_t *convertor,
|
||||
mca_pml_base_send_mode_t mode)
|
||||
{
|
||||
const bool OFI_CQ_DATA = " . $OFI_CQ_DATA_EN . ";
|
||||
|
||||
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
||||
convertor, mode,
|
||||
OFI_CQ_DATA);
|
||||
}";
|
||||
return $SEND_FUNCTION;
|
||||
}
|
||||
|
||||
sub gen_send_sym_init {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_send_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
|
||||
my $OFI_CQ_DATA_EN = $op_flags[0];
|
||||
my $symbol_init =
|
||||
"
|
||||
sym_table->ompi_mtl_ofi_send[".$OFI_CQ_DATA_EN."]
|
||||
= ".$MTL_OFI_FUNC_NAME.";
|
||||
";
|
||||
return $symbol_init;
|
||||
}
|
||||
|
||||
1;
|
@ -19,6 +19,19 @@ BEGIN_C_DECLS
|
||||
/**
|
||||
* MTL Module Interface
|
||||
*/
|
||||
|
||||
typedef struct mca_mtl_ofi_context_t {
|
||||
/* Transmit and receive contexts */
|
||||
struct fid_ep *tx_ep;
|
||||
struct fid_ep *rx_ep;
|
||||
|
||||
/* Completion queue */
|
||||
struct fid_cq *cq;
|
||||
|
||||
/* Thread locking */
|
||||
opal_mutex_t context_lock;
|
||||
} mca_mtl_ofi_context_t;
|
||||
|
||||
typedef struct mca_mtl_ofi_module_t {
|
||||
mca_mtl_base_module_t base;
|
||||
|
||||
@ -31,11 +44,19 @@ typedef struct mca_mtl_ofi_module_t {
|
||||
/** Address vector handle */
|
||||
struct fid_av *av;
|
||||
|
||||
/** Completion queue handle */
|
||||
struct fid_cq *cq;
|
||||
/* Multi-threaded Application flag */
|
||||
bool mpi_thread_multiple;
|
||||
|
||||
/** Endpoint to communicate on */
|
||||
struct fid_ep *ep;
|
||||
/* Scalable Endpoint attributes */
|
||||
struct fid_ep *sep; /* Endpoint object */
|
||||
mca_mtl_ofi_context_t *ofi_ctxt; /* OFI contexts */
|
||||
int threshold_comm_context_id; /* Set threshold communicator ID */
|
||||
int *comm_to_context; /* Map communicator ID to context */
|
||||
int rx_ctx_bits; /* Bits used for RX context */
|
||||
int total_ctxts_used; /* Total number of contexts used */
|
||||
int enable_sep; /* MCA to enable/disable SEP feature */
|
||||
int thread_grouping; /* MCA for thread grouping feature */
|
||||
int num_ofi_contexts; /* MCA for number of contexts to use */
|
||||
|
||||
/** Endpoint name length */
|
||||
size_t epnamelen;
|
||||
@ -71,6 +92,9 @@ typedef struct mca_mtl_ofi_module_t {
|
||||
unsigned long long sync_send_ack;
|
||||
unsigned long long sync_proto_mask;
|
||||
|
||||
/** Optimized function Symbol Tables **/
|
||||
struct ompi_mtl_ofi_symtable sym_table;
|
||||
|
||||
} mca_mtl_ofi_module_t;
|
||||
|
||||
extern mca_mtl_ofi_module_t ompi_mtl_ofi;
|
||||
@ -80,6 +104,19 @@ typedef struct mca_mtl_ofi_component_t {
|
||||
mca_mtl_base_component_2_0_0_t super;
|
||||
} mca_mtl_ofi_component_t;
|
||||
|
||||
typedef enum {
|
||||
OFI_REGULAR_EP = 0,
|
||||
OFI_SCALABLE_EP,
|
||||
} mca_mtl_ofi_ep_type;
|
||||
|
||||
/*
|
||||
* Define upper limit for number of events read from a CQ.
|
||||
* Setting this to 100 as this was deemed optimal from empirical data.
|
||||
* If one wants to read lesser number of events from the CQ, the MCA
|
||||
* variable can be used.
|
||||
*/
|
||||
#define MTL_OFI_MAX_PROG_EVENT_COUNT 100
|
||||
|
||||
/*OFI TAG:
|
||||
* Define 3 different OFI tag distributions:
|
||||
* 1) Support FI_REMOTE_CQ_DATA: No need for source rank in the tag
|
||||
@ -89,12 +126,15 @@ typedef struct mca_mtl_ofi_component_t {
|
||||
* More details of the tags are in the README file (mtl_ofi_tag_mode).
|
||||
*/
|
||||
|
||||
#define MTL_OFI_MINIMUM_CID_BITS (8)
|
||||
|
||||
/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
|
||||
* 01234567 01234567 01234567 012345 67 01234567 01234567 01234567 01234567
|
||||
* | |
|
||||
* context_id |prot| message tag
|
||||
*/
|
||||
#define MTL_OFI_PROTO_BIT_COUNT (2)
|
||||
#define MTL_OFI_HIGHEST_TAG_BIT (0x8000000000000000ULL)
|
||||
|
||||
#define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
|
||||
#define MTL_OFI_CID_BIT_COUNT_DATA (30)
|
||||
|
66
ompi/mca/mtl/ofi/opt_common/mtl_ofi_opt.pm.template
Обычный файл
66
ompi/mca/mtl/ofi/opt_common/mtl_ofi_opt.pm.template
Обычный файл
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use opt_common::mtl_ofi_opt_common;
|
||||
package mtl_ofi_#INSERT FUNCTION NAME HERE#_opt;
|
||||
|
||||
my @en_dis = (0, 1);
|
||||
|
||||
sub gen_funcs {
|
||||
my $gen_file = $_[0];
|
||||
my $gen_type = $_[1];
|
||||
my $#INSERT FLAG NAME HERE# = 0;
|
||||
|
||||
foreach $#INSERT FLAG NAME HERE# (@en_dis) {
|
||||
my @flags = ($#INSERT FLAG NAME HERE#);
|
||||
if (($gen_type cmp "FUNC") == 0) {
|
||||
my $FUNC = gen_#INSERT FUNCTION NAME HERE#_function(\@flags);
|
||||
print $gen_file "$FUNC\n\n";
|
||||
}
|
||||
if (($gen_type cmp "SYM") == 0) {
|
||||
my $SYM = gen_#INSERT FUNCTION NAME HERE#_sym_init(\@flags);
|
||||
print $gen_file "$SYM\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_#INSERT FUNCTION NAME HERE#_function {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_NAME_EXT = opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags);
|
||||
my $#INSERT FLAG NAME HERE#_EN = $op_flags[0];
|
||||
|
||||
my $FUNCTION =
|
||||
"__opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#_" . $MTL_OFI_NAME_EXT . "(#INSERT FUNCTION ARGS HERE#)
|
||||
{
|
||||
const int $#INSERT FLAG NAME HERE# = " . $#INSERT FLAG NAME HERE#_EN . ";
|
||||
|
||||
return ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#_generic(#INSERT FUNCTION ARGS HERE#,
|
||||
#INSERT FLAG NAME HERE#);
|
||||
}";
|
||||
return $FUNCTION;
|
||||
}
|
||||
|
||||
sub gen_#INSERT FUNCTION NAME HERE#_sym_init {
|
||||
my @op_flags = @{$_[0]};
|
||||
my $MTL_OFI_FUNC_NAME = "ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#_" . opt_common::mtl_ofi_opt_common::gen_flags_ext(\@op_flags) . "";
|
||||
my $#INSERT FLAG NAME HERE#_EN = $op_flags[0];
|
||||
my $symbol_init =
|
||||
"
|
||||
sym_table->ompi_mtl_ofi_#INSERT FUNCTION NAME HERE#[".$#INSERT FLAG NAME HERE#_EN."]
|
||||
= ".$MTL_OFI_FUNC_NAME.";
|
||||
";
|
||||
return $symbol_init;
|
||||
}
|
||||
|
||||
1;
|
54
ompi/mca/mtl/ofi/opt_common/mtl_ofi_opt_common.pm
Обычный файл
54
ompi/mca/mtl/ofi/opt_common/mtl_ofi_opt_common.pm
Обычный файл
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
package opt_common::mtl_ofi_opt_common;
|
||||
|
||||
#
|
||||
# Generate the extension for functions and symbols based off the flags.
|
||||
#
|
||||
sub gen_flags_ext {
|
||||
my $OP_FLAGS = "";
|
||||
my @name_flags = @{$_[0]};
|
||||
my $num_flags = $#name_flags;
|
||||
for my $flag (@name_flags) {
|
||||
$OP_FLAGS = $OP_FLAGS . $flag;
|
||||
if ($num_flags--) {
|
||||
$OP_FLAGS = $OP_FLAGS . '_';
|
||||
}
|
||||
}
|
||||
return $OP_FLAGS;
|
||||
}
|
||||
|
||||
#
|
||||
# Generate the header for the specialized symbol table init function.
|
||||
#
|
||||
sub gen_sym_function_header {
|
||||
my $MTL_OFI_SYM_TYPE = $_[0];
|
||||
my $header =
|
||||
"void ".$MTL_OFI_SYM_TYPE."_init(struct ompi_mtl_ofi_symtable *sym_table)
|
||||
{";
|
||||
return $header;
|
||||
}
|
||||
###
|
||||
|
||||
#
|
||||
# Generate the footer for the specialized symbol table init function.
|
||||
#
|
||||
sub gen_sym_function_footer {
|
||||
my $footer =
|
||||
"}";
|
||||
return $footer;
|
||||
}
|
||||
###
|
||||
|
||||
1;
|
67
opal/mca/btl/ofi/Makefile.am
Обычный файл
67
opal/mca/btl/ofi/Makefile.am
Обычный файл
@ -0,0 +1,67 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
|
||||
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2018 Intel, inc. All rights reserved
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
#dist_opaldata_DATA = help-mpi-btl-ofi.txt
|
||||
|
||||
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
|
||||
sources = \
|
||||
btl_ofi.h \
|
||||
btl_ofi_component.c \
|
||||
btl_ofi_endpoint.h \
|
||||
btl_ofi_endpoint.c \
|
||||
btl_ofi_module.c \
|
||||
btl_ofi_rdma.h \
|
||||
btl_ofi_rdma.c \
|
||||
btl_ofi_atomics.c \
|
||||
btl_ofi_frag.c \
|
||||
btl_ofi_frag.h \
|
||||
btl_ofi_context.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_opal_btl_ofi_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_btl_ofi.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_btl_ofi.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(opallibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_btl_ofi_la_SOURCES = $(component_sources)
|
||||
mca_btl_ofi_la_LDFLAGS = -module -avoid-version \
|
||||
$(opal_ofi_LDFLAGS)
|
||||
mca_btl_ofi_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la \
|
||||
$(opal_ofi_LIBS)
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_btl_ofi_la_SOURCES = $(lib_sources)
|
||||
libmca_btl_ofi_la_LDFLAGS = -module -avoid-version $(opal_ofi_LDFLAGS)
|
||||
libmca_btl_ofi_la_LIBS = $(opal_ofi_LIBS)
|
110
opal/mca/btl/ofi/README
Обычный файл
110
opal/mca/btl/ofi/README
Обычный файл
@ -0,0 +1,110 @@
|
||||
========================================
|
||||
Design notes on BTL/OFI
|
||||
========================================
|
||||
|
||||
This is the RDMA only btl based on OFI Libfabric. The goal is to enable RDMA
|
||||
with multiple vendor hardware through one interface. Most of the operations are
|
||||
managed by upper layer (osc/rdma). This BTL is mostly doing the low level work.
|
||||
|
||||
Tested providers: sockets,psm2,ugni
|
||||
|
||||
========================================
|
||||
|
||||
Component
|
||||
|
||||
This BTL is requesting libfabric version 1.5 API and will not support older versions.
|
||||
|
||||
The required capabilities of this BTL is FI_ATOMIC and FI_RMA with the endpoint type
|
||||
of FI_EP_RDM only. This BTL does NOT support libfabric provider that requires local
|
||||
memory registration (FI_MR_LOCAL).
|
||||
|
||||
BTL/OFI will initialize a module with ONLY the first compatible info returned from OFI.
|
||||
This means it will rely on OFI provider to do load balancing. The support for multiple
|
||||
device might be added later.
|
||||
|
||||
The BTL creates only one endpoint and one CQ.
|
||||
|
||||
========================================
|
||||
|
||||
Memory Registration
|
||||
|
||||
Open MPI has a system in place to exchange remote address and always use the remote
|
||||
virtual address to refer to a piece of memory. However, some libfabric providers might
|
||||
not support the use of virtual address and instead will use zero-based offset addressing.
|
||||
|
||||
FI_MR_VIRT_ADDR is the flag that determine this behavior. mca_btl_ofi_reg_mem() handles
|
||||
this by storing the base address in registration handle in case of the provider does not
|
||||
support FI_MR_VIRT_ADDR. This base address will be used to calculate the offset later in
|
||||
RDMA/Atomic operations.
|
||||
|
||||
The BTL will try to use the address of registration handle as the key. However, if the
|
||||
provider supports FI_MR_PROV_KEY, it will use provider provided key. Simply does not care.
|
||||
|
||||
The BTL does not register local operand or compare. This is why this BTL does not support
|
||||
FI_MR_LOCAL and will allocate every buffer before registering. This means FI_MR_ALLOCATED
|
||||
is supported. So to be explicit.
|
||||
|
||||
Supported MR mode bits (will work with or without):
|
||||
enum:
|
||||
- FI_MR_BASIC
|
||||
- FI_MR_SCALABLE
|
||||
|
||||
mode bits:
|
||||
- FI_MR_VIRT_ADDR
|
||||
- FI_MR_ALLOCATED
|
||||
- FI_MR_PROV_KEY
|
||||
|
||||
The BTL does NOT support (will not work with):
|
||||
- FI_MR_LOCAL
|
||||
- FI_MR_MMU_NOTIFY
|
||||
- FI_MR_RMA_EVENT
|
||||
- FI_MR_ENDPOINT
|
||||
|
||||
Just a reminder, in libfabric API 1.5...
|
||||
FI_MR_BASIC == (FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_VIRT_ADDR)
|
||||
|
||||
========================================
|
||||
|
||||
Completions
|
||||
|
||||
Every operation in this BTL is asynchronous. The completion handling will occur in
|
||||
mca_btl_ofi_component_progress() where we read the CQ with the completion context and
|
||||
execute the callback functions. The completions are local. No remote completion event is
|
||||
generated as local completion already guarantee global completion.
|
||||
|
||||
The BTL keep tracks of number of outstanding operations and provide flush interface.
|
||||
|
||||
========================================
|
||||
|
||||
Sockets Provider
|
||||
|
||||
Sockets provider is the proof of concept provider for libfabric. It is supposed to support
|
||||
all the OFI API with emulations. This provider is considered very slow and bound to raise
|
||||
problems that we might not see from other faster providers.
|
||||
|
||||
Known Problems:
|
||||
- sockets provider uses progress thread and can cause segfault in finalize as we free
|
||||
the resources while progress thread is still using it. sleep(1) was put in
|
||||
mca_btl_ofi_componenet_close() for this reason.
|
||||
- sockets provider deadlock in two-sided mode. Might be something about buffered recv.
|
||||
(August 2018).
|
||||
|
||||
========================================
|
||||
|
||||
Scalable Endpoint
|
||||
|
||||
This BTL will try to use scalable endpoint to create communication context. This will increase
|
||||
multithreaded performance for some application. The default number of context created is 1 and
|
||||
can be tuned VIA MCA parameter "btl_ofi_num_contexts_per_module". It is advised that the number
|
||||
of context should be equal to number of physical core for optimal performance.
|
||||
|
||||
User can disable scalable endpoint by MCA parameter "btl_ofi_disable_sep".
|
||||
With scalable endpoint disbled, the BTL will alias OFI endpoint to both tx and rx context.
|
||||
|
||||
========================================
|
||||
|
||||
Two sided communication
|
||||
|
||||
Two sided communication is added later on to BTL OFI to enable non tag-matching provider
|
||||
to be able to use in Open MPI with this BTL. However, the support is only for "functional"
|
||||
and has not been optimized for performance at this point. (August 2018)
|
378
opal/mca/btl/ofi/btl_ofi.h
Обычный файл
378
opal/mca/btl/ofi/btl_ofi.h
Обычный файл
@ -0,0 +1,378 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2018 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_BTL_OFI_H
|
||||
#define MCA_BTL_OFI_H
|
||||
|
||||
#include "opal_config.h"
|
||||
#include <sys/types.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Open MPI includes */
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/mca/rcache/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
|
||||
#include <rdma/fabric.h>
|
||||
#include <rdma/fi_domain.h>
|
||||
#include <rdma/fi_errno.h>
|
||||
#include <rdma/fi_cm.h>
|
||||
#include <rdma/fi_endpoint.h>
|
||||
#include <rdma/fi_rma.h>
|
||||
|
||||
BEGIN_C_DECLS
|
||||
#define MCA_BTL_OFI_MAX_MODULES 16
|
||||
#define MCA_BTL_OFI_NUM_CQE_READ 64
|
||||
|
||||
#define MCA_BTL_OFI_DEFAULT_RD_NUM 10
|
||||
#define MCA_BTL_OFI_DEFAULT_MAX_CQE 128
|
||||
#define MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD 64
|
||||
|
||||
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
|
||||
|
||||
#define TWO_SIDED_ENABLED mca_btl_ofi_component.two_sided_enabled
|
||||
|
||||
enum mca_btl_ofi_mode {
|
||||
MCA_BTL_OFI_MODE_ONE_SIDED = 0,
|
||||
MCA_BTL_OFI_MODE_TWO_SIDED,
|
||||
MCA_BTL_OFI_MODE_FULL_SUPPORT,
|
||||
MCA_BTL_OFI_MODE_TOTAL
|
||||
};
|
||||
|
||||
enum mca_btl_ofi_hdr_type {
|
||||
MCA_BTL_OFI_TYPE_PUT = 0,
|
||||
MCA_BTL_OFI_TYPE_GET,
|
||||
MCA_BTL_OFI_TYPE_AOP,
|
||||
MCA_BTL_OFI_TYPE_AFOP,
|
||||
MCA_BTL_OFI_TYPE_CSWAP,
|
||||
MCA_BTL_OFI_TYPE_SEND,
|
||||
MCA_BTL_OFI_TYPE_RECV,
|
||||
MCA_BTL_OFI_TYPE_TOTAL
|
||||
};
|
||||
|
||||
struct mca_btl_ofi_context_t {
|
||||
int32_t context_id;
|
||||
|
||||
/* transmit context */
|
||||
struct fid_ep *tx_ctx;
|
||||
struct fid_ep *rx_ctx;
|
||||
|
||||
/* completion queue */
|
||||
struct fid_cq *cq;
|
||||
|
||||
/* completion info freelist */
|
||||
/* We have it per context to reduce the thread contention
|
||||
* on the freelist. Things can get really slow. */
|
||||
opal_free_list_t rdma_comp_list;
|
||||
opal_free_list_t frag_comp_list;
|
||||
opal_free_list_t frag_list;
|
||||
|
||||
/* for thread locking */
|
||||
volatile int32_t lock;
|
||||
};
|
||||
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
|
||||
|
||||
/**
|
||||
* @brief OFI BTL module
|
||||
*/
|
||||
struct mca_btl_ofi_module_t {
|
||||
/** base BTL interface */
|
||||
mca_btl_base_module_t super;
|
||||
|
||||
/* libfabric components */
|
||||
struct fi_info *fabric_info;
|
||||
struct fid_fabric *fabric;
|
||||
struct fid_domain *domain;
|
||||
struct fid_ep *ofi_endpoint;
|
||||
struct fid_av *av;
|
||||
|
||||
int num_contexts;
|
||||
mca_btl_ofi_context_t *contexts;
|
||||
|
||||
char *linux_device_name;
|
||||
|
||||
/** whether the module has been fully initialized or not */
|
||||
bool initialized;
|
||||
bool use_virt_addr;
|
||||
bool is_scalable_ep;
|
||||
|
||||
int64_t outstanding_rdma;
|
||||
int64_t outstanding_send;
|
||||
|
||||
/** linked list of BTL endpoints. this list is never searched so
|
||||
* there is no need for a complicated structure here at this time*/
|
||||
opal_list_t endpoints;
|
||||
|
||||
opal_mutex_t module_lock;
|
||||
opal_hash_table_t id_to_endpoint;
|
||||
|
||||
/** registration cache */
|
||||
mca_rcache_base_module_t *rcache;
|
||||
};
|
||||
typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t;
|
||||
|
||||
extern mca_btl_ofi_module_t mca_btl_ofi_module_template;
|
||||
|
||||
/**
|
||||
* @brief OFI BTL component
|
||||
*/
|
||||
struct mca_btl_ofi_component_t {
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
|
||||
/** number of TL modules */
|
||||
int module_count;
|
||||
int num_contexts_per_module;
|
||||
int num_cqe_read;
|
||||
int progress_threshold;
|
||||
int mode;
|
||||
int rd_num;
|
||||
bool two_sided_enabled;
|
||||
|
||||
size_t namelen;
|
||||
|
||||
/** All BTL OFI modules (1 per tl) */
|
||||
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
|
||||
|
||||
};
|
||||
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
|
||||
|
||||
OPAL_MODULE_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component;
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
uint64_t rkey;
|
||||
void *desc;
|
||||
void *base_addr;
|
||||
};
|
||||
|
||||
struct mca_btl_ofi_reg_t {
|
||||
mca_rcache_base_registration_t base;
|
||||
struct fid_mr *ur_mr;
|
||||
|
||||
/* remote handle */
|
||||
mca_btl_base_registration_handle_t handle;
|
||||
};
|
||||
typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t);
|
||||
|
||||
struct mca_btl_ofi_header_t {
|
||||
mca_btl_base_tag_t tag;
|
||||
size_t len;
|
||||
};
|
||||
typedef struct mca_btl_ofi_header_t mca_btl_ofi_header_t;
|
||||
|
||||
struct mca_btl_ofi_base_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_base_segment_t segments[2];
|
||||
|
||||
int context_id;
|
||||
struct mca_btl_ofi_module_t *btl;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
opal_free_list_t *free_list;
|
||||
mca_btl_ofi_header_t hdr;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ofi_base_frag_t mca_btl_ofi_base_frag_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_base_frag_t);
|
||||
|
||||
|
||||
struct mca_btl_ofi_completion_context_t {
|
||||
struct fi_context ctx;
|
||||
void *comp;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ofi_completion_context_t mca_btl_ofi_completion_context_t;
|
||||
|
||||
/* completion structure store information needed
|
||||
* for RDMA callbacks */
|
||||
struct mca_btl_ofi_base_completion_t {
|
||||
opal_free_list_item_t comp_list;
|
||||
|
||||
opal_free_list_t *my_list;
|
||||
|
||||
struct mca_btl_base_module_t *btl;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
struct mca_btl_ofi_context_t *my_context;
|
||||
int type;
|
||||
};
|
||||
typedef struct mca_btl_ofi_base_completion_t mca_btl_ofi_base_completion_t;
|
||||
|
||||
struct mca_btl_ofi_rdma_completion_t {
|
||||
mca_btl_ofi_base_completion_t base;
|
||||
mca_btl_ofi_completion_context_t comp_ctx;
|
||||
void *local_address;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
|
||||
uint64_t operand;
|
||||
uint64_t compare;
|
||||
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc;
|
||||
void *cbcontext;
|
||||
void *cbdata;
|
||||
};
|
||||
typedef struct mca_btl_ofi_rdma_completion_t mca_btl_ofi_rdma_completion_t;
|
||||
|
||||
struct mca_btl_ofi_frag_completion_t {
|
||||
mca_btl_ofi_base_completion_t base;
|
||||
mca_btl_ofi_completion_context_t comp_ctx;
|
||||
mca_btl_ofi_base_frag_t *frag;
|
||||
};
|
||||
typedef struct mca_btl_ofi_frag_completion_t mca_btl_ofi_frag_completion_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_rdma_completion_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_frag_completion_t);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the put operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
int mca_btl_ofi_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the get operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
int mca_btl_ofi_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
|
||||
int mca_btl_ofi_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
int mca_btl_ofi_finalize (mca_btl_base_module_t *btl);
|
||||
|
||||
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module);
|
||||
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
|
||||
mca_rcache_base_registration_t *reg);
|
||||
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
|
||||
|
||||
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
|
||||
|
||||
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode);
|
||||
|
||||
int mca_btl_ofi_post_recvs(mca_btl_base_module_t* module, mca_btl_ofi_context_t *context, int count);
|
||||
void mca_btl_ofi_exit(void);
|
||||
|
||||
/* thread atomics */
|
||||
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
|
||||
{
|
||||
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
|
||||
}
|
||||
|
||||
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
|
||||
{
|
||||
while (mca_btl_ofi_context_trylock(context));
|
||||
}
|
||||
|
||||
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
|
||||
{
|
||||
opal_atomic_mb();
|
||||
context->lock = 0;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
193
opal/mca/btl/ofi/btl_ofi_atomics.c
Обычный файл
193
opal/mca/btl/ofi/btl_ofi_atomics.c
Обычный файл
@ -0,0 +1,193 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <rdma/fi_atomic.h>
|
||||
#include "btl_ofi_rdma.h"
|
||||
|
||||
static inline int to_fi_op(mca_btl_base_atomic_op_t op)
|
||||
{
|
||||
switch (op) {
|
||||
case MCA_BTL_ATOMIC_ADD:
|
||||
return FI_SUM;
|
||||
case MCA_BTL_ATOMIC_SWAP:
|
||||
return FI_ATOMIC_WRITE;
|
||||
default:
|
||||
BTL_ERROR(("Unknown or unsupported atomic op."));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
|
||||
/* just to squash the warning */
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
int fi_datatype = FI_UINT64;
|
||||
int fi_op;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_rdma_completion_t *comp = NULL;
|
||||
mca_btl_ofi_context_t *ofi_context;
|
||||
|
||||
ofi_context = get_ofi_context(ofi_btl);
|
||||
|
||||
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
|
||||
fi_datatype = FI_UINT32;
|
||||
}
|
||||
|
||||
fi_op = to_fi_op(op);
|
||||
|
||||
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
|
||||
ofi_context,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_AFOP);
|
||||
|
||||
/* copy the operand because it might get freed from upper layer */
|
||||
comp->operand = (uint64_t) operand;
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
rc = fi_fetch_atomic(ofi_context->tx_ctx,
|
||||
(void*) &comp->operand, 1, NULL, /* operand */
|
||||
local_address, local_handle->desc, /* results */
|
||||
btl_endpoint->peer_addr, /* remote addr */
|
||||
remote_address, remote_handle->rkey, /* remote buffer */
|
||||
fi_datatype, fi_op, &comp->comp_ctx);
|
||||
|
||||
if (rc == -FI_EAGAIN) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else if (rc < 0) {
|
||||
BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
int fi_datatype = FI_UINT64;
|
||||
int fi_op;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_rdma_completion_t *comp = NULL;
|
||||
mca_btl_ofi_context_t *ofi_context;
|
||||
|
||||
ofi_context = get_ofi_context(ofi_btl);
|
||||
|
||||
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
|
||||
fi_datatype = FI_UINT32;
|
||||
}
|
||||
|
||||
fi_op = to_fi_op(op);
|
||||
|
||||
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
|
||||
ofi_context,
|
||||
NULL,
|
||||
NULL,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_AOP);
|
||||
|
||||
/* copy the operand because it might get freed from upper layer */
|
||||
comp->operand = (uint64_t) operand;
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
rc = fi_atomic(ofi_context->tx_ctx,
|
||||
(void*) &comp->operand, 1, NULL, /* operand */
|
||||
btl_endpoint->peer_addr, /* remote addr */
|
||||
remote_address, remote_handle->rkey, /* remote buffer */
|
||||
fi_datatype, fi_op, &comp->comp_ctx);
|
||||
|
||||
if (rc == -FI_EAGAIN) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else if (rc < 0) {
|
||||
BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
int fi_datatype = FI_UINT64;
|
||||
|
||||
mca_btl_ofi_rdma_completion_t *comp = NULL;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_context_t *ofi_context;
|
||||
|
||||
ofi_context = get_ofi_context(ofi_btl);
|
||||
|
||||
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
|
||||
fi_datatype = FI_UINT32;
|
||||
}
|
||||
|
||||
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
|
||||
ofi_context,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_CSWAP);
|
||||
|
||||
/* copy the operand because it might get freed from upper layer */
|
||||
comp->operand = (uint64_t) value;
|
||||
comp->compare = (uint64_t) compare;
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
/* perform atomic */
|
||||
rc = fi_compare_atomic(ofi_context->tx_ctx,
|
||||
(void*) &comp->operand, 1, NULL,
|
||||
(void*) &comp->compare, NULL,
|
||||
local_address, local_handle->desc,
|
||||
btl_endpoint->peer_addr,
|
||||
remote_address, remote_handle->rkey,
|
||||
fi_datatype,
|
||||
FI_CSWAP,
|
||||
&comp->comp_ctx);
|
||||
|
||||
if (rc == -FI_EAGAIN) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else if (rc < 0) {
|
||||
BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
699
opal/mca/btl/ofi/btl_ofi_component.c
Обычный файл
699
opal/mca/btl/ofi/btl_ofi_component.c
Обычный файл
@ -0,0 +1,699 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/common/ofi/common_ofi.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
#include "btl_ofi_rdma.h"
|
||||
#include "btl_ofi_frag.h"
|
||||
|
||||
#define MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
|
||||
#define MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS (FI_MSG)
|
||||
|
||||
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
|
||||
|
||||
static char *prov_include;
|
||||
static char *ofi_progress_mode;
|
||||
static bool disable_sep;
|
||||
static int mca_btl_ofi_init_device(struct fi_info *info);
|
||||
|
||||
/* validate information returned from fi_getinfo().
|
||||
* return OPAL_ERROR if we dont have what we need. */
|
||||
static int validate_info(struct fi_info *info, uint64_t required_caps)
|
||||
{
|
||||
int mr_mode;
|
||||
|
||||
BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
|
||||
|
||||
/* we need exactly all the required bits */
|
||||
if ((info->caps & required_caps) != required_caps) {
|
||||
BTL_VERBOSE(("unsupported caps"));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* we need FI_EP_RDM */
|
||||
if (info->ep_attr->type != FI_EP_RDM) {
|
||||
BTL_VERBOSE(("unsupported EP type"));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
mr_mode = info->domain_attr->mr_mode;
|
||||
|
||||
if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
|
||||
(mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
|
||||
BTL_VERBOSE(("unsupported MR mode"));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) {
|
||||
BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE"));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name));
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Register the MCA parameters */
|
||||
static int mca_btl_ofi_component_register(void)
|
||||
{
|
||||
char *msg;
|
||||
mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template;
|
||||
|
||||
asprintf(&msg, "BTL OFI mode of operation. Valid values are: %d = One-Sided only, %d=Two-Sided only, "
|
||||
"%d = Both one and two sided. BTL OFI is only optimized for one-sided communication",
|
||||
MCA_BTL_OFI_MODE_ONE_SIDED,
|
||||
MCA_BTL_OFI_MODE_TWO_SIDED,
|
||||
MCA_BTL_OFI_MODE_FULL_SUPPORT);
|
||||
if (NULL == msg) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
mca_btl_ofi_component.mode = MCA_BTL_OFI_MODE_ONE_SIDED;
|
||||
(void)mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"mode",
|
||||
msg,
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.mode);
|
||||
|
||||
/* fi_getinfo with prov_name == NULL means ALL provider.
|
||||
* Since now we are using the first valid info returned, I'm not sure
|
||||
* if we need to provide the support for comma limited provider list. */
|
||||
prov_include = NULL;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"provider_include",
|
||||
"OFI provider that ofi btl will query for. This parameter only "
|
||||
"accept ONE provider name. "
|
||||
"(e.g., \"psm2\"; an empty value means that all providers will "
|
||||
"be considered.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_include);
|
||||
|
||||
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"num_cq_read",
|
||||
"Number of completion entries to read from a single cq_read. ",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.num_cqe_read);
|
||||
|
||||
ofi_progress_mode = "unspec";
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"progress_mode",
|
||||
"requested provider progress mode. [unspec, auto, manual]"
|
||||
"(default: unspec)",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ofi_progress_mode);
|
||||
|
||||
mca_btl_ofi_component.num_contexts_per_module = 1;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"num_contexts_per_module",
|
||||
"number of communication context per module to create. "
|
||||
"This should increase multithreaded performance but it is "
|
||||
"advised that this number should be lower than total cores.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.num_contexts_per_module);
|
||||
|
||||
disable_sep = false;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"disable_sep",
|
||||
"force btl/ofi to never use scalable endpoint.",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&disable_sep);
|
||||
|
||||
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"progress_threshold",
|
||||
"number of outstanding operation before btl will progress "
|
||||
"automatically. Tuning this might improve performance on "
|
||||
"certain type of application.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.progress_threshold);
|
||||
|
||||
mca_btl_ofi_component.rd_num = MCA_BTL_OFI_DEFAULT_RD_NUM;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"rd_num",
|
||||
"Number of receive descriptor posted per context.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.rd_num);
|
||||
|
||||
|
||||
/* for now we want this component to lose to the MTL. */
|
||||
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
|
||||
|
||||
return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
|
||||
&module->super);
|
||||
}
|
||||
|
||||
static int mca_btl_ofi_component_open(void)
|
||||
{
|
||||
mca_btl_ofi_component.module_count = 0;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* component cleanup - sanity checking of queue lengths
|
||||
*/
|
||||
static int mca_btl_ofi_component_close(void)
|
||||
{
|
||||
/* If we don't sleep, sockets provider freaks out. */
|
||||
sleep(1);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_btl_ofi_exit(void)
|
||||
{
|
||||
BTL_ERROR(("BTL OFI will now abort."));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* OFI component initialization:
|
||||
* read interface list from kernel and compare against component parameters
|
||||
* then create a BTL instance for selected interfaces
|
||||
*/
|
||||
|
||||
static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */
|
||||
int rc;
|
||||
uint64_t progress_mode;
|
||||
unsigned resource_count = 0;
|
||||
struct mca_btl_base_module_t **base_modules;
|
||||
|
||||
BTL_VERBOSE(("initializing ofi btl"));
|
||||
|
||||
/* Set up libfabric hints. */
|
||||
uint32_t libfabric_api;
|
||||
libfabric_api = fi_version();
|
||||
|
||||
/* bail if OFI version is less than 1.5. */
|
||||
if (libfabric_api < FI_VERSION(1, 5)) {
|
||||
BTL_VERBOSE(("ofi btl disqualified because OFI version < 1.5."));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct fi_info *info, *info_list, *selected_info;
|
||||
struct fi_info hints = {0};
|
||||
struct fi_ep_attr ep_attr = {0};
|
||||
struct fi_rx_attr rx_attr = {0};
|
||||
struct fi_tx_attr tx_attr = {0};
|
||||
struct fi_fabric_attr fabric_attr = {0};
|
||||
struct fi_domain_attr domain_attr = {0};
|
||||
uint64_t required_caps;
|
||||
|
||||
switch (mca_btl_ofi_component.mode) {
|
||||
|
||||
case MCA_BTL_OFI_MODE_TWO_SIDED:
|
||||
mca_btl_ofi_component.two_sided_enabled = true;
|
||||
required_caps = MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
|
||||
break;
|
||||
|
||||
case MCA_BTL_OFI_MODE_FULL_SUPPORT:
|
||||
mca_btl_ofi_component.two_sided_enabled = true;
|
||||
required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS |
|
||||
MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* default to only one sided. */
|
||||
required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Select the provider */
|
||||
fabric_attr.prov_name = prov_include;
|
||||
|
||||
domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE;
|
||||
|
||||
/* message progression mode. */
|
||||
if (!strcmp(ofi_progress_mode, "auto")) {
|
||||
progress_mode = FI_PROGRESS_AUTO;
|
||||
} else if (!strcmp(ofi_progress_mode, "manual")) {
|
||||
progress_mode = FI_PROGRESS_MANUAL;
|
||||
} else {
|
||||
progress_mode = FI_PROGRESS_UNSPEC;
|
||||
}
|
||||
|
||||
domain_attr.control_progress = progress_mode;
|
||||
domain_attr.data_progress = progress_mode;
|
||||
|
||||
/* select endpoint type */
|
||||
ep_attr.type = FI_EP_RDM;
|
||||
|
||||
/* ask for capabilities */
|
||||
/* TODO: catch the caps here. */
|
||||
hints.caps = required_caps;
|
||||
hints.mode = FI_CONTEXT;
|
||||
|
||||
/* Ask for completion context */
|
||||
hints.mode = FI_CONTEXT;
|
||||
|
||||
hints.fabric_attr = &fabric_attr;
|
||||
hints.domain_attr = &domain_attr;
|
||||
hints.ep_attr = &ep_attr;
|
||||
hints.tx_attr = &tx_attr;
|
||||
hints.rx_attr = &rx_attr;
|
||||
|
||||
/* for now */
|
||||
tx_attr.iov_limit = 1;
|
||||
rx_attr.iov_limit = 1;
|
||||
|
||||
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
|
||||
|
||||
mca_btl_ofi_component.module_count = 0;
|
||||
|
||||
/* do the query. */
|
||||
rc = fi_getinfo(FI_VERSION(1, 5), NULL, NULL, 0, &hints, &info_list);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* count the number of resources/ */
|
||||
info = info_list;
|
||||
while(info) {
|
||||
resource_count++;
|
||||
info = info->next;
|
||||
}
|
||||
BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count));
|
||||
|
||||
info = info_list;
|
||||
|
||||
while(info) {
|
||||
rc = validate_info(info, required_caps);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
/* Device passed sanity check, let's make a module.
|
||||
*
|
||||
* The initial fi_getinfo() call will return a list of providers
|
||||
* available for this process. once a provider is selected from the
|
||||
* list, we will cycle through the remaining list to identify NICs
|
||||
* serviced by this provider, and try to pick one on the same NUMA
|
||||
* node as this process. If there are no NICs on the same NUMA node,
|
||||
* we pick one in a manner which allows all ranks to make balanced
|
||||
* use of available NICs on the system.
|
||||
*
|
||||
* Most providers give a separate fi_info object for each NIC,
|
||||
* however some may have multiple info objects with different
|
||||
* attributes for the same NIC. The initial provider attributes
|
||||
* are used to ensure that all NICs we return provide the same
|
||||
* capabilities as the inital one.
|
||||
*/
|
||||
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
|
||||
rc = mca_btl_ofi_init_device(selected_info);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
info = selected_info;
|
||||
break;
|
||||
}
|
||||
}
|
||||
info = info->next;
|
||||
}
|
||||
|
||||
/* We are done with the returned info. */
|
||||
fi_freeinfo(info_list);
|
||||
|
||||
/* pass module array back to caller */
|
||||
base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules));
|
||||
if (NULL == base_modules) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
memcpy(base_modules, mca_btl_ofi_component.modules,
|
||||
mca_btl_ofi_component.module_count *sizeof (mca_btl_ofi_component.modules[0]));
|
||||
|
||||
BTL_VERBOSE(("ofi btl initialization complete. found %d suitable transports",
|
||||
mca_btl_ofi_component.module_count));
|
||||
|
||||
*num_btl_modules = mca_btl_ofi_component.module_count;
|
||||
|
||||
return base_modules;
|
||||
}
|
||||
|
||||
static int mca_btl_ofi_init_device(struct fi_info *info)
|
||||
{
|
||||
int rc;
|
||||
int *module_count = &mca_btl_ofi_component.module_count;
|
||||
size_t namelen;
|
||||
size_t num_contexts_to_create;
|
||||
|
||||
char *linux_device_name;
|
||||
char ep_name[FI_NAME_MAX];
|
||||
|
||||
struct fi_info *ofi_info;
|
||||
struct fi_ep_attr *ep_attr;
|
||||
struct fi_domain_attr *domain_attr;
|
||||
struct fi_av_attr av_attr = {0};
|
||||
struct fid_fabric *fabric = NULL;
|
||||
struct fid_domain *domain = NULL;
|
||||
struct fid_ep *ep = NULL;
|
||||
struct fid_av *av = NULL;
|
||||
|
||||
mca_btl_ofi_module_t *module;
|
||||
|
||||
module = mca_btl_ofi_module_alloc(mca_btl_ofi_component.mode);
|
||||
if (NULL == module) {
|
||||
BTL_VERBOSE(("failed allocating ofi module"));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* If the user ask for two sided support, something bad is happening
|
||||
* to the MTL, so we will take maximum priority to supersede the MTL. */
|
||||
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
|
||||
|
||||
/* make a copy of the given info to store on the module */
|
||||
ofi_info = fi_dupinfo(info);
|
||||
ep_attr = ofi_info->ep_attr;
|
||||
domain_attr = ofi_info->domain_attr;
|
||||
|
||||
/* mtl_btl_ofi_rcache_init() initializes patcher which should only
|
||||
* take place things are single threaded. OFI providers may start
|
||||
* spawn threads, so initialize the rcache before creating OFI objects
|
||||
* to prevent races. */
|
||||
mca_btl_ofi_rcache_init(module);
|
||||
|
||||
linux_device_name = info->domain_attr->name;
|
||||
BTL_VERBOSE(("initializing dev:%s provider:%s",
|
||||
linux_device_name,
|
||||
info->fabric_attr->prov_name));
|
||||
|
||||
/* fabric */
|
||||
rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_fabric with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* domain */
|
||||
rc = fi_domain(fabric, ofi_info, &domain, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_domain with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* AV */
|
||||
av_attr.type = FI_AV_MAP;
|
||||
rc = fi_av_open(domain, &av_attr, &av, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_av_open with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module;
|
||||
|
||||
/* If the domain support scalable endpoint. */
|
||||
if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) {
|
||||
|
||||
BTL_VERBOSE(("btl/ofi using scalable endpoint."));
|
||||
|
||||
if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) {
|
||||
BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)",
|
||||
module->num_contexts,
|
||||
domain_attr->max_ep_tx_ctx));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* modify the info to let the provider know we are creating x contexts */
|
||||
ep_attr->tx_ctx_cnt = num_contexts_to_create;
|
||||
ep_attr->rx_ctx_cnt = num_contexts_to_create;
|
||||
|
||||
/* create scalable endpoint */
|
||||
rc = fi_scalable_ep(domain, ofi_info, &ep, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
module->num_contexts = num_contexts_to_create;
|
||||
module->is_scalable_ep = true;
|
||||
|
||||
/* create contexts */
|
||||
module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info,
|
||||
domain, ep, av,
|
||||
num_contexts_to_create);
|
||||
|
||||
} else {
|
||||
/* warn the user if they want more than 1 context */
|
||||
if (num_contexts_to_create > 1) {
|
||||
BTL_ERROR(("cannot create %zu contexts as the provider does not support "
|
||||
"scalable endpoint. Falling back to single context endpoint.",
|
||||
num_contexts_to_create));
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("btl/ofi using normal endpoint."));
|
||||
|
||||
rc = fi_endpoint(domain, ofi_info, &ep, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
module->num_contexts = 1;
|
||||
module->is_scalable_ep = false;
|
||||
|
||||
/* create contexts */
|
||||
module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info,
|
||||
domain, ep, av);
|
||||
}
|
||||
|
||||
if (NULL == module->contexts) {
|
||||
/* error message is already printed */
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* enable the endpoint for using */
|
||||
rc = fi_enable(ep);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_enable with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Everything succeeded, lets create a module for this device. */
|
||||
/* store the information. */
|
||||
module->fabric_info = ofi_info;
|
||||
module->fabric = fabric;
|
||||
module->domain = domain;
|
||||
module->av = av;
|
||||
module->ofi_endpoint = ep;
|
||||
module->linux_device_name = linux_device_name;
|
||||
module->outstanding_rdma = 0;
|
||||
module->use_virt_addr = false;
|
||||
|
||||
if (ofi_info->domain_attr->mr_mode == FI_MR_BASIC ||
|
||||
ofi_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) {
|
||||
module->use_virt_addr = true;
|
||||
}
|
||||
|
||||
/* create endpoint list */
|
||||
OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t);
|
||||
|
||||
rc = opal_hash_table_init (&module->id_to_endpoint, 512);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_ERROR(("error initializing hash table."));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* create and send the modex for this device */
|
||||
namelen = sizeof(ep_name);
|
||||
rc = fi_getname((fid_t)ep, &ep_name[0], &namelen);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_getname with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
|
||||
/* If we have two-sided support. */
|
||||
if (TWO_SIDED_ENABLED) {
|
||||
|
||||
/* post wildcard recvs */
|
||||
for (int i=0; i < module->num_contexts; i++) {
|
||||
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) module,
|
||||
&module->contexts[i],
|
||||
mca_btl_ofi_component.rd_num);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* post our endpoint name so peer can use it to connect to us */
|
||||
OPAL_MODEX_SEND(rc,
|
||||
OPAL_PMIX_GLOBAL,
|
||||
&mca_btl_ofi_component.super.btl_version,
|
||||
&ep_name,
|
||||
namelen);
|
||||
mca_btl_ofi_component.namelen = namelen;
|
||||
|
||||
/* add this module to the list */
|
||||
mca_btl_ofi_component.modules[(*module_count)++] = module;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
||||
fail:
|
||||
/* clean up */
|
||||
|
||||
/* close basic ep before closing av */
|
||||
if (NULL != ep && !module->is_scalable_ep) {
|
||||
fi_close(&ep->fid);
|
||||
ep = NULL;
|
||||
}
|
||||
|
||||
/* if the contexts have not been initiated, num_contexts should
|
||||
* be zero and we skip this. */
|
||||
for (int i=0; i < module->num_contexts; i++) {
|
||||
mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep);
|
||||
}
|
||||
free(module->contexts);
|
||||
|
||||
/* check for NULL ep to avoid double-close */
|
||||
if (NULL != ep) {
|
||||
fi_close(&ep->fid);
|
||||
}
|
||||
|
||||
/* close av after closing basic ep */
|
||||
if (NULL != av) {
|
||||
fi_close(&av->fid);
|
||||
}
|
||||
|
||||
if (NULL != domain) {
|
||||
fi_close(&domain->fid);
|
||||
}
|
||||
|
||||
if (NULL != fabric) {
|
||||
fi_close(&fabric->fid);
|
||||
}
|
||||
free(module);
|
||||
|
||||
/* not really a failure. just skip this device. */
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief OFI BTL progress function
|
||||
*
|
||||
* This function explictly progresses all workers.
|
||||
*/
|
||||
static int mca_btl_ofi_component_progress (void)
|
||||
{
|
||||
int events = 0;
|
||||
mca_btl_ofi_context_t *context;
|
||||
|
||||
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
|
||||
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
|
||||
|
||||
/* progress context we own first. */
|
||||
context = get_ofi_context(module);
|
||||
|
||||
if (mca_btl_ofi_context_trylock(context)) {
|
||||
events += mca_btl_ofi_context_progress(context);
|
||||
mca_btl_ofi_context_unlock(context);
|
||||
}
|
||||
|
||||
/* if there is nothing to do, try progress other's. */
|
||||
if (events == 0) {
|
||||
for (int j = 0 ; j < module->num_contexts ; j++ ) {
|
||||
|
||||
context = get_ofi_context_rr(module);
|
||||
|
||||
if (mca_btl_ofi_context_trylock(context)) {
|
||||
events += mca_btl_ofi_context_progress(context);
|
||||
mca_btl_ofi_context_unlock(context);
|
||||
}
|
||||
|
||||
/* If we did something, good enough. return now.
|
||||
* This is crucial for performance/latency. */
|
||||
if (events > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return events;
|
||||
}
|
||||
|
||||
/** OFI btl component */
|
||||
mca_btl_ofi_component_t mca_btl_ofi_component = {
|
||||
.super = {
|
||||
.btl_version = {
|
||||
MCA_BTL_DEFAULT_VERSION("ofi"),
|
||||
.mca_open_component = mca_btl_ofi_component_open,
|
||||
.mca_close_component = mca_btl_ofi_component_close,
|
||||
.mca_register_component_params = mca_btl_ofi_component_register,
|
||||
},
|
||||
.btl_data = {
|
||||
/* The component is not checkpoint ready */
|
||||
.param_field = MCA_BASE_METADATA_PARAM_NONE
|
||||
},
|
||||
|
||||
.btl_init = mca_btl_ofi_component_init,
|
||||
.btl_progress = mca_btl_ofi_component_progress,
|
||||
},
|
||||
};
|
463
opal/mca/btl/ofi/btl_ofi_context.c
Обычный файл
463
opal/mca/btl/ofi/btl_ofi_context.c
Обычный файл
@ -0,0 +1,463 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
* Copyright (c) 2018 Intel Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_frag.h"
|
||||
#include "btl_ofi_rdma.h"
|
||||
|
||||
#if OPAL_HAVE_THREAD_LOCAL
|
||||
opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
|
||||
#endif /* OPAL_HAVE_THREAD_LOCAL */
|
||||
|
||||
int init_context_freelists(mca_btl_ofi_context_t *context)
|
||||
{
|
||||
int rc;
|
||||
OBJ_CONSTRUCT(&context->rdma_comp_list, opal_free_list_t);
|
||||
rc = opal_free_list_init(&context->rdma_comp_list,
|
||||
sizeof(mca_btl_ofi_rdma_completion_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_btl_ofi_rdma_completion_t),
|
||||
0,
|
||||
0,
|
||||
512,
|
||||
-1,
|
||||
512,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
BTL_VERBOSE(("cannot allocate completion freelist"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (TWO_SIDED_ENABLED) {
|
||||
OBJ_CONSTRUCT(&context->frag_comp_list, opal_free_list_t);
|
||||
rc = opal_free_list_init(&context->frag_comp_list,
|
||||
sizeof(mca_btl_ofi_frag_completion_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_btl_ofi_frag_completion_t),
|
||||
0,
|
||||
0,
|
||||
512,
|
||||
-1,
|
||||
512,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
BTL_VERBOSE(("cannot allocate completion freelist"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Initialize frag pool */
|
||||
OBJ_CONSTRUCT(&context->frag_list, opal_free_list_t);
|
||||
rc = opal_free_list_init(&context->frag_list,
|
||||
sizeof(mca_btl_ofi_base_frag_t) +
|
||||
MCA_BTL_OFI_FRAG_SIZE,
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_btl_ofi_base_frag_t),
|
||||
0,
|
||||
0,
|
||||
1024,
|
||||
-1,
|
||||
1024,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("failed to init frag pool (free_list)"));
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* mca_btl_ofi_context_alloc_normal()
|
||||
*
|
||||
* This function will allocate an ofi_context, map the endpoint to tx/rx context,
|
||||
* bind CQ,AV to the endpoint and initialize all the structure.
|
||||
* USE WITH NORMAL ENDPOINT ONLY */
|
||||
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
|
||||
struct fid_domain *domain,
|
||||
struct fid_ep *ep,
|
||||
struct fid_av *av)
|
||||
{
|
||||
int rc;
|
||||
uint32_t cq_flags = FI_TRANSMIT | FI_SEND | FI_RECV;
|
||||
char *linux_device_name = info->domain_attr->name;
|
||||
|
||||
struct fi_cq_attr cq_attr = {0};
|
||||
|
||||
mca_btl_ofi_context_t *context;
|
||||
|
||||
context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
|
||||
if (NULL == context) {
|
||||
BTL_VERBOSE(("cannot allocate context"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Don't really need to check, just avoiding compiler warning because
|
||||
* BTL_VERBOSE is a no op in performance build and the compiler will
|
||||
* complain about unused variable. */
|
||||
if (NULL == linux_device_name) {
|
||||
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
|
||||
goto single_fail;
|
||||
}
|
||||
|
||||
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
|
||||
cq_attr.wait_obj = FI_WAIT_NONE;
|
||||
rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto single_fail;
|
||||
}
|
||||
|
||||
rc = fi_ep_bind(ep, (fid_t)av, 0);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto single_fail;
|
||||
}
|
||||
|
||||
rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto single_fail;
|
||||
}
|
||||
|
||||
rc = init_context_freelists(context);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
goto single_fail;
|
||||
}
|
||||
|
||||
context->tx_ctx = ep;
|
||||
context->rx_ctx = ep;
|
||||
context->context_id = 0;
|
||||
|
||||
return context;
|
||||
|
||||
single_fail:
|
||||
mca_btl_ofi_context_finalize(context, false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* mca_btl_ofi_context_alloc_scalable()
|
||||
*
|
||||
* This function allocate communication contexts and return the pointer
|
||||
* to the first btl context. It also take care of all the bindings needed.
|
||||
* USE WITH SCALABLE ENDPOINT ONLY */
|
||||
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
|
||||
struct fid_domain *domain,
|
||||
struct fid_ep *sep,
|
||||
struct fid_av *av,
|
||||
size_t num_contexts)
|
||||
{
|
||||
BTL_VERBOSE(("creating %zu contexts", num_contexts));
|
||||
|
||||
int rc;
|
||||
size_t i;
|
||||
char *linux_device_name = info->domain_attr->name;
|
||||
|
||||
struct fi_cq_attr cq_attr = {0};
|
||||
struct fi_tx_attr tx_attr = {0};
|
||||
struct fi_rx_attr rx_attr = {0};
|
||||
|
||||
mca_btl_ofi_context_t *contexts;
|
||||
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
|
||||
|
||||
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
|
||||
if (NULL == contexts) {
|
||||
BTL_VERBOSE(("cannot allocate communication contexts."));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Don't really need to check, just avoiding compiler warning because
|
||||
* BTL_VERBOSE is a no op in performance build and the compiler will
|
||||
* complain about unused variable. */
|
||||
if (NULL == linux_device_name) {
|
||||
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* bind AV to endpoint */
|
||||
rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
for (i=0; i < num_contexts; i++) {
|
||||
rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* We don't actually need a receiving context as we only do one-sided.
|
||||
* However, sockets provider will hang if we dont have one. It is
|
||||
* also nice to have equal number of tx/rx context. */
|
||||
rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* create CQ */
|
||||
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
|
||||
cq_attr.wait_obj = FI_WAIT_NONE;
|
||||
rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* bind cq to transmit context */
|
||||
rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, FI_TRANSMIT);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* bind cq to receiving context */
|
||||
if (TWO_SIDED_ENABLED) {
|
||||
rc = fi_ep_bind(contexts[i].rx_ctx, (fid_t)contexts[i].cq, FI_RECV);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
}
|
||||
|
||||
/* enable the context. */
|
||||
rc = fi_enable(contexts[i].tx_ctx);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_enable with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
rc = fi_enable(contexts[i].rx_ctx);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_enable with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* initialize freelists. */
|
||||
rc = init_context_freelists(&contexts[i]);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
goto scalable_fail;
|
||||
}
|
||||
|
||||
/* assign the id */
|
||||
contexts[i].context_id = i;
|
||||
}
|
||||
|
||||
return contexts;
|
||||
|
||||
scalable_fail:
|
||||
/* close and free */
|
||||
for(i=0; i < num_contexts; i++) {
|
||||
mca_btl_ofi_context_finalize(&contexts[i], true);
|
||||
}
|
||||
free(contexts);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
|
||||
|
||||
/* if it is a scalable ep, we have to close all contexts. */
|
||||
if (scalable_ep) {
|
||||
if (NULL != context->tx_ctx) {
|
||||
fi_close(&context->tx_ctx->fid);
|
||||
}
|
||||
|
||||
if (NULL != context->rx_ctx) {
|
||||
fi_close(&context->rx_ctx->fid);
|
||||
}
|
||||
}
|
||||
|
||||
if( NULL != context->cq) {
|
||||
fi_close(&context->cq->fid);
|
||||
}
|
||||
|
||||
/* Can we destruct the object that hasn't been constructed? */
|
||||
OBJ_DESTRUCT(&context->rdma_comp_list);
|
||||
|
||||
if (TWO_SIDED_ENABLED) {
|
||||
OBJ_DESTRUCT(&context->frag_comp_list);
|
||||
OBJ_DESTRUCT(&context->frag_list);
|
||||
}
|
||||
}
|
||||
|
||||
/* Get a context to use for communication.
|
||||
* If TLS is supported, it will use the cached endpoint.
|
||||
* If not, it will invoke the normal round-robin assignment. */
|
||||
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
|
||||
{
|
||||
#if OPAL_HAVE_THREAD_LOCAL
|
||||
/* With TLS, we cache the context we use. */
|
||||
static volatile int64_t cur_num = 0;
|
||||
|
||||
if (OPAL_UNLIKELY(my_context == NULL)) {
|
||||
OPAL_THREAD_LOCK(&btl->module_lock);
|
||||
|
||||
my_context = &btl->contexts[cur_num];
|
||||
cur_num = (cur_num + 1) %btl->num_contexts;
|
||||
|
||||
OPAL_THREAD_UNLOCK(&btl->module_lock);
|
||||
}
|
||||
|
||||
assert (my_context);
|
||||
return my_context;
|
||||
#else
|
||||
return get_ofi_context_rr(btl);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* return the context in a round-robin. */
|
||||
/* There is no need for atomics here as it might hurt the performance. */
|
||||
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
|
||||
{
|
||||
static volatile uint64_t rr_num = 0;
|
||||
return &btl->contexts[rr_num++%btl->num_contexts];
|
||||
}
|
||||
|
||||
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
|
||||
|
||||
int ret = 0;
|
||||
int events_read;
|
||||
int events = 0;
|
||||
struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE];
|
||||
struct fi_cq_err_entry cqerr = {0};
|
||||
|
||||
mca_btl_ofi_completion_context_t *c_ctx;
|
||||
mca_btl_ofi_base_completion_t *comp;
|
||||
mca_btl_ofi_rdma_completion_t *rdma_comp;
|
||||
mca_btl_ofi_frag_completion_t *frag_comp;
|
||||
|
||||
ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
|
||||
|
||||
if (0 < ret) {
|
||||
events_read = ret;
|
||||
for (int i = 0; i < events_read; i++) {
|
||||
if (NULL != cq_entry[i].op_context) {
|
||||
++events;
|
||||
|
||||
c_ctx = (mca_btl_ofi_completion_context_t*) cq_entry[i].op_context;
|
||||
|
||||
/* We are casting to every type here just for simplicity. */
|
||||
comp = (mca_btl_ofi_base_completion_t*) c_ctx->comp;
|
||||
frag_comp = (mca_btl_ofi_frag_completion_t*) c_ctx->comp;
|
||||
rdma_comp = (mca_btl_ofi_rdma_completion_t*) c_ctx->comp;
|
||||
|
||||
switch (comp->type) {
|
||||
case MCA_BTL_OFI_TYPE_GET:
|
||||
case MCA_BTL_OFI_TYPE_PUT:
|
||||
case MCA_BTL_OFI_TYPE_AOP:
|
||||
case MCA_BTL_OFI_TYPE_AFOP:
|
||||
case MCA_BTL_OFI_TYPE_CSWAP:
|
||||
/* call the callback */
|
||||
if (rdma_comp->cbfunc) {
|
||||
rdma_comp->cbfunc (comp->btl, comp->endpoint,
|
||||
rdma_comp->local_address, rdma_comp->local_handle,
|
||||
rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS);
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t*) comp->btl);
|
||||
break;
|
||||
|
||||
case MCA_BTL_OFI_TYPE_RECV:
|
||||
mca_btl_ofi_recv_frag((mca_btl_ofi_module_t*) comp->btl,
|
||||
(mca_btl_ofi_endpoint_t*) comp->endpoint,
|
||||
context, frag_comp->frag);
|
||||
break;
|
||||
|
||||
case MCA_BTL_OFI_TYPE_SEND:
|
||||
MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t*) comp->btl);
|
||||
mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS);
|
||||
break;
|
||||
|
||||
default:
|
||||
/* catasthrophic */
|
||||
BTL_ERROR(("unknown completion type"));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
/* return the completion handler */
|
||||
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
|
||||
}
|
||||
}
|
||||
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
|
||||
ret = fi_cq_readerr(context->cq, &cqerr, 0);
|
||||
|
||||
/* cq readerr failed!? */
|
||||
if (0 > ret) {
|
||||
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
|
||||
__FILE__, __LINE__, fi_strerror(-ret), ret));
|
||||
} else {
|
||||
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
|
||||
cqerr.prov_errno));
|
||||
}
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
#ifdef FI_EINTR
|
||||
/* sometimes, sockets provider complain about interupt. We do nothing. */
|
||||
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
|
||||
|
||||
}
|
||||
#endif
|
||||
/* If the error is not FI_EAGAIN, report the error and abort. */
|
||||
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
|
||||
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
return events;
|
||||
}
|
||||
|
||||
|
50
opal/mca/btl/ofi/btl_ofi_endpoint.c
Обычный файл
50
opal/mca/btl/ofi/btl_ofi_endpoint.c
Обычный файл
@ -0,0 +1,50 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint)
|
||||
{
|
||||
endpoint->peer_addr = 0;
|
||||
OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t);
|
||||
}
|
||||
|
||||
static void mca_btl_ofi_endpoint_destruct (mca_btl_ofi_endpoint_t *endpoint)
|
||||
{
|
||||
endpoint->peer_addr = 0;
|
||||
|
||||
/* set to null, we will free ofi endpoint in module */
|
||||
endpoint->ofi_endpoint = NULL;
|
||||
|
||||
OBJ_DESTRUCT(&endpoint->ep_lock);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ofi_endpoint_t, opal_list_item_t,
|
||||
mca_btl_ofi_endpoint_construct,
|
||||
mca_btl_ofi_endpoint_destruct);
|
||||
|
||||
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep)
|
||||
{
|
||||
mca_btl_ofi_endpoint_t *endpoint = OBJ_NEW(mca_btl_ofi_endpoint_t);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == endpoint)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
endpoint->ep_proc = proc;
|
||||
endpoint->ofi_endpoint = ep;
|
||||
|
||||
return (mca_btl_base_endpoint_t *) endpoint;
|
||||
}
|
75
opal/mca/btl/ofi/btl_ofi_endpoint.h
Обычный файл
75
opal/mca/btl/ofi/btl_ofi_endpoint.h
Обычный файл
@ -0,0 +1,75 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_OFI_ENDPOINT_H
|
||||
#define MCA_BTL_OFI_ENDPOINT_H
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "btl_ofi.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#if OPAL_HAVE_THREAD_LOCAL
|
||||
extern opal_thread_local mca_btl_ofi_context_t *my_context;
|
||||
#endif /* OPAL_HAVE_THREAD_LOCAL */
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
struct fid_ep *ofi_endpoint;
|
||||
fi_addr_t peer_addr;
|
||||
|
||||
/** endpoint proc */
|
||||
opal_proc_t *ep_proc;
|
||||
|
||||
/** mutex to protect this structure */
|
||||
opal_mutex_t ep_lock;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t);
|
||||
|
||||
int init_context_freelists(mca_btl_ofi_context_t *context);
|
||||
|
||||
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep);
|
||||
|
||||
/* contexts */
|
||||
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
|
||||
struct fid_domain *domain,
|
||||
struct fid_ep *sep,
|
||||
struct fid_av *av,
|
||||
size_t num_contexts);
|
||||
|
||||
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
|
||||
struct fid_domain *domain,
|
||||
struct fid_ep *ep,
|
||||
struct fid_av *av);
|
||||
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep);
|
||||
|
||||
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl);
|
||||
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
198
opal/mca/btl/ofi/btl_ofi_frag.c
Обычный файл
198
opal/mca/btl/ofi/btl_ofi_frag.c
Обычный файл
@ -0,0 +1,198 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_frag.h"
|
||||
#include "btl_ofi_rdma.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
|
||||
static void mca_btl_ofi_base_frag_constructor (mca_btl_ofi_base_frag_t *frag)
|
||||
{
|
||||
/* zero everything out */
|
||||
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
}
|
||||
|
||||
static void mca_btl_ofi_base_frag_destructor (mca_btl_ofi_base_frag_t *frag)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ofi_base_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ofi_base_frag_constructor,
|
||||
mca_btl_ofi_base_frag_destructor);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ofi_frag_completion_t,
|
||||
opal_free_list_item_t,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
|
||||
(mca_btl_base_module_t *btl,
|
||||
mca_btl_ofi_context_t *context,
|
||||
mca_btl_ofi_base_frag_t *frag,
|
||||
int type)
|
||||
{
|
||||
mca_btl_ofi_frag_completion_t *comp;
|
||||
|
||||
comp = (mca_btl_ofi_frag_completion_t*) opal_free_list_get(&context->frag_comp_list);
|
||||
comp->base.btl = btl;
|
||||
comp->base.my_context = context;
|
||||
comp->base.my_list = &context->frag_comp_list;
|
||||
comp->base.type = type;
|
||||
|
||||
comp->frag = frag;
|
||||
comp->comp_ctx.comp = comp;
|
||||
|
||||
return comp;
|
||||
}
|
||||
|
||||
|
||||
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t order, size_t size, uint32_t flags)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
|
||||
mca_btl_ofi_base_frag_t *frag = NULL;
|
||||
mca_btl_ofi_context_t *context = get_ofi_context(ofi_btl);
|
||||
|
||||
frag = mca_btl_ofi_frag_alloc(ofi_btl, &context->frag_list, endpoint);
|
||||
|
||||
if (OPAL_LIKELY(frag)) {
|
||||
frag->segments[0].seg_addr.pval = frag + 1;
|
||||
frag->segments[0].seg_len = size;
|
||||
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_segments = &frag->segments[0];
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = order;
|
||||
frag->hdr.len = size;
|
||||
}
|
||||
|
||||
return (mca_btl_base_descriptor_t*) frag;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
|
||||
{
|
||||
/* return the frag to the free list. */
|
||||
mca_btl_ofi_frag_return ((mca_btl_ofi_base_frag_t*) des);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_base_descriptor_t *descriptor,
|
||||
mca_btl_base_tag_t tag)
|
||||
{
|
||||
int rc = 0;
|
||||
mca_btl_ofi_context_t *context;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
|
||||
mca_btl_ofi_endpoint_t *ofi_ep = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t*) descriptor;
|
||||
mca_btl_ofi_frag_completion_t *comp;
|
||||
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
/* This tag is the active message tag for the remote side */
|
||||
frag->hdr.tag = tag;
|
||||
|
||||
/* create completion context */
|
||||
context = get_ofi_context(ofi_btl);
|
||||
comp = mca_btl_ofi_frag_completion_alloc(btl, context, frag,
|
||||
MCA_BTL_OFI_TYPE_SEND);
|
||||
|
||||
/* send the frag. Note that we start sending from BTL header + payload
|
||||
* because we need the other side to have this header information. */
|
||||
rc = fi_send(context->tx_ctx,
|
||||
&frag->hdr,
|
||||
sizeof(mca_btl_ofi_header_t) + frag->hdr.len,
|
||||
NULL,
|
||||
ofi_ep->peer_addr,
|
||||
&comp->comp_ctx);
|
||||
|
||||
if (OPAL_UNLIKELY(FI_SUCCESS != rc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_SEND_INC(ofi_btl);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_ofi_context_t *context,
|
||||
mca_btl_ofi_base_frag_t *frag)
|
||||
{
|
||||
int rc;
|
||||
mca_btl_active_message_callback_t *reg;
|
||||
|
||||
/* Tell PML where the payload is */
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->segments[0].seg_addr.pval = frag+1;
|
||||
frag->segments[0].seg_len = frag->hdr.len;
|
||||
frag->base.des_segment_count = 1;
|
||||
|
||||
/* call the callback */
|
||||
reg = mca_btl_base_active_message_trigger + frag->hdr.tag;
|
||||
reg->cbfunc (&ofi_btl->super, frag->hdr.tag, &frag->base, reg->cbdata);
|
||||
mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS);
|
||||
|
||||
/* repost the recv */
|
||||
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) ofi_btl, context, 1);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
/* might not be that bad but let's just fail here. */
|
||||
BTL_ERROR(("failed reposting receive."));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve,
|
||||
size_t *size, uint32_t flags)
|
||||
{
|
||||
struct iovec iov;
|
||||
size_t length;
|
||||
uint32_t iov_count = 1;
|
||||
mca_btl_ofi_base_frag_t *frag;
|
||||
|
||||
/* allocate the frag with reserve. */
|
||||
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(btl, endpoint,
|
||||
order, reserve, flags);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* pack the data after the reserve */
|
||||
iov.iov_len = *size;
|
||||
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag->segments[0].seg_addr.pval)) + reserve);
|
||||
opal_convertor_pack(convertor, &iov, &iov_count, &length);
|
||||
|
||||
/* pass on frag information */
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->segments[0].seg_len += length;
|
||||
frag->hdr.len += length;
|
||||
*size = length;
|
||||
|
||||
return &frag->base;
|
||||
}
|
95
opal/mca/btl/ofi/btl_ofi_frag.h
Обычный файл
95
opal/mca/btl/ofi/btl_ofi_frag.h
Обычный файл
@ -0,0 +1,95 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(MCA_BTL_OFI_FRAG_H)
|
||||
#define MCA_BTL_OFI_FRAG_H
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
|
||||
|
||||
#define MCA_BTL_OFI_HDR_SIZE sizeof(mca_btl_ofi_header_t)
|
||||
#define MCA_BTL_OFI_FRAG_SIZE 4096
|
||||
#define MCA_BTL_OFI_RECV_SIZE MCA_BTL_OFI_FRAG_SIZE + MCA_BTL_OFI_HDR_SIZE
|
||||
|
||||
#define MCA_BTL_OFI_NUM_SEND_INC(module) \
|
||||
OPAL_ATOMIC_ADD_FETCH64(&(module)->outstanding_send, 1); \
|
||||
if (module->outstanding_send > mca_btl_ofi_component.progress_threshold) { \
|
||||
mca_btl_ofi_component.super.btl_progress(); \
|
||||
}
|
||||
|
||||
#define MCA_BTL_OFI_NUM_SEND_DEC(module) \
|
||||
OPAL_ATOMIC_ADD_FETCH64(&(module)->outstanding_send, -1);
|
||||
|
||||
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t order, size_t size, uint32_t flags);
|
||||
|
||||
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des);
|
||||
|
||||
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_base_descriptor_t *descriptor,
|
||||
mca_btl_base_tag_t tag);
|
||||
|
||||
int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_ofi_context_t *context,
|
||||
mca_btl_ofi_base_frag_t *frag);
|
||||
|
||||
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve,
|
||||
size_t *size, uint32_t flags);
|
||||
|
||||
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
|
||||
(mca_btl_base_module_t *btl,
|
||||
mca_btl_ofi_context_t *context,
|
||||
mca_btl_ofi_base_frag_t *frag,
|
||||
int type);
|
||||
|
||||
static inline mca_btl_ofi_base_frag_t *mca_btl_ofi_frag_alloc (mca_btl_ofi_module_t *ofi_btl, opal_free_list_t *fl,
|
||||
mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t *) opal_free_list_get (fl);
|
||||
|
||||
if (OPAL_LIKELY(NULL != frag)) {
|
||||
frag->free_list = fl;
|
||||
frag->endpoint = endpoint;
|
||||
frag->btl = ofi_btl;
|
||||
}
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
static inline void mca_btl_ofi_frag_return (mca_btl_ofi_base_frag_t *frag)
|
||||
{
|
||||
opal_free_list_return (frag->free_list, &frag->base.super);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ofi_frag_complete (mca_btl_ofi_base_frag_t *frag, int rc) {
|
||||
mca_btl_ofi_module_t *ofi_btl = frag->btl;
|
||||
|
||||
/* call the local callback if specified */
|
||||
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||||
frag->base.des_cbfunc(&ofi_btl->super, frag->endpoint, &frag->base, rc);
|
||||
}
|
||||
|
||||
/* If the BTL has ownership, return it to the free list, */
|
||||
if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
||||
mca_btl_ofi_frag_return (frag);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* !defined(MCA_BTL_OFI_FRAG_H) */
|
447
opal/mca/btl/ofi/btl_ofi_module.c
Обычный файл
447
opal/mca/btl/ofi/btl_ofi_module.c
Обычный файл
@ -0,0 +1,447 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
#include <string.h>
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
#include "btl_ofi_frag.h"
|
||||
|
||||
static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
|
||||
size_t nprocs, opal_proc_t **opal_procs,
|
||||
mca_btl_base_endpoint_t **peers,
|
||||
opal_bitmap_t *reachable)
|
||||
{
|
||||
int rc;
|
||||
int count;
|
||||
char *ep_name = NULL;
|
||||
size_t namelen = mca_btl_ofi_component.namelen;
|
||||
|
||||
opal_proc_t *proc;
|
||||
mca_btl_base_endpoint_t *ep;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
|
||||
for (size_t i = 0 ; i < nprocs ; ++i) {
|
||||
|
||||
proc = opal_procs[i];
|
||||
|
||||
/* See if we already have an endpoint for this proc. */
|
||||
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void **) &ep);
|
||||
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name)));
|
||||
peers[i] = ep;
|
||||
|
||||
} else {
|
||||
/* We don't have this endpoint yet, create one */
|
||||
peers[i] = mca_btl_ofi_endpoint_create (proc, ofi_btl->ofi_endpoint);
|
||||
BTL_VERBOSE(("creating peer %p", peers[i]));
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == peers[i])) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Add this endpoint to the lookup table */
|
||||
(void) opal_hash_table_set_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void**) &ep);
|
||||
}
|
||||
|
||||
OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version,
|
||||
&peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_ERROR(("error receiving modex"));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
/* get peer fi_addr */
|
||||
count = fi_av_insert(ofi_btl->av, /* Address vector to insert */
|
||||
ep_name, /* peer name */
|
||||
1, /* amount to insert */
|
||||
&peers[i]->peer_addr, /* return peer address here */
|
||||
0, /* flags */
|
||||
NULL); /* context */
|
||||
|
||||
/* if succeed, add this proc and mark reachable */
|
||||
if (count == 1) { /* we inserted 1 address. */
|
||||
opal_list_append (&ofi_btl->endpoints, &peers[i]->super);
|
||||
opal_bitmap_set_bit(reachable, i);
|
||||
} else {
|
||||
BTL_VERBOSE(("fi_av_insert failed with rc = %d", count));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs,
|
||||
opal_proc_t **procs, mca_btl_base_endpoint_t **peers)
|
||||
{
|
||||
int rc;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_base_endpoint_t *ep;
|
||||
|
||||
for (size_t i = 0 ; i < nprocs ; ++i) {
|
||||
if (peers[i]) {
|
||||
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i], (void **) &ep);
|
||||
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
/* remove the address from AV. */
|
||||
rc = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
|
||||
if (rc < 0) {
|
||||
/* remove failed. this should not happen. */
|
||||
/* Lets not crash because we failed to remove an address. */
|
||||
BTL_ERROR(("fi_av_remove failed with error %d:%s",
|
||||
rc, fi_strerror(-rc)));
|
||||
}
|
||||
|
||||
/* remove and free MPI endpoint from the list. */
|
||||
opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
|
||||
(void) opal_hash_table_remove_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i]);
|
||||
OBJ_RELEASE(peers[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module)
|
||||
{
|
||||
if (!module->initialized) {
|
||||
mca_rcache_base_resources_t rcache_resources;
|
||||
char *tmp;
|
||||
|
||||
(void) asprintf (&tmp, "ofi.%s", module->linux_device_name);
|
||||
|
||||
rcache_resources.cache_name = tmp;
|
||||
rcache_resources.reg_data = (void *) module;
|
||||
rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t);
|
||||
rcache_resources.register_mem = mca_btl_ofi_reg_mem;
|
||||
rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem;
|
||||
|
||||
module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources);
|
||||
free (tmp);
|
||||
|
||||
if (NULL == module->rcache) {
|
||||
/* something when horribly wrong */
|
||||
BTL_ERROR(("cannot create rcache"));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
module->initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Register a memory region for put/get/atomic operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
||||
* @param base (IN) Pointer to start of region
|
||||
* @param size (IN) Size of region
|
||||
* @param flags (IN) Flags indicating what operation will be performed. Valid
|
||||
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
|
||||
* and MCA_BTL_DES_FLAGS_ATOMIC
|
||||
*
|
||||
* @returns a memory registration handle valid for both local and remote operations
|
||||
* @returns NULL if the region could not be registered
|
||||
*
|
||||
* This function registers the specified region with the hardware for use with
|
||||
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
||||
* functions. Care should be taken to not hold an excessive number of registrations
|
||||
* as they may use limited system/NIC resources.
|
||||
*/
|
||||
static struct mca_btl_base_registration_handle_t *
|
||||
mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_reg_t *reg;
|
||||
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
|
||||
int rc;
|
||||
|
||||
rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags,
|
||||
(mca_rcache_base_registration_t **) ®);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ®->handle;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Deregister a memory region
|
||||
*
|
||||
* @param btl (IN) BTL module region was registered with
|
||||
* @param handle (IN) BTL registration handle to deregister
|
||||
*
|
||||
* This function deregisters the memory region associated with the specified handle. Care
|
||||
* should be taken to not perform any RDMA or atomic operation on this memory region
|
||||
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
||||
* a remote node.
|
||||
*/
|
||||
static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_reg_t *reg =
|
||||
(mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle));
|
||||
|
||||
(void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, ®->base);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg)
|
||||
{
|
||||
int rc;
|
||||
static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE;
|
||||
|
||||
mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data;
|
||||
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg;
|
||||
|
||||
rc = fi_mr_reg(btl->domain, base, size, access_flags, 0,
|
||||
(uint64_t) reg, 0, &ur->ur_mr, NULL);
|
||||
if (0 != rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ur->handle.rkey = fi_mr_key(ur->ur_mr);
|
||||
ur->handle.desc = fi_mr_desc(ur->ur_mr);
|
||||
|
||||
/* In case the provider doesn't support FI_MR_VIRT_ADDR,
|
||||
* we need to reference the remote address by the distance from base registered
|
||||
* address. We keep this information to use in rdma/atomic operations. */
|
||||
if (btl->use_virt_addr) {
|
||||
ur->handle.base_addr = 0;
|
||||
} else {
|
||||
ur->handle.base_addr = base;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
|
||||
{
|
||||
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg;
|
||||
|
||||
if (ur->ur_mr != NULL) {
|
||||
if (0 != fi_close(&ur->ur_mr->fid)) {
|
||||
BTL_ERROR(("%s: error unpinning memory mr=%p: %s",
|
||||
__func__, (void*) ur->ur_mr, strerror(errno)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleanup/release module resources.
|
||||
*/
|
||||
|
||||
int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
|
||||
{
|
||||
int i;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *endpoint, *next;
|
||||
|
||||
assert(btl);
|
||||
|
||||
/* clear the rcache */
|
||||
if (ofi_btl->rcache) {
|
||||
mca_rcache_base_module_destroy (ofi_btl->rcache);
|
||||
ofi_btl->rcache = NULL;
|
||||
}
|
||||
|
||||
/* Close basic ep before closing its attached resources. */
|
||||
if (NULL != ofi_btl->ofi_endpoint && !ofi_btl->is_scalable_ep) {
|
||||
fi_close(&ofi_btl->ofi_endpoint->fid);
|
||||
ofi_btl->ofi_endpoint = NULL;
|
||||
}
|
||||
|
||||
/* loop over all the contexts */
|
||||
for (i=0; i < ofi_btl->num_contexts; i++) {
|
||||
mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
|
||||
}
|
||||
free(ofi_btl->contexts);
|
||||
|
||||
if (NULL != ofi_btl->ofi_endpoint) {
|
||||
fi_close(&ofi_btl->ofi_endpoint->fid);
|
||||
}
|
||||
|
||||
/* close ep before closing av */
|
||||
if (NULL != ofi_btl->av) {
|
||||
fi_close(&ofi_btl->av->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->domain) {
|
||||
fi_close(&ofi_btl->domain->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->fabric) {
|
||||
fi_close(&ofi_btl->fabric->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->fabric_info) {
|
||||
fi_freeinfo(ofi_btl->fabric_info);
|
||||
}
|
||||
|
||||
/* clean up any leftover endpoints */
|
||||
OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) {
|
||||
opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super);
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&ofi_btl->endpoints);
|
||||
OBJ_DESTRUCT(&ofi_btl->id_to_endpoint);
|
||||
OBJ_DESTRUCT(&ofi_btl->module_lock);
|
||||
|
||||
free (btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Post wildcard recvs on the rx context. */
|
||||
int mca_btl_ofi_post_recvs (mca_btl_base_module_t *module,
|
||||
mca_btl_ofi_context_t *context,
|
||||
int count)
|
||||
{
|
||||
int i;
|
||||
int rc;
|
||||
mca_btl_ofi_base_frag_t *frag;
|
||||
mca_btl_ofi_frag_completion_t *comp;
|
||||
|
||||
for (i=0; i < count; i++) {
|
||||
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(module,
|
||||
NULL,
|
||||
0,
|
||||
MCA_BTL_OFI_FRAG_SIZE,
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (NULL == frag) {
|
||||
BTL_ERROR(("cannot allocate recv frag."));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
comp = mca_btl_ofi_frag_completion_alloc (module,
|
||||
context,
|
||||
frag,
|
||||
MCA_BTL_OFI_TYPE_RECV);
|
||||
|
||||
rc = fi_recv (context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE,
|
||||
NULL, FI_ADDR_UNSPEC, &comp->comp_ctx);
|
||||
|
||||
if (FI_SUCCESS != rc) {
|
||||
BTL_ERROR(("cannot post recvs"));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Allocate and fill out the module capabilities according to operation mode. */
|
||||
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode)
|
||||
{
|
||||
mca_btl_ofi_module_t *module;
|
||||
|
||||
/* allocate module */
|
||||
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
|
||||
if (NULL == module) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* fill in the defaults */
|
||||
*module = mca_btl_ofi_module_template;
|
||||
|
||||
if (mode == MCA_BTL_OFI_MODE_ONE_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
|
||||
|
||||
module->super.btl_put = mca_btl_ofi_put;
|
||||
module->super.btl_get = mca_btl_ofi_get;
|
||||
module->super.btl_atomic_op = mca_btl_ofi_aop;
|
||||
module->super.btl_atomic_fop = mca_btl_ofi_afop;
|
||||
module->super.btl_atomic_cswap = mca_btl_ofi_acswap;
|
||||
module->super.btl_flush = mca_btl_ofi_flush;
|
||||
|
||||
module->super.btl_register_mem = mca_btl_ofi_register_mem;
|
||||
module->super.btl_deregister_mem = mca_btl_ofi_deregister_mem;
|
||||
|
||||
module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS |
|
||||
MCA_BTL_FLAGS_ATOMIC_OPS |
|
||||
MCA_BTL_FLAGS_RDMA;
|
||||
|
||||
module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_SWAP |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_32BIT ;
|
||||
|
||||
module->super.btl_put_limit = 1 << 23;
|
||||
module->super.btl_put_alignment = 0;
|
||||
|
||||
module->super.btl_get_limit = 1 << 23;
|
||||
module->super.btl_get_alignment = 0;
|
||||
|
||||
module->super.btl_registration_handle_size =
|
||||
sizeof(mca_btl_base_registration_handle_t);
|
||||
}
|
||||
|
||||
if (mode == MCA_BTL_OFI_MODE_TWO_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
|
||||
|
||||
module->super.btl_alloc = mca_btl_ofi_alloc;
|
||||
module->super.btl_free = mca_btl_ofi_free;
|
||||
module->super.btl_prepare_src = mca_btl_ofi_prepare_src;
|
||||
|
||||
module->super.btl_send = mca_btl_ofi_send;
|
||||
|
||||
module->super.btl_flags |= MCA_BTL_FLAGS_SEND;
|
||||
module->super.btl_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
|
||||
module->super.btl_max_send_size = MCA_BTL_OFI_FRAG_SIZE;
|
||||
module->super.btl_rndv_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
|
||||
|
||||
/* If two sided is enabled, we expected that the user knows exactly what
|
||||
* they want. We bump the priority to maximum, making this BTL the default. */
|
||||
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
||||
}
|
||||
|
||||
if (mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
|
||||
module->super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
|
||||
module->super.btl_rdma_pipeline_send_length = 8 * 1024;
|
||||
}
|
||||
|
||||
return module;
|
||||
}
|
||||
|
||||
mca_btl_ofi_module_t mca_btl_ofi_module_template = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_ofi_component.super,
|
||||
.btl_add_procs = mca_btl_ofi_add_procs,
|
||||
.btl_del_procs = mca_btl_ofi_del_procs,
|
||||
.btl_finalize = mca_btl_ofi_finalize,
|
||||
}
|
||||
};
|
159
opal/mca/btl/ofi/btl_ofi_rdma.c
Обычный файл
159
opal/mca/btl/ofi/btl_ofi_rdma.c
Обычный файл
@ -0,0 +1,159 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofi_rdma.h"
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ofi_rdma_completion_t,
|
||||
opal_free_list_item_t,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
mca_btl_ofi_rdma_completion_t *mca_btl_ofi_rdma_completion_alloc (
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_ofi_context_t *ofi_context,
|
||||
void *local_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata,
|
||||
int type)
|
||||
{
|
||||
assert(btl);
|
||||
assert(endpoint);
|
||||
assert(ofi_context);
|
||||
|
||||
mca_btl_ofi_rdma_completion_t *comp;
|
||||
|
||||
comp = (mca_btl_ofi_rdma_completion_t*) opal_free_list_get(&ofi_context->rdma_comp_list);
|
||||
assert(comp);
|
||||
|
||||
comp->base.btl = btl;
|
||||
comp->base.endpoint = endpoint;
|
||||
comp->base.my_context = ofi_context;
|
||||
comp->base.my_list = &ofi_context->rdma_comp_list;
|
||||
comp->base.type = type;
|
||||
|
||||
comp->local_address = local_address;
|
||||
comp->local_handle = local_handle;
|
||||
comp->cbfunc = cbfunc;
|
||||
comp->cbcontext = cbcontext;
|
||||
comp->cbdata = cbdata;
|
||||
|
||||
comp->comp_ctx.comp = comp;
|
||||
|
||||
return comp;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
|
||||
int rc;
|
||||
mca_btl_ofi_rdma_completion_t *comp;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_context_t *ofi_context;
|
||||
|
||||
ofi_context = get_ofi_context(ofi_btl);
|
||||
|
||||
/* create completion context */
|
||||
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
|
||||
ofi_context,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_GET);
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
/* Remote write data across the wire */
|
||||
rc = fi_read(ofi_context->tx_ctx,
|
||||
local_address, size, /* payload */
|
||||
local_handle->desc,
|
||||
btl_endpoint->peer_addr,
|
||||
remote_address, remote_handle->rkey,
|
||||
&comp->comp_ctx); /* completion context */
|
||||
|
||||
if (-FI_EAGAIN == rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_context_t *ofi_context;
|
||||
|
||||
ofi_context = get_ofi_context(ofi_btl);
|
||||
|
||||
/* create completion context */
|
||||
mca_btl_ofi_rdma_completion_t *comp;
|
||||
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
|
||||
ofi_context,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_PUT);
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
/* Remote write data across the wire */
|
||||
rc = fi_write(ofi_context->tx_ctx,
|
||||
local_address, size, /* payload */
|
||||
local_handle->desc,
|
||||
btl_endpoint->peer_addr,
|
||||
remote_address, remote_handle->rkey,
|
||||
&comp->comp_ctx); /* completion context */
|
||||
|
||||
if (-FI_EAGAIN == rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
int mca_btl_ofi_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
|
||||
while(ofi_btl->outstanding_rdma > 0) {
|
||||
(void) mca_btl_ofi_component.super.btl_progress();
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
42
opal/mca/btl/ofi/btl_ofi_rdma.h
Обычный файл
42
opal/mca/btl/ofi/btl_ofi_rdma.h
Обычный файл
@ -0,0 +1,42 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef BTL_OFI_RDMA_H
|
||||
#define BTL_OFI_RDMA_H
|
||||
|
||||
#include "opal/threads/thread_usage.h"
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
|
||||
mca_btl_ofi_rdma_completion_t *mca_btl_ofi_rdma_completion_alloc (
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_ofi_context_t *ofi_context,
|
||||
void *local_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata,
|
||||
int type);
|
||||
|
||||
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
|
||||
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); \
|
||||
if (module->outstanding_rdma > mca_btl_ofi_component.progress_threshold){ \
|
||||
mca_btl_ofi_component.super.btl_progress(); \
|
||||
}
|
||||
|
||||
#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \
|
||||
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1);
|
||||
|
||||
#endif /* !defined(BTL_OFI_RDMA_H) */
|
||||
|
52
opal/mca/btl/ofi/configure.m4
Обычный файл
52
opal/mca/btl/ofi/configure.m4
Обычный файл
@ -0,0 +1,52 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 QLogic Corp. All rights reserved.
|
||||
# Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
|
||||
# Copyright (c) 2011-2018 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2018 Intel, inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# OPAL_CHECK_OFI(prefix, [action-if-found], [action-if-not-found])
|
||||
# --------------------------------------------------------
|
||||
# check if OFI support can be found. sets prefix_{CPPFLAGS,
|
||||
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
|
||||
# support, otherwise executes action-if-not-found
|
||||
|
||||
AC_DEFUN([MCA_opal_btl_ofi_CONFIG],[
|
||||
OPAL_VAR_SCOPE_PUSH([opal_btl_ofi_happy CPPFLAGS_save])
|
||||
|
||||
AC_CONFIG_FILES([opal/mca/btl/ofi/Makefile])
|
||||
|
||||
# Check for OFI
|
||||
OPAL_CHECK_OFI
|
||||
|
||||
opal_btl_ofi_happy=0
|
||||
AS_IF([test "$opal_ofi_happy" = "yes"],
|
||||
[CPPFLAGS_save=$CPPFLAGS
|
||||
CPPFLAGS="$opal_ofi_CPPFLAGS $CPPFLAGS"
|
||||
AC_CHECK_DECL([FI_MR_VIRT_ADDR], [opal_btl_ofi_happy=1], [],
|
||||
[#include <rdma/fabric.h>])
|
||||
CPPFLAGS=$CPPFLAGS_save])
|
||||
AS_IF([test $opal_btl_ofi_happy -eq 1],
|
||||
[$1],
|
||||
[$2])
|
||||
|
||||
OPAL_VAR_SCOPE_POP
|
||||
])dnl
|
7
opal/mca/btl/ofi/owner.txt
Обычный файл
7
opal/mca/btl/ofi/owner.txt
Обычный файл
@ -0,0 +1,7 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner:Intel
|
||||
status:active
|
@ -12,7 +12,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2010-2019 Cisco Systems, Inc. All rights reserved
|
||||
# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved
|
||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
@ -100,25 +100,11 @@ AC_DEFUN([_OPAL_BTL_USNIC_DO_CONFIG],[
|
||||
OPAL_CHECK_OFI
|
||||
opal_btl_usnic_happy=$opal_ofi_happy])
|
||||
|
||||
# The usnic BTL requires at least OFI libfabric v1.1 (there was a
|
||||
# critical bug in libfabric v1.0).
|
||||
# The usnic BTL requires at least OFI libfabric v1.3.
|
||||
AS_IF([test "$opal_btl_usnic_happy" = "yes"],
|
||||
[AC_MSG_CHECKING([whether OFI libfabric is >= v1.1])
|
||||
opal_btl_usnic_CPPFLAGS_save=$CPPFLAGS
|
||||
CPPFLAGS="$opal_ofi_CPPFLAGS $CPPFLAGS"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fabric.h>]],
|
||||
[[
|
||||
#if !defined(FI_MAJOR_VERSION)
|
||||
#error your version of OFI libfabric is too old
|
||||
#elif FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION) < FI_VERSION(1, 1)
|
||||
#error your version of OFI libfabric is too old
|
||||
#endif
|
||||
]])],
|
||||
[opal_btl_usnic_happy=yes],
|
||||
[opal_btl_usnic_happy=no])
|
||||
AC_MSG_RESULT([$opal_btl_usnic_happy])
|
||||
CPPFLAGS=$opal_btl_usnic_CPPFLAGS_save
|
||||
])
|
||||
[OPAL_CHECK_OFI_VERSION_GE([1,3],
|
||||
[],
|
||||
[opal_btl_usnic_happy=no])])
|
||||
|
||||
# Make sure we can find the OFI libfabric usnic extensions header
|
||||
AS_IF([test "$opal_btl_usnic_happy" = "yes" ],
|
||||
|
106
opal/mca/common/ofi/Makefile.am
Обычный файл
106
opal/mca/common/ofi/Makefile.am
Обычный файл
@ -0,0 +1,106 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
|
||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2019 Hewlett Packard Enterprise. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# A word of explanation...
|
||||
#
|
||||
# This library is linked against various MCA components because the
|
||||
# support for ofis is needed in various places.
|
||||
#
|
||||
# Note that building this common component statically and linking
|
||||
# against other dynamic components is *not* supported!
|
||||
|
||||
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
|
||||
|
||||
# Header files
|
||||
|
||||
headers = \
|
||||
common_ofi.h
|
||||
|
||||
# Source files
|
||||
|
||||
sources = \
|
||||
common_ofi.c
|
||||
|
||||
# As per above, we'll either have an installable or noinst result.
|
||||
# The installable one should follow the same MCA prefix naming rules
|
||||
# (i.e., libmca_<type>_<name>.la). The noinst one can be named
|
||||
# whatever it wants, although libmca_<type>_<name>_noinst.la is
|
||||
# recommended.
|
||||
|
||||
# To simplify components that link to this library, we will *always*
|
||||
# have an output libtool library named libmca_<type>_<name>.la -- even
|
||||
# for case 2) described above (i.e., so there's no conditional logic
|
||||
# necessary in component Makefile.am's that link to this library).
|
||||
# Hence, if we're creating a noinst version of this library (i.e.,
|
||||
# case 2), we sym link it to the libmca_<type>_<name>.la name
|
||||
# (libtool will do the Right Things under the covers). See the
|
||||
# all-local and clean-local rules, below, for how this is effected.
|
||||
|
||||
lib_LTLIBRARIES =
|
||||
noinst_LTLIBRARIES =
|
||||
comp_inst = lib@OPAL_LIB_PREFIX@mca_common_ofi.la
|
||||
comp_noinst = lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst.la
|
||||
|
||||
|
||||
if MCA_BUILD_opal_common_ofi_DSO
|
||||
lib_LTLIBRARIES += $(comp_inst)
|
||||
else
|
||||
noinst_LTLIBRARIES += $(comp_noinst)
|
||||
endif
|
||||
|
||||
lib@OPAL_LIB_PREFIX@mca_common_ofi_la_SOURCES = $(headers) $(sources)
|
||||
lib@OPAL_LIB_PREFIX@mca_common_ofi_la_LDFLAGS = \
|
||||
$(opal_ofi_LDFLAGS) \
|
||||
-version-info $(libmca_opal_common_ofi_so_version)
|
||||
lib@OPAL_LIB_PREFIX@mca_common_ofi_la_LIBADD = $(opal_ofi_LIBS)
|
||||
|
||||
lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst_la_SOURCES = $(headers) $(sources)
|
||||
lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst_la_LDFLAGS = $(opal_ofi_LDFLAGS)
|
||||
lib@OPAL_LIB_PREFIX@mca_common_ofi_noinst_la_LIBADD = $(opal_ofi_LIBS)
|
||||
|
||||
# Conditionally install the header files
|
||||
|
||||
if WANT_INSTALL_HEADERS
|
||||
opaldir = $(opalincludedir)/$(subdir)
|
||||
opal_HEADERS = $(headers)
|
||||
endif
|
||||
|
||||
# These two rules will sym link the "noinst" libtool library filename
|
||||
# to the installable libtool library filename in the case where we are
|
||||
# compiling this component statically (case 2), described above).
|
||||
|
||||
V=0
|
||||
OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V)
|
||||
ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY)
|
||||
ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`;
|
||||
|
||||
all-local:
|
||||
$(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \
|
||||
rm -f "$(comp_inst)"; \
|
||||
$(LN_S) "$(comp_noinst)" "$(comp_inst)"; \
|
||||
fi
|
||||
|
||||
clean-local:
|
||||
if test -z "$(lib_LTLIBRARIES)"; then \
|
||||
rm -f "$(comp_inst)"; \
|
||||
fi
|
311
opal/mca/common/ofi/common_ofi.c
Обычный файл
311
opal/mca/common/ofi/common_ofi.c
Обычный файл
@ -0,0 +1,311 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common_ofi.h"
|
||||
|
||||
int mca_common_ofi_register_mca_variables(void)
|
||||
{
|
||||
if (fi_version() >= FI_VERSION(1,0)) {
|
||||
return OPAL_SUCCESS;
|
||||
} else {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* check that the tx attributes match */
|
||||
static int
|
||||
check_tx_attr(struct fi_tx_attr *provider_info,
|
||||
struct fi_tx_attr *provider)
|
||||
{
|
||||
if (!(provider->msg_order & ~(provider_info->msg_order)) &&
|
||||
!(provider->op_flags & ~(provider_info->op_flags)) &&
|
||||
(provider->inject_size == provider_info->inject_size)) {
|
||||
return 0;
|
||||
} else {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* check that the rx attributes match */
|
||||
static int
|
||||
check_rx_attr(struct fi_rx_attr *provider_info,
|
||||
struct fi_rx_attr *provider)
|
||||
{
|
||||
if (!(provider->msg_order & ~(provider_info->msg_order)) &&
|
||||
!(provider->op_flags & ~(provider_info->op_flags))) {
|
||||
return 0;
|
||||
} else {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* check that the ep attributes match */
|
||||
static int
|
||||
check_ep_attr(struct fi_ep_attr *provider_info,
|
||||
struct fi_ep_attr *provider)
|
||||
{
|
||||
if (!(provider->type & ~(provider_info->type)) &&
|
||||
!(provider->mem_tag_format & ~(provider_info->mem_tag_format)) &&
|
||||
(provider->max_msg_size == provider_info->max_msg_size) &&
|
||||
(provider->tx_ctx_cnt == provider_info->tx_ctx_cnt) &&
|
||||
(provider->rx_ctx_cnt == provider_info->rx_ctx_cnt)) {
|
||||
return 0;
|
||||
} else {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* check that the provider attributes match */
|
||||
static int
|
||||
check_provider_attr(struct fi_info *provider_info,
|
||||
struct fi_info *provider)
|
||||
{
|
||||
/* make sure both info are the same provider and provide the same attributes */
|
||||
if (0 == strcmp(provider_info->fabric_attr->prov_name, provider->fabric_attr->prov_name) &&
|
||||
!check_tx_attr(provider_info->tx_attr, provider->tx_attr) &&
|
||||
!check_rx_attr(provider_info->rx_attr, provider->rx_attr) &&
|
||||
!check_ep_attr(provider_info->ep_attr, provider->ep_attr) &&
|
||||
!(provider_info->caps & ~(provider->caps)) &&
|
||||
!(provider_info->mode & ~(provider->mode))) {
|
||||
return 0;
|
||||
} else {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
||||
/* Check if a process and a pci device share the same cpuset
|
||||
* @param (IN) pci struct fi_pci_attr pci device attributes,
|
||||
* used to find hwloc object for device.
|
||||
*
|
||||
* @param (IN) topology hwloc_topology_t topology to get the cpusets
|
||||
* from
|
||||
*
|
||||
* @param (OUT) returns true if cpusets match and false if
|
||||
* cpusets do not match or an error prevents comparison
|
||||
*
|
||||
* Uses a pci device to find an ancestor that contains a cpuset, and
|
||||
* determines if it intersects with the cpuset that the process is bound to.
|
||||
* if the process is not bound, or if a cpuset is unavailable for whatever
|
||||
* reason, returns false. Otherwise, returns the result of
|
||||
* hwloc_cpuset_intersects()
|
||||
*/
|
||||
static bool
|
||||
compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
|
||||
{
|
||||
bool result = false;
|
||||
int ret;
|
||||
hwloc_bitmap_t proc_cpuset;
|
||||
hwloc_obj_t obj = NULL;
|
||||
|
||||
/* Cannot find topology info if no topology is found */
|
||||
if (NULL == topology) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Allocate memory for proc_cpuset */
|
||||
proc_cpuset = hwloc_bitmap_alloc();
|
||||
if (NULL == proc_cpuset) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Fill cpuset with the collection of cpu cores that the process runs on */
|
||||
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
|
||||
if (0 > ret) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Get the pci device from bdf */
|
||||
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id,
|
||||
pci.device_id, pci.function_id);
|
||||
if (NULL == obj) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* pcidev objects don't have cpusets so find the first non-io object above */
|
||||
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
|
||||
if (NULL != obj) {
|
||||
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
|
||||
}
|
||||
|
||||
error:
|
||||
hwloc_bitmap_free(proc_cpuset);
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Count providers returns the number of providers present in an fi_info list
|
||||
* @param (IN) provider_list struct fi_info* list of providers available
|
||||
*
|
||||
* @param (OUT) int number of providers present in the list
|
||||
*
|
||||
* returns 0 if the list is NULL
|
||||
*/
|
||||
static int
|
||||
count_providers(struct fi_info* provider_list)
|
||||
{
|
||||
struct fi_info* dev = provider_list;
|
||||
int num_provider = 0;
|
||||
|
||||
while (NULL != dev) {
|
||||
num_provider++;
|
||||
dev = dev->next;
|
||||
}
|
||||
|
||||
return num_provider;
|
||||
}
|
||||
|
||||
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
|
||||
*
|
||||
* Initializes opal_hwloc_topology to access hardware topology if not previously
|
||||
* initialized
|
||||
*
|
||||
* There are 3 main cases that this covers:
|
||||
*
|
||||
* 1. If the first provider passed into this function is the only valid
|
||||
* provider, this provider is returned.
|
||||
*
|
||||
* 2. If there is more than 1 provider that matches the type of the first
|
||||
* provider in the list, and the BDF data
|
||||
* is available then a provider is selected based on locality of device
|
||||
* cpuset and process cpuset and tries to ensure that processes are distributed
|
||||
* evenly across NICs. This has two separate cases:
|
||||
*
|
||||
* i. There is one or more provider local to the process:
|
||||
*
|
||||
* (local rank % number of providers of the same type that share the process cpuset)
|
||||
* is used to select one of these providers.
|
||||
*
|
||||
* ii. There is no provider that is local to the process:
|
||||
*
|
||||
* (local rank % number of providers of the same type)
|
||||
* is used to select one of these providers
|
||||
*
|
||||
* 3. If there is more than 1 providers of the same type in the list, and the BDF data
|
||||
* is not available (the ofi version does not support fi_info.nic or the
|
||||
* provider does not support BDF) then (local rank % number of providers of the same type)
|
||||
* is used to select one of these providers
|
||||
*
|
||||
* @param provider_list (IN) struct fi_info* An initially selected
|
||||
* provider NIC. The provider name and
|
||||
* attributes are used to restrict NIC
|
||||
* selection. This provider is returned if the
|
||||
* NIC selection fails.
|
||||
*
|
||||
* @param local_index (IN) int The local rank of the process. Used to
|
||||
* select one valid NIC if there is a case
|
||||
* where more than one can be selected. This
|
||||
* could occur when more than one provider
|
||||
* shares the same cpuset as the process.
|
||||
*
|
||||
* @param provider (OUT) struct fi_info* object with the selected
|
||||
* provider if the selection succeeds
|
||||
* if the selection fails, returns the fi_info
|
||||
* object that was initially provided.
|
||||
*
|
||||
* All errors should be recoverable and will return the initially provided
|
||||
* provider. However, if an error occurs we can no longer guarantee
|
||||
* that the provider returned is local to the process or that the processes will
|
||||
* balance across available NICs.
|
||||
*/
|
||||
struct fi_info*
|
||||
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
|
||||
{
|
||||
struct fi_info *provider = provider_list, *current_provider = provider_list;
|
||||
struct fi_info **provider_table;
|
||||
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
||||
struct fi_pci_attr pci;
|
||||
#endif
|
||||
int ret;
|
||||
unsigned int num_provider = 0, provider_limit = 0;
|
||||
bool provider_found = false, cpusets_match = false;
|
||||
|
||||
/* Initialize opal_hwloc_topology if it is not already */
|
||||
ret = opal_hwloc_base_get_topology();
|
||||
if (0 > ret) {
|
||||
/* Provider selection can continue but there is no guarantee of locality */
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d:Failed to initialize topology\n",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
provider_limit = count_providers(provider_list);
|
||||
|
||||
/* Allocate memory for provider table */
|
||||
provider_table = calloc(provider_limit, sizeof(struct fi_info*));
|
||||
if (NULL == provider_table) {
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"%s:%d:Failed to allocate memory for provider table\n",
|
||||
__FILE__, __LINE__);
|
||||
return provider_list;
|
||||
}
|
||||
|
||||
current_provider = provider;
|
||||
|
||||
/* Cycle through remaining fi_info objects, looking for alike providers */
|
||||
while (NULL != current_provider) {
|
||||
if (!check_provider_attr(provider, current_provider)) {
|
||||
cpusets_match = false;
|
||||
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
||||
if (NULL != current_provider->nic) {
|
||||
pci = current_provider->nic->bus_attr->attr.pci;
|
||||
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Reset the list if the cpusets match and no other provider was
|
||||
* found on the same cpuset as the process.
|
||||
*/
|
||||
if (cpusets_match && !provider_found) {
|
||||
provider_found = true;
|
||||
num_provider = 0;
|
||||
}
|
||||
|
||||
/* Add the provider to the provider list if the cpusets match or if
|
||||
* no other provider was found on the same cpuset as the process.
|
||||
*/
|
||||
if (cpusets_match || !provider_found) {
|
||||
provider_table[num_provider] = current_provider;
|
||||
num_provider++;
|
||||
}
|
||||
}
|
||||
current_provider = current_provider->next;
|
||||
}
|
||||
|
||||
/* Select provider from local rank % number of providers */
|
||||
if (num_provider > 0) {
|
||||
provider = provider_table[local_index % num_provider];
|
||||
}
|
||||
|
||||
#if OPAL_OFI_PCI_DATA_AVAILABLE
|
||||
if (NULL != provider->nic) {
|
||||
pci = provider->nic->bus_attr->attr.pci;
|
||||
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if OPAL_DEBUG_ENABLE
|
||||
opal_output_verbose(1, opal_common_ofi.output,
|
||||
"local rank: %d device: %s cpusets match: %s\n",
|
||||
local_index, provider->domain_attr->name,
|
||||
cpusets_match ? "true" : "false");
|
||||
#endif
|
||||
|
||||
free(provider_table);
|
||||
return provider;
|
||||
}
|
21
opal/mca/common/ofi/common_ofi.h
Обычный файл
21
opal/mca/common/ofi/common_ofi.h
Обычный файл
@ -0,0 +1,21 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OPAL_MCA_COMMON_OFI_H
|
||||
#define OPAL_MCA_COMMON_OFI_H
|
||||
#include <rdma/fabric.h>
|
||||
|
||||
OPAL_DECLSPEC int mca_common_ofi_register_mca_variables(void);
|
||||
|
||||
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
|
||||
|
||||
#endif /* OPAL_MCA_COMMON_OFI_H */
|
35
opal/mca/common/ofi/configure.m4
Обычный файл
35
opal/mca/common/ofi/configure.m4
Обычный файл
@ -0,0 +1,35 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
|
||||
# Copyright (c) 2013 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2019 Hewlett Packard Enterprise. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AC_DEFUN([MCA_opal_common_ofi_CONFIG],[
|
||||
AC_CONFIG_FILES([opal/mca/common/ofi/Makefile])
|
||||
|
||||
OPAL_CHECK_OFI
|
||||
|
||||
# Note that $opal_common_ofi_happy is
|
||||
# used in other configure.m4's to know if ofi configured
|
||||
# successfully.
|
||||
AS_IF([test "$opal_ofi_happy" = "yes"],
|
||||
[opal_common_ofi_happy=yes
|
||||
common_ofi_WRAPPER_EXTRA_LDFLAGS=$opal_ofi_LDFLAGS
|
||||
common_ofi_WRAPPER_EXTRA_LIBS=$opal_ofi_LIBS
|
||||
$1],
|
||||
[opal_common_ofi_happy=no
|
||||
$2])
|
||||
|
||||
])dnl
|
7
opal/mca/common/ofi/owner.txt
Обычный файл
7
opal/mca/common/ofi/owner.txt
Обычный файл
@ -0,0 +1,7 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: Hewlett Packard Enterprise
|
||||
status:active
|
Загрузка…
x
Ссылка в новой задаче
Block a user