
-> Added new targets in Makefile.am to call a new build script generate-opt-funcs.pl to generate specialized functions for each *.pm file. -> Added new perl module *.pm files for send,isend,irecv,iprobe,improbe which are loaded by generate-opt-funcs.pl to create new source files that correspond to the name of the .pm file to be used as part of MTL OFI. -> Added mtl_ofi_opt.pm.template and updated README with details on the specialization features and how to add additional specialization support. -> Added new opt_common/mtl_ofi_opt_common.pm containing common functions for generating the specialized functions used by all other *.pm modules. -> Added new mtl_ofi.h which includes the definitions for the function symbol table for storing the specialized functions along with the definitions for the initialization functions for the corresponding function pointers. -> Based off the OFI provider capabilities the specialized function pointers are assigned at mtl_ofi_component_init to the corresponding MTL OFI function. -> mca_mtl_ofi_module_t has been updated with the symbol table struct which is assigned at component init. Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
295 строки
13 KiB
Plaintext
295 строки
13 KiB
Plaintext
OFI MTL:
|
|
--------
|
|
The OFI MTL supports Libfabric (a.k.a. Open Fabrics Interfaces OFI,
|
|
https://ofiwg.github.io/libfabric/) tagged APIs (fi_tagged(3)). At
|
|
initialization time, the MTL queries libfabric for providers supporting tag matching
|
|
(fi_getinfo(3)). Libfabric will return a list of providers that satisfy the requested
|
|
capabilities, having the most performant one at the top of the list.
|
|
The user may modify the OFI provider selection with mca parameters
|
|
mtl_ofi_provider_include or mtl_ofi_provider_exclude.
|
|
|
|
PROGRESS:
|
|
---------
|
|
The MTL registers a progress function to opal_progress. There is currently
|
|
no support for asynchronous progress. The progress function reads multiple events
|
|
from the OFI provider Completion Queue (CQ) per iteration (defaults to 100, can be
|
|
modified with the mca mtl_ofi_progress_event_cnt) and iterates until the
|
|
completion queue is drained.
|
|
|
|
COMPLETIONS:
|
|
------------
|
|
Each operation uses a request type ompi_mtl_ofi_request_t which includes a reference
|
|
to an operation specific completion callback, an MPI request, and a context. The
|
|
context (fi_context) is used to map completion events with MPI_requests when reading the
|
|
CQ.
|
|
|
|
OFI TAG:
|
|
--------
|
|
MPI needs to send 96 bits of information per message (32 bits communicator id,
|
|
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
|
|
addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
|
|
Therefore, there are only 62 bits available in the OFI tag for message usage. The
|
|
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
|
|
|
|
"auto" (Default):
|
|
After the OFI provider is selected, a runtime check is performed to assess
|
|
FI_REMOTE_CQ_DATA and FI_DIRECTED_RECV support (see fi_tagged(3), fi_msg(2)
|
|
and fi_getinfo(3)). If supported, "ofi_tag_full" is used. If not supported,
|
|
fall back to "ofi_tag_1".
|
|
|
|
"ofi_tag_1":
|
|
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
|
|
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 62
|
|
bits available bit in the OFI tag. There are two options available with different
|
|
number of bits for the Communicator ID and MPI tag fields. This tag distribution
|
|
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
|
|
provider reserved bits (see mem_tag_format below), 18 bits for Source Rank (max
|
|
Source Rank 262,143), 32 bits for MPI tag (max MPI tag is INT_MAX).
|
|
|
|
"ofi_tag_2":
|
|
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
|
|
applications that may require a greater number of supported Communicators at the
|
|
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
|
|
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 18
|
|
bits for Source Rank (max Source Rank 262,143), 20 bits for MPI tag (max MPI tag
|
|
524,287).
|
|
|
|
"ofi_tag_full":
|
|
For executions that cannot accept trimming source rank or MPI tag, this mode sends
|
|
source rank for each message in the CQ DATA. The Source Rank is made available at
|
|
the remote process CQ (FI_CQ_FORMAT_TAGGED is used, see fi_cq(3)) at the completion
|
|
of the matching receive operation. Since the minimum size for FI_REMOTE_CQ_DATA
|
|
is 32 bits, the Source Rank fits with no limitations. The OFI tag is used for the
|
|
Communicator id (28 bits, max Communicator ID 268,435,455. See mem_tag_format below),
|
|
and the MPI tag (max MPI tag is INT_MAX). If this mode is selected by the user
|
|
and FI_REMOTE_CQ_DATA or FI_DIRECTED_RECV are not supported, the execution will abort.
|
|
|
|
mem_tag_format (fi_endpoint(3))
|
|
Some providers can reserve the higher order bits from the OFI tag for internal purposes.
|
|
This is signaled in mem_tag_format (see fi_endpoint(3)) by setting higher order bits
|
|
to zero. In such cases, the OFI MTL will reduce the number of communicator ids supported
|
|
by reducing the bits available for the communicator ID field in the OFI tag.
|
|
|
|
SCALABLE ENDPOINTS:
|
|
-------------------
|
|
OFI MTL supports OFI Scalable Endpoints feature as a means to improve
|
|
multi-threaded application throughput and message rate. Currently the feature
|
|
is designed to utilize multiple TX/RX contexts exposed by the OFI provider in
|
|
conjunction with a multi-communicator MPI application model. Therefore, new OFI
|
|
contexts are created as and when communicators are duplicated in a lazy fashion
|
|
instead of creating them all at once during init time and this approach also
|
|
favours only creating as many contexts as needed.
|
|
|
|
1. Multi-communicator model:
|
|
With this approach, the application first duplicates the communicators it
|
|
wants to use with MPI operations (ideally creating as many communicators as
|
|
the number of threads it wants to use to call into MPI). The duplicated
|
|
communicators are then used by the corresponding threads to perform MPI
|
|
operations. A possible usage scenario could be in an MPI + OMP
|
|
application as follows (example limited to 2 ranks):
|
|
|
|
MPI_Comm dup_comm[n];
|
|
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
|
|
for (i = 0; i < n; i++) {
|
|
MPI_Comm_dup(MPI_COMM_WORLD, &dup_comm[i]);
|
|
}
|
|
if (rank == 0) {
|
|
#pragma omp parallel for private(host_sbuf, host_rbuf) num_threads(n)
|
|
for (i = 0; i < n ; i++) {
|
|
MPI_Send(host_sbuf, MYBUFSIZE, MPI_CHAR,
|
|
1, MSG_TAG, dup_comm[i]);
|
|
MPI_Recv(host_rbuf, MYBUFSIZE, MPI_CHAR,
|
|
1, MSG_TAG, dup_comm[i], &status);
|
|
}
|
|
} else if (rank == 1) {
|
|
#pragma omp parallel for private(status, host_sbuf, host_rbuf) num_threads(n)
|
|
for (i = 0; i < n ; i++) {
|
|
MPI_Recv(host_rbuf, MYBUFSIZE, MPI_CHAR,
|
|
0, MSG_TAG, dup_comm[i], &status);
|
|
MPI_Send(host_sbuf, MYBUFSIZE, MPI_CHAR,
|
|
0, MSG_TAG, dup_comm[i]);
|
|
}
|
|
}
|
|
|
|
2. MCA variable:
|
|
To utilize the feature, the following MCA variable needs to be set:
|
|
mtl_ofi_thread_grouping:
|
|
This MCA variable is at the OFI MTL level and needs to be set to switch
|
|
the feature on.
|
|
|
|
Default: 0
|
|
|
|
It is not recommended to set the MCA variable for:
|
|
- Multi-threaded MPI applications not following multi-communicator approach.
|
|
- Applications that have multiple threads using a single communicator as
|
|
it may degrade performance.
|
|
|
|
Command-line syntax to set the MCA variable:
|
|
"-mca mtl_ofi_thread_grouping 1"
|
|
|
|
3. Notes on performance:
|
|
- OFI MTL will create as many TX/RX contexts as allowed by an underlying
|
|
provider (each provider may have different thresholds). Once the threshold
|
|
is exceeded, contexts are used in a round-robin fashion which leads to
|
|
resource sharing among threads. Therefore locks are required to guard
|
|
against race conditions. For performance, it is recommended to have
|
|
|
|
Number of communicators = Number of contexts
|
|
|
|
For example, when using PSM2 provider, the number of contexts is dictated
|
|
by the Intel Omni-Path HFI1 driver module.
|
|
|
|
- For applications using a single thread with multiple communicators and MCA
|
|
variable "mtl_ofi_thread_grouping" set to 1, the MTL will use multiple
|
|
contexts, but the benefits may be negligible as only one thread is driving
|
|
progress.
|
|
|
|
SPECIALIZED FUNCTIONS:
|
|
-------------------
|
|
To improve performance when calling message passing APIs in the OFI mtl
|
|
specialized functions are generated at compile time that eliminate all the
|
|
if conditionals that can be determined at init and don't need to be
|
|
queried again during the critical path. These functions are generated by
|
|
perl scripts during make which generate functions and symbols for every
|
|
combination of flags for each function.
|
|
|
|
1. ADDING NEW FLAGS FOR SPECIALIZATION OF EXISTING FUNCTION:
|
|
To add a new flag to an existing specialized function for handling cases
|
|
where different OFI providers may or may not support a particular feature,
|
|
then you must follow these steps:
|
|
1) Update the "_generic" function in mtl_ofi.h with the new flag and
|
|
the if conditionals to read the new value.
|
|
2) Update the *.pm file corresponding to the function with the new flag in:
|
|
gen_funcs(), gen_*_function(), & gen_*_sym_init()
|
|
3) Update mtl_ofi_opt.h with:
|
|
The new flag as #define NEW_FLAG_TYPES #NUMBER_OF_STATES
|
|
example: #define OFI_CQ_DATA 2 (only has TRUE/FALSE states)
|
|
Update the function's types with:
|
|
#define OMPI_MTL_OFI_FUNCTION_TYPES [NEW_FLAG_TYPES]
|
|
|
|
2. ADDING A NEW FUNCTION FOR SPECIALIZATION:
|
|
To add a new function to be specialized you must
|
|
follow these steps:
|
|
1) Create a new mtl_ofi_"function_name"_opt.pm based off opt_common/mtl_ofi_opt.pm.template
|
|
2) Add new .pm file to generated_source_modules in Makefile.am
|
|
3) Add .c file to generated_sources in Makefile.am named the same as the corresponding .pm file
|
|
4) Update existing or create function in mtl_ofi.h to _generic with new flags.
|
|
5) Update mtl_ofi_opt.h with:
|
|
a) New function types: #define OMPI_MTL_OFI_FUNCTION_TYPES [FLAG_TYPES]
|
|
b) Add new function to the struct ompi_mtl_ofi_symtable:
|
|
struct ompi_mtl_ofi_symtable {
|
|
...
|
|
int (*ompi_mtl_ofi_FUNCTION OMPI_MTL_OFI_FUNCTION_TYPES )
|
|
}
|
|
c) Add new symbol table init function definition:
|
|
void ompi_mtl_ofi_FUNCTION_symtable_init(struct ompi_mtl_ofi_symtable* sym_table);
|
|
6) Add calls to init the new function in the symbol table and assign the function
|
|
pointer to be used based off the flags in mtl_ofi_component.c:
|
|
ompi_mtl_ofi_FUNCTION_symtable_init(&ompi_mtl_ofi.sym_table);
|
|
ompi_mtl_ofi.base.mtl_FUNCTION =
|
|
ompi_mtl_ofi.sym_table.ompi_mtl_ofi_FUNCTION[ompi_mtl_ofi.flag];
|
|
|
|
3. EXAMPLE SPECIALIZED FILE:
|
|
The code below is an example of what is generated by the specialization
|
|
scripts for use in the OFI mtl. This code specializes the blocking
|
|
send functionality based on FI_REMOTE_CQ_DATA & OFI Scalable Endpoint support
|
|
provided by an OFI Provider. Only one function and symbol is used during
|
|
runtime based on if FI_REMOTE_CQ_DATA is supported and/or if OFI Scalable
|
|
Endpoint support is enabled.
|
|
/*
|
|
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "mtl_ofi.h"
|
|
|
|
__opal_attribute_always_inline__ static inline int
|
|
ompi_mtl_ofi_send_false_false(struct mca_mtl_base_module_t *mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode)
|
|
{
|
|
const bool OFI_CQ_DATA = false;
|
|
const bool OFI_SCEP_EPS = false;
|
|
|
|
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
|
convertor, mode,
|
|
OFI_CQ_DATA, OFI_SCEP_EPS);
|
|
}
|
|
|
|
__opal_attribute_always_inline__ static inline int
|
|
ompi_mtl_ofi_send_false_true(struct mca_mtl_base_module_t *mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode)
|
|
{
|
|
const bool OFI_CQ_DATA = false;
|
|
const bool OFI_SCEP_EPS = true;
|
|
|
|
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
|
convertor, mode,
|
|
OFI_CQ_DATA, OFI_SCEP_EPS);
|
|
}
|
|
|
|
__opal_attribute_always_inline__ static inline int
|
|
ompi_mtl_ofi_send_true_false(struct mca_mtl_base_module_t *mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode)
|
|
{
|
|
const bool OFI_CQ_DATA = true;
|
|
const bool OFI_SCEP_EPS = false;
|
|
|
|
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
|
convertor, mode,
|
|
OFI_CQ_DATA, OFI_SCEP_EPS);
|
|
}
|
|
|
|
__opal_attribute_always_inline__ static inline int
|
|
ompi_mtl_ofi_send_true_true(struct mca_mtl_base_module_t *mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode)
|
|
{
|
|
const bool OFI_CQ_DATA = true;
|
|
const bool OFI_SCEP_EPS = true;
|
|
|
|
return ompi_mtl_ofi_send_generic(mtl, comm, dest, tag,
|
|
convertor, mode,
|
|
OFI_CQ_DATA, OFI_SCEP_EPS);
|
|
}
|
|
|
|
void ompi_mtl_ofi_send_symtable_init(struct ompi_mtl_ofi_symtable* sym_table)
|
|
{
|
|
|
|
sym_table->ompi_mtl_ofi_send[false][false]
|
|
= ompi_mtl_ofi_send_false_false;
|
|
|
|
|
|
sym_table->ompi_mtl_ofi_send[false][true]
|
|
= ompi_mtl_ofi_send_false_true;
|
|
|
|
|
|
sym_table->ompi_mtl_ofi_send[true][false]
|
|
= ompi_mtl_ofi_send_true_false;
|
|
|
|
|
|
sym_table->ompi_mtl_ofi_send[true][true]
|
|
= ompi_mtl_ofi_send_true_true;
|
|
|
|
}
|
|
###
|