1
1

Merge pull request #5004 from matcabral/mtl_ofi_remote_cq_data

MTL OFI: add support for FI_REMOTE_CQ_DATA.
Этот коммит содержится в:
Matias Cabral 2018-06-15 16:24:54 -07:00 коммит произвёл GitHub
родитель 56c35d25c8 e6674556aa
Коммит 10516c1fb8
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 503 добавлений и 111 удалений

69
ompi/mca/mtl/ofi/README Обычный файл
Просмотреть файл

@ -0,0 +1,69 @@
OFI MTL
The OFI MTL supports Libfabric (a.k.a. Open Fabrics Interfaces OFI,
https://ofiwg.github.io/libfabric/) tagged APIs (fi_tagged(3)). At
initialization time, the MTL queries libfabric for providers supporting tag matching
(fi_getinfo(3)). Libfabric will return a list of providers that satisfy the requested
capabilities, having the most performant one at the top of the list.
The user may modify the OFI provider selection with mca parameters
mtl_ofi_provider_include or mtl_ofi_provider_exclude.
PROGRESS:
The MTL registers a progress function to opal_progress. There is currently
no support for asynchronous progress. The progress function reads multiple events
from the OFI provider Completion Queue (CQ) per iteration (defaults to 100, can be
modified with the mca mtl_ofi_progress_event_cnt) and iterates until the
completion queue is drained.
COMPLETIONS:
Each operation uses a request type ompi_mtl_ofi_request_t which includes a reference
to an operation specific completion callback, an MPI request, and a context. The
context (fi_context) is used to map completion events with MPI_requests when reading the
CQ.
OFI TAG:
MPI needs to send 96 bits of information per message (32 bits communicator id,
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
addition, the OFI MTL uses 4 bits of the OFI tag for the synchronous send protocol.
Therefore, there are only 60 bits available in the OFI tag for message usage. The
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
"auto" (Default):
After the OFI provider is selected, a runtime check is performed to assess
FI_REMOTE_CQ_DATA and FI_DIRECTED_RECV support (see fi_tagged(3), fi_msg(2)
and fi_getinfo(3)). If supported, "ofi_tag_full" is used. If not supported,
fall back to "ofi_tag_1".
"ofi_tag_1":
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 60
bits available bit in the OFI tag. There are two options available with different
number of bits for the Communicator ID and MPI tag fields. This tag distribution
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
provider reserved bits (see mem_tag_format below), 16 bits for Source Rank (max
Source Rank 65,535), 32 bits for MPI tag (max MPI tag is INT_MAX).
"ofi_tag_2":
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
applications that may require a greater number of supported Communicators at the
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 16
bits for Source Rank (max Source Rank 65,535), 20 bits for MPI tag (max MPI tag
524,287).
"ofi_tag_full":
For executions that cannot accept trimming source rank or MPI tag, this mode sends
source rank for each message in the CQ DATA. The Source Rank is made available at
the remote process CQ (FI_CQ_FORMAT_TAGGED is used, see fi_cq(3)) at the completion
of the matching receive operation. Since the minimum size for FI_REMOTE_CQ_DATA
is 32 bits, the Source Rank fits with no limitations. The OFI tag is used for the
Communicator id (28 bits, max Communicator ID 268,435,455. See mem_tag_format below),
and the MPI tag (max MPI tag is INT_MAX). If this mode is selected by the user
and FI_REMOTE_CQ_DATA or FI_DIRECTED_RECV are not supported, the execution will abort.
mem_tag_format (fi_endpoint(3))
Some providers can reserve the higher order bits from the OFI tag for internal purposes.
This is signaled in mem_tag_format (see fi_endpoint(3)) by setting higher order bits
to zero. In such cases, the OFI MTL will reduce the number of communicator ids supported
by reducing the bits available for the communicator ID field in the OFI tag.

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -14,8 +14,8 @@ OMPI_DECLSPEC extern mca_mtl_ofi_component_t mca_mtl_ofi_component;
mca_mtl_ofi_module_t ompi_mtl_ofi = {
{
8191, /* max cid - 2^13 - 1 */
(1UL << 30), /* max tag value - must allow negatives */
(int)((1ULL << MTL_OFI_CID_BIT_COUNT_1) - 1), /* max cid */
(int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1) ,/* max tag value */
0, /* request reserve space */
0, /* flags */

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
*
@ -244,6 +244,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
ompi_proc_t *ompi_proc = NULL;
mca_mtl_ofi_endpoint_t *endpoint = NULL;
ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */
fi_addr_t src_addr = 0;
ompi_proc = ompi_comm_peer_lookup(comm, dest);
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
@ -255,6 +256,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
ofi_req->length = length;
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
if (ompi_mtl_ofi.fi_cq_data) {
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
src_addr = endpoint->peer_fiaddr;
} else {
match_bits = mtl_ofi_create_send_tag(comm->c_contextid,
comm->c_my_rank, tag);
/* src_addr is ignored when FI_DIRECTED_RECV is not supported */
}
if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_SYNCHRONOUS == mode)) {
ack_req = malloc(sizeof(ompi_mtl_ofi_request_t));
assert(ack_req);
@ -263,14 +273,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
ack_req->error_callback = ompi_mtl_ofi_send_ack_error_callback;
ofi_req->completion_count = 2;
MTL_OFI_SET_SEND_BITS(match_bits, comm->c_contextid,
comm->c_my_rank, tag, MTL_OFI_SYNC_SEND);
MTL_OFI_SET_SYNC_SEND(match_bits);
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
NULL,
0,
NULL,
endpoint->peer_fiaddr,
match_bits | MTL_OFI_SYNC_SEND_ACK,
src_addr,
match_bits | ompi_mtl_ofi.sync_send_ack,
0, /* Exact match, no ignore bits */
(void *) &ack_req->ctx));
if (OPAL_UNLIKELY(0 > ret)) {
@ -282,20 +293,30 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
}
} else {
ofi_req->completion_count = 1;
MTL_OFI_SET_SEND_BITS(match_bits, comm->c_contextid,
comm->c_my_rank, tag, 0);
}
if (ompi_mtl_ofi.max_inject_size >= length) {
MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ep,
if (ompi_mtl_ofi.fi_cq_data) {
MTL_OFI_RETRY_UNTIL_DONE(fi_tinjectdata(ompi_mtl_ofi.ep,
start,
length,
comm->c_my_rank,
endpoint->peer_fiaddr,
match_bits));
} else {
MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ep,
start,
length,
endpoint->peer_fiaddr,
match_bits));
}
if (OPAL_UNLIKELY(0 > ret)) {
char *fi_api = ompi_mtl_ofi.fi_cq_data ? "fi_tinjectddata" : "fi_tinject";
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_tinject failed: %s(%zd)",
__FILE__, __LINE__, fi_strerror(-ret), ret);
"%s:%d: %s failed: %s(%zd)",
__FILE__, __LINE__,fi_api, fi_strerror(-ret), ret);
if (ack_req) {
fi_cancel((fid_t)ompi_mtl_ofi.ep, &ack_req->ctx);
free(ack_req);
@ -305,17 +326,29 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
ofi_req->event_callback(NULL,ofi_req);
} else {
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
if (ompi_mtl_ofi.fi_cq_data) {
MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ep,
start,
length,
NULL,
comm->c_my_rank,
endpoint->peer_fiaddr,
match_bits,
(void *) &ofi_req->ctx));
} else {
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
start,
length,
NULL,
endpoint->peer_fiaddr,
match_bits,
(void *) &ofi_req->ctx));
}
if (OPAL_UNLIKELY(0 > ret)) {
char *fi_api = ompi_mtl_ofi.fi_cq_data ? "fi_tsendddata" : "fi_send";
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_tsend failed: %s(%zd)",
__FILE__, __LINE__, fi_strerror(-ret), ret);
"%s:%d: %s failed: %s(%zd)",
__FILE__, __LINE__,fi_api, fi_strerror(-ret), ret);
return ompi_mtl_ofi_get_error(ret);
}
}
@ -415,7 +448,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
ssize_t ret;
ompi_proc_t *ompi_proc = NULL;
mca_mtl_ofi_endpoint_t *endpoint = NULL;
int src;
int src = mtl_ofi_get_source(wc);
ompi_status_public_t *status = NULL;
assert(ofi_req->super.ompi_req);
@ -427,7 +460,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
*/
ofi_req->req_started = true;
status->MPI_SOURCE = MTL_OFI_GET_SOURCE(wc->tag);
status->MPI_SOURCE = src;
status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag);
status->_ucount = wc->len;
@ -474,7 +507,6 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
* we need to extract the source's actual address.
*/
if (ompi_mtl_ofi.any_addr == ofi_req->remote_addr) {
src = MTL_OFI_GET_SOURCE(wc->tag);
ompi_proc = ompi_comm_peer_lookup(ofi_req->comm, src);
endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc);
ofi_req->remote_addr = endpoint->peer_fiaddr;
@ -484,7 +516,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
0,
NULL,
ofi_req->remote_addr,
wc->tag | MTL_OFI_SYNC_SEND_ACK,
wc->tag | ompi_mtl_ofi.sync_send_ack,
(void *) &ofi_req->ctx));
if (OPAL_UNLIKELY(0 > ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
@ -510,7 +542,7 @@ ompi_mtl_ofi_recv_error_callback(struct fi_cq_err_entry *error,
assert(ofi_req->super.ompi_req);
status = &ofi_req->super.ompi_req->req_status;
status->MPI_TAG = MTL_OFI_GET_TAG(ofi_req->match_bits);
status->MPI_SOURCE = MTL_OFI_GET_SOURCE(ofi_req->match_bits);
status->MPI_SOURCE = mtl_ofi_get_source((struct fi_cq_tagged_entry *) error);
switch (error->err) {
case FI_ETRUNC:
@ -538,7 +570,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
int ompi_ret = OMPI_SUCCESS;
ssize_t ret;
uint64_t match_bits, mask_bits;
fi_addr_t remote_addr;
fi_addr_t remote_addr = ompi_mtl_ofi.any_addr;
ompi_proc_t *ompi_proc = NULL;
mca_mtl_ofi_endpoint_t *endpoint = NULL;
ompi_mtl_ofi_request_t *ofi_req = (ompi_mtl_ofi_request_t*) mtl_request;
@ -546,15 +578,21 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
size_t length;
bool free_after;
if (MPI_ANY_SOURCE != src) {
ompi_proc = ompi_comm_peer_lookup(comm, src);
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
remote_addr = endpoint->peer_fiaddr;
} else {
remote_addr = ompi_mtl_ofi.any_addr;
}
MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag);
if (ompi_mtl_ofi.fi_cq_data) {
if (MPI_ANY_SOURCE != src) {
ompi_proc = ompi_comm_peer_lookup(comm, src);
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
remote_addr = endpoint->peer_fiaddr;
}
mtl_ofi_create_recv_tag_CQD(&match_bits, &mask_bits, comm->c_contextid,
tag);
} else {
mtl_ofi_create_recv_tag(&match_bits, &mask_bits, comm->c_contextid, src,
tag);
/* src_addr is ignored when FI_DIRECTED_RECV is not used */
}
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
&start,
@ -606,7 +644,7 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
{
struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req;
ompi_status_public_t *status = &mrecv_req->ompi_req->req_status;
status->MPI_SOURCE = MTL_OFI_GET_SOURCE(wc->tag);
status->MPI_SOURCE = mtl_ofi_get_source(wc);
status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag);
status->MPI_ERROR = MPI_SUCCESS;
status->_ucount = wc->len;
@ -628,7 +666,7 @@ ompi_mtl_ofi_mrecv_error_callback(struct fi_cq_err_entry *error,
struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req;
ompi_status_public_t *status = &mrecv_req->ompi_req->req_status;
status->MPI_TAG = MTL_OFI_GET_TAG(ofi_req->match_bits);
status->MPI_SOURCE = MTL_OFI_GET_SOURCE(ofi_req->match_bits);
status->MPI_SOURCE = mtl_ofi_get_source((struct fi_cq_tagged_entry *) error);
switch (error->err) {
case FI_ETRUNC:
@ -716,7 +754,7 @@ ompi_mtl_ofi_probe_callback(struct fi_cq_tagged_entry *wc,
{
ofi_req->match_state = 1;
ofi_req->match_bits = wc->tag;
ofi_req->status.MPI_SOURCE = MTL_OFI_GET_SOURCE(wc->tag);
ofi_req->status.MPI_SOURCE = mtl_ofi_get_source(wc);
ofi_req->status.MPI_TAG = MTL_OFI_GET_TAG(wc->tag);
ofi_req->status.MPI_ERROR = MPI_SUCCESS;
ofi_req->status._ucount = wc->len;
@ -749,22 +787,28 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
struct ompi_mtl_ofi_request_t ofi_req;
ompi_proc_t *ompi_proc = NULL;
mca_mtl_ofi_endpoint_t *endpoint = NULL;
fi_addr_t remote_proc = 0;
fi_addr_t remote_proc = ompi_mtl_ofi.any_addr;
uint64_t match_bits, mask_bits;
ssize_t ret;
struct fi_msg_tagged msg;
uint64_t msgflags = FI_PEEK;
/**
* If the source is known, use its peer_fiaddr.
*/
if (MPI_ANY_SOURCE != src) {
ompi_proc = ompi_comm_peer_lookup( comm, src );
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
remote_proc = endpoint->peer_fiaddr;
}
if (ompi_mtl_ofi.fi_cq_data) {
/* If the source is known, use its peer_fiaddr. */
if (MPI_ANY_SOURCE != src) {
ompi_proc = ompi_comm_peer_lookup( comm, src );
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
remote_proc = endpoint->peer_fiaddr;
}
MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag);
mtl_ofi_create_recv_tag_CQD(&match_bits, &mask_bits, comm->c_contextid,
tag);
}
else {
mtl_ofi_create_recv_tag(&match_bits, &mask_bits, comm->c_contextid, src,
tag);
/* src_addr is ignored when FI_DIRECTED_RECV is not used */
}
/**
* fi_trecvmsg with FI_PEEK:
@ -829,7 +873,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
struct ompi_mtl_ofi_request_t *ofi_req;
ompi_proc_t *ompi_proc = NULL;
mca_mtl_ofi_endpoint_t *endpoint = NULL;
fi_addr_t remote_proc = 0;
fi_addr_t remote_proc = ompi_mtl_ofi.any_addr;
uint64_t match_bits, mask_bits;
ssize_t ret;
struct fi_msg_tagged msg;
@ -843,13 +887,22 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
/**
* If the source is known, use its peer_fiaddr.
*/
if (MPI_ANY_SOURCE != src) {
ompi_proc = ompi_comm_peer_lookup( comm, src );
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
remote_proc = endpoint->peer_fiaddr;
}
MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag);
if (ompi_mtl_ofi.fi_cq_data) {
if (MPI_ANY_SOURCE != src) {
ompi_proc = ompi_comm_peer_lookup( comm, src );
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
remote_proc = endpoint->peer_fiaddr;
}
mtl_ofi_create_recv_tag_CQD(&match_bits, &mask_bits, comm->c_contextid,
tag);
}
else {
/* src_addr is ignored when FI_DIRECTED_RECV is not used */
mtl_ofi_create_recv_tag(&match_bits, &mask_bits, comm->c_contextid, src,
tag);
}
/**
* fi_trecvmsg with FI_PEEK and FI_CLAIM:

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
*
* Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
@ -31,6 +31,7 @@ static char *prov_exclude;
static int control_progress;
static int data_progress;
static int av_type;
static int ofi_tag_mode;
/*
* Enumerators
@ -68,6 +69,21 @@ mca_base_var_enum_value_t av_table_type[] = {
{0, NULL}
};
enum {
MTL_OFI_TAG_AUTO=1,
MTL_OFI_TAG_1,
MTL_OFI_TAG_2,
MTL_OFI_TAG_FULL,
};
mca_base_var_enum_value_t ofi_tag_mode_type[] = {
{MTL_OFI_TAG_AUTO, "auto"},
{MTL_OFI_TAG_1, "ofi_tag_1"},
{MTL_OFI_TAG_2, "ofi_tag_2"},
{MTL_OFI_TAG_FULL, "ofi_tag_full"},
{0, NULL}
};
mca_mtl_ofi_component_t mca_mtl_ofi_component = {
{
@ -136,7 +152,37 @@ ompi_mtl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.ofi_progress_event_count);
free(desc);
free(desc);
ret = mca_base_var_enum_create ("ofi_tag_mode_type", ofi_tag_mode_type , &new_enum);
if (OPAL_SUCCESS != ret) {
return ret;
}
ofi_tag_mode = MTL_OFI_TAG_AUTO;
asprintf(&desc, "Mode specifying how many bits to use for various MPI values in OFI/Libfabric"
" communications. Some Libfabric provider network types can support most of Open MPI"
" needs; others can only supply a limited number of bits, which then must be split"
" across the MPI communicator ID, MPI source rank, and MPI tag. Three different"
" splitting schemes are available: ofi_tag_full (%d bits for the communicator, %d bits"
" for the source rank, and %d bits for the tag), ofi_tag_1 (%d bits for the communicator"
", %d bits source rank, %d bits tag), ofi_tag_2 (%d bits for the communicator"
", %d bits source rank, %d bits tag). By default, this MCA variable is set to \"auto\","
" which will first try to use ofi_tag_full, and if that fails, fall back to ofi_tag_1.",
MTL_OFI_CID_BIT_COUNT_DATA, 32, MTL_OFI_TAG_BIT_COUNT_DATA,
MTL_OFI_CID_BIT_COUNT_1, MTL_OFI_SOURCE_BIT_COUNT_1, MTL_OFI_TAG_BIT_COUNT_1,
MTL_OFI_CID_BIT_COUNT_2, MTL_OFI_SOURCE_BIT_COUNT_2, MTL_OFI_TAG_BIT_COUNT_2);
mca_base_component_var_register (&mca_mtl_ofi_component.super.mtl_version,
"tag_mode",
desc,
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ofi_tag_mode);
free(desc);
OBJ_RELEASE(new_enum);
ret = mca_base_var_enum_create ("control_prog_type", control_prog_type, &new_enum);
if (OPAL_SUCCESS != ret) {
@ -304,13 +350,96 @@ select_ofi_provider(struct fi_info *providers)
return prov;
}
/* Check if FI_REMOTE_CQ_DATA is supported, if so send the source rank there
* FI_DIRECTED_RECV is also needed so receives can discrimate the source
*/
static int
ompi_mtl_ofi_check_fi_remote_cq_data(int fi_version,
struct fi_info *hints,
struct fi_info *provider,
struct fi_info **prov_cq_data)
{
int ret;
char *provider_name;
struct fi_info *hints_dup;
hints_dup = fi_dupinfo(hints);
provider_name = strdup(provider->fabric_attr->prov_name);
hints_dup->fabric_attr->prov_name = provider_name;
hints_dup->caps |= FI_TAGGED | FI_DIRECTED_RECV;
/* Ask for the size that OMPI uses for the source rank number */
hints_dup->domain_attr->cq_data_size = sizeof(int);
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, prov_cq_data);
if ((0 != ret) && (-FI_ENODATA != ret)) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_getinfo",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
return ret;
} else if (-FI_ENODATA == ret) {
/* The provider does not support FI_REMOTE_CQ_DATA */
prov_cq_data = NULL;
}
fi_freeinfo(hints_dup);
return OMPI_SUCCESS;
}
static void
ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) {
switch (ofi_tag_mode) {
case MTL_OFI_TAG_1:
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_1 ) - 1);
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1);
ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_1;
ompi_mtl_ofi.num_bits_source_rank = MTL_OFI_SOURCE_BIT_COUNT_1;
ompi_mtl_ofi.source_rank_mask = MTL_OFI_SOURCE_MASK_1;
ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_1;
ompi_mtl_ofi.num_bits_mpi_tag = MTL_OFI_TAG_BIT_COUNT_1;
ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_1;
ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_1;
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_1;
break;
case MTL_OFI_TAG_2:
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_2 ) - 1);
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_2 - 1)) - 1);
ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_2;
ompi_mtl_ofi.num_bits_source_rank = MTL_OFI_SOURCE_BIT_COUNT_2;
ompi_mtl_ofi.source_rank_mask = MTL_OFI_SOURCE_MASK_2;
ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_2;
ompi_mtl_ofi.num_bits_mpi_tag = MTL_OFI_TAG_BIT_COUNT_2;
ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_2;
ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_2;
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_2;
break;
default: /* use FI_REMOTE_CQ_DATA */
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_DATA ) - 1);
ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_DATA - 1)) - 1);
ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_DATA;
ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_DATA;
ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_DATA;
ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_DATA;
}
}
static mca_mtl_base_module_t*
ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads)
{
int ret, fi_version;
struct fi_info *hints;
struct fi_info *providers = NULL, *prov = NULL;
struct fi_info *providers = NULL;
struct fi_info *prov = NULL;
struct fi_info *prov_cq_data = NULL;
struct fi_cq_attr cq_attr = {0};
struct fi_av_attr av_attr = {0};
char ep_name[FI_NAME_MAX] = {0};
@ -411,6 +540,39 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
goto error;
}
/**
* Select the format of the OFI tag
*/
if ((MTL_OFI_TAG_AUTO == ofi_tag_mode) ||
(MTL_OFI_TAG_FULL == ofi_tag_mode)) {
ret = ompi_mtl_ofi_check_fi_remote_cq_data(fi_version,
hints, prov,
&prov_cq_data);
if (OMPI_SUCCESS != ret) {
goto error;
} else if (NULL == prov_cq_data) {
/* No support for FI_REMTOTE_CQ_DATA */
fi_freeinfo(prov_cq_data);
ompi_mtl_ofi.fi_cq_data = false;
if (MTL_OFI_TAG_AUTO == ofi_tag_mode) {
/* Fallback to MTL_OFI_TAG_1 */
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1);
} else { /* MTL_OFI_TAG_FULL */
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
__FILE__, __LINE__, prov->fabric_attr->prov_name);
goto error;
}
} else {
/* Use FI_REMTOTE_CQ_DATA */
ompi_mtl_ofi.fi_cq_data = true;
prov = prov_cq_data;
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL);
}
} else { /* MTL_OFI_TAG_1 or MTL_OFI_TAG_2 */
ompi_mtl_ofi.fi_cq_data = false;
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode);
}
/**
* Open fabric
@ -503,7 +665,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
* Allocate memory for storing the CQ events read in OFI progress.
*/
ompi_mtl_ofi.progress_entries = calloc(ompi_mtl_ofi.ofi_progress_event_count, sizeof(struct fi_cq_tagged_entry));
if (OPAL_UNLIKELY(!ompi_mtl_ofi.progress_entries)) {
if (NULL == ompi_mtl_ofi.progress_entries) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: alloc of CQ event storage failed: %s\n",
__FILE__, __LINE__, strerror(errno));
@ -614,6 +776,9 @@ error:
if (providers) {
(void) fi_freeinfo(providers);
}
if (prov_cq_data) {
(void) fi_freeinfo(prov_cq_data);
}
if (hints) {
(void) fi_freeinfo(hints);
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
*
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
@ -55,6 +55,21 @@ typedef struct mca_mtl_ofi_module_t {
/** CQ event storage */
struct fi_cq_tagged_entry *progress_entries;
/** Use FI_REMOTE_CQ_DATA*/
bool fi_cq_data;
/** Info used to create the OFI tag **/
unsigned long long source_rank_tag_mask;
int num_bits_source_rank;
unsigned long long source_rank_mask;
unsigned long long mpi_tag_mask;
int num_bits_mpi_tag;
/** Synchronous protocol tag bits */
unsigned long long sync_send;
unsigned long long sync_send_ack;
unsigned long long sync_proto_mask;
} mca_mtl_ofi_module_t;
extern mca_mtl_ofi_module_t ompi_mtl_ofi;
@ -64,75 +79,165 @@ typedef struct mca_mtl_ofi_component_t {
mca_mtl_base_component_2_0_0_t super;
} mca_mtl_ofi_component_t;
/*OFI TAG:
* Define 3 different OFI tag distributions:
* 1) Support FI_REMOTE_CQ_DATA: No need for source rank in the tag
* 2) ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
* 3) ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
* with more bits for the communicator ID.
* More details of the tags are in the README file (mtl_ofi_tag_mode).
*/
/* match/ignore bit manipulation
*
* 0 123 4567 01234567 01234567 01234567 01234567 01234567 01234567 01234567
* | | | |
* | | context id | source | message tag
* ^| ^ | | |
* | |
* | +- protocol
* +---- ACK flag
/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
* 01234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567
* | |
* context_id |prot| message tag
*/
#define MTL_OFI_PROTO_BIT_COUNT (4)
#define MTL_OFI_PROTOCOL_HEADER_MASK (0xF000000000000000ULL)
#define MTL_OFI_PROTOCOL_MASK (0x7000000000000000ULL)
#define MTL_OFI_CONTEXT_MASK (0x0FFF000000000000ULL)
#define MTL_OFI_SOURCE_MASK (0x0000FFFF00000000ULL)
#define MTL_OFI_TAG_MASK (0x00000000FFFFFFFFULL)
#define MTL_OFI_CID_BIT_COUNT_DATA (28)
#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_DATA (32)
#define MTL_OFI_PROTO_MASK_DATA (0x0000000F00000000ULL)
#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000900000000ULL)
#define MTL_OFI_SYNC_SEND (0x1000000000000000ULL)
#define MTL_OFI_SYNC_SEND_ACK (0x9000000000000000ULL)
/* Send tag with CQ_DATA */
__opal_attribute_always_inline__ static inline uint64_t
mtl_ofi_create_send_tag_CQD(int comm_id, int tag)
{
uint64_t match_bits = comm_id;
match_bits = (match_bits << (MTL_OFI_TAG_BIT_COUNT_DATA
+ MTL_OFI_PROTO_BIT_COUNT));
match_bits |= (tag & MTL_OFI_TAG_MASK_DATA);
return match_bits;
}
/* send posting */
#define MTL_OFI_SET_SEND_BITS(match_bits, contextid, source, tag, type) \
{ \
match_bits = contextid; \
match_bits = (match_bits << 16); \
match_bits |= (uint64_t)source; \
match_bits = (match_bits << 32); \
match_bits |= (MTL_OFI_TAG_MASK & tag) | type; \
/* Receive tag with CQ_DATA */
__opal_attribute_always_inline__ static inline void
mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
int comm_id, int tag)
{
*mask_bits = ompi_mtl_ofi.sync_proto_mask;
*match_bits = (uint64_t) comm_id;
*match_bits = (*match_bits << (MTL_OFI_PROTO_BIT_COUNT
+ MTL_OFI_TAG_BIT_COUNT_DATA));
if (MPI_ANY_TAG == tag) {
/* Special negative tags are used for collective operations.
* MPI_ANY_TAG should not match these special tags.
* See ompi/mca/coll/base/coll_tags.h
*/
*mask_bits |= (ompi_mtl_ofi.mpi_tag_mask>>1);
} else {
*match_bits |= (ompi_mtl_ofi.mpi_tag_mask & tag);
}
}
/*
* ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
*
* 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567 01234567
* | | |
* Comm id | source |prot| message tag
*/
#define MTL_OFI_CID_BIT_COUNT_1 (12)
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFF000000000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_1 (16)
#define MTL_OFI_SOURCE_MASK_1 (0x000000000000FFFFULL)
#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_1 (32)
#define MTL_OFI_PROTO_MASK_1 (0x0000000F00000000ULL)
#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000900000000ULL)
/*
* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
*
* 01234567 01234567 01234567 01234567 01234567 0123 4567 01234567 01234567
* | | |
* Comm id | source |prot| message tag
*/
#define MTL_OFI_CID_BIT_COUNT_2 (24)
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFF000000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_2 (16)
#define MTL_OFI_SOURCE_MASK_2 (0x000000000000FFFFULL)
#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_2 (20)
#define MTL_OFI_PROTO_MASK_2 (0x0000000000F00000ULL)
#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000900000ULL)
/* Send tag */
__opal_attribute_always_inline__ static inline uint64_t
mtl_ofi_create_send_tag(int comm_id, int source, int tag)
{
uint64_t match_bits = comm_id;
match_bits = (match_bits << ompi_mtl_ofi.num_bits_source_rank);
match_bits |= (uint64_t)(source & ompi_mtl_ofi.source_rank_mask);
match_bits = (match_bits << (ompi_mtl_ofi.num_bits_mpi_tag
+ MTL_OFI_PROTO_BIT_COUNT));
match_bits |= (tag & ompi_mtl_ofi.mpi_tag_mask);
return match_bits;
}
/* Receive tag*/
__opal_attribute_always_inline__ static inline void
mtl_ofi_create_recv_tag(uint64_t *match_bits, uint64_t *mask_bits,
int comm_id, int source, int tag)
{
*mask_bits = ompi_mtl_ofi.sync_proto_mask;
*match_bits = comm_id;
*match_bits = (*match_bits << ompi_mtl_ofi.num_bits_source_rank);
if (MPI_ANY_SOURCE == source) {
*match_bits = (*match_bits << (ompi_mtl_ofi.num_bits_mpi_tag
+ MTL_OFI_PROTO_BIT_COUNT));
*mask_bits |= ompi_mtl_ofi.source_rank_tag_mask;
} else {
*match_bits |= (uint64_t)(source & ompi_mtl_ofi.source_rank_mask);
*match_bits = (*match_bits << (ompi_mtl_ofi.num_bits_mpi_tag
+ MTL_OFI_PROTO_BIT_COUNT));
}
/* receive posting */
/* Special tags are used for collective operations.
* MPI_ANY_TAG should not match these special tags.
* See ompi/mca/coll/base/coll_tags.h
*/
#define MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, contextid, source, tag) \
{ \
match_bits = 0; \
mask_bits = MTL_OFI_PROTOCOL_MASK; \
\
match_bits = contextid; \
match_bits = (match_bits << 16); \
\
if (MPI_ANY_SOURCE == source) { \
match_bits = (match_bits << 32); \
mask_bits |= MTL_OFI_SOURCE_MASK; \
} else { \
match_bits |= (uint64_t)source; \
match_bits = (match_bits << 32); \
} \
\
if (MPI_ANY_TAG == tag) { \
mask_bits |= 0x000000007FFFFFFFULL; \
} else { \
match_bits |= (MTL_OFI_TAG_MASK & tag); \
} \
if (MPI_ANY_TAG == tag) {
/* Special negative tags are used for collective operations.
* MPI_ANY_TAG should not match these special tags.
* See ompi/mca/coll/base/coll_tags.h
*/
*mask_bits |= (ompi_mtl_ofi.mpi_tag_mask>>1);
} else {
*match_bits |= (ompi_mtl_ofi.mpi_tag_mask & tag);
}
}
#define MTL_OFI_SET_SYNC_SEND(match_bits) \
match_bits |= ompi_mtl_ofi.sync_send
#define MTL_OFI_IS_SYNC_SEND(match_bits) \
(MTL_OFI_SYNC_SEND == (MTL_OFI_PROTOCOL_HEADER_MASK & match_bits))
#define MTL_OFI_IS_SYNC_SEND_ACK(match_bits) \
(MTL_OFI_SYNC_SEND_ACK == (MTL_OFI_PROTOCOL_HEADER_MASK & match_bits))
(ompi_mtl_ofi.sync_send == (ompi_mtl_ofi.sync_proto_mask & match_bits))
#define MTL_OFI_IS_SYNC_SEND_ACK(match_bits) \
(ompi_mtl_ofi.sync_send_ack == (ompi_mtl_ofi.sync_proto_mask & match_bits))
#define MTL_OFI_GET_TAG(match_bits) \
((int)(match_bits & MTL_OFI_TAG_MASK))
#define MTL_OFI_GET_SOURCE(match_bits) \
((int)((match_bits & MTL_OFI_SOURCE_MASK) >> 32))
((int)(match_bits & ompi_mtl_ofi.mpi_tag_mask))
__opal_attribute_always_inline__ static inline int
mtl_ofi_get_source(struct fi_cq_tagged_entry *wc)
{
int src;
if (ompi_mtl_ofi.fi_cq_data) {
src = (int) wc->data;
}
else {
src = (int)((wc->tag >> (MTL_OFI_PROTO_BIT_COUNT +
ompi_mtl_ofi.num_bits_mpi_tag)) & ompi_mtl_ofi.source_rank_mask);
}
return src;
}
END_C_DECLS
#endif /* MTL_OFI_TYPES_H_HAS_BEEN_INCLUDED */