1
1

Merge pull request #5397 from nrspruit/ns_ofi_mtl_ssend

MTL OFI: Redesign sync send with reduced tag bits and quick ack
Этот коммит содержится в:
Matias Cabral 2018-07-17 10:14:33 -07:00 коммит произвёл GitHub
родитель 9aa5168795 9a17864278
Коммит be3cb01cb4
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 72 добавлений и 69 удалений

Просмотреть файл

@ -24,8 +24,8 @@ CQ.
OFI TAG:
MPI needs to send 96 bits of information per message (32 bits communicator id,
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
addition, the OFI MTL uses 4 bits of the OFI tag for the synchronous send protocol.
Therefore, there are only 60 bits available in the OFI tag for message usage. The
addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
Therefore, there are only 62 bits available in the OFI tag for message usage. The
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
"auto" (Default):
@ -36,19 +36,19 @@ fall back to "ofi_tag_1".
"ofi_tag_1":
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 60
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 62
bits available bit in the OFI tag. There are two options available with different
number of bits for the Communicator ID and MPI tag fields. This tag distribution
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
provider reserved bits (see mem_tag_format below), 16 bits for Source Rank (max
Source Rank 65,535), 32 bits for MPI tag (max MPI tag is INT_MAX).
provider reserved bits (see mem_tag_format below), 18 bits for Source Rank (max
Source Rank 262,143), 32 bits for MPI tag (max MPI tag is INT_MAX).
"ofi_tag_2":
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
applications that may require a greater number of supported Communicators at the
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 16
bits for Source Rank (max Source Rank 65,535), 20 bits for MPI tag (max MPI tag
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 18
bits for Source Rank (max Source Rank 262,143), 20 bits for MPI tag (max MPI tag
524,287).
"ofi_tag_full":

Просмотреть файл

@ -274,8 +274,6 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
ofi_req->completion_count = 2;
MTL_OFI_SET_SYNC_SEND(match_bits);
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
NULL,
0,
@ -291,6 +289,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
free(ack_req);
return ompi_mtl_ofi_get_error(ret);
}
/* The SYNC_SEND tag bit is set for the send operation only.*/
MTL_OFI_SET_SYNC_SEND(match_bits);
} else {
ofi_req->completion_count = 1;
}
@ -423,20 +423,6 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
return ret;
}
/**
* Called when a completion for SYNC ACK send is received.
* This completes the synchronous recv operation. Thus, we
* call the upper layer's completion function.
*/
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_sync_recv_callback(struct fi_cq_tagged_entry *wc,
ompi_mtl_ofi_request_t *ofi_req)
{
ofi_req->super.completion_callback(&ofi_req->super);
return OMPI_SUCCESS;
}
/**
* Called when a completion for a posted recv is received.
*/
@ -450,6 +436,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
mca_mtl_ofi_endpoint_t *endpoint = NULL;
int src = mtl_ofi_get_source(wc);
ompi_status_public_t *status = NULL;
struct fi_msg_tagged tagged_msg;
assert(ofi_req->super.ompi_req);
status = &ofi_req->super.ompi_req->req_status;
@ -487,21 +474,25 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
}
/**
* We do not want any SYNC_SEND_ACK here!
* See mtl_ofi_send.c for details.
* We can only accept MTL_OFI_SYNC_SEND in the standard recv callback.
* MTL_OFI_SYNC_SEND_ACK should only be received in the send_ack
* callback.
*/
assert(!MTL_OFI_IS_SYNC_SEND_ACK(wc->tag));
/**
* If this recv is part of an MPI_Ssend operation, then we send an
* acknowledgment back to the sender. The fi_context can be
* re-used safely because the previous operation has completed.
* This recv request will complete once we get a completion for
* this send. See ompi_mtl_ofi_sync_recv_callback().
* Otherwise, this request is now complete.
* acknowledgment back to the sender.
* The ack message is sent without generating a completion event in
* the completion queue by not setting FI_COMPLETION in the flags to
* fi_tsendmsg(FI_SELECTIVE_COMPLETION).
* This is done since the 0 byte message requires no
* notification on the send side for a successful completion.
* If a failure occurs the provider will notify the error
* in the cq_readerr during OFI progress. Once the message has been
* successfully processed the request is marked as completed.
*/
if (OPAL_UNLIKELY(MTL_OFI_IS_SYNC_SEND(wc->tag))) {
ofi_req->event_callback = ompi_mtl_ofi_sync_recv_callback;
/**
* If the recv request was posted for any source,
* we need to extract the source's actual address.
@ -511,23 +502,32 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc);
ofi_req->remote_addr = endpoint->peer_fiaddr;
}
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
NULL,
0,
NULL,
ofi_req->remote_addr,
wc->tag | ompi_mtl_ofi.sync_send_ack,
(void *) &ofi_req->ctx));
tagged_msg.msg_iov = NULL;
tagged_msg.desc = NULL;
tagged_msg.iov_count = 0;
tagged_msg.addr = ofi_req->remote_addr;
/**
* We must continue to use the user's original tag but remove the
* sync_send protocol tag bit and instead apply the sync_send_ack
* tag bit to complete the initator's sync send receive.
*/
tagged_msg.tag = (wc->tag | ompi_mtl_ofi.sync_send_ack) & ~ompi_mtl_ofi.sync_send;
tagged_msg.context = NULL;
tagged_msg.data = 0;
MTL_OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ep,
&tagged_msg, 0));
if (OPAL_UNLIKELY(0 > ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_tsend failed: %s(%zd)",
"%s:%d: fi_tsendmsg failed: %s(%zd)",
__FILE__, __LINE__, fi_strerror(-ret), ret);
status->MPI_ERROR = OMPI_ERROR;
}
} else {
ofi_req->super.completion_callback(&ofi_req->super);
}
ofi_req->super.completion_callback(&ofi_req->super);
return OMPI_SUCCESS;
}
@ -701,7 +701,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
struct fi_msg_tagged msg;
int ompi_ret;
ssize_t ret;
uint64_t msgflags = FI_CLAIM;
uint64_t msgflags = FI_CLAIM | FI_COMPLETION;
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
&start,
@ -791,7 +791,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
uint64_t match_bits, mask_bits;
ssize_t ret;
struct fi_msg_tagged msg;
uint64_t msgflags = FI_PEEK;
uint64_t msgflags = FI_PEEK | FI_COMPLETION;
if (ompi_mtl_ofi.fi_cq_data) {
/* If the source is known, use its peer_fiaddr. */
@ -877,7 +877,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
uint64_t match_bits, mask_bits;
ssize_t ret;
struct fi_msg_tagged msg;
uint64_t msgflags = FI_PEEK | FI_CLAIM;
uint64_t msgflags = FI_PEEK | FI_CLAIM | FI_COMPLETION;
ofi_req = malloc(sizeof *ofi_req);
if (NULL == ofi_req) {

Просмотреть файл

@ -467,6 +467,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
hints->caps = FI_TAGGED; /* Tag matching interface */
hints->tx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->op_flags = FI_COMPLETION;
hints->tx_attr->op_flags = FI_COMPLETION;
hints->domain_attr->threading = FI_THREAD_UNSPEC;
@ -691,7 +693,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
*/
ret = fi_ep_bind(ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.cq,
FI_SEND | FI_RECV);
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind CQ-EP failed: %s\n",

Просмотреть файл

@ -89,18 +89,19 @@ typedef struct mca_mtl_ofi_component_t {
*/
/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
* 01234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567
* 01234567 01234567 01234567 012345 67 01234567 01234567 01234567 01234567
* | |
* context_id |prot| message tag
*/
#define MTL_OFI_PROTO_BIT_COUNT (4)
#define MTL_OFI_PROTO_BIT_COUNT (2)
#define MTL_OFI_CID_BIT_COUNT_DATA (28)
#define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
#define MTL_OFI_CID_BIT_COUNT_DATA (30)
#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_DATA (32)
#define MTL_OFI_PROTO_MASK_DATA (0x0000000F00000000ULL)
#define MTL_OFI_PROTO_MASK_DATA (0x0000000300000000ULL)
#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000900000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000200000000ULL)
/* Send tag with CQ_DATA */
__opal_attribute_always_inline__ static inline uint64_t
@ -136,38 +137,38 @@ mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
/*
* ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
*
* 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567 01234567
* 01234567 0123 4567 01234567 012345 67 01234567 01234567 01234567 01234567
* | | |
* Comm id | source |prot| message tag
*/
#define MTL_OFI_CID_BIT_COUNT_1 (12)
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFF000000000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_1 (16)
#define MTL_OFI_SOURCE_MASK_1 (0x000000000000FFFFULL)
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFFC00000000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_1 (18)
#define MTL_OFI_SOURCE_MASK_1 (0x000000000003FFFFULL)
#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_1 (32)
#define MTL_OFI_PROTO_MASK_1 (0x0000000F00000000ULL)
#define MTL_OFI_PROTO_MASK_1 (0x0000000300000000ULL)
#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000900000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000200000000ULL)
/*
* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
*
* 01234567 01234567 01234567 01234567 01234567 0123 4567 01234567 01234567
* 01234567 01234567 01234567 01234567 01234567 01 23 4567 01234567 01234567
* | | |
* Comm id | source |prot| message tag
*/
#define MTL_OFI_CID_BIT_COUNT_2 (24)
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFF000000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_2 (16)
#define MTL_OFI_SOURCE_MASK_2 (0x000000000000FFFFULL)
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFFC00000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_2 (18)
#define MTL_OFI_SOURCE_MASK_2 (0x000000000003FFFFULL)
#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_2 (20)
#define MTL_OFI_PROTO_MASK_2 (0x0000000000F00000ULL)
#define MTL_OFI_PROTO_MASK_2 (0x0000000000300000ULL)
#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000900000ULL)
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000200000ULL)
/* Send tag */
__opal_attribute_always_inline__ static inline uint64_t