Merge pull request #5397 from nrspruit/ns_ofi_mtl_ssend
MTL OFI: Redesign sync send with reduced tag bits and quick ack
Этот коммит содержится в:
Коммит
be3cb01cb4
@ -24,8 +24,8 @@ CQ.
|
|||||||
OFI TAG:
|
OFI TAG:
|
||||||
MPI needs to send 96 bits of information per message (32 bits communicator id,
|
MPI needs to send 96 bits of information per message (32 bits communicator id,
|
||||||
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
|
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
|
||||||
addition, the OFI MTL uses 4 bits of the OFI tag for the synchronous send protocol.
|
addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
|
||||||
Therefore, there are only 60 bits available in the OFI tag for message usage. The
|
Therefore, there are only 62 bits available in the OFI tag for message usage. The
|
||||||
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
|
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
|
||||||
|
|
||||||
"auto" (Default):
|
"auto" (Default):
|
||||||
@ -36,19 +36,19 @@ fall back to "ofi_tag_1".
|
|||||||
|
|
||||||
"ofi_tag_1":
|
"ofi_tag_1":
|
||||||
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
|
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
|
||||||
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 60
|
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 62
|
||||||
bits available bit in the OFI tag. There are two options available with different
|
bits available bit in the OFI tag. There are two options available with different
|
||||||
number of bits for the Communicator ID and MPI tag fields. This tag distribution
|
number of bits for the Communicator ID and MPI tag fields. This tag distribution
|
||||||
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
|
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
|
||||||
provider reserved bits (see mem_tag_format below), 16 bits for Source Rank (max
|
provider reserved bits (see mem_tag_format below), 18 bits for Source Rank (max
|
||||||
Source Rank 65,535), 32 bits for MPI tag (max MPI tag is INT_MAX).
|
Source Rank 262,143), 32 bits for MPI tag (max MPI tag is INT_MAX).
|
||||||
|
|
||||||
"ofi_tag_2":
|
"ofi_tag_2":
|
||||||
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
|
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
|
||||||
applications that may require a greater number of supported Communicators at the
|
applications that may require a greater number of supported Communicators at the
|
||||||
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
|
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
|
||||||
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 16
|
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 18
|
||||||
bits for Source Rank (max Source Rank 65,535), 20 bits for MPI tag (max MPI tag
|
bits for Source Rank (max Source Rank 262,143), 20 bits for MPI tag (max MPI tag
|
||||||
524,287).
|
524,287).
|
||||||
|
|
||||||
"ofi_tag_full":
|
"ofi_tag_full":
|
||||||
|
@ -274,8 +274,6 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
|
|||||||
|
|
||||||
ofi_req->completion_count = 2;
|
ofi_req->completion_count = 2;
|
||||||
|
|
||||||
MTL_OFI_SET_SYNC_SEND(match_bits);
|
|
||||||
|
|
||||||
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
|
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
|
||||||
NULL,
|
NULL,
|
||||||
0,
|
0,
|
||||||
@ -291,6 +289,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
|
|||||||
free(ack_req);
|
free(ack_req);
|
||||||
return ompi_mtl_ofi_get_error(ret);
|
return ompi_mtl_ofi_get_error(ret);
|
||||||
}
|
}
|
||||||
|
/* The SYNC_SEND tag bit is set for the send operation only.*/
|
||||||
|
MTL_OFI_SET_SYNC_SEND(match_bits);
|
||||||
} else {
|
} else {
|
||||||
ofi_req->completion_count = 1;
|
ofi_req->completion_count = 1;
|
||||||
}
|
}
|
||||||
@ -423,20 +423,6 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Called when a completion for SYNC ACK send is received.
|
|
||||||
* This completes the synchronous recv operation. Thus, we
|
|
||||||
* call the upper layer's completion function.
|
|
||||||
*/
|
|
||||||
__opal_attribute_always_inline__ static inline int
|
|
||||||
ompi_mtl_ofi_sync_recv_callback(struct fi_cq_tagged_entry *wc,
|
|
||||||
ompi_mtl_ofi_request_t *ofi_req)
|
|
||||||
{
|
|
||||||
ofi_req->super.completion_callback(&ofi_req->super);
|
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when a completion for a posted recv is received.
|
* Called when a completion for a posted recv is received.
|
||||||
*/
|
*/
|
||||||
@ -450,6 +436,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
|
|||||||
mca_mtl_ofi_endpoint_t *endpoint = NULL;
|
mca_mtl_ofi_endpoint_t *endpoint = NULL;
|
||||||
int src = mtl_ofi_get_source(wc);
|
int src = mtl_ofi_get_source(wc);
|
||||||
ompi_status_public_t *status = NULL;
|
ompi_status_public_t *status = NULL;
|
||||||
|
struct fi_msg_tagged tagged_msg;
|
||||||
|
|
||||||
assert(ofi_req->super.ompi_req);
|
assert(ofi_req->super.ompi_req);
|
||||||
status = &ofi_req->super.ompi_req->req_status;
|
status = &ofi_req->super.ompi_req->req_status;
|
||||||
@ -487,21 +474,25 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* We do not want any SYNC_SEND_ACK here!
|
* We can only accept MTL_OFI_SYNC_SEND in the standard recv callback.
|
||||||
* See mtl_ofi_send.c for details.
|
* MTL_OFI_SYNC_SEND_ACK should only be received in the send_ack
|
||||||
|
* callback.
|
||||||
*/
|
*/
|
||||||
assert(!MTL_OFI_IS_SYNC_SEND_ACK(wc->tag));
|
assert(!MTL_OFI_IS_SYNC_SEND_ACK(wc->tag));
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If this recv is part of an MPI_Ssend operation, then we send an
|
* If this recv is part of an MPI_Ssend operation, then we send an
|
||||||
* acknowledgment back to the sender. The fi_context can be
|
* acknowledgment back to the sender.
|
||||||
* re-used safely because the previous operation has completed.
|
* The ack message is sent without generating a completion event in
|
||||||
* This recv request will complete once we get a completion for
|
* the completion queue by not setting FI_COMPLETION in the flags to
|
||||||
* this send. See ompi_mtl_ofi_sync_recv_callback().
|
* fi_tsendmsg(FI_SELECTIVE_COMPLETION).
|
||||||
* Otherwise, this request is now complete.
|
* This is done since the 0 byte message requires no
|
||||||
|
* notification on the send side for a successful completion.
|
||||||
|
* If a failure occurs the provider will notify the error
|
||||||
|
* in the cq_readerr during OFI progress. Once the message has been
|
||||||
|
* successfully processed the request is marked as completed.
|
||||||
*/
|
*/
|
||||||
if (OPAL_UNLIKELY(MTL_OFI_IS_SYNC_SEND(wc->tag))) {
|
if (OPAL_UNLIKELY(MTL_OFI_IS_SYNC_SEND(wc->tag))) {
|
||||||
ofi_req->event_callback = ompi_mtl_ofi_sync_recv_callback;
|
|
||||||
/**
|
/**
|
||||||
* If the recv request was posted for any source,
|
* If the recv request was posted for any source,
|
||||||
* we need to extract the source's actual address.
|
* we need to extract the source's actual address.
|
||||||
@ -511,23 +502,32 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
|
|||||||
endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc);
|
endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc);
|
||||||
ofi_req->remote_addr = endpoint->peer_fiaddr;
|
ofi_req->remote_addr = endpoint->peer_fiaddr;
|
||||||
}
|
}
|
||||||
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
|
|
||||||
NULL,
|
tagged_msg.msg_iov = NULL;
|
||||||
0,
|
tagged_msg.desc = NULL;
|
||||||
NULL,
|
tagged_msg.iov_count = 0;
|
||||||
ofi_req->remote_addr,
|
tagged_msg.addr = ofi_req->remote_addr;
|
||||||
wc->tag | ompi_mtl_ofi.sync_send_ack,
|
/**
|
||||||
(void *) &ofi_req->ctx));
|
* We must continue to use the user's original tag but remove the
|
||||||
|
* sync_send protocol tag bit and instead apply the sync_send_ack
|
||||||
|
* tag bit to complete the initator's sync send receive.
|
||||||
|
*/
|
||||||
|
tagged_msg.tag = (wc->tag | ompi_mtl_ofi.sync_send_ack) & ~ompi_mtl_ofi.sync_send;
|
||||||
|
tagged_msg.context = NULL;
|
||||||
|
tagged_msg.data = 0;
|
||||||
|
|
||||||
|
MTL_OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ep,
|
||||||
|
&tagged_msg, 0));
|
||||||
if (OPAL_UNLIKELY(0 > ret)) {
|
if (OPAL_UNLIKELY(0 > ret)) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
"%s:%d: fi_tsend failed: %s(%zd)",
|
"%s:%d: fi_tsendmsg failed: %s(%zd)",
|
||||||
__FILE__, __LINE__, fi_strerror(-ret), ret);
|
__FILE__, __LINE__, fi_strerror(-ret), ret);
|
||||||
status->MPI_ERROR = OMPI_ERROR;
|
status->MPI_ERROR = OMPI_ERROR;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
ofi_req->super.completion_callback(&ofi_req->super);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ofi_req->super.completion_callback(&ofi_req->super);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -701,7 +701,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
|
|||||||
struct fi_msg_tagged msg;
|
struct fi_msg_tagged msg;
|
||||||
int ompi_ret;
|
int ompi_ret;
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
uint64_t msgflags = FI_CLAIM;
|
uint64_t msgflags = FI_CLAIM | FI_COMPLETION;
|
||||||
|
|
||||||
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
|
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
|
||||||
&start,
|
&start,
|
||||||
@ -791,7 +791,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
|
|||||||
uint64_t match_bits, mask_bits;
|
uint64_t match_bits, mask_bits;
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
struct fi_msg_tagged msg;
|
struct fi_msg_tagged msg;
|
||||||
uint64_t msgflags = FI_PEEK;
|
uint64_t msgflags = FI_PEEK | FI_COMPLETION;
|
||||||
|
|
||||||
if (ompi_mtl_ofi.fi_cq_data) {
|
if (ompi_mtl_ofi.fi_cq_data) {
|
||||||
/* If the source is known, use its peer_fiaddr. */
|
/* If the source is known, use its peer_fiaddr. */
|
||||||
@ -877,7 +877,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
|
|||||||
uint64_t match_bits, mask_bits;
|
uint64_t match_bits, mask_bits;
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
struct fi_msg_tagged msg;
|
struct fi_msg_tagged msg;
|
||||||
uint64_t msgflags = FI_PEEK | FI_CLAIM;
|
uint64_t msgflags = FI_PEEK | FI_CLAIM | FI_COMPLETION;
|
||||||
|
|
||||||
ofi_req = malloc(sizeof *ofi_req);
|
ofi_req = malloc(sizeof *ofi_req);
|
||||||
if (NULL == ofi_req) {
|
if (NULL == ofi_req) {
|
||||||
|
@ -467,6 +467,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
hints->caps = FI_TAGGED; /* Tag matching interface */
|
hints->caps = FI_TAGGED; /* Tag matching interface */
|
||||||
hints->tx_attr->msg_order = FI_ORDER_SAS;
|
hints->tx_attr->msg_order = FI_ORDER_SAS;
|
||||||
hints->rx_attr->msg_order = FI_ORDER_SAS;
|
hints->rx_attr->msg_order = FI_ORDER_SAS;
|
||||||
|
hints->rx_attr->op_flags = FI_COMPLETION;
|
||||||
|
hints->tx_attr->op_flags = FI_COMPLETION;
|
||||||
|
|
||||||
hints->domain_attr->threading = FI_THREAD_UNSPEC;
|
hints->domain_attr->threading = FI_THREAD_UNSPEC;
|
||||||
|
|
||||||
@ -691,7 +693,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
*/
|
*/
|
||||||
ret = fi_ep_bind(ompi_mtl_ofi.ep,
|
ret = fi_ep_bind(ompi_mtl_ofi.ep,
|
||||||
(fid_t)ompi_mtl_ofi.cq,
|
(fid_t)ompi_mtl_ofi.cq,
|
||||||
FI_SEND | FI_RECV);
|
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||||
"%s:%d: fi_bind CQ-EP failed: %s\n",
|
"%s:%d: fi_bind CQ-EP failed: %s\n",
|
||||||
|
@ -93,14 +93,15 @@ typedef struct mca_mtl_ofi_component_t {
|
|||||||
* | |
|
* | |
|
||||||
* context_id |prot| message tag
|
* context_id |prot| message tag
|
||||||
*/
|
*/
|
||||||
#define MTL_OFI_PROTO_BIT_COUNT (4)
|
#define MTL_OFI_PROTO_BIT_COUNT (2)
|
||||||
|
|
||||||
#define MTL_OFI_CID_BIT_COUNT_DATA (28)
|
#define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
|
||||||
|
#define MTL_OFI_CID_BIT_COUNT_DATA (30)
|
||||||
#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
|
#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
|
||||||
#define MTL_OFI_TAG_BIT_COUNT_DATA (32)
|
#define MTL_OFI_TAG_BIT_COUNT_DATA (32)
|
||||||
#define MTL_OFI_PROTO_MASK_DATA (0x0000000F00000000ULL)
|
#define MTL_OFI_PROTO_MASK_DATA (0x0000000300000000ULL)
|
||||||
#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
|
#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
|
||||||
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000900000000ULL)
|
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000200000000ULL)
|
||||||
|
|
||||||
/* Send tag with CQ_DATA */
|
/* Send tag with CQ_DATA */
|
||||||
__opal_attribute_always_inline__ static inline uint64_t
|
__opal_attribute_always_inline__ static inline uint64_t
|
||||||
@ -142,14 +143,14 @@ mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#define MTL_OFI_CID_BIT_COUNT_1 (12)
|
#define MTL_OFI_CID_BIT_COUNT_1 (12)
|
||||||
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFF000000000ULL)
|
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFFC00000000ULL)
|
||||||
#define MTL_OFI_SOURCE_BIT_COUNT_1 (16)
|
#define MTL_OFI_SOURCE_BIT_COUNT_1 (18)
|
||||||
#define MTL_OFI_SOURCE_MASK_1 (0x000000000000FFFFULL)
|
#define MTL_OFI_SOURCE_MASK_1 (0x000000000003FFFFULL)
|
||||||
#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
|
#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
|
||||||
#define MTL_OFI_TAG_BIT_COUNT_1 (32)
|
#define MTL_OFI_TAG_BIT_COUNT_1 (32)
|
||||||
#define MTL_OFI_PROTO_MASK_1 (0x0000000F00000000ULL)
|
#define MTL_OFI_PROTO_MASK_1 (0x0000000300000000ULL)
|
||||||
#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
|
#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
|
||||||
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000900000000ULL)
|
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000200000000ULL)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
|
* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
|
||||||
@ -160,14 +161,14 @@ mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#define MTL_OFI_CID_BIT_COUNT_2 (24)
|
#define MTL_OFI_CID_BIT_COUNT_2 (24)
|
||||||
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFF000000ULL)
|
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFFC00000ULL)
|
||||||
#define MTL_OFI_SOURCE_BIT_COUNT_2 (16)
|
#define MTL_OFI_SOURCE_BIT_COUNT_2 (18)
|
||||||
#define MTL_OFI_SOURCE_MASK_2 (0x000000000000FFFFULL)
|
#define MTL_OFI_SOURCE_MASK_2 (0x000000000003FFFFULL)
|
||||||
#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
|
#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
|
||||||
#define MTL_OFI_TAG_BIT_COUNT_2 (20)
|
#define MTL_OFI_TAG_BIT_COUNT_2 (20)
|
||||||
#define MTL_OFI_PROTO_MASK_2 (0x0000000000F00000ULL)
|
#define MTL_OFI_PROTO_MASK_2 (0x0000000000300000ULL)
|
||||||
#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
|
#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
|
||||||
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000900000ULL)
|
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000200000ULL)
|
||||||
|
|
||||||
/* Send tag */
|
/* Send tag */
|
||||||
__opal_attribute_always_inline__ static inline uint64_t
|
__opal_attribute_always_inline__ static inline uint64_t
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user