1
1

Merge pull request #5397 from nrspruit/ns_ofi_mtl_ssend

MTL OFI: Redesign sync send with reduced tag bits and quick ack
Этот коммит содержится в:
Matias Cabral 2018-07-17 10:14:33 -07:00 коммит произвёл GitHub
родитель 9aa5168795 9a17864278
Коммит be3cb01cb4
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 72 добавлений и 69 удалений

Просмотреть файл

@ -24,8 +24,8 @@ CQ.
OFI TAG: OFI TAG:
MPI needs to send 96 bits of information per message (32 bits communicator id, MPI needs to send 96 bits of information per message (32 bits communicator id,
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In 32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
addition, the OFI MTL uses 4 bits of the OFI tag for the synchronous send protocol. addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
Therefore, there are only 60 bits available in the OFI tag for message usage. The Therefore, there are only 62 bits available in the OFI tag for message usage. The
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this: OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
"auto" (Default): "auto" (Default):
@ -36,19 +36,19 @@ fall back to "ofi_tag_1".
"ofi_tag_1": "ofi_tag_1":
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 60 trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 62
bits available bit in the OFI tag. There are two options available with different bits available bit in the OFI tag. There are two options available with different
number of bits for the Communicator ID and MPI tag fields. This tag distribution number of bits for the Communicator ID and MPI tag fields. This tag distribution
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
provider reserved bits (see mem_tag_format below), 16 bits for Source Rank (max provider reserved bits (see mem_tag_format below), 18 bits for Source Rank (max
Source Rank 65,535), 32 bits for MPI tag (max MPI tag is INT_MAX). Source Rank 262,143), 32 bits for MPI tag (max MPI tag is INT_MAX).
"ofi_tag_2": "ofi_tag_2":
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
applications that may require a greater number of supported Communicators at the applications that may require a greater number of supported Communicators at the
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 16 Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 18
bits for Source Rank (max Source Rank 65,535), 20 bits for MPI tag (max MPI tag bits for Source Rank (max Source Rank 262,143), 20 bits for MPI tag (max MPI tag
524,287). 524,287).
"ofi_tag_full": "ofi_tag_full":

Просмотреть файл

@ -274,8 +274,6 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
ofi_req->completion_count = 2; ofi_req->completion_count = 2;
MTL_OFI_SET_SYNC_SEND(match_bits);
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep, MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
NULL, NULL,
0, 0,
@ -291,6 +289,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
free(ack_req); free(ack_req);
return ompi_mtl_ofi_get_error(ret); return ompi_mtl_ofi_get_error(ret);
} }
/* The SYNC_SEND tag bit is set for the send operation only.*/
MTL_OFI_SET_SYNC_SEND(match_bits);
} else { } else {
ofi_req->completion_count = 1; ofi_req->completion_count = 1;
} }
@ -423,20 +423,6 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
return ret; return ret;
} }
/**
* Called when a completion for SYNC ACK send is received.
* This completes the synchronous recv operation. Thus, we
* call the upper layer's completion function.
*/
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_sync_recv_callback(struct fi_cq_tagged_entry *wc,
ompi_mtl_ofi_request_t *ofi_req)
{
ofi_req->super.completion_callback(&ofi_req->super);
return OMPI_SUCCESS;
}
/** /**
* Called when a completion for a posted recv is received. * Called when a completion for a posted recv is received.
*/ */
@ -450,6 +436,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
mca_mtl_ofi_endpoint_t *endpoint = NULL; mca_mtl_ofi_endpoint_t *endpoint = NULL;
int src = mtl_ofi_get_source(wc); int src = mtl_ofi_get_source(wc);
ompi_status_public_t *status = NULL; ompi_status_public_t *status = NULL;
struct fi_msg_tagged tagged_msg;
assert(ofi_req->super.ompi_req); assert(ofi_req->super.ompi_req);
status = &ofi_req->super.ompi_req->req_status; status = &ofi_req->super.ompi_req->req_status;
@ -487,21 +474,25 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
} }
/** /**
* We do not want any SYNC_SEND_ACK here! * We can only accept MTL_OFI_SYNC_SEND in the standard recv callback.
* See mtl_ofi_send.c for details. * MTL_OFI_SYNC_SEND_ACK should only be received in the send_ack
* callback.
*/ */
assert(!MTL_OFI_IS_SYNC_SEND_ACK(wc->tag)); assert(!MTL_OFI_IS_SYNC_SEND_ACK(wc->tag));
/** /**
* If this recv is part of an MPI_Ssend operation, then we send an * If this recv is part of an MPI_Ssend operation, then we send an
* acknowledgment back to the sender. The fi_context can be * acknowledgment back to the sender.
* re-used safely because the previous operation has completed. * The ack message is sent without generating a completion event in
* This recv request will complete once we get a completion for * the completion queue by not setting FI_COMPLETION in the flags to
* this send. See ompi_mtl_ofi_sync_recv_callback(). * fi_tsendmsg(FI_SELECTIVE_COMPLETION).
* Otherwise, this request is now complete. * This is done since the 0 byte message requires no
* notification on the send side for a successful completion.
* If a failure occurs the provider will notify the error
* in the cq_readerr during OFI progress. Once the message has been
* successfully processed the request is marked as completed.
*/ */
if (OPAL_UNLIKELY(MTL_OFI_IS_SYNC_SEND(wc->tag))) { if (OPAL_UNLIKELY(MTL_OFI_IS_SYNC_SEND(wc->tag))) {
ofi_req->event_callback = ompi_mtl_ofi_sync_recv_callback;
/** /**
* If the recv request was posted for any source, * If the recv request was posted for any source,
* we need to extract the source's actual address. * we need to extract the source's actual address.
@ -511,23 +502,32 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc); endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc);
ofi_req->remote_addr = endpoint->peer_fiaddr; ofi_req->remote_addr = endpoint->peer_fiaddr;
} }
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
NULL, tagged_msg.msg_iov = NULL;
0, tagged_msg.desc = NULL;
NULL, tagged_msg.iov_count = 0;
ofi_req->remote_addr, tagged_msg.addr = ofi_req->remote_addr;
wc->tag | ompi_mtl_ofi.sync_send_ack, /**
(void *) &ofi_req->ctx)); * We must continue to use the user's original tag but remove the
* sync_send protocol tag bit and instead apply the sync_send_ack
* tag bit to complete the initator's sync send receive.
*/
tagged_msg.tag = (wc->tag | ompi_mtl_ofi.sync_send_ack) & ~ompi_mtl_ofi.sync_send;
tagged_msg.context = NULL;
tagged_msg.data = 0;
MTL_OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ep,
&tagged_msg, 0));
if (OPAL_UNLIKELY(0 > ret)) { if (OPAL_UNLIKELY(0 > ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_tsend failed: %s(%zd)", "%s:%d: fi_tsendmsg failed: %s(%zd)",
__FILE__, __LINE__, fi_strerror(-ret), ret); __FILE__, __LINE__, fi_strerror(-ret), ret);
status->MPI_ERROR = OMPI_ERROR; status->MPI_ERROR = OMPI_ERROR;
} }
} else {
ofi_req->super.completion_callback(&ofi_req->super);
} }
ofi_req->super.completion_callback(&ofi_req->super);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -701,7 +701,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
struct fi_msg_tagged msg; struct fi_msg_tagged msg;
int ompi_ret; int ompi_ret;
ssize_t ret; ssize_t ret;
uint64_t msgflags = FI_CLAIM; uint64_t msgflags = FI_CLAIM | FI_COMPLETION;
ompi_ret = ompi_mtl_datatype_recv_buf(convertor, ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
&start, &start,
@ -791,7 +791,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
uint64_t match_bits, mask_bits; uint64_t match_bits, mask_bits;
ssize_t ret; ssize_t ret;
struct fi_msg_tagged msg; struct fi_msg_tagged msg;
uint64_t msgflags = FI_PEEK; uint64_t msgflags = FI_PEEK | FI_COMPLETION;
if (ompi_mtl_ofi.fi_cq_data) { if (ompi_mtl_ofi.fi_cq_data) {
/* If the source is known, use its peer_fiaddr. */ /* If the source is known, use its peer_fiaddr. */
@ -877,7 +877,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
uint64_t match_bits, mask_bits; uint64_t match_bits, mask_bits;
ssize_t ret; ssize_t ret;
struct fi_msg_tagged msg; struct fi_msg_tagged msg;
uint64_t msgflags = FI_PEEK | FI_CLAIM; uint64_t msgflags = FI_PEEK | FI_CLAIM | FI_COMPLETION;
ofi_req = malloc(sizeof *ofi_req); ofi_req = malloc(sizeof *ofi_req);
if (NULL == ofi_req) { if (NULL == ofi_req) {

Просмотреть файл

@ -467,6 +467,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
hints->caps = FI_TAGGED; /* Tag matching interface */ hints->caps = FI_TAGGED; /* Tag matching interface */
hints->tx_attr->msg_order = FI_ORDER_SAS; hints->tx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->msg_order = FI_ORDER_SAS; hints->rx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->op_flags = FI_COMPLETION;
hints->tx_attr->op_flags = FI_COMPLETION;
hints->domain_attr->threading = FI_THREAD_UNSPEC; hints->domain_attr->threading = FI_THREAD_UNSPEC;
@ -691,7 +693,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
*/ */
ret = fi_ep_bind(ompi_mtl_ofi.ep, ret = fi_ep_bind(ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.cq, (fid_t)ompi_mtl_ofi.cq,
FI_SEND | FI_RECV); FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
if (0 != ret) { if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind CQ-EP failed: %s\n", "%s:%d: fi_bind CQ-EP failed: %s\n",

Просмотреть файл

@ -89,18 +89,19 @@ typedef struct mca_mtl_ofi_component_t {
*/ */
/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum) /* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
* 01234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567 * 01234567 01234567 01234567 012345 67 01234567 01234567 01234567 01234567
* | | * | |
* context_id |prot| message tag * context_id |prot| message tag
*/ */
#define MTL_OFI_PROTO_BIT_COUNT (4) #define MTL_OFI_PROTO_BIT_COUNT (2)
#define MTL_OFI_CID_BIT_COUNT_DATA (28) #define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
#define MTL_OFI_CID_BIT_COUNT_DATA (30)
#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL) #define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_DATA (32) #define MTL_OFI_TAG_BIT_COUNT_DATA (32)
#define MTL_OFI_PROTO_MASK_DATA (0x0000000F00000000ULL) #define MTL_OFI_PROTO_MASK_DATA (0x0000000300000000ULL)
#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL) #define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000900000000ULL) #define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000200000000ULL)
/* Send tag with CQ_DATA */ /* Send tag with CQ_DATA */
__opal_attribute_always_inline__ static inline uint64_t __opal_attribute_always_inline__ static inline uint64_t
@ -136,38 +137,38 @@ mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
/* /*
* ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported * ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
* *
* 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567 01234567 * 01234567 0123 4567 01234567 012345 67 01234567 01234567 01234567 01234567
* | | | * | | |
* Comm id | source |prot| message tag * Comm id | source |prot| message tag
*/ */
#define MTL_OFI_CID_BIT_COUNT_1 (12) #define MTL_OFI_CID_BIT_COUNT_1 (12)
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFF000000000ULL) #define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFFC00000000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_1 (16) #define MTL_OFI_SOURCE_BIT_COUNT_1 (18)
#define MTL_OFI_SOURCE_MASK_1 (0x000000000000FFFFULL) #define MTL_OFI_SOURCE_MASK_1 (0x000000000003FFFFULL)
#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL) #define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_1 (32) #define MTL_OFI_TAG_BIT_COUNT_1 (32)
#define MTL_OFI_PROTO_MASK_1 (0x0000000F00000000ULL) #define MTL_OFI_PROTO_MASK_1 (0x0000000300000000ULL)
#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL) #define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000900000000ULL) #define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000200000000ULL)
/* /*
* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported * ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
* *
* 01234567 01234567 01234567 01234567 01234567 0123 4567 01234567 01234567 * 01234567 01234567 01234567 01234567 01234567 01 23 4567 01234567 01234567
* | | | * | | |
* Comm id | source |prot| message tag * Comm id | source |prot| message tag
*/ */
#define MTL_OFI_CID_BIT_COUNT_2 (24) #define MTL_OFI_CID_BIT_COUNT_2 (24)
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFF000000ULL) #define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFFC00000ULL)
#define MTL_OFI_SOURCE_BIT_COUNT_2 (16) #define MTL_OFI_SOURCE_BIT_COUNT_2 (18)
#define MTL_OFI_SOURCE_MASK_2 (0x000000000000FFFFULL) #define MTL_OFI_SOURCE_MASK_2 (0x000000000003FFFFULL)
#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL) #define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
#define MTL_OFI_TAG_BIT_COUNT_2 (20) #define MTL_OFI_TAG_BIT_COUNT_2 (20)
#define MTL_OFI_PROTO_MASK_2 (0x0000000000F00000ULL) #define MTL_OFI_PROTO_MASK_2 (0x0000000000300000ULL)
#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL) #define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000900000ULL) #define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000200000ULL)
/* Send tag */ /* Send tag */
__opal_attribute_always_inline__ static inline uint64_t __opal_attribute_always_inline__ static inline uint64_t