From a7dcfd98742d7f44d96d74a60a38529894d4099c Mon Sep 17 00:00:00 2001 From: William Zhang Date: Tue, 28 Jul 2020 09:24:36 -0700 Subject: [PATCH 1/2] btl/ofi: Disable EFA provider in versions earlier than libfabric 1.12.0 EFA incorrectly implements FI_DELIVERY_COMPLETE in earlier libfabric versions. While FI_DELIVERY_COMPLETE would be advertised by the provider, completions would return too early by not accounting for bounce buffers on the receive side. This would cause the BTL to receive early completions that lead to correctness issues. This is not an issue in the mtl/ofi as it does not require FI_DELIVERY_COMPLETE. Signed-off-by: William Zhang --- opal/mca/btl/ofi/btl_ofi_component.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index c4efa5e167..ee5809f17a 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -59,6 +59,17 @@ static int validate_info(struct fi_info *info, uint64_t required_caps) BTL_VERBOSE(("validating device: %s", info->domain_attr->name)); + /* EFA does not fulfill FI_DELIVERY_COMPLETE requirements in prior libfabric + * versions. The prov version is set as: + * FI_VERSION(FI_MAJOR_VERSION * 100 + FI_MINOR_VERSION, FI_REVISION_VERSION * 10) + * Thus, FI_VERSION(112,0) corresponds to libfabric 1.12.0 + */ + if (!strncasecmp(info->fabric_attr->prov_name, "efa", 3) + && FI_VERSION_LT(info->fabric_attr->prov_version, FI_VERSION(112,0))) { + BTL_VERBOSE(("unsupported libfabric efa version")); + return OPAL_ERROR; + } + /* we need exactly all the required bits */ if ((info->caps & required_caps) != required_caps) { BTL_VERBOSE(("unsupported caps")); From 41acfee2bbfc5495aeeeae4b72f385ca8d1d8cee Mon Sep 17 00:00:00 2001 From: William Zhang Date: Tue, 11 Aug 2020 13:59:26 -0700 Subject: [PATCH 2/2] btl/ofi: Disable ofi_rxm provider The ofi_rxm provider is dependent upon the underlying hardware for its implementation of FI_DELIVERY_COMPLETE. Since this can lead to early completions, we disable the provider to avoid correctness issues. This is not an issue in the mtl/ofi as it does not require FI_DELIVERY_COMPLETE. Signed-off-by: William Zhang --- opal/mca/btl/ofi/btl_ofi_component.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index ee5809f17a..37263b35cb 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -70,6 +70,14 @@ static int validate_info(struct fi_info *info, uint64_t required_caps) return OPAL_ERROR; } + /* ofi_rxm does not fulfill FI_DELIVERY_COMPLETE requirements. Thus we + * exclude it if it's detected. + */ + if (strstr(info->fabric_attr->prov_name, "ofi_rxm")) { + BTL_VERBOSE(("ofi_rxm does not support FI_DELIVERY_COMPLETE")); + return OPAL_ERROR; + } + /* we need exactly all the required bits */ if ((info->caps & required_caps) != required_caps) { BTL_VERBOSE(("unsupported caps"));