From a7dcfd98742d7f44d96d74a60a38529894d4099c Mon Sep 17 00:00:00 2001 From: William Zhang Date: Tue, 28 Jul 2020 09:24:36 -0700 Subject: [PATCH] btl/ofi: Disable EFA provider in versions earlier than libfabric 1.12.0 EFA incorrectly implements FI_DELIVERY_COMPLETE in earlier libfabric versions. While FI_DELIVERY_COMPLETE would be advertised by the provider, completions would return too early by not accounting for bounce buffers on the receive side. This would cause the BTL to receive early completions that lead to correctness issues. This is not an issue in the mtl/ofi as it does not require FI_DELIVERY_COMPLETE. Signed-off-by: William Zhang --- opal/mca/btl/ofi/btl_ofi_component.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index c4efa5e167..ee5809f17a 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -59,6 +59,17 @@ static int validate_info(struct fi_info *info, uint64_t required_caps) BTL_VERBOSE(("validating device: %s", info->domain_attr->name)); + /* EFA does not fulfill FI_DELIVERY_COMPLETE requirements in prior libfabric + * versions. The prov version is set as: + * FI_VERSION(FI_MAJOR_VERSION * 100 + FI_MINOR_VERSION, FI_REVISION_VERSION * 10) + * Thus, FI_VERSION(112,0) corresponds to libfabric 1.12.0 + */ + if (!strncasecmp(info->fabric_attr->prov_name, "efa", 3) + && FI_VERSION_LT(info->fabric_attr->prov_version, FI_VERSION(112,0))) { + BTL_VERBOSE(("unsupported libfabric efa version")); + return OPAL_ERROR; + } + /* we need exactly all the required bits */ if ((info->caps & required_caps) != required_caps) { BTL_VERBOSE(("unsupported caps"));