From 285fc42b4ea9c16c8e7b9caa1c5def453f80e557 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 30 Oct 2017 12:13:44 -0700 Subject: [PATCH] Fix OFI MTL to recognize correct CQ empty scenario Currently, the progress function is incorrectly interpreting any error value other than a positive value or -FI_EAVAIL to mean CQ is empty. CQ is empty only if fi_cq_read() call returned -EAGAIN error code. Fix that here. While at it, fix help text output for calls made to OFI API. Signed-off-by: Aravind Gopalakrishnan --- ompi/mca/mtl/ofi/help-mtl-ofi.txt | 8 ++- ompi/mca/mtl/ofi/mtl_ofi.h | 44 ++++++++++------ ompi/mca/mtl/ofi/mtl_ofi_component.c | 79 +++++++++++++++------------- 3 files changed, 77 insertions(+), 54 deletions(-) diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 2338d548f0..97c355123c 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2013-2015 Intel, Inc. All rights reserved +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved # # $COPYRIGHT$ # @@ -8,3 +8,9 @@ # # $HEADER$ # +[OFI call fail] +Open MPI failed an OFI Libfabric library call (%s).This is highly unusual; +your job may behave unpredictably (and/or abort) after this. + Local host: %s + Location: %s:%d + Error: %s (%zd) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 1128aca3d2..0ee125c796 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -14,6 +14,7 @@ #include "ompi/mca/mtl/mtl.h" #include "ompi/mca/mtl/base/base.h" #include "opal/datatype/opal_convertor.h" +#include "opal/util/show_help.h" #include #include @@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void) assert(ofi_req); ret = ofi_req->event_callback(&wc, ofi_req); if (OMPI_SUCCESS != ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned by request event callback: %zd", - ret); - abort(); + opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, ret); + fflush(stderr); + exit(1); } } - } else if (ret == -FI_EAVAIL) { + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { /** * An error occured and is being reported via the CQ. * Read the error and forward it to the upper layer. @@ -94,9 +96,11 @@ ompi_mtl_ofi_progress(void) &error, 0); if (0 > ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned from fi_cq_readerr: %zd", ret); - abort(); + opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, fi_strerror(-ret), ret); + fflush(stderr); + exit(1); } assert(error.op_context); @@ -104,16 +108,22 @@ ompi_mtl_ofi_progress(void) assert(ofi_req); ret = ofi_req->error_callback(&error, ofi_req); if (OMPI_SUCCESS != ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned by request error callback: %zd", - ret); - abort(); + opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, ret); + fflush(stderr); + exit(1); } } else { - /** - * The CQ is empty. Return. - */ - break; + if (ret == -FI_EAGAIN) { + break; + } else { + opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, fi_strerror(-ret), ret); + fflush(stderr); + exit(1); + } } } return count; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 6b7058434d..2dd0090ad1 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights @@ -14,6 +14,7 @@ #include "mtl_ofi.h" #include "opal/util/argv.h" +#include "opal/util/show_help.h" static int ompi_mtl_ofi_component_open(void); static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority); @@ -364,9 +365,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_getinfo failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_getinfo", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -392,9 +394,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ NULL); /* Optional context for fabric events */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_fabric failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_fabric", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -408,9 +411,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.domain, /* Out: Domain oject */ NULL); /* Optional context for domain events */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_domain failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_domain", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -426,9 +430,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.ep, /* Out: Endpoint object */ NULL); /* Optional context */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_endpoint failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_endpoint", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -581,38 +586,40 @@ error: int ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) { + ssize_t ret; + opal_progress_unregister(ompi_mtl_ofi_progress_no_inline); - /** - * * Close all the OFI objects - * */ - if (fi_close((fid_t)ompi_mtl_ofi.ep)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + /* Close all the OFI objects */ + if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) { + goto finalize_err; } - if (fi_close((fid_t)ompi_mtl_ofi.cq)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + + if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) { + goto finalize_err; } - if (fi_close((fid_t)ompi_mtl_ofi.av)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + + if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) { + goto finalize_err; } - if (fi_close((fid_t)ompi_mtl_ofi.domain)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + + if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) { + goto finalize_err; } - if (fi_close((fid_t)ompi_mtl_ofi.fabric)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + + if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) { + goto finalize_err; } return OMPI_SUCCESS; + +finalize_err: + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_close", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); + + return OMPI_ERROR; }