1
1

Merge pull request #4304 from aravindksg/master

Fix OFI MTL to recognize correct CQ empty scenario and improve error reporting
Этот коммит содержится в:
Matias Cabral 2017-11-02 11:56:57 -07:00 коммит произвёл GitHub
родитель e7990e7e75 285fc42b4e
Коммит c8aa22ee22
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 77 добавлений и 54 удалений

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*- # -*- text -*-
# #
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved # Copyright (c) 2013-2017 Intel, Inc. All rights reserved
# #
# $COPYRIGHT$ # $COPYRIGHT$
# #
@ -8,3 +8,9 @@
# #
# $HEADER$ # $HEADER$
# #
[OFI call fail]
Open MPI failed an OFI Libfabric library call (%s).This is highly unusual;
your job may behave unpredictably (and/or abort) after this.
Local host: %s
Location: %s:%d
Error: %s (%zd)

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -14,6 +14,7 @@
#include "ompi/mca/mtl/mtl.h" #include "ompi/mca/mtl/mtl.h"
#include "ompi/mca/mtl/base/base.h" #include "ompi/mca/mtl/base/base.h"
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"
#include "opal/util/show_help.h"
#include <rdma/fabric.h> #include <rdma/fabric.h>
#include <rdma/fi_cm.h> #include <rdma/fi_cm.h>
@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void)
assert(ofi_req); assert(ofi_req);
ret = ofi_req->event_callback(&wc, ofi_req); ret = ofi_req->event_callback(&wc, ofi_req);
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output, opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
"Error returned by request event callback: %zd", "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
ret); __FILE__, __LINE__, ret);
abort(); fflush(stderr);
exit(1);
} }
} }
} else if (ret == -FI_EAVAIL) { } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
/** /**
* An error occured and is being reported via the CQ. * An error occured and is being reported via the CQ.
* Read the error and forward it to the upper layer. * Read the error and forward it to the upper layer.
@ -94,9 +96,11 @@ ompi_mtl_ofi_progress(void)
&error, &error,
0); 0);
if (0 > ret) { if (0 > ret) {
opal_output(ompi_mtl_base_framework.framework_output, opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
"Error returned from fi_cq_readerr: %zd", ret); "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
abort(); __FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
} }
assert(error.op_context); assert(error.op_context);
@ -104,16 +108,22 @@ ompi_mtl_ofi_progress(void)
assert(ofi_req); assert(ofi_req);
ret = ofi_req->error_callback(&error, ofi_req); ret = ofi_req->error_callback(&error, ofi_req);
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output, opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
"Error returned by request error callback: %zd", "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
ret); __FILE__, __LINE__, ret);
abort(); fflush(stderr);
exit(1);
} }
} else { } else {
/** if (ret == -FI_EAGAIN) {
* The CQ is empty. Return. break;
*/ } else {
break; opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}
} }
} }
return count; return count;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
* *
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
@ -14,6 +14,7 @@
#include "mtl_ofi.h" #include "mtl_ofi.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/show_help.h"
static int ompi_mtl_ofi_component_open(void); static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority); static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@ -364,9 +365,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
hints, /* In: Hints to filter providers */ hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */ &providers); /* Out: List of matching providers */
if (0 != ret) { if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"%s:%d: fi_getinfo failed: %s\n", "fi_getinfo",
__FILE__, __LINE__, fi_strerror(-ret)); ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error; goto error;
} }
@ -392,9 +394,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */ &ompi_mtl_ofi.fabric, /* Out: Fabric handle */
NULL); /* Optional context for fabric events */ NULL); /* Optional context for fabric events */
if (0 != ret) { if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"%s:%d: fi_fabric failed: %s\n", "fi_fabric",
__FILE__, __LINE__, fi_strerror(-ret)); ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error; goto error;
} }
@ -408,9 +411,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.domain, /* Out: Domain oject */ &ompi_mtl_ofi.domain, /* Out: Domain oject */
NULL); /* Optional context for domain events */ NULL); /* Optional context for domain events */
if (0 != ret) { if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"%s:%d: fi_domain failed: %s\n", "fi_domain",
__FILE__, __LINE__, fi_strerror(-ret)); ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error; goto error;
} }
@ -426,9 +430,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.ep, /* Out: Endpoint object */ &ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */ NULL); /* Optional context */
if (0 != ret) { if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"%s:%d: fi_endpoint failed: %s\n", "fi_endpoint",
__FILE__, __LINE__, fi_strerror(-ret)); ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error; goto error;
} }
@ -581,38 +586,40 @@ error:
int int
ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
{ {
ssize_t ret;
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline); opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
/** /* Close all the OFI objects */
* * Close all the OFI objects if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) {
* */ goto finalize_err;
if (fi_close((fid_t)ompi_mtl_ofi.ep)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
} }
if (fi_close((fid_t)ompi_mtl_ofi.cq)) {
opal_output(ompi_mtl_base_framework.framework_output, if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) {
"fi_close failed: %s", strerror(errno)); goto finalize_err;
abort();
} }
if (fi_close((fid_t)ompi_mtl_ofi.av)) {
opal_output(ompi_mtl_base_framework.framework_output, if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) {
"fi_close failed: %s", strerror(errno)); goto finalize_err;
abort();
} }
if (fi_close((fid_t)ompi_mtl_ofi.domain)) {
opal_output(ompi_mtl_base_framework.framework_output, if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) {
"fi_close failed: %s", strerror(errno)); goto finalize_err;
abort();
} }
if (fi_close((fid_t)ompi_mtl_ofi.fabric)) {
opal_output(ompi_mtl_base_framework.framework_output, if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) {
"fi_close failed: %s", strerror(errno)); goto finalize_err;
abort();
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
finalize_err:
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_close",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
return OMPI_ERROR;
} }