1
1

Merge pull request #4304 from aravindksg/master

Fix OFI MTL to recognize correct CQ empty scenario and improve error reporting
Этот коммит содержится в:
Matias Cabral 2017-11-02 11:56:57 -07:00 коммит произвёл GitHub
родитель e7990e7e75 285fc42b4e
Коммит c8aa22ee22
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 77 добавлений и 54 удалений

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
@ -8,3 +8,9 @@
#
# $HEADER$
#
[OFI call fail]
Open MPI failed an OFI Libfabric library call (%s).This is highly unusual;
your job may behave unpredictably (and/or abort) after this.
Local host: %s
Location: %s:%d
Error: %s (%zd)

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -14,6 +14,7 @@
#include "ompi/mca/mtl/mtl.h"
#include "ompi/mca/mtl/base/base.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/util/show_help.h"
#include <rdma/fabric.h>
#include <rdma/fi_cm.h>
@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void)
assert(ofi_req);
ret = ofi_req->event_callback(&wc, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request event callback: %zd",
ret);
abort();
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, ret);
fflush(stderr);
exit(1);
}
}
} else if (ret == -FI_EAVAIL) {
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
/**
* An error occured and is being reported via the CQ.
* Read the error and forward it to the upper layer.
@ -94,9 +96,11 @@ ompi_mtl_ofi_progress(void)
&error,
0);
if (0 > ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned from fi_cq_readerr: %zd", ret);
abort();
opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}
assert(error.op_context);
@ -104,16 +108,22 @@ ompi_mtl_ofi_progress(void)
assert(ofi_req);
ret = ofi_req->error_callback(&error, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request error callback: %zd",
ret);
abort();
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, ret);
fflush(stderr);
exit(1);
}
} else {
/**
* The CQ is empty. Return.
*/
break;
if (ret == -FI_EAGAIN) {
break;
} else {
opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}
}
}
return count;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
*
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
@ -14,6 +14,7 @@
#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@ -364,9 +365,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getinfo failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_getinfo",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}
@ -392,9 +394,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
NULL); /* Optional context for fabric events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_fabric failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_fabric",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}
@ -408,9 +411,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.domain, /* Out: Domain oject */
NULL); /* Optional context for domain events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_domain failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_domain",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}
@ -426,9 +430,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_endpoint failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_endpoint",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}
@ -581,38 +586,40 @@ error:
int
ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
{
ssize_t ret;
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
/**
* * Close all the OFI objects
* */
if (fi_close((fid_t)ompi_mtl_ofi.ep)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
/* Close all the OFI objects */
if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) {
goto finalize_err;
}
if (fi_close((fid_t)ompi_mtl_ofi.cq)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) {
goto finalize_err;
}
if (fi_close((fid_t)ompi_mtl_ofi.av)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) {
goto finalize_err;
}
if (fi_close((fid_t)ompi_mtl_ofi.domain)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) {
goto finalize_err;
}
if (fi_close((fid_t)ompi_mtl_ofi.fabric)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) {
goto finalize_err;
}
return OMPI_SUCCESS;
finalize_err:
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_close",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
return OMPI_ERROR;
}