Merge pull request #4304 from aravindksg/master
Fix OFI MTL to recognize correct CQ empty scenario and improve error reporting
Этот коммит содержится в:
Коммит
c8aa22ee22
@ -1,6 +1,6 @@
|
|||||||
# -*- text -*-
|
# -*- text -*-
|
||||||
#
|
#
|
||||||
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved
|
||||||
#
|
#
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
@ -8,3 +8,9 @@
|
|||||||
#
|
#
|
||||||
# $HEADER$
|
# $HEADER$
|
||||||
#
|
#
|
||||||
|
[OFI call fail]
|
||||||
|
Open MPI failed an OFI Libfabric library call (%s).This is highly unusual;
|
||||||
|
your job may behave unpredictably (and/or abort) after this.
|
||||||
|
Local host: %s
|
||||||
|
Location: %s:%d
|
||||||
|
Error: %s (%zd)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -14,6 +14,7 @@
|
|||||||
#include "ompi/mca/mtl/mtl.h"
|
#include "ompi/mca/mtl/mtl.h"
|
||||||
#include "ompi/mca/mtl/base/base.h"
|
#include "ompi/mca/mtl/base/base.h"
|
||||||
#include "opal/datatype/opal_convertor.h"
|
#include "opal/datatype/opal_convertor.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
#include <rdma/fabric.h>
|
#include <rdma/fabric.h>
|
||||||
#include <rdma/fi_cm.h>
|
#include <rdma/fi_cm.h>
|
||||||
@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void)
|
|||||||
assert(ofi_req);
|
assert(ofi_req);
|
||||||
ret = ofi_req->event_callback(&wc, ofi_req);
|
ret = ofi_req->event_callback(&wc, ofi_req);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
|
||||||
"Error returned by request event callback: %zd",
|
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
|
||||||
ret);
|
__FILE__, __LINE__, ret);
|
||||||
abort();
|
fflush(stderr);
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (ret == -FI_EAVAIL) {
|
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
|
||||||
/**
|
/**
|
||||||
* An error occured and is being reported via the CQ.
|
* An error occured and is being reported via the CQ.
|
||||||
* Read the error and forward it to the upper layer.
|
* Read the error and forward it to the upper layer.
|
||||||
@ -94,9 +96,11 @@ ompi_mtl_ofi_progress(void)
|
|||||||
&error,
|
&error,
|
||||||
0);
|
0);
|
||||||
if (0 > ret) {
|
if (0 > ret) {
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
|
||||||
"Error returned from fi_cq_readerr: %zd", ret);
|
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
|
||||||
abort();
|
__FILE__, __LINE__, fi_strerror(-ret), ret);
|
||||||
|
fflush(stderr);
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(error.op_context);
|
assert(error.op_context);
|
||||||
@ -104,16 +108,22 @@ ompi_mtl_ofi_progress(void)
|
|||||||
assert(ofi_req);
|
assert(ofi_req);
|
||||||
ret = ofi_req->error_callback(&error, ofi_req);
|
ret = ofi_req->error_callback(&error, ofi_req);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
|
||||||
"Error returned by request error callback: %zd",
|
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
|
||||||
ret);
|
__FILE__, __LINE__, ret);
|
||||||
abort();
|
fflush(stderr);
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/**
|
if (ret == -FI_EAGAIN) {
|
||||||
* The CQ is empty. Return.
|
break;
|
||||||
*/
|
} else {
|
||||||
break;
|
opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
|
||||||
|
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
|
||||||
|
__FILE__, __LINE__, fi_strerror(-ret), ret);
|
||||||
|
fflush(stderr);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
|
||||||
*
|
*
|
||||||
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||||
@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
#include "mtl_ofi.h"
|
#include "mtl_ofi.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
static int ompi_mtl_ofi_component_open(void);
|
static int ompi_mtl_ofi_component_open(void);
|
||||||
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
|
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
|
||||||
@ -364,9 +365,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
hints, /* In: Hints to filter providers */
|
hints, /* In: Hints to filter providers */
|
||||||
&providers); /* Out: List of matching providers */
|
&providers); /* Out: List of matching providers */
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||||
"%s:%d: fi_getinfo failed: %s\n",
|
"fi_getinfo",
|
||||||
__FILE__, __LINE__, fi_strerror(-ret));
|
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||||
|
fi_strerror(-ret), ret);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -392,9 +394,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
|
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
|
||||||
NULL); /* Optional context for fabric events */
|
NULL); /* Optional context for fabric events */
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||||
"%s:%d: fi_fabric failed: %s\n",
|
"fi_fabric",
|
||||||
__FILE__, __LINE__, fi_strerror(-ret));
|
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||||
|
fi_strerror(-ret), ret);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -408,9 +411,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
&ompi_mtl_ofi.domain, /* Out: Domain oject */
|
&ompi_mtl_ofi.domain, /* Out: Domain oject */
|
||||||
NULL); /* Optional context for domain events */
|
NULL); /* Optional context for domain events */
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||||
"%s:%d: fi_domain failed: %s\n",
|
"fi_domain",
|
||||||
__FILE__, __LINE__, fi_strerror(-ret));
|
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||||
|
fi_strerror(-ret), ret);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -426,9 +430,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
|
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
|
||||||
NULL); /* Optional context */
|
NULL); /* Optional context */
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||||
"%s:%d: fi_endpoint failed: %s\n",
|
"fi_endpoint",
|
||||||
__FILE__, __LINE__, fi_strerror(-ret));
|
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||||
|
fi_strerror(-ret), ret);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -581,38 +586,40 @@ error:
|
|||||||
int
|
int
|
||||||
ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
|
ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
|
||||||
{
|
{
|
||||||
|
ssize_t ret;
|
||||||
|
|
||||||
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
|
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
|
||||||
|
|
||||||
/**
|
/* Close all the OFI objects */
|
||||||
* * Close all the OFI objects
|
if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) {
|
||||||
* */
|
goto finalize_err;
|
||||||
if (fi_close((fid_t)ompi_mtl_ofi.ep)) {
|
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
|
||||||
"fi_close failed: %s", strerror(errno));
|
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
if (fi_close((fid_t)ompi_mtl_ofi.cq)) {
|
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) {
|
||||||
"fi_close failed: %s", strerror(errno));
|
goto finalize_err;
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
if (fi_close((fid_t)ompi_mtl_ofi.av)) {
|
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) {
|
||||||
"fi_close failed: %s", strerror(errno));
|
goto finalize_err;
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
if (fi_close((fid_t)ompi_mtl_ofi.domain)) {
|
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) {
|
||||||
"fi_close failed: %s", strerror(errno));
|
goto finalize_err;
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
if (fi_close((fid_t)ompi_mtl_ofi.fabric)) {
|
|
||||||
opal_output(ompi_mtl_base_framework.framework_output,
|
if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) {
|
||||||
"fi_close failed: %s", strerror(errno));
|
goto finalize_err;
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
|
|
||||||
|
finalize_err:
|
||||||
|
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
|
||||||
|
"fi_close",
|
||||||
|
ompi_process_info.nodename, __FILE__, __LINE__,
|
||||||
|
fi_strerror(-ret), ret);
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user