Merge pull request #2050 from jsquyres/pr/btl-tcp-help-messages
Add a show_help message to TCP BTL when peer unexpectedly disconnects
Этот коммит содержится в:
Коммит
527efec4fb
@ -14,7 +14,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -44,8 +44,12 @@
|
|||||||
|
|
||||||
#include "opal/opal_socket_errno.h"
|
#include "opal/opal_socket_errno.h"
|
||||||
#include "opal/mca/btl/base/btl_base_error.h"
|
#include "opal/mca/btl/base/btl_base_error.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
#include "btl_tcp_frag.h"
|
#include "btl_tcp_frag.h"
|
||||||
#include "btl_tcp_endpoint.h"
|
#include "btl_tcp_endpoint.h"
|
||||||
|
#include "btl_tcp_proc.h"
|
||||||
|
|
||||||
|
|
||||||
static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag)
|
static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag)
|
||||||
{
|
{
|
||||||
@ -225,6 +229,16 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
|||||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
case ECONNRESET:
|
||||||
|
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
|
||||||
|
true, opal_process_info.nodename,
|
||||||
|
getpid(),
|
||||||
|
btl_endpoint->endpoint_proc->proc_opal->proc_hostname);
|
||||||
|
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||||
|
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||||
|
return false;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)",
|
BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)",
|
||||||
strerror(opal_socket_errno),
|
strerror(opal_socket_errno),
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# -*- text -*-
|
# -*- text -*-
|
||||||
#
|
#
|
||||||
# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
# Copyright (c) 2015-2016 The University of Tennessee and The University
|
# Copyright (c) 2015-2016 The University of Tennessee and The University
|
||||||
# of Tennessee Research Foundation. All rights
|
# of Tennessee Research Foundation. All rights
|
||||||
# reserved.
|
# reserved.
|
||||||
@ -59,6 +59,14 @@ most common causes when it does occur are:
|
|||||||
* The operating system ran out of file descriptors
|
* The operating system ran out of file descriptors
|
||||||
* The operating system ran out of memory
|
* The operating system ran out of memory
|
||||||
|
|
||||||
|
Your Open MPI job will likely hang (or crash) until the failure
|
||||||
|
resason is fixed (e.g., more file descriptors and/or memory becomes
|
||||||
|
available), and may eventually timeout / abort.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
Errno: %d (%s)
|
||||||
|
#
|
||||||
[unsuported progress thread]
|
[unsuported progress thread]
|
||||||
WARNING: Support for the TCP progress thread has not been compiled in.
|
WARNING: Support for the TCP progress thread has not been compiled in.
|
||||||
Fall back to the normal progress.
|
Fall back to the normal progress.
|
||||||
@ -66,14 +74,20 @@ Fall back to the normal progress.
|
|||||||
Local host: %s
|
Local host: %s
|
||||||
Value: %s
|
Value: %s
|
||||||
Message: %s
|
Message: %s
|
||||||
|
|
||||||
#
|
#
|
||||||
|
[peer hung up]
|
||||||
|
An MPI communication peer process has unexpectedly disconnected. This
|
||||||
|
usually indicates a failure in the peer process (e.g., a crash or
|
||||||
|
otherwise exiting without calling MPI_FINALIZE first).
|
||||||
|
|
||||||
Your Open MPI job will likely hang until the failure resason is fixed
|
Although this local MPI process will likely now behave unpredictably
|
||||||
(e.g., more file descriptors and/or memory becomes available), and may
|
(it may even hang or crash), the root cause of this problem is the
|
||||||
eventually timeout / abort.
|
failure of the peer -- that is what you need to investigate. For
|
||||||
|
example, there may be a core file that you can examine. More
|
||||||
|
generally: such peer hangups are frequently caused by application bugs
|
||||||
|
or other external events.
|
||||||
|
|
||||||
Local host: %s
|
Local host: %s
|
||||||
PID: %d
|
Local PID: %d
|
||||||
Errno: %d (%s)
|
Peer host: %s
|
||||||
#
|
#
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user