diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index d67bebe603..bb61c76491 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,8 +44,12 @@ #include "opal/opal_socket_errno.h" #include "opal/mca/btl/base/btl_base_error.h" +#include "opal/util/show_help.h" + #include "btl_tcp_frag.h" #include "btl_tcp_endpoint.h" +#include "btl_tcp_proc.h" + static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) { @@ -225,6 +229,16 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); return false; + + case ECONNRESET: + opal_show_help("help-mpi-btl-tcp.txt", "peer hung up", + true, opal_process_info.nodename, + getpid(), + btl_endpoint->endpoint_proc->proc_opal->proc_hostname); + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; + mca_btl_tcp_endpoint_close(btl_endpoint); + return false; + default: BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", strerror(opal_socket_errno), diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index b3d19cea28..4b3f4544a3 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2015-2016 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. @@ -59,6 +59,14 @@ most common causes when it does occur are: * The operating system ran out of file descriptors * The operating system ran out of memory +Your Open MPI job will likely hang (or crash) until the failure +resason is fixed (e.g., more file descriptors and/or memory becomes +available), and may eventually timeout / abort. + + Local host: %s + PID: %d + Errno: %d (%s) +# [unsuported progress thread] WARNING: Support for the TCP progress thread has not been compiled in. Fall back to the normal progress. @@ -66,14 +74,20 @@ Fall back to the normal progress. Local host: %s Value: %s Message: %s - # +[peer hung up] +An MPI communication peer process has unexpectedly disconnected. This +usually indicates a failure in the peer process (e.g., a crash or +otherwise exiting without calling MPI_FINALIZE first). -Your Open MPI job will likely hang until the failure resason is fixed -(e.g., more file descriptors and/or memory becomes available), and may -eventually timeout / abort. +Although this local MPI process will likely now behave unpredictably +(it may even hang or crash), the root cause of this problem is the +failure of the peer -- that is what you need to investigate. For +example, there may be a core file that you can examine. More +generally: such peer hangups are frequently caused by application bugs +or other external events. Local host: %s - PID: %d - Errno: %d (%s) + Local PID: %d + Peer host: %s #