From 95c6f6cfc0c5161657a5c9e9949cd259b9a5ad91 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 2 Sep 2016 17:14:22 -0400 Subject: [PATCH 1/2] btl/tcp: fix help message It looks like one help message was accidentally pasted in the middle of another. Disentangle the two messages from each other, and slightly tweak the one message to say that the job may also crash (in addition to hanging). Signed-off-by: Jeff Squyres --- opal/mca/btl/tcp/help-mpi-btl-tcp.txt | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index b3d19cea28..71a93157f2 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2015-2016 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. @@ -59,6 +59,14 @@ most common causes when it does occur are: * The operating system ran out of file descriptors * The operating system ran out of memory +Your Open MPI job will likely hang (or crash) until the failure +resason is fixed (e.g., more file descriptors and/or memory becomes +available), and may eventually timeout / abort. + + Local host: %s + PID: %d + Errno: %d (%s) +# [unsuported progress thread] WARNING: Support for the TCP progress thread has not been compiled in. Fall back to the normal progress. @@ -66,14 +74,3 @@ Fall back to the normal progress. Local host: %s Value: %s Message: %s - -# - -Your Open MPI job will likely hang until the failure resason is fixed -(e.g., more file descriptors and/or memory becomes available), and may -eventually timeout / abort. - - Local host: %s - PID: %d - Errno: %d (%s) -# From 1953e3406f5273f4d54eed2a8c7f1e110c139e09 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 2 Sep 2016 17:44:07 -0400 Subject: [PATCH 2/2] btl/tcp: add show_help message when peer hangs up We commonly see messages on the users list where a peer has hung up because it has crashed. Instead of having just a BTL_ERROR message, make this a real opal_show_help() message that tells the user that the peer unexpectedly hung up, and they should look into *why* that peer hung up. Signed-off-by: Jeff Squyres --- opal/mca/btl/tcp/btl_tcp_frag.c | 16 +++++++++++++++- opal/mca/btl/tcp/help-mpi-btl-tcp.txt | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index d67bebe603..bb61c76491 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,8 +44,12 @@ #include "opal/opal_socket_errno.h" #include "opal/mca/btl/base/btl_base_error.h" +#include "opal/util/show_help.h" + #include "btl_tcp_frag.h" #include "btl_tcp_endpoint.h" +#include "btl_tcp_proc.h" + static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) { @@ -225,6 +229,16 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); return false; + + case ECONNRESET: + opal_show_help("help-mpi-btl-tcp.txt", "peer hung up", + true, opal_process_info.nodename, + getpid(), + btl_endpoint->endpoint_proc->proc_opal->proc_hostname); + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; + mca_btl_tcp_endpoint_close(btl_endpoint); + return false; + default: BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", strerror(opal_socket_errno), diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index 71a93157f2..4b3f4544a3 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -74,3 +74,20 @@ Fall back to the normal progress. Local host: %s Value: %s Message: %s +# +[peer hung up] +An MPI communication peer process has unexpectedly disconnected. This +usually indicates a failure in the peer process (e.g., a crash or +otherwise exiting without calling MPI_FINALIZE first). + +Although this local MPI process will likely now behave unpredictably +(it may even hang or crash), the root cause of this problem is the +failure of the peer -- that is what you need to investigate. For +example, there may be a core file that you can examine. More +generally: such peer hangups are frequently caused by application bugs +or other external events. + + Local host: %s + Local PID: %d + Peer host: %s +#