From 56ecb92b1094c86ccbc444d8b92f7a266495aa5a Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 1 May 2014 20:15:33 +0000 Subject: [PATCH] Per discussion with George and Ralph, change this BTL_ERROR message to an opal_show_help() so that its output is deduplicated. This commit was SVN r31590. --- ompi/mca/btl/tcp/btl_tcp_component.c | 13 +++++++++---- ompi/mca/btl/tcp/help-mpi-btl-tcp.txt | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c index 9c224b4bab..17b271f89b 100644 --- a/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Laboratory * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights @@ -1126,9 +1126,14 @@ static void mca_btl_tcp_component_accept_handler( int incoming_sd, if(sd < 0) { if(opal_socket_errno == EINTR) continue; - if(opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) - BTL_ERROR(("accept() failed: %s (%d).", - strerror(opal_socket_errno), opal_socket_errno)); + if (opal_socket_errno != EAGAIN && + opal_socket_errno != EWOULDBLOCK) { + opal_show_help("help-mpi-btl-tcp.txt", "accept failed", + true, "v4", ompi_process_info.nodename, + getpid(), + opal_socket_errno, + strerror(opal_socket_errno)); + } return; } mca_btl_tcp_set_socket_options(sd); diff --git a/ompi/mca/btl/tcp/help-mpi-btl-tcp.txt b/ompi/mca/btl/tcp/help-mpi-btl-tcp.txt index d54e8e24de..b53e6cc511 100644 --- a/ompi/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/ompi/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -47,3 +47,20 @@ Your Open MPI job may now fail. PID: %d Message: %s # +[accept failed] +WARNING: The accept(3) system call failed on a TCP socket. While this +should generally never happen on a well-configured HPC system, the +most common causes when it does occur are: + + * The process ran out of file descriptors + * The operating system ran out of file descriptors + * The operating system ran out of memory + +Your Open MPI job will likely hang until the failure resason is fixed +(e.g., more file descriptors and/or memory becomes available), and may +eventually timeout / abort. + + Local host: %s + PID: %d + Errno: %d (%s) +#