1
1

oob_tcp_listener: slightly refactor EAGAIN/EWOULDBLOCK

Have only a single level of "if" conditionals.  Also, slightly change
the logic such that we only die/break out of the loop if we get EMFILE
-- all other errors are ok to go on to the next fd.

Finally, use a real show_help() message to warn when other errors occur.
Этот коммит содержится в:
Jeff Squyres 2015-05-20 17:24:34 -04:00
родитель e43c8dc291
Коммит 3069daa015
2 изменённых файлов: 86 добавлений и 26 удалений

Просмотреть файл

@ -11,6 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -77,3 +78,21 @@ the security domains of the two hosts - e.g., one might be
using Munge while the other is not. This can typically be
resolved by specifying the desired security method. For
example, adding "--mca sec basic" to your command line.
#
[accept failed]
WARNING: The accept(3) system call failed on a TCP socket. While this
should generally never happen on a well-configured HPC system, the
most common causes when it does occur are:
* The process ran out of file descriptors
* The operating system ran out of file descriptors
* The operating system ran out of memory
Your Open MPI job will likely hang until the failure resason is fixed
(e.g., more file descriptors and/or memory becomes available), and may
eventually timeout / abort.
Local host: %s
Errno: %d (%s)
Probable cause: %s
#

Просмотреть файл

@ -741,18 +741,42 @@ static void* listen_thread(opal_object_t *obj)
&addrlen);
if (pending_connection->fd < 0) {
OBJ_RELEASE(pending_connection);
if (opal_socket_errno != EAGAIN ||
opal_socket_errno != EWOULDBLOCK) {
if (EMFILE == opal_socket_errno) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
} else {
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
strerror(opal_socket_errno), opal_socket_errno);
}
/* Non-fatal errors */
if (EAGAIN == opal_socket_errno ||
EWOULDBLOCK == opal_socket_errno) {
continue;
}
/* If we run out of file descriptors, log an extra
warning (so that the user can know to fix this
problem) and abandon all hope. */
else if (EMFILE == opal_socket_errno) {
CLOSE_THE_SOCKET(sd);
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
orte_show_help("help-oob-tcp.txt",
"accept failed",
true,
opal_process_info.nodename,
opal_socket_errno,
strerror(opal_socket_errno),
"Out of file descriptors");
goto done;
}
continue;
/* For all other cases, close the socket, print a
warning but try to continue */
else {
CLOSE_THE_SOCKET(sd);
orte_show_help("help-oob-tcp.txt",
"accept failed",
true,
opal_process_info.nodename,
opal_socket_errno,
strerror(opal_socket_errno),
"Unknown cause; job will try to continue");
continue;
}
}
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
@ -838,26 +862,43 @@ static void connection_event_handler(int incoming_sd, short flags, void* cbdata)
opal_net_get_hostname((struct sockaddr*) &addr),
opal_net_get_port((struct sockaddr*) &addr));
if (sd < 0) {
if (EINTR == opal_socket_errno) {
/* Non-fatal errors */
if (EINTR == opal_socket_errno ||
EAGAIN == opal_socket_errno ||
EWOULDBLOCK == opal_socket_errno) {
return;
}
if (opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
if (EMFILE == opal_socket_errno) {
/*
* Close incoming_sd so that orte_show_help will have a file
* descriptor with which to open the help file. We will be
* exiting anyway, so we don't need to keep it open.
*/
CLOSE_THE_SOCKET(incoming_sd);
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
} else {
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
strerror(opal_socket_errno), opal_socket_errno);
}
/* If we run out of file descriptors, log an extra warning (so
that the user can know to fix this problem) and abandon all
hope. */
else if (EMFILE == opal_socket_errno) {
CLOSE_THE_SOCKET(incoming_sd);
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
orte_show_help("help-oob-tcp.txt",
"accept failed",
true,
opal_process_info.nodename,
opal_socket_errno,
strerror(opal_socket_errno),
"Out of file descriptors");
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
return;
}
/* For all other cases, close the socket, print a warning but
try to continue */
else {
CLOSE_THE_SOCKET(incoming_sd);
orte_show_help("help-oob-tcp.txt",
"accept failed",
true,
opal_process_info.nodename,
opal_socket_errno,
strerror(opal_socket_errno),
"Unknown cause; job will try to continue");
return;
}
return;
}
/* process the connection */