oob_tcp_listener: slightly refactor EAGAIN/EWOULDBLOCK
Have only a single level of "if" conditionals. Also, slightly change the logic such that we only die/break out of the loop if we get EMFILE -- all other errors are ok to go on to the next fd. Finally, use a real show_help() message to warn when other errors occur.
Этот коммит содержится в:
родитель
e43c8dc291
Коммит
3069daa015
@ -11,6 +11,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -77,3 +78,21 @@ the security domains of the two hosts - e.g., one might be
|
||||
using Munge while the other is not. This can typically be
|
||||
resolved by specifying the desired security method. For
|
||||
example, adding "--mca sec basic" to your command line.
|
||||
#
|
||||
[accept failed]
|
||||
WARNING: The accept(3) system call failed on a TCP socket. While this
|
||||
should generally never happen on a well-configured HPC system, the
|
||||
most common causes when it does occur are:
|
||||
|
||||
* The process ran out of file descriptors
|
||||
* The operating system ran out of file descriptors
|
||||
* The operating system ran out of memory
|
||||
|
||||
Your Open MPI job will likely hang until the failure resason is fixed
|
||||
(e.g., more file descriptors and/or memory becomes available), and may
|
||||
eventually timeout / abort.
|
||||
|
||||
Local host: %s
|
||||
Errno: %d (%s)
|
||||
Probable cause: %s
|
||||
#
|
||||
|
@ -741,18 +741,42 @@ static void* listen_thread(opal_object_t *obj)
|
||||
&addrlen);
|
||||
if (pending_connection->fd < 0) {
|
||||
OBJ_RELEASE(pending_connection);
|
||||
if (opal_socket_errno != EAGAIN ||
|
||||
opal_socket_errno != EWOULDBLOCK) {
|
||||
if (EMFILE == opal_socket_errno) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
|
||||
} else {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
}
|
||||
|
||||
/* Non-fatal errors */
|
||||
if (EAGAIN == opal_socket_errno ||
|
||||
EWOULDBLOCK == opal_socket_errno) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* If we run out of file descriptors, log an extra
|
||||
warning (so that the user can know to fix this
|
||||
problem) and abandon all hope. */
|
||||
else if (EMFILE == opal_socket_errno) {
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
|
||||
orte_show_help("help-oob-tcp.txt",
|
||||
"accept failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
opal_socket_errno,
|
||||
strerror(opal_socket_errno),
|
||||
"Out of file descriptors");
|
||||
goto done;
|
||||
}
|
||||
continue;
|
||||
|
||||
/* For all other cases, close the socket, print a
|
||||
warning but try to continue */
|
||||
else {
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
orte_show_help("help-oob-tcp.txt",
|
||||
"accept failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
opal_socket_errno,
|
||||
strerror(opal_socket_errno),
|
||||
"Unknown cause; job will try to continue");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
@ -838,26 +862,43 @@ static void connection_event_handler(int incoming_sd, short flags, void* cbdata)
|
||||
opal_net_get_hostname((struct sockaddr*) &addr),
|
||||
opal_net_get_port((struct sockaddr*) &addr));
|
||||
if (sd < 0) {
|
||||
if (EINTR == opal_socket_errno) {
|
||||
/* Non-fatal errors */
|
||||
if (EINTR == opal_socket_errno ||
|
||||
EAGAIN == opal_socket_errno ||
|
||||
EWOULDBLOCK == opal_socket_errno) {
|
||||
return;
|
||||
}
|
||||
if (opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
||||
if (EMFILE == opal_socket_errno) {
|
||||
/*
|
||||
* Close incoming_sd so that orte_show_help will have a file
|
||||
* descriptor with which to open the help file. We will be
|
||||
* exiting anyway, so we don't need to keep it open.
|
||||
*/
|
||||
CLOSE_THE_SOCKET(incoming_sd);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
|
||||
} else {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
}
|
||||
|
||||
/* If we run out of file descriptors, log an extra warning (so
|
||||
that the user can know to fix this problem) and abandon all
|
||||
hope. */
|
||||
else if (EMFILE == opal_socket_errno) {
|
||||
CLOSE_THE_SOCKET(incoming_sd);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
|
||||
orte_show_help("help-oob-tcp.txt",
|
||||
"accept failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
opal_socket_errno,
|
||||
strerror(opal_socket_errno),
|
||||
"Out of file descriptors");
|
||||
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
/* For all other cases, close the socket, print a warning but
|
||||
try to continue */
|
||||
else {
|
||||
CLOSE_THE_SOCKET(incoming_sd);
|
||||
orte_show_help("help-oob-tcp.txt",
|
||||
"accept failed",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
opal_socket_errno,
|
||||
strerror(opal_socket_errno),
|
||||
"Unknown cause; job will try to continue");
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* process the connection */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user