This fixes trac:1930.
Emit a more informative error message when the file descriptor limit is reached during an accept() call. Also, abort when the accept fails to avoid an infinite loop. Emit a more informative error message when the help file can't be opened. This commit was SVN r21271. The following Trac tickets were found above: Ticket 1930 --> https://svn.open-mpi.org/trac/ompi/ticket/1930
Этот коммит содержится в:
родитель
52cb752367
Коммит
e7ff2368d6
@ -22,6 +22,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <locale.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/util/show_help.h"
|
||||
@ -123,6 +124,8 @@ static int open_file(const char *base, const char *topic)
|
||||
const char *lang;
|
||||
#endif
|
||||
char *filename;
|
||||
char *err_msg = 0;
|
||||
size_t base_len;
|
||||
|
||||
/* If no filename was supplied, use the default */
|
||||
|
||||
@ -137,12 +140,17 @@ static int open_file(const char *base, const char *topic)
|
||||
|
||||
filename = opal_os_path( false, opal_install_dirs.pkgdatadir, base, NULL );
|
||||
opal_show_help_yyin = fopen(filename, "r");
|
||||
free(filename);
|
||||
if (NULL == opal_show_help_yyin) {
|
||||
asprintf(&filename, "%s/%s.txt", opal_install_dirs.pkgdatadir, base);
|
||||
opal_show_help_yyin = fopen(filename, "r");
|
||||
free(filename);
|
||||
asprintf(&err_msg, "%s: %s", filename, strerror(errno));
|
||||
base_len = strlen(base);
|
||||
if (4 > base_len || 0 != strcmp(base + base_len - 4, ".txt")) {
|
||||
free(filename);
|
||||
asprintf(&filename, "%s%s%s.txt", opal_install_dirs.pkgdatadir,
|
||||
OPAL_PATH_SEP, base);
|
||||
opal_show_help_yyin = fopen(filename, "r");
|
||||
}
|
||||
}
|
||||
free(filename);
|
||||
#else
|
||||
/* What's our locale? */
|
||||
|
||||
@ -154,15 +162,17 @@ static int open_file(const char *base, const char *topic)
|
||||
/* Do we have a file matching that locale? If not, open the
|
||||
default language (because we know that we have that one) */
|
||||
|
||||
asprintf(&filename, "%s/%s.%s", opal_install_dirs.pkgdatadir, base, lang);
|
||||
asprintf(&filename, "%s%s%s.%s", opal_install_dirs.pkgdatadir,
|
||||
OPAL_PATH_SEP, base, lang);
|
||||
opal_show_help_yyin = fopen(filename, "r");
|
||||
free(filename);
|
||||
if (NULL == opal_show_help_yyin) {
|
||||
asprintf(&filename, "%s/%s.%s", opal_install_dirs.pkgdatadir,
|
||||
base, default_language);
|
||||
opal_show_help_yyin = fopen(filename, "r");
|
||||
asprintf(&err_msg, "%s: %s", filename, strerror(errno));
|
||||
free(filename);
|
||||
asprintf(&filename, "%s%s%s.%s", opal_install_dirs.pkgdatadir,
|
||||
OPAL_PATH_SEP, base, default_language);
|
||||
opal_show_help_yyin = fopen(filename, "r");
|
||||
}
|
||||
free(filename);
|
||||
|
||||
/* If we still couldn't find it, try with no extension */
|
||||
|
||||
@ -176,10 +186,15 @@ static int open_file(const char *base, const char *topic)
|
||||
/* If we still couldn't open it, then something is wrong */
|
||||
|
||||
if (NULL == opal_show_help_yyin) {
|
||||
opal_output(output_stream, "%sSorry! You were supposed to get help about:\n %s\nfrom the file:\n %s\nBut I couldn't find any file matching that name. Sorry!\n%s", dash_line, topic, base, dash_line);
|
||||
opal_output(output_stream, "%sSorry! You were supposed to get help about:\n %s\nBut I couldn't open the help file:\n %s. Sorry!\n%s", dash_line, topic, err_msg, dash_line);
|
||||
free(err_msg);
|
||||
return OPAL_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
if (NULL != err_msg) {
|
||||
free(err_msg);
|
||||
}
|
||||
|
||||
/* Set the buffer */
|
||||
|
||||
opal_show_help_init_buffer(opal_show_help_yyin);
|
||||
|
@ -97,7 +97,8 @@ enum {
|
||||
ORTE_ERR_EXE_NOT_ACCESSIBLE = (ORTE_ERR_BASE - 25),
|
||||
ORTE_ERR_FAILED_TO_START = (ORTE_ERR_BASE - 26),
|
||||
ORTE_ERR_FILE_NOT_EXECUTABLE = (ORTE_ERR_BASE - 27),
|
||||
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 28)
|
||||
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 28),
|
||||
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29)
|
||||
};
|
||||
|
||||
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
|
||||
|
@ -500,12 +500,24 @@ static void mca_oob_tcp_accept(int incoming_sd)
|
||||
|
||||
sd = accept(incoming_sd, (struct sockaddr*)&addr, &addrlen);
|
||||
if(sd < 0) {
|
||||
if(opal_socket_errno == EINTR) {
|
||||
if(EINTR == opal_socket_errno) {
|
||||
continue;
|
||||
}
|
||||
if(opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
if(EMFILE == opal_socket_errno) {
|
||||
/*
|
||||
* Close incoming_sd so that orte_show_help will have a file
|
||||
* descriptor with which to open the help file. We will be
|
||||
* exiting anyway, so we don't need to keep it open.
|
||||
*/
|
||||
CLOSE_THE_SOCKET(incoming_sd);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
|
||||
} else {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
}
|
||||
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, "");
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -910,9 +922,14 @@ mca_oob_tcp_listen_thread(opal_object_t *obj)
|
||||
|
||||
if (opal_socket_errno != EAGAIN ||
|
||||
opal_socket_errno != EWOULDBLOCK) {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(pending_connection->fd);
|
||||
if(EMFILE == opal_socket_errno) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
|
||||
} else {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
}
|
||||
goto done;
|
||||
}
|
||||
|
||||
|
@ -177,6 +177,13 @@ increasing your limit descriptor setting (using limit or ulimit commands),
|
||||
asking the system administrator for that node to increase the system limit, or
|
||||
by rearranging your processes to place fewer of them on that node.
|
||||
#
|
||||
[orterun:sys-limit-sockets]
|
||||
Error: system limit exceeded on number of network connections that can be open
|
||||
|
||||
This can be resolved by setting the mca parameter opal_set_max_sys_limits to 1,
|
||||
increasing your limit descriptor setting (using limit or ulimit commands),
|
||||
or asking the system administrator to increase the system limit.
|
||||
#
|
||||
[orterun:pipe-setup-failure]
|
||||
%s was unable to launch the specified application as it encountered an error:
|
||||
|
||||
|
@ -115,6 +115,9 @@ orte_err2str(int errnum)
|
||||
case ORTE_ERR_HNP_COULD_NOT_START:
|
||||
retval = "Unable to start a daemon on the local node";
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_SOCKETS:
|
||||
retval = "The system limit on number of network connections a process can open was reached";
|
||||
break;
|
||||
default:
|
||||
retval = NULL;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user