1
1
Emit a more informative error message when the file descriptor limit is
reached during an accept() call.  Also, abort when the accept fails to
avoid an infinite loop.

Emit a more informative error message when the help file can't be opened.

This commit was SVN r21271.

The following Trac tickets were found above:
  Ticket 1930 --> https://svn.open-mpi.org/trac/ompi/ticket/1930
Этот коммит содержится в:
Iain Bason 2009-05-26 20:03:21 +00:00
родитель 52cb752367
Коммит e7ff2368d6
5 изменённых файлов: 59 добавлений и 16 удалений

Просмотреть файл

@ -22,6 +22,7 @@
#include <stdio.h>
#include <string.h>
#include <locale.h>
#include <errno.h>
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/show_help.h"
@ -123,6 +124,8 @@ static int open_file(const char *base, const char *topic)
const char *lang;
#endif
char *filename;
char *err_msg = 0;
size_t base_len;
/* If no filename was supplied, use the default */
@ -137,12 +140,17 @@ static int open_file(const char *base, const char *topic)
filename = opal_os_path( false, opal_install_dirs.pkgdatadir, base, NULL );
opal_show_help_yyin = fopen(filename, "r");
free(filename);
if (NULL == opal_show_help_yyin) {
asprintf(&filename, "%s/%s.txt", opal_install_dirs.pkgdatadir, base);
opal_show_help_yyin = fopen(filename, "r");
free(filename);
asprintf(&err_msg, "%s: %s", filename, strerror(errno));
base_len = strlen(base);
if (4 > base_len || 0 != strcmp(base + base_len - 4, ".txt")) {
free(filename);
asprintf(&filename, "%s%s%s.txt", opal_install_dirs.pkgdatadir,
OPAL_PATH_SEP, base);
opal_show_help_yyin = fopen(filename, "r");
}
}
free(filename);
#else
/* What's our locale? */
@ -154,15 +162,17 @@ static int open_file(const char *base, const char *topic)
/* Do we have a file matching that locale? If not, open the
default language (because we know that we have that one) */
asprintf(&filename, "%s/%s.%s", opal_install_dirs.pkgdatadir, base, lang);
asprintf(&filename, "%s%s%s.%s", opal_install_dirs.pkgdatadir,
OPAL_PATH_SEP, base, lang);
opal_show_help_yyin = fopen(filename, "r");
free(filename);
if (NULL == opal_show_help_yyin) {
asprintf(&filename, "%s/%s.%s", opal_install_dirs.pkgdatadir,
base, default_language);
opal_show_help_yyin = fopen(filename, "r");
asprintf(&err_msg, "%s: %s", filename, strerror(errno));
free(filename);
asprintf(&filename, "%s%s%s.%s", opal_install_dirs.pkgdatadir,
OPAL_PATH_SEP, base, default_language);
opal_show_help_yyin = fopen(filename, "r");
}
free(filename);
/* If we still couldn't find it, try with no extension */
@ -176,10 +186,15 @@ static int open_file(const char *base, const char *topic)
/* If we still couldn't open it, then something is wrong */
if (NULL == opal_show_help_yyin) {
opal_output(output_stream, "%sSorry! You were supposed to get help about:\n %s\nfrom the file:\n %s\nBut I couldn't find any file matching that name. Sorry!\n%s", dash_line, topic, base, dash_line);
opal_output(output_stream, "%sSorry! You were supposed to get help about:\n %s\nBut I couldn't open the help file:\n %s. Sorry!\n%s", dash_line, topic, err_msg, dash_line);
free(err_msg);
return OPAL_ERR_NOT_FOUND;
}
if (NULL != err_msg) {
free(err_msg);
}
/* Set the buffer */
opal_show_help_init_buffer(opal_show_help_yyin);

Просмотреть файл

@ -97,7 +97,8 @@ enum {
ORTE_ERR_EXE_NOT_ACCESSIBLE = (ORTE_ERR_BASE - 25),
ORTE_ERR_FAILED_TO_START = (ORTE_ERR_BASE - 26),
ORTE_ERR_FILE_NOT_EXECUTABLE = (ORTE_ERR_BASE - 27),
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 28)
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 28),
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29)
};
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)

Просмотреть файл

@ -500,12 +500,24 @@ static void mca_oob_tcp_accept(int incoming_sd)
sd = accept(incoming_sd, (struct sockaddr*)&addr, &addrlen);
if(sd < 0) {
if(opal_socket_errno == EINTR) {
if(EINTR == opal_socket_errno) {
continue;
}
if(opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
strerror(opal_socket_errno), opal_socket_errno);
if(EMFILE == opal_socket_errno) {
/*
* Close incoming_sd so that orte_show_help will have a file
* descriptor with which to open the help file. We will be
* exiting anyway, so we don't need to keep it open.
*/
CLOSE_THE_SOCKET(incoming_sd);
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
} else {
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
strerror(opal_socket_errno), opal_socket_errno);
}
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, "");
}
return;
}
@ -910,9 +922,14 @@ mca_oob_tcp_listen_thread(opal_object_t *obj)
if (opal_socket_errno != EAGAIN ||
opal_socket_errno != EWOULDBLOCK) {
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(pending_connection->fd);
if(EMFILE == opal_socket_errno) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_SOCKETS);
orte_show_help("help-orterun.txt", "orterun:sys-limit-sockets", true);
} else {
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
strerror(opal_socket_errno), opal_socket_errno);
}
goto done;
}

Просмотреть файл

@ -177,6 +177,13 @@ increasing your limit descriptor setting (using limit or ulimit commands),
asking the system administrator for that node to increase the system limit, or
by rearranging your processes to place fewer of them on that node.
#
[orterun:sys-limit-sockets]
Error: system limit exceeded on number of network connections that can be open
This can be resolved by setting the mca parameter opal_set_max_sys_limits to 1,
increasing your limit descriptor setting (using limit or ulimit commands),
or asking the system administrator to increase the system limit.
#
[orterun:pipe-setup-failure]
%s was unable to launch the specified application as it encountered an error:

Просмотреть файл

@ -115,6 +115,9 @@ orte_err2str(int errnum)
case ORTE_ERR_HNP_COULD_NOT_START:
retval = "Unable to start a daemon on the local node";
break;
case ORTE_ERR_SYS_LIMITS_SOCKETS:
retval = "The system limit on number of network connections a process can open was reached";
break;
default:
retval = NULL;
}