1
1

btl/tcp: make error messages more specific

Convert some verbose messages to opal_show_help() messages.

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
Jeff Squyres 2017-09-12 07:55:50 -07:00
родитель e0d86b1c72
Коммит 40afd525f8
2 изменённых файлов: 87 добавлений и 50 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Oak Ridge National Laboratory
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
@ -1363,7 +1363,6 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
struct timeval save, tv;
socklen_t rcvtimeo_save_len = sizeof(save);
char str[128];
/* Note, Socket will be in blocking mode during intial handshake
* hence setting SO_RCVTIMEO to say 2 seconds here to avoid waiting
@ -1376,20 +1375,22 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (ENOPROTOOPT == errno) {
sockopt = false;
} else {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot get current recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
} else {
tv.tv_sec = 2;
tv.tv_usec = 0;
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot set new recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
}
@ -1409,13 +1410,15 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
* continue properly.
*/
if (sizeof(hs_msg) != retval) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"process did not receive full connect ACK "
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
const char *peer = opal_fd_get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt",
"did not receive full magic id string",
true,
opal_process_info.nodename,
getpid(),
(retval > 0) ? hs_msg.magic_id : "<nothing>",
"handshake message length");
opal_version_string,
peer);
free((char*) peer);
/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
@ -1424,12 +1427,18 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
guid = hs_msg.guid;
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"process did not receive right magic string. "
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
const char *peer = opal_fd_get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt",
"received incorrect magic id string",
true,
opal_process_info.nodename,
getpid(), hs_msg.magic_id,
"string value");
getpid(),
opal_version_string,
peer,
hs_msg.magic_id,
mca_btl_tcp_magic_id_string);
free((char*) peer);
/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
return;
@ -1438,10 +1447,11 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (sockopt) {
/* reset RECVTIMEO option to its original state */
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot reset recv timeout value"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
}
@ -1492,24 +1502,9 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
/* are there any existing peer instances willing to accept this connection */
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
switch (addr.ss_family) {
case AF_INET:
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
break;
#if OPAL_ENABLE_IPV6
case AF_INET6:
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
break;
#endif
default:
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
CLOSE_THE_SOCKET(sd);
return;
}
const char *str = opal_fd_get_peer_name(sd);
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: now connected to %s, process %s", str,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
free((char*) str);
}

Просмотреть файл

@ -35,7 +35,7 @@ values are in the range [1 .. 2^16-1]. This value will be ignored
WARNING: Open MPI failed to TCP connect to a peer MPI process. This
should not happen.
Your Open MPI job may now fail.
Your Open MPI job may now hang or fail.
Local host: %s
PID: %d
@ -46,7 +46,7 @@ Your Open MPI job may now fail.
WARNING: Open MPI failed to handshake with a connecting peer MPI
process over TCP. This should not happen.
Your Open MPI job may now fail.
Your Open MPI job may now hang or fail.
Local host: %s
PID: %d
@ -102,8 +102,11 @@ hopefully be able to continue).
Known IPs of peer: %s
#
[socket flag fail]
WARNING: Open MPI failed to set flags on a TCP socket. This should
not happen. It is likely that your MPI job will now fail.
WARNING: Open MPI failed to get or set flags on a TCP socket. This
should not happen.
This may cause unpredictable behavior, and may end up hanging or
aborting your job.
Local host: %s
PID: %d
@ -165,3 +168,42 @@ Your Open MPI job may now fail.
Message: %s
Error: %s (%d)
#
[did not receive full magic id string]
The TCP BTL received an inbound socket connection from an unidentified
peer. This typically means one of two things:
1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.
Open MPI only supports running exactly the same version between all
processes in a single job.
This may cause unpredictable behavior, and may end up aborting your
job.
Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
#
[received incorrect magic id string]
The TCP BTL received an inbound socket connection from a peer that did
not identify itself correctly as an Open MPI process. This typically
means one of two things:
1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.
Open MPI only supports running exactly the same version between all
processes in a single job.
This may cause unpredictable behavior, and may end up hanging or
aborting your job.
Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
Peer identifier: %s (expected %s)