1
1

btl/tcp: make error messages more specific

Convert some verbose messages to opal_show_help() messages.

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
Jeff Squyres 2017-09-12 07:55:50 -07:00
родитель e0d86b1c72
Коммит 40afd525f8
2 изменённых файлов: 87 добавлений и 50 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Oak Ridge National Laboratory * Copyright (c) 2009 Oak Ridge National Laboratory
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
@ -1363,7 +1363,6 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
mca_btl_tcp_endpoint_hs_msg_t hs_msg; mca_btl_tcp_endpoint_hs_msg_t hs_msg;
struct timeval save, tv; struct timeval save, tv;
socklen_t rcvtimeo_save_len = sizeof(save); socklen_t rcvtimeo_save_len = sizeof(save);
char str[128];
/* Note, Socket will be in blocking mode during intial handshake /* Note, Socket will be in blocking mode during intial handshake
* hence setting SO_RCVTIMEO to say 2 seconds here to avoid waiting * hence setting SO_RCVTIMEO to say 2 seconds here to avoid waiting
@ -1376,20 +1375,22 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (ENOPROTOOPT == errno) { if (ENOPROTOOPT == errno) {
sockopt = false; sockopt = false;
} else { } else {
opal_output_verbose(20, opal_btl_base_framework.framework_output, opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
"Cannot get current recv timeout value of the socket" true, opal_process_info.nodename,
"Local_host:%s PID:%d", getpid(),
opal_process_info.nodename, getpid()); "getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return; return;
} }
} else { } else {
tv.tv_sec = 2; tv.tv_sec = 2;
tv.tv_usec = 0; tv.tv_usec = 0;
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) { if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output, opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
"Cannot set new recv timeout value of the socket" true, opal_process_info.nodename,
"Local_host:%s PID:%d", getpid(),
opal_process_info.nodename, getpid()); "setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return; return;
} }
} }
@ -1408,14 +1409,16 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
* This attempted connection will be ignored; your MPI job may or may not * This attempted connection will be ignored; your MPI job may or may not
* continue properly. * continue properly.
*/ */
if (sizeof(hs_msg) != retval) { if (sizeof(hs_msg) != retval) {
opal_output_verbose(20, opal_btl_base_framework.framework_output, const char *peer = opal_fd_get_peer_name(sd);
"process did not receive full connect ACK " opal_show_help("help-mpi-btl-tcp.txt",
"Local_host:%s PID:%d String_received:%s Test_fail:%s", "did not receive full magic id string",
opal_process_info.nodename, true,
getpid(), opal_process_info.nodename,
(retval > 0) ? hs_msg.magic_id : "<nothing>", getpid(),
"handshake message length"); opal_version_string,
peer);
free((char*) peer);
/* The other side probably isn't OMPI, so just hang up */ /* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd); CLOSE_THE_SOCKET(sd);
@ -1424,12 +1427,18 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
guid = hs_msg.guid; guid = hs_msg.guid;
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) { if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
opal_output_verbose(20, opal_btl_base_framework.framework_output, const char *peer = opal_fd_get_peer_name(sd);
"process did not receive right magic string. " opal_show_help("help-mpi-btl-tcp.txt",
"Local_host:%s PID:%d String_received:%s Test_fail:%s", "received incorrect magic id string",
opal_process_info.nodename, true,
getpid(), hs_msg.magic_id, opal_process_info.nodename,
"string value"); getpid(),
opal_version_string,
peer,
hs_msg.magic_id,
mca_btl_tcp_magic_id_string);
free((char*) peer);
/* The other side probably isn't OMPI, so just hang up */ /* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd); CLOSE_THE_SOCKET(sd);
return; return;
@ -1438,10 +1447,11 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (sockopt) { if (sockopt) {
/* reset RECVTIMEO option to its original state */ /* reset RECVTIMEO option to its original state */
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) { if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output, opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
"Cannot reset recv timeout value" true, opal_process_info.nodename,
"Local_host:%s PID:%d", getpid(),
opal_process_info.nodename, getpid()); "setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return; return;
} }
} }
@ -1492,24 +1502,9 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
/* are there any existing peer instances willing to accept this connection */ /* are there any existing peer instances willing to accept this connection */
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd); (void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
switch (addr.ss_family) { const char *str = opal_fd_get_peer_name(sd);
case AF_INET:
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
break;
#if OPAL_ENABLE_IPV6
case AF_INET6:
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
break;
#endif
default:
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
CLOSE_THE_SOCKET(sd);
return;
}
opal_output_verbose(10, opal_btl_base_framework.framework_output, opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: now connected to %s, process %s", str, "btl:tcp: now connected to %s, process %s", str,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
free((char*) str);
} }

Просмотреть файл

@ -35,7 +35,7 @@ values are in the range [1 .. 2^16-1]. This value will be ignored
WARNING: Open MPI failed to TCP connect to a peer MPI process. This WARNING: Open MPI failed to TCP connect to a peer MPI process. This
should not happen. should not happen.
Your Open MPI job may now fail. Your Open MPI job may now hang or fail.
Local host: %s Local host: %s
PID: %d PID: %d
@ -46,7 +46,7 @@ Your Open MPI job may now fail.
WARNING: Open MPI failed to handshake with a connecting peer MPI WARNING: Open MPI failed to handshake with a connecting peer MPI
process over TCP. This should not happen. process over TCP. This should not happen.
Your Open MPI job may now fail. Your Open MPI job may now hang or fail.
Local host: %s Local host: %s
PID: %d PID: %d
@ -102,8 +102,11 @@ hopefully be able to continue).
Known IPs of peer: %s Known IPs of peer: %s
# #
[socket flag fail] [socket flag fail]
WARNING: Open MPI failed to set flags on a TCP socket. This should WARNING: Open MPI failed to get or set flags on a TCP socket. This
not happen. It is likely that your MPI job will now fail. should not happen.
This may cause unpredictable behavior, and may end up hanging or
aborting your job.
Local host: %s Local host: %s
PID: %d PID: %d
@ -164,4 +167,43 @@ Your Open MPI job may now fail.
PID: %d PID: %d
Message: %s Message: %s
Error: %s (%d) Error: %s (%d)
# #
[did not receive full magic id string]
The TCP BTL received an inbound socket connection from an unidentified
peer. This typically means one of two things:
1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.
Open MPI only supports running exactly the same version between all
processes in a single job.
This may cause unpredictable behavior, and may end up aborting your
job.
Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
#
[received incorrect magic id string]
The TCP BTL received an inbound socket connection from a peer that did
not identify itself correctly as an Open MPI process. This typically
means one of two things:
1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.
Open MPI only supports running exactly the same version between all
processes in a single job.
This may cause unpredictable behavior, and may end up hanging or
aborting your job.
Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
Peer identifier: %s (expected %s)