diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 225e8a63c2..3738c394b3 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Laboratory * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights @@ -1363,7 +1363,6 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) mca_btl_tcp_endpoint_hs_msg_t hs_msg; struct timeval save, tv; socklen_t rcvtimeo_save_len = sizeof(save); - char str[128]; /* Note, Socket will be in blocking mode during intial handshake * hence setting SO_RCVTIMEO to say 2 seconds here to avoid waiting @@ -1376,20 +1375,22 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) if (ENOPROTOOPT == errno) { sockopt = false; } else { - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "Cannot get current recv timeout value of the socket" - "Local_host:%s PID:%d", - opal_process_info.nodename, getpid()); + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), + "getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)", + strerror(opal_socket_errno), opal_socket_errno); return; } } else { tv.tv_sec = 2; tv.tv_usec = 0; if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) { - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "Cannot set new recv timeout value of the socket" - "Local_host:%s PID:%d", - opal_process_info.nodename, getpid()); + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), + "setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)", + strerror(opal_socket_errno), opal_socket_errno); return; } } @@ -1408,14 +1409,16 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) * This attempted connection will be ignored; your MPI job may or may not * continue properly. */ - if (sizeof(hs_msg) != retval) { - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "process did not receive full connect ACK " - "Local_host:%s PID:%d String_received:%s Test_fail:%s", - opal_process_info.nodename, - getpid(), - (retval > 0) ? hs_msg.magic_id : "", - "handshake message length"); + if (sizeof(hs_msg) != retval) { + const char *peer = opal_fd_get_peer_name(sd); + opal_show_help("help-mpi-btl-tcp.txt", + "did not receive full magic id string", + true, + opal_process_info.nodename, + getpid(), + opal_version_string, + peer); + free((char*) peer); /* The other side probably isn't OMPI, so just hang up */ CLOSE_THE_SOCKET(sd); @@ -1424,12 +1427,18 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) guid = hs_msg.guid; if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) { - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "process did not receive right magic string. " - "Local_host:%s PID:%d String_received:%s Test_fail:%s", - opal_process_info.nodename, - getpid(), hs_msg.magic_id, - "string value"); + const char *peer = opal_fd_get_peer_name(sd); + opal_show_help("help-mpi-btl-tcp.txt", + "received incorrect magic id string", + true, + opal_process_info.nodename, + getpid(), + opal_version_string, + peer, + hs_msg.magic_id, + mca_btl_tcp_magic_id_string); + free((char*) peer); + /* The other side probably isn't OMPI, so just hang up */ CLOSE_THE_SOCKET(sd); return; @@ -1438,10 +1447,11 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) if (sockopt) { /* reset RECVTIMEO option to its original state */ if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) { - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "Cannot reset recv timeout value" - "Local_host:%s PID:%d", - opal_process_info.nodename, getpid()); + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), + "setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)", + strerror(opal_socket_errno), opal_socket_errno); return; } } @@ -1492,24 +1502,9 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) /* are there any existing peer instances willing to accept this connection */ (void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd); - switch (addr.ss_family) { - case AF_INET: - inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str)); - break; - - #if OPAL_ENABLE_IPV6 - case AF_INET6: - inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str)); - break; - #endif - - default: - BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen")); - CLOSE_THE_SOCKET(sd); - return; - - } + const char *str = opal_fd_get_peer_name(sd); opal_output_verbose(10, opal_btl_base_framework.framework_output, "btl:tcp: now connected to %s, process %s", str, OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); + free((char*) str); } diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index 320ee110ff..c5adbd7f32 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -35,7 +35,7 @@ values are in the range [1 .. 2^16-1]. This value will be ignored WARNING: Open MPI failed to TCP connect to a peer MPI process. This should not happen. -Your Open MPI job may now fail. +Your Open MPI job may now hang or fail. Local host: %s PID: %d @@ -46,7 +46,7 @@ Your Open MPI job may now fail. WARNING: Open MPI failed to handshake with a connecting peer MPI process over TCP. This should not happen. -Your Open MPI job may now fail. +Your Open MPI job may now hang or fail. Local host: %s PID: %d @@ -102,8 +102,11 @@ hopefully be able to continue). Known IPs of peer: %s # [socket flag fail] -WARNING: Open MPI failed to set flags on a TCP socket. This should -not happen. It is likely that your MPI job will now fail. +WARNING: Open MPI failed to get or set flags on a TCP socket. This +should not happen. + +This may cause unpredictable behavior, and may end up hanging or +aborting your job. Local host: %s PID: %d @@ -164,4 +167,43 @@ Your Open MPI job may now fail. PID: %d Message: %s Error: %s (%d) -# \ No newline at end of file +# +[did not receive full magic id string] +The TCP BTL received an inbound socket connection from an unidentified +peer. This typically means one of two things: + +1. A non-Open MPI process tried to connect to this Open MPI process. +2. An Open MPI process compiled against a different version of Open + MPI tried to connect to this Open MPI process. + +Open MPI only supports running exactly the same version between all +processes in a single job. + +This may cause unpredictable behavior, and may end up aborting your +job. + + Local host: %s + Local PID: %d + Local Open MPI version: %s + Peer IP address: %s +# +[received incorrect magic id string] +The TCP BTL received an inbound socket connection from a peer that did +not identify itself correctly as an Open MPI process. This typically +means one of two things: + +1. A non-Open MPI process tried to connect to this Open MPI process. +2. An Open MPI process compiled against a different version of Open + MPI tried to connect to this Open MPI process. + +Open MPI only supports running exactly the same version between all +processes in a single job. + +This may cause unpredictable behavior, and may end up hanging or +aborting your job. + + Local host: %s + Local PID: %d + Local Open MPI version: %s + Peer IP address: %s + Peer identifier: %s (expected %s)