1
1

Merge pull request #4942 from jsquyres/pr/tcp-btl-help-message-updates

TCP help message updates
This commit is contained in:
Jeff Squyres 2018-03-22 08:53:04 -05:00 committed by GitHub
commit 023a4a82d3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 188 additions and 62 deletions

View File

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Oak Ridge National Laboratory
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
@ -1363,7 +1363,6 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
struct timeval save, tv;
socklen_t rcvtimeo_save_len = sizeof(save);
char str[128];
/* Note, Socket will be in blocking mode during intial handshake
* hence setting SO_RCVTIMEO to say 2 seconds here to avoid waiting
@ -1376,20 +1375,22 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (ENOPROTOOPT == errno) {
sockopt = false;
} else {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot get current recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
} else {
tv.tv_sec = 2;
tv.tv_usec = 0;
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot set new recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
}
@ -1408,14 +1409,16 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
* This attempted connection will be ignored; your MPI job may or may not
* continue properly.
*/
if (sizeof(hs_msg) != retval) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"process did not receive full connect ACK "
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
opal_process_info.nodename,
getpid(),
(retval > 0) ? hs_msg.magic_id : "<nothing>",
"handshake message length");
if (sizeof(hs_msg) != retval) {
const char *peer = opal_fd_get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt",
"did not receive full magic id string",
true,
opal_process_info.nodename,
getpid(),
opal_version_string,
peer);
free((char*) peer);
/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
@ -1424,12 +1427,18 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
guid = hs_msg.guid;
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"process did not receive right magic string. "
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
opal_process_info.nodename,
getpid(), hs_msg.magic_id,
"string value");
const char *peer = opal_fd_get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt",
"received incorrect magic id string",
true,
opal_process_info.nodename,
getpid(),
opal_version_string,
peer,
hs_msg.magic_id,
mca_btl_tcp_magic_id_string);
free((char*) peer);
/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
return;
@ -1438,10 +1447,11 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (sockopt) {
/* reset RECVTIMEO option to its original state */
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot reset recv timeout value"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
}
@ -1492,24 +1502,9 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
/* are there any existing peer instances willing to accept this connection */
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
switch (addr.ss_family) {
case AF_INET:
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
break;
#if OPAL_ENABLE_IPV6
case AF_INET6:
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
break;
#endif
default:
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
CLOSE_THE_SOCKET(sd);
return;
}
const char *str = opal_fd_get_peer_name(sd);
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: now connected to %s, process %s", str,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
free((char*) str);
}

View File

@ -35,7 +35,7 @@ values are in the range [1 .. 2^16-1]. This value will be ignored
WARNING: Open MPI failed to TCP connect to a peer MPI process. This
should not happen.
Your Open MPI job may now fail.
Your Open MPI job may now hang or fail.
Local host: %s
PID: %d
@ -46,7 +46,7 @@ Your Open MPI job may now fail.
WARNING: Open MPI failed to handshake with a connecting peer MPI
process over TCP. This should not happen.
Your Open MPI job may now fail.
Your Open MPI job may now hang or fail.
Local host: %s
PID: %d
@ -102,8 +102,11 @@ hopefully be able to continue).
Known IPs of peer: %s
#
[socket flag fail]
WARNING: Open MPI failed to set flags on a TCP socket. This should
not happen. It is likely that your MPI job will now fail.
WARNING: Open MPI failed to get or set flags on a TCP socket. This
should not happen.
This may cause unpredictable behavior, and may end up hanging or
aborting your job.
Local host: %s
PID: %d
@ -164,4 +167,43 @@ Your Open MPI job may now fail.
PID: %d
Message: %s
Error: %s (%d)
#
#
[did not receive full magic id string]
The TCP BTL received an inbound socket connection from an unidentified
peer. This typically means one of two things:
1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.
Open MPI only supports running exactly the same version between all
processes in a single job.
This may cause unpredictable behavior, and may end up aborting your
job.
Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
#
[received incorrect magic id string]
The TCP BTL received an inbound socket connection from a peer that did
not identify itself correctly as an Open MPI process. This typically
means one of two things:
1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.
Open MPI only supports running exactly the same version between all
processes in a single job.
This may cause unpredictable behavior, and may end up hanging or
aborting your job.
Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
Peer identifier: %s (expected %s)

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
*
@ -18,13 +18,22 @@
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include "opal/util/fd.h"
#include "opal/constants.h"
@ -126,3 +135,49 @@ bool opal_fd_is_blkdev(int fd)
return S_ISBLK(buf.st_mode);
}
const char *opal_fd_get_peer_name(int fd)
{
char *str;
const char *ret;
struct sockaddr sa;
socklen_t slt = (socklen_t) sizeof(sa);
int rc = getpeername(fd, &sa, &slt);
if (0 != rc) {
ret = strdup("Unknown");
return ret;
}
size_t len = INET_ADDRSTRLEN;
#if OPAL_ENABLE_IPV6
len = INET6_ADDRSTRLEN;
#endif
str = malloc(len);
if (NULL == str) {
return NULL;
}
if (sa.sa_family == AF_INET) {
struct sockaddr_in *si;
si = (struct sockaddr_in*) &sa;
ret = inet_ntop(AF_INET, &(si->sin_addr), str, INET_ADDRSTRLEN);
if (NULL == ret) {
free(str);
}
}
#if OPAL_ENABLE_IPV6
else if (sa.sa_family == AF_INET6) {
struct sockaddr_in6 *si6;
si6 = (struct sockaddr_in6*) &sa;
ret = inet_ntop(AF_INET6, &(si6->sin6_addr), str, INET6_ADDRSTRLEN);
if (NULL == ret) {
free(str);
}
}
#endif
else {
ret = strdup("Unknown");
}
return ret;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
*
@ -94,6 +94,15 @@ OPAL_DECLSPEC bool opal_fd_is_chardev(int fd);
*/
OPAL_DECLSPEC bool opal_fd_is_blkdev(int fd);
/**
* Convenience function to get a string name of the peer on the other
* end of this internet socket.
*
* @param fd File descriptor of an AF_INET/AF_INET6 socket
*
* @returns resolvable IP name, or "a.b.c.d". This string must be freed by the caller.
*/
OPAL_DECLSPEC const char *opal_fd_get_peer_name(int fd);
END_C_DECLS

View File

@ -11,7 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -106,10 +106,29 @@ levels.
Remote host: %s
Remote port: %d
The connection was rejected.
#
[static-fwd]
Static ports were requested while orte_fwd_mpirun_port was set.
Both options cannot be simultaneously set. Please either set
orte_fwd_mpirun_port=false or remove any static port directives.
#
[version mismatch]
Open MPI detected a mismatch in versions between two processes. This
typically means that you executed "mpirun" (or "mpiexec") from one
version of Open MPI on on node, but your default path on one of the
other nodes upon which you launched found a different version of Open
MPI.
Open MPI only supports running exactly the same version between all
processes in a single job.
This will almost certainly cause unpredictable behavior, and may end
up aborting your job.
Local host: %s
Local process name: %s
Local Open MPI version: %s
Peer host: %s
Peer process name: %s
Peer Open MPI version: %s

View File

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
@ -58,6 +58,7 @@
#include "opal/util/net.h"
#include "opal/util/fd.h"
#include "opal/util/error.h"
#include "opal/util/show_help.h"
#include "opal/class/opal_hash_table.h"
#include "opal/mca/event/event.h"
@ -701,6 +702,7 @@ static bool retry(mca_oob_tcp_peer_t* peer, int sd, bool fatal)
}
}
int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
int sd, mca_oob_tcp_hdr_t *dhdr)
{
@ -890,11 +892,15 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
version = (char*)((char*)msg + offset);
offset += strlen(version) + 1;
if (0 != strcmp(version, orte_version_string)) {
opal_output(0, "%s tcp_peer_recv_connect_ack: "
"received different version from %s: %s instead of %s\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
version, orte_version_string);
opal_show_help("help-oob-tcp.txt", "version mismatch",
true,
opal_process_info.nodename,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_version_string,
opal_fd_get_peer_name(peer->sd),
ORTE_NAME_PRINT(&(peer->name)),
version);
peer->state = MCA_OOB_TCP_FAILED;
mca_oob_tcp_peer_close(peer);
free(msg);