1
1

Merge pull request #3955 from mohanasudhan/master

Btl tcp: Improved diagnostic output and failure mode
Этот коммит содержится в:
Brian Barrett 2017-08-18 11:42:27 -07:00 коммит произвёл GitHub
родитель b67b1e88a5 fc32ae401e
Коммит c667719a3f
7 изменённых файлов: 491 добавлений и 100 удалений

Просмотреть файл

@ -31,12 +31,14 @@
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/mpool/mpool.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/opal_socket_errno.h"
#include "btl_tcp.h"
#include "btl_tcp_frag.h"
#include "btl_tcp_proc.h"
#include "btl_tcp_endpoint.h"
mca_btl_tcp_module_t mca_btl_tcp_module = {
.super = {
.btl_component = &mca_btl_tcp_component.super,
@ -531,3 +533,68 @@ void mca_btl_tcp_dump(struct mca_btl_base_module_t* base_btl,
}
#endif /* OPAL_ENABLE_DEBUG && WANT_PEER_DUMP */
}
/*
* A blocking recv for both blocking and non-blocking socket.
* Used to receive the small amount of connection information
* that identifies the endpoints
*
* when the socket is blocking (the caller introduces timeout)
* which happens during initial handshake otherwise socket is
* non-blocking most of the time.
*/
int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size)
{
unsigned char* ptr = (unsigned char*)data;
size_t cnt = 0;
while (cnt < size) {
int retval = recv(sd, ((char *)ptr) + cnt, size - cnt, 0);
/* remote closed connection */
if (0 == retval) {
BTL_ERROR(("remote peer unexpectedly closed connection while I was waiting for blocking message"));
return -1;
}
/* socket is non-blocking so handle errors */
if (retval < 0) {
if (opal_socket_errno != EINTR &&
opal_socket_errno != EAGAIN &&
opal_socket_errno != EWOULDBLOCK) {
BTL_ERROR(("recv(%d) failed: %s (%d)", sd, strerror(opal_socket_errno), opal_socket_errno));
return -1;
}
continue;
}
cnt += retval;
}
return cnt;
}
/*
* A blocking send on a non-blocking socket. Used to send the small
* amount of connection information that identifies the endpoints
* endpoint.
*/
int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size)
{
unsigned char* ptr = (unsigned char*)data;
size_t cnt = 0;
while(cnt < size) {
int retval = send(sd, ((const char *)ptr) + cnt, size - cnt, 0);
if (retval < 0) {
if (opal_socket_errno != EINTR &&
opal_socket_errno != EAGAIN &&
opal_socket_errno != EWOULDBLOCK) {
BTL_ERROR(("send() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno));
return -1;
}
continue;
}
cnt += retval;
}
return cnt;
}

Просмотреть файл

@ -351,5 +351,23 @@ mca_btl_tcp_dump(struct mca_btl_base_module_t* btl,
*/
int mca_btl_tcp_ft_event(int state);
/*
* A blocking send on a non-blocking socket. Used to send the small
* amount of connection information that identifies the endpoints
* endpoint.
*/
int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size);
/*
* A blocking recv for both blocking and non-blocking socket.
* Used to receive the small amount of connection information
* that identifies the endpoints
*
* when the socket is blocking (the caller introduces timeout)
* which happens during initial handshake otherwise socket is
* non-blocking most of the time.
*/
int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size);
END_C_DECLS
#endif

Просмотреть файл

@ -54,6 +54,9 @@
#endif
#include <ctype.h>
#include <limits.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/mca/event/event.h"
#include "opal/util/ethtool.h"
@ -729,7 +732,9 @@ static int mca_btl_tcp_component_create_instances(void)
char* if_name = *argv;
int if_index = opal_ifnametokindex(if_name);
if(if_index < 0) {
BTL_ERROR(("invalid interface \"%s\"", if_name));
opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude",
true, "include", opal_process_info.nodename,
if_name, "Unknown interface name");
ret = OPAL_ERR_NOT_FOUND;
goto cleanup;
}
@ -856,13 +861,18 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
freeaddrinfo (res);
#ifdef IPV6_V6ONLY
/* in case of AF_INET6, disable v4-mapped addresses */
/* If this OS supports the "IPV6_V6ONLY" constant, then set it
on this socket. It specifies that *only* V6 connections
should be accepted on this socket (vs. allowing incoming
both V4 and V6 connections -- which is actually defined
behavior for V6<-->V4 interop stuff). See
https://github.com/open-mpi/ompi/commit/95d7e08a6617530d57b6700c57738b351bfccbf8 for some
more details. */
if (AF_INET6 == af_family) {
int flg = 1;
if (setsockopt (sd, IPPROTO_IPV6, IPV6_V6ONLY,
(char *) &flg, sizeof (flg)) < 0) {
opal_output(0,
"mca_btl_tcp_create_listen: unable to disable v4-mapped addresses\n");
BTL_ERROR((0, "mca_btl_tcp_create_listen: unable to set IPV6_V6ONLY\n"));
}
}
#endif /* IPV6_V6ONLY */
@ -904,6 +914,10 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
#else
((struct sockaddr_in*) &inaddr)->sin_port = htons(port + index);
#endif /* OPAL_ENABLE_IPV6 */
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: Attempting to bind to %s port %d",
(AF_INET == af_family) ? "AF_INET" : "AF_INET6",
port + index);
if(bind(sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
if( (EADDRINUSE == opal_socket_errno) || (EADDRNOTAVAIL == opal_socket_errno) ) {
continue;
@ -913,6 +927,10 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
CLOSE_THE_SOCKET(sd);
return OPAL_ERROR;
}
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: Successfully bound to %s port %d",
(AF_INET == af_family) ? "AF_INET" : "AF_INET6",
port + index);
goto socket_binded;
}
#if OPAL_ENABLE_IPV6
@ -943,11 +961,19 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
if (AF_INET6 == af_family) {
mca_btl_tcp_component.tcp6_listen_port = ((struct sockaddr_in6*) &inaddr)->sin6_port;
mca_btl_tcp_component.tcp6_listen_sd = sd;
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: my listening v6 socket port is %d",
ntohs(mca_btl_tcp_component.tcp6_listen_port));
} else
#endif
{
char str[16];
mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*) &inaddr)->sin_port;
mca_btl_tcp_component.tcp_listen_sd = sd;
inet_ntop(AF_INET, &(((struct sockaddr_in*)&inaddr)->sin_addr), str, sizeof(str));
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: my listening v4 socket is %s:%u",
str, ntohs(mca_btl_tcp_component.tcp_listen_port));
}
/* setup listen backlog to maximum allowed by kernel */
@ -960,15 +986,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
/* set socket up to be non-blocking, otherwise accept could block */
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
return OPAL_ERROR;
} else {
flags |= O_NONBLOCK;
if(fcntl(sd, F_SETFL, flags) < 0) {
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
return OPAL_ERROR;
}
@ -1081,6 +1112,7 @@ static int mca_btl_tcp_component_exchange(void)
size_t current_addr = 0;
if(mca_btl_tcp_component.tcp_num_btls != 0) {
char ifn[32];
mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size);
memset(addrs, 0, size);
@ -1098,6 +1130,9 @@ static int mca_btl_tcp_component_exchange(void)
continue;
}
opal_ifindextoname(index, ifn, sizeof(ifn));
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: examining interface %s", ifn);
if (OPAL_SUCCESS !=
opal_ifindextoaddr(index, (struct sockaddr*) &my_ss,
sizeof (my_ss))) {
@ -1121,6 +1156,8 @@ static int mca_btl_tcp_component_exchange(void)
addrs[current_addr].addr_ifkindex =
opal_ifindextokindex (index);
current_addr++;
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: using ipv4 interface %s", ifn);
} else
#endif
if ((AF_INET == my_ss.ss_family) &&
@ -1136,6 +1173,8 @@ static int mca_btl_tcp_component_exchange(void)
addrs[current_addr].addr_ifkindex =
opal_ifindextokindex (index);
current_addr++;
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: using ipv6 interface %s", ifn);
}
} /* end of for opal_ifbegin() */
} /* end of for tcp_num_btls */
@ -1299,45 +1338,159 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
struct sockaddr_storage addr;
opal_socklen_t addr_len = sizeof(addr);
mca_btl_tcp_proc_t* btl_proc;
int retval;
bool sockopt = true;
size_t retval, len = strlen(mca_btl_tcp_magic_id_string);
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
struct timeval save, tv;
socklen_t rcvtimeo_save_len = sizeof(save);
char str[128];
/* Note, Socket will be in blocking mode during intial handshake
* hence setting SO_RCVTIMEO to say 2 seconds here to avoid chance
* of spin forever if it tries to connect to old version
* as older version will send just process id which won't be long enough
* to cross sizeof(str) length + process id struct
* or when the remote side isn't OMPI where it's not going to send
* any data*/
/* get the current timeout value so we can reset to it */
if (0 != getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, (void*)&save, &rcvtimeo_save_len)) {
if (ENOPROTOOPT == errno) {
sockopt = false;
} else {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot get current recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
return;
}
} else {
tv.tv_sec = 2;
tv.tv_usec = 0;
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot set new recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
return;
}
}
OBJ_RELEASE(event);
retval = mca_btl_tcp_recv_blocking(sd, (void *)&hs_msg, sizeof(hs_msg));
guid = hs_msg.guid;
/* recv the process identifier */
retval = recv(sd, (char *)&guid, sizeof(guid), 0);
if(retval != sizeof(guid)) {
/* An unknown process attempted to connect to Open MPI via TCP.
* Open MPI uses a "magic" string to trivially verify that the connecting
* process is a fellow Open MPI process. An MPI process accepted a TCP
* connection but did not receive the correct magic string. This might
* indicate an Open MPI version mismatch between different MPI processes
* in the same job, or it may indicate that some other agent is
* mistakenly attempting to connect to Open MPI's TCP listening sockets.
* This attempted connection will be ignored; your MPI job may or may not
* continue properly.
*/
if (sizeof(hs_msg) != retval) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"server did not receive entire connect ACK "
"Local_host:%s PID:%d Role:%s String_received:%s Test_fail:%s",
opal_process_info.nodename,
getpid(), "server",
(retval > 0) ? hs_msg.magic_id : "<nothing>",
"handshake message length");
/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
return;
}
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"server did not receive right magic string. "
"Local_host:%s PID:%d Role:%s String_received:%s Test_fail:%s",
opal_process_info.nodename,
getpid(), "server", hs_msg.magic_id,
"string value");
/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
return;
}
if (sockopt) {
/* reset RECVTIMEO option to its original state */
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot reset recv timeout value"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
return;
}
}
OPAL_PROCESS_NAME_NTOH(guid);
/* now set socket up to be non-blocking */
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
} else {
flags |= O_NONBLOCK;
if(fcntl(sd, F_SETFL, flags) < 0) {
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
}
}
/* lookup the corresponding process */
btl_proc = mca_btl_tcp_proc_lookup(&guid);
if(NULL == btl_proc) {
opal_show_help("help-mpi-btl-tcp.txt",
"server accept cannot find guid",
true, opal_process_info.nodename,
getpid());
CLOSE_THE_SOCKET(sd);
return;
}
/* lookup peer address */
if(getpeername(sd, (struct sockaddr*)&addr, &addr_len) != 0) {
BTL_ERROR(("getpeername() failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
opal_show_help("help-mpi-btl-tcp.txt",
"server getpeername failed",
true, opal_process_info.nodename,
getpid(),
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
return;
}
/* are there any existing peer instances willing to accept this connection */
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
switch (addr.ss_family) {
case AF_INET:
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
break;
#if OPAL_ENABLE_IPV6
case AF_INET6:
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
break;
#endif
default:
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
CLOSE_THE_SOCKET(sd);
return;
}
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: now connected to %s, process %s", str,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
}

Просмотреть файл

@ -62,6 +62,11 @@
#include "btl_tcp_frag.h"
#include "btl_tcp_addr.h"
/*
* Magic ID string send during connect/accept handshake
*/
const char mca_btl_tcp_magic_id_string[MCA_BTL_TCP_MAGIC_STRING_LENGTH] = "OPAL-TCP-BTL";
/*
* Initialize state of the endpoint instance.
@ -371,48 +376,42 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp
/*
* A blocking send on a non-blocking socket. Used to send the small amount of connection
* information that identifies the endpoints endpoint.
* A blocking send on a non-blocking socket. Used to send the small
* amount of connection information that identifies the endpoints endpoint.
*/
static int
mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
void* data, size_t size)
const void* data, size_t size)
{
unsigned char* ptr = (unsigned char*)data;
size_t cnt = 0;
while(cnt < size) {
int retval = send(btl_endpoint->endpoint_sd, (const char *)ptr+cnt, size-cnt, 0);
if(retval < 0) {
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
BTL_ERROR(("send(%d, %p, %lu/%lu) failed: %s (%d)",
btl_endpoint->endpoint_sd, data, cnt, size,
strerror(opal_socket_errno), opal_socket_errno));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return -1;
}
continue;
}
cnt += retval;
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
if (ret < 0) {
mca_btl_tcp_endpoint_close(btl_endpoint);
}
return cnt;
return ret;
}
/*
* Send the globally unique identifier for this process to a endpoint on
* a newly connected socket.
*/
static int mca_btl_tcp_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
static int
mca_btl_tcp_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
{
/* send process identifier to remote endpoint */
opal_process_name_t guid = opal_proc_local_get()->proc_name;
OPAL_PROCESS_NAME_HTON(guid);
if(mca_btl_tcp_endpoint_send_blocking(btl_endpoint, &guid, sizeof(guid)) !=
sizeof(guid)) {
return OPAL_ERR_UNREACH;
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
strcpy(hs_msg.magic_id, mca_btl_tcp_magic_id_string);
hs_msg.guid = guid;
if(sizeof(hs_msg) !=
mca_btl_tcp_endpoint_send_blocking(btl_endpoint,
&hs_msg, sizeof(hs_msg))) {
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
true, opal_process_info.nodename,
sizeof(hs_msg),
"connect ACK failed to send magic-id and guid");
return OPAL_ERR_UNREACH;
}
return OPAL_SUCCESS;
}
@ -573,31 +572,11 @@ static void mca_btl_tcp_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoint
*/
static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size)
{
unsigned char* ptr = (unsigned char*)data;
size_t cnt = 0;
while(cnt < size) {
int retval = recv(btl_endpoint->endpoint_sd, (char *)ptr+cnt, size-cnt, 0);
/* remote closed connection */
if(retval == 0) {
mca_btl_tcp_endpoint_close(btl_endpoint);
return cnt;
}
/* socket is non-blocking so handle errors */
if(retval < 0) {
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
BTL_ERROR(("recv(%d, %lu/%lu) failed: %s (%d)",
btl_endpoint->endpoint_sd, cnt, size, strerror(opal_socket_errno), opal_socket_errno));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return -1;
}
continue;
}
cnt += retval;
int ret = mca_btl_tcp_recv_blocking(btl_endpoint->endpoint_sd, data, size);
if (ret <= 0) {
mca_btl_tcp_endpoint_close(btl_endpoint);
}
return cnt;
return ret;
}
@ -605,17 +584,22 @@ static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpo
* Receive the endpoints globally unique process identification from a newly
* connected socket and verify the expected response. If so, move the
* socket to a connected state.
*
* NOTE: The return codes from this function are checked in
* mca_btl_tcp_endpoint_recv_handler(). Don't change them here
* without also changing the handling in _recv_handler()!
*/
static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
{
size_t s;
opal_process_name_t guid;
size_t retval, len = strlen(mca_btl_tcp_magic_id_string);;
mca_btl_tcp_proc_t* btl_proc = btl_endpoint->endpoint_proc;
opal_process_name_t guid;
s = mca_btl_tcp_endpoint_recv_blocking(btl_endpoint,
&guid, sizeof(opal_process_name_t));
if (s != sizeof(opal_process_name_t)) {
if (0 == s) {
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
retval = mca_btl_tcp_endpoint_recv_blocking(btl_endpoint, &hs_msg, sizeof(hs_msg));
if (sizeof(hs_msg) != retval) {
if (0 == retval) {
/* If we get zero bytes, the peer closed the socket. This
can happen when the two peers started the connection
protocol simultaneously. Just report the problem
@ -624,10 +608,19 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
}
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
true, opal_process_info.nodename,
getpid(),
"did not receive entire connect ACK from peer");
return OPAL_ERR_UNREACH;
getpid(), "did not receive entire connect ACK from peer");
return OPAL_ERR_BAD_PARAM;
}
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
opal_show_help("help-mpi-btl-tcp.txt", "server did not receive magic string",
true, opal_process_info.nodename,
getpid(), "client", hs_msg.magic_id,
"string value");
return OPAL_ERR_BAD_PARAM;
}
guid = hs_msg.guid;
OPAL_PROCESS_NAME_NTOH(guid);
/* compare this to the expected values */
/* TODO: this deserve a little bit more thinking as we are not supposed
@ -708,25 +701,39 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
/* setup the socket as non-blocking */
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
/* Upper layer will handler the error */
return OPAL_ERR_UNREACH;
} else {
flags |= O_NONBLOCK;
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) {
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
strerror(opal_socket_errno), opal_socket_errno);
/* Upper layer will handler the error */
return OPAL_ERR_UNREACH;
}
}
/* start the connect - will likely fail with EINPROGRESS */
mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);
opal_output_verbose(20, opal_btl_base_framework.framework_output,
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl: tcp: attempting to connect() to %s address %s on port %d",
OPAL_NAME_PRINT(btl_endpoint->endpoint_proc->proc_opal->proc_name),
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
ntohs(btl_endpoint->endpoint_addr->addr_port));
if(0 == connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen)) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: connect() to %s:%d completed",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
/* send our globally unique process identifier to the endpoint */
if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OPAL_SUCCESS) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
@ -742,6 +749,8 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "event_add(send) [start_connect]");
MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0);
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl:tcp: would block, so allowing background progress");
return OPAL_SUCCESS;
}
}
@ -765,7 +774,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
* later. Otherwise, send this processes identifier to the endpoint on the
* newly connected socket.
*/
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
{
int so_error = 0;
opal_socklen_t so_length = sizeof(so_error);
@ -781,32 +790,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e
/* check connect completion status */
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
BTL_ERROR(("getsockopt() to %s failed: %s (%d)",
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
((struct sockaddr_in*) &endpoint_addr)->sin_port,
strerror(opal_socket_errno), opal_socket_errno));
mca_btl_tcp_endpoint_close(btl_endpoint);
return;
return OPAL_ERROR;
}
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
return;
return OPAL_SUCCESS;
}
if(so_error != 0) {
BTL_ERROR(("connect() to %s failed: %s (%d)",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
strerror(so_error), so_error));
char *msg;
asprintf(&msg, "connect() to %s:%d failed",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
opal_show_help("help-mpi-btl-tcp.txt", "client connect fail",
true, opal_process_info.nodename,
getpid(), msg,
strerror(opal_socket_errno), opal_socket_errno);
free(msg);
mca_btl_tcp_endpoint_close(btl_endpoint);
return;
return OPAL_ERROR;
}
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
return;
return OPAL_SUCCESS;
}
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR;
}
@ -855,6 +881,26 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
}
else if (OPAL_ERR_BAD_PARAM == rc) {
/* If we get a BAD_PARAM, it means that it probably wasn't
an OMPI process on the other end of the socket (e.g.,
the magic string ID failed). So we can probably just
close the socket and ignore this connection. */
CLOSE_THE_SOCKET(sd);
}
else {
/* Otherwise, it probably *was* an OMPI peer process on
the other end, and something bad has probably
happened. */
mca_btl_tcp_module_t *m = btl_endpoint->endpoint_btl;
/* Fail up to the PML */
if (NULL != m->tcp_error_cb) {
m->tcp_error_cb((mca_btl_base_module_t*) m, MCA_BTL_ERROR_FLAGS_FATAL,
btl_endpoint->endpoint_proc->proc_opal,
"TCP ACK is neither SUCCESS nor ERR (something bad has probably happened)");
}
}
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
return;
}

Просмотреть файл

@ -26,7 +26,7 @@
BEGIN_C_DECLS
#define MCA_BTL_TCP_ENDPOINT_CACHE 1
#define MCA_BTL_TCP_MAGIC_STRING_LENGTH 16
/**
* State of TCP endpoint connection.
*/
@ -75,6 +75,14 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_tcp_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_tcp_endpoint_t);
/* Magic socket handshake string */
extern const char mca_btl_tcp_magic_id_string[MCA_BTL_TCP_MAGIC_STRING_LENGTH];
typedef struct {
opal_process_name_t guid;
char magic_id[MCA_BTL_TCP_MAGIC_STRING_LENGTH];
} mca_btl_tcp_endpoint_hs_msg_t;
void mca_btl_tcp_set_socket_options(int sd);
void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t*);
int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t*, struct mca_btl_tcp_frag_t*);

Просмотреть файл

@ -421,6 +421,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data;
size_t max_peer_interfaces;
memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t));
char str_local[128], str_remote[128];
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
return OPAL_ERR_UNREACH;
@ -508,10 +509,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
default:
opal_output(0, "unknown address family for tcp: %d\n",
endpoint_addr_ss.ss_family);
/*
* return OPAL_UNREACH or some error, as this is not
* good
*/
return OPAL_ERR_UNREACH;
}
}
@ -553,14 +551,26 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
NULL != peer_interfaces[j]->ipv4_address) {
/* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */
inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr,
str_local, sizeof(str_local));
inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr,
str_remote, sizeof(str_remote));
if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
local_interface->ipv4_netmask)) {
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK",
str_local, str_remote);
} else {
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK",
str_local, str_remote);
}
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
continue;
@ -569,8 +579,14 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
local_interface->ipv4_netmask)) {
proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK",
str_local, str_remote);
} else {
proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK",
str_local, str_remote);
}
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
continue;
@ -582,12 +598,24 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
if(NULL != local_interface->ipv6_address &&
NULL != peer_interfaces[j]->ipv6_address) {
/* Convert the IPv6 addresses into nicely-printable strings for verbose debugging output */
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) local_interface->ipv6_address))->sin6_addr,
str_local, sizeof(str_local));
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) peer_interfaces[j]->ipv6_address))->sin6_addr,
str_remote, sizeof(str_remote));
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address,
(struct sockaddr*) peer_interfaces[j]->ipv6_address,
local_interface->ipv6_netmask)) {
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV6 PUBLIC SAME NETWORK",
str_local, str_remote);
} else {
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"btl:tcp: path from %s to %s: IPV6 PUBLIC DIFFERENT NETWORK",
str_local, str_remote);
}
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr;
continue;
@ -660,6 +688,12 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
rc = OPAL_SUCCESS;
}
}
if (OPAL_ERR_UNREACH == rc) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: host %s, process %s UNREACHABLE",
proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
}
for(i = 0; i < perm_size; ++i) {
free(proc_data->weights[i]);
@ -716,7 +750,7 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_
OBJ_RELEASE(btl_proc);
return OPAL_SUCCESS;
}
/* The endpoint_addr may still be NULL if this enpoint is
/* The endpoint_addr may still be NULL if this endpoint is
being removed early in the wireup sequence (e.g., if it
is unreachable by all other procs) */
if (NULL != btl_endpoint->endpoint_addr) {

Просмотреть файл

@ -100,3 +100,68 @@ hopefully be able to continue).
Peer hostname: %s (%s)
Source IP of socket: %s
Known IPs of peer: %s
#
[socket flag fail]
WARNING: Open MPI failed to set flags on a TCP socket. This should
not happen. It is likely that your MPI job will now fail.
Local host: %s
PID: %d
Flag: %s
Error: %s (%d)
#
[server did not get guid]
WARNING: Open MPI accepted a TCP connection from what appears to be a
another Open MPI process but the peer process did not complete the
initial handshake properly. This should not happen.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
#
[server accept cannot find guid]
WARNING: Open MPI accepted a TCP connection from what appears to be a
another Open MPI process but cannot find a corresponding process
entry for that peer.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
#
[server getpeername failed]
WARNING: Open MPI failed to look up the peer IP address information of
a TCP connection that it just accepted. This should not happen.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
Error: %s (%d)
#
[server cannot find endpoint]
WARNING: Open MPI accepted a TCP connection from what appears to be a
valid peer Open MPI process but cannot find a corresponding endpoint
entry for that peer. This should not happen.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
#
[client connect fail]
WARNING: Open MPI failed to TCP connect to a peer MPI process via
TCP. This should not happen.
Your Open MPI job may now fail.
Local host: %s
PID: %d
Message: %s
Error: %s (%d)
#