Merge pull request #3955 from mohanasudhan/master
Btl tcp: Improved diagnostic output and failure mode
Этот коммит содержится в:
Коммит
c667719a3f
@ -31,12 +31,14 @@
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/opal_socket_errno.h"
|
||||
|
||||
#include "btl_tcp.h"
|
||||
#include "btl_tcp_frag.h"
|
||||
#include "btl_tcp_proc.h"
|
||||
#include "btl_tcp_endpoint.h"
|
||||
|
||||
|
||||
mca_btl_tcp_module_t mca_btl_tcp_module = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_tcp_component.super,
|
||||
@ -531,3 +533,68 @@ void mca_btl_tcp_dump(struct mca_btl_base_module_t* base_btl,
|
||||
}
|
||||
#endif /* OPAL_ENABLE_DEBUG && WANT_PEER_DUMP */
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* A blocking recv for both blocking and non-blocking socket.
|
||||
* Used to receive the small amount of connection information
|
||||
* that identifies the endpoints
|
||||
*
|
||||
* when the socket is blocking (the caller introduces timeout)
|
||||
* which happens during initial handshake otherwise socket is
|
||||
* non-blocking most of the time.
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)data;
|
||||
size_t cnt = 0;
|
||||
while (cnt < size) {
|
||||
int retval = recv(sd, ((char *)ptr) + cnt, size - cnt, 0);
|
||||
/* remote closed connection */
|
||||
if (0 == retval) {
|
||||
BTL_ERROR(("remote peer unexpectedly closed connection while I was waiting for blocking message"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* socket is non-blocking so handle errors */
|
||||
if (retval < 0) {
|
||||
if (opal_socket_errno != EINTR &&
|
||||
opal_socket_errno != EAGAIN &&
|
||||
opal_socket_errno != EWOULDBLOCK) {
|
||||
BTL_ERROR(("recv(%d) failed: %s (%d)", sd, strerror(opal_socket_errno), opal_socket_errno));
|
||||
return -1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
cnt += retval;
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* A blocking send on a non-blocking socket. Used to send the small
|
||||
* amount of connection information that identifies the endpoints
|
||||
* endpoint.
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)data;
|
||||
size_t cnt = 0;
|
||||
while(cnt < size) {
|
||||
int retval = send(sd, ((const char *)ptr) + cnt, size - cnt, 0);
|
||||
if (retval < 0) {
|
||||
if (opal_socket_errno != EINTR &&
|
||||
opal_socket_errno != EAGAIN &&
|
||||
opal_socket_errno != EWOULDBLOCK) {
|
||||
BTL_ERROR(("send() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno));
|
||||
return -1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
cnt += retval;
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
@ -351,5 +351,23 @@ mca_btl_tcp_dump(struct mca_btl_base_module_t* btl,
|
||||
*/
|
||||
int mca_btl_tcp_ft_event(int state);
|
||||
|
||||
/*
|
||||
* A blocking send on a non-blocking socket. Used to send the small
|
||||
* amount of connection information that identifies the endpoints
|
||||
* endpoint.
|
||||
*/
|
||||
int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size);
|
||||
|
||||
/*
|
||||
* A blocking recv for both blocking and non-blocking socket.
|
||||
* Used to receive the small amount of connection information
|
||||
* that identifies the endpoints
|
||||
*
|
||||
* when the socket is blocking (the caller introduces timeout)
|
||||
* which happens during initial handshake otherwise socket is
|
||||
* non-blocking most of the time.
|
||||
*/
|
||||
int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -54,6 +54,9 @@
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/util/ethtool.h"
|
||||
@ -729,7 +732,9 @@ static int mca_btl_tcp_component_create_instances(void)
|
||||
char* if_name = *argv;
|
||||
int if_index = opal_ifnametokindex(if_name);
|
||||
if(if_index < 0) {
|
||||
BTL_ERROR(("invalid interface \"%s\"", if_name));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude",
|
||||
true, "include", opal_process_info.nodename,
|
||||
if_name, "Unknown interface name");
|
||||
ret = OPAL_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -856,13 +861,18 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
freeaddrinfo (res);
|
||||
|
||||
#ifdef IPV6_V6ONLY
|
||||
/* in case of AF_INET6, disable v4-mapped addresses */
|
||||
/* If this OS supports the "IPV6_V6ONLY" constant, then set it
|
||||
on this socket. It specifies that *only* V6 connections
|
||||
should be accepted on this socket (vs. allowing incoming
|
||||
both V4 and V6 connections -- which is actually defined
|
||||
behavior for V6<-->V4 interop stuff). See
|
||||
https://github.com/open-mpi/ompi/commit/95d7e08a6617530d57b6700c57738b351bfccbf8 for some
|
||||
more details. */
|
||||
if (AF_INET6 == af_family) {
|
||||
int flg = 1;
|
||||
if (setsockopt (sd, IPPROTO_IPV6, IPV6_V6ONLY,
|
||||
(char *) &flg, sizeof (flg)) < 0) {
|
||||
opal_output(0,
|
||||
"mca_btl_tcp_create_listen: unable to disable v4-mapped addresses\n");
|
||||
BTL_ERROR((0, "mca_btl_tcp_create_listen: unable to set IPV6_V6ONLY\n"));
|
||||
}
|
||||
}
|
||||
#endif /* IPV6_V6ONLY */
|
||||
@ -904,6 +914,10 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
#else
|
||||
((struct sockaddr_in*) &inaddr)->sin_port = htons(port + index);
|
||||
#endif /* OPAL_ENABLE_IPV6 */
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: Attempting to bind to %s port %d",
|
||||
(AF_INET == af_family) ? "AF_INET" : "AF_INET6",
|
||||
port + index);
|
||||
if(bind(sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
|
||||
if( (EADDRINUSE == opal_socket_errno) || (EADDRNOTAVAIL == opal_socket_errno) ) {
|
||||
continue;
|
||||
@ -913,6 +927,10 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: Successfully bound to %s port %d",
|
||||
(AF_INET == af_family) ? "AF_INET" : "AF_INET6",
|
||||
port + index);
|
||||
goto socket_binded;
|
||||
}
|
||||
#if OPAL_ENABLE_IPV6
|
||||
@ -943,11 +961,19 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
if (AF_INET6 == af_family) {
|
||||
mca_btl_tcp_component.tcp6_listen_port = ((struct sockaddr_in6*) &inaddr)->sin6_port;
|
||||
mca_btl_tcp_component.tcp6_listen_sd = sd;
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: my listening v6 socket port is %d",
|
||||
ntohs(mca_btl_tcp_component.tcp6_listen_port));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
char str[16];
|
||||
mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*) &inaddr)->sin_port;
|
||||
mca_btl_tcp_component.tcp_listen_sd = sd;
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*)&inaddr)->sin_addr), str, sizeof(str));
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: my listening v4 socket is %s:%u",
|
||||
str, ntohs(mca_btl_tcp_component.tcp_listen_port));
|
||||
}
|
||||
|
||||
/* setup listen backlog to maximum allowed by kernel */
|
||||
@ -960,15 +986,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
|
||||
/* set socket up to be non-blocking, otherwise accept could block */
|
||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return OPAL_ERROR;
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
@ -1081,6 +1112,7 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
size_t current_addr = 0;
|
||||
|
||||
if(mca_btl_tcp_component.tcp_num_btls != 0) {
|
||||
char ifn[32];
|
||||
mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size);
|
||||
memset(addrs, 0, size);
|
||||
|
||||
@ -1098,6 +1130,9 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_ifindextoname(index, ifn, sizeof(ifn));
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: examining interface %s", ifn);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_ifindextoaddr(index, (struct sockaddr*) &my_ss,
|
||||
sizeof (my_ss))) {
|
||||
@ -1121,6 +1156,8 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
addrs[current_addr].addr_ifkindex =
|
||||
opal_ifindextokindex (index);
|
||||
current_addr++;
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: using ipv4 interface %s", ifn);
|
||||
} else
|
||||
#endif
|
||||
if ((AF_INET == my_ss.ss_family) &&
|
||||
@ -1136,6 +1173,8 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
addrs[current_addr].addr_ifkindex =
|
||||
opal_ifindextokindex (index);
|
||||
current_addr++;
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: using ipv6 interface %s", ifn);
|
||||
}
|
||||
} /* end of for opal_ifbegin() */
|
||||
} /* end of for tcp_num_btls */
|
||||
@ -1299,45 +1338,159 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
|
||||
struct sockaddr_storage addr;
|
||||
opal_socklen_t addr_len = sizeof(addr);
|
||||
mca_btl_tcp_proc_t* btl_proc;
|
||||
int retval;
|
||||
bool sockopt = true;
|
||||
size_t retval, len = strlen(mca_btl_tcp_magic_id_string);
|
||||
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
|
||||
struct timeval save, tv;
|
||||
socklen_t rcvtimeo_save_len = sizeof(save);
|
||||
char str[128];
|
||||
|
||||
/* Note, Socket will be in blocking mode during intial handshake
|
||||
* hence setting SO_RCVTIMEO to say 2 seconds here to avoid chance
|
||||
* of spin forever if it tries to connect to old version
|
||||
* as older version will send just process id which won't be long enough
|
||||
* to cross sizeof(str) length + process id struct
|
||||
* or when the remote side isn't OMPI where it's not going to send
|
||||
* any data*/
|
||||
|
||||
/* get the current timeout value so we can reset to it */
|
||||
if (0 != getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, (void*)&save, &rcvtimeo_save_len)) {
|
||||
if (ENOPROTOOPT == errno) {
|
||||
sockopt = false;
|
||||
} else {
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"Cannot get current recv timeout value of the socket"
|
||||
"Local_host:%s PID:%d",
|
||||
opal_process_info.nodename, getpid());
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
tv.tv_sec = 2;
|
||||
tv.tv_usec = 0;
|
||||
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"Cannot set new recv timeout value of the socket"
|
||||
"Local_host:%s PID:%d",
|
||||
opal_process_info.nodename, getpid());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(event);
|
||||
retval = mca_btl_tcp_recv_blocking(sd, (void *)&hs_msg, sizeof(hs_msg));
|
||||
guid = hs_msg.guid;
|
||||
|
||||
/* recv the process identifier */
|
||||
retval = recv(sd, (char *)&guid, sizeof(guid), 0);
|
||||
if(retval != sizeof(guid)) {
|
||||
/* An unknown process attempted to connect to Open MPI via TCP.
|
||||
* Open MPI uses a "magic" string to trivially verify that the connecting
|
||||
* process is a fellow Open MPI process. An MPI process accepted a TCP
|
||||
* connection but did not receive the correct magic string. This might
|
||||
* indicate an Open MPI version mismatch between different MPI processes
|
||||
* in the same job, or it may indicate that some other agent is
|
||||
* mistakenly attempting to connect to Open MPI's TCP listening sockets.
|
||||
|
||||
* This attempted connection will be ignored; your MPI job may or may not
|
||||
* continue properly.
|
||||
*/
|
||||
if (sizeof(hs_msg) != retval) {
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"server did not receive entire connect ACK "
|
||||
"Local_host:%s PID:%d Role:%s String_received:%s Test_fail:%s",
|
||||
opal_process_info.nodename,
|
||||
getpid(), "server",
|
||||
(retval > 0) ? hs_msg.magic_id : "<nothing>",
|
||||
"handshake message length");
|
||||
|
||||
/* The other side probably isn't OMPI, so just hang up */
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"server did not receive right magic string. "
|
||||
"Local_host:%s PID:%d Role:%s String_received:%s Test_fail:%s",
|
||||
opal_process_info.nodename,
|
||||
getpid(), "server", hs_msg.magic_id,
|
||||
"string value");
|
||||
/* The other side probably isn't OMPI, so just hang up */
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
|
||||
if (sockopt) {
|
||||
/* reset RECVTIMEO option to its original state */
|
||||
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"Cannot reset recv timeout value"
|
||||
"Local_host:%s PID:%d",
|
||||
opal_process_info.nodename, getpid());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_PROCESS_NAME_NTOH(guid);
|
||||
|
||||
/* now set socket up to be non-blocking */
|
||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
}
|
||||
}
|
||||
|
||||
/* lookup the corresponding process */
|
||||
btl_proc = mca_btl_tcp_proc_lookup(&guid);
|
||||
if(NULL == btl_proc) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt",
|
||||
"server accept cannot find guid",
|
||||
true, opal_process_info.nodename,
|
||||
getpid());
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
|
||||
/* lookup peer address */
|
||||
if(getpeername(sd, (struct sockaddr*)&addr, &addr_len) != 0) {
|
||||
BTL_ERROR(("getpeername() failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt",
|
||||
"server getpeername failed",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
|
||||
/* are there any existing peer instances willing to accept this connection */
|
||||
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
|
||||
|
||||
switch (addr.ss_family) {
|
||||
case AF_INET:
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
|
||||
break;
|
||||
|
||||
#if OPAL_ENABLE_IPV6
|
||||
case AF_INET6:
|
||||
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
|
||||
break;
|
||||
#endif
|
||||
|
||||
default:
|
||||
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
|
||||
}
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: now connected to %s, process %s", str,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
}
|
||||
|
@ -62,6 +62,11 @@
|
||||
#include "btl_tcp_frag.h"
|
||||
#include "btl_tcp_addr.h"
|
||||
|
||||
/*
|
||||
* Magic ID string send during connect/accept handshake
|
||||
*/
|
||||
|
||||
const char mca_btl_tcp_magic_id_string[MCA_BTL_TCP_MAGIC_STRING_LENGTH] = "OPAL-TCP-BTL";
|
||||
|
||||
/*
|
||||
* Initialize state of the endpoint instance.
|
||||
@ -371,48 +376,42 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp
|
||||
|
||||
|
||||
/*
|
||||
* A blocking send on a non-blocking socket. Used to send the small amount of connection
|
||||
* information that identifies the endpoints endpoint.
|
||||
* A blocking send on a non-blocking socket. Used to send the small
|
||||
* amount of connection information that identifies the endpoints endpoint.
|
||||
*/
|
||||
static int
|
||||
mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
|
||||
void* data, size_t size)
|
||||
const void* data, size_t size)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)data;
|
||||
size_t cnt = 0;
|
||||
while(cnt < size) {
|
||||
int retval = send(btl_endpoint->endpoint_sd, (const char *)ptr+cnt, size-cnt, 0);
|
||||
if(retval < 0) {
|
||||
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
||||
BTL_ERROR(("send(%d, %p, %lu/%lu) failed: %s (%d)",
|
||||
btl_endpoint->endpoint_sd, data, cnt, size,
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return -1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
cnt += retval;
|
||||
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
|
||||
if (ret < 0) {
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
}
|
||||
return cnt;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Send the globally unique identifier for this process to a endpoint on
|
||||
* a newly connected socket.
|
||||
*/
|
||||
|
||||
static int mca_btl_tcp_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
static int
|
||||
mca_btl_tcp_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
/* send process identifier to remote endpoint */
|
||||
opal_process_name_t guid = opal_proc_local_get()->proc_name;
|
||||
|
||||
OPAL_PROCESS_NAME_HTON(guid);
|
||||
if(mca_btl_tcp_endpoint_send_blocking(btl_endpoint, &guid, sizeof(guid)) !=
|
||||
sizeof(guid)) {
|
||||
return OPAL_ERR_UNREACH;
|
||||
|
||||
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
|
||||
strcpy(hs_msg.magic_id, mca_btl_tcp_magic_id_string);
|
||||
hs_msg.guid = guid;
|
||||
|
||||
if(sizeof(hs_msg) !=
|
||||
mca_btl_tcp_endpoint_send_blocking(btl_endpoint,
|
||||
&hs_msg, sizeof(hs_msg))) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
|
||||
true, opal_process_info.nodename,
|
||||
sizeof(hs_msg),
|
||||
"connect ACK failed to send magic-id and guid");
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -573,31 +572,11 @@ static void mca_btl_tcp_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoint
|
||||
*/
|
||||
static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)data;
|
||||
size_t cnt = 0;
|
||||
while(cnt < size) {
|
||||
int retval = recv(btl_endpoint->endpoint_sd, (char *)ptr+cnt, size-cnt, 0);
|
||||
|
||||
/* remote closed connection */
|
||||
if(retval == 0) {
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/* socket is non-blocking so handle errors */
|
||||
if(retval < 0) {
|
||||
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
||||
BTL_ERROR(("recv(%d, %lu/%lu) failed: %s (%d)",
|
||||
btl_endpoint->endpoint_sd, cnt, size, strerror(opal_socket_errno), opal_socket_errno));
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return -1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
cnt += retval;
|
||||
int ret = mca_btl_tcp_recv_blocking(btl_endpoint->endpoint_sd, data, size);
|
||||
if (ret <= 0) {
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
}
|
||||
return cnt;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -605,17 +584,22 @@ static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpo
|
||||
* Receive the endpoints globally unique process identification from a newly
|
||||
* connected socket and verify the expected response. If so, move the
|
||||
* socket to a connected state.
|
||||
*
|
||||
* NOTE: The return codes from this function are checked in
|
||||
* mca_btl_tcp_endpoint_recv_handler(). Don't change them here
|
||||
* without also changing the handling in _recv_handler()!
|
||||
*/
|
||||
static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
size_t s;
|
||||
opal_process_name_t guid;
|
||||
size_t retval, len = strlen(mca_btl_tcp_magic_id_string);;
|
||||
mca_btl_tcp_proc_t* btl_proc = btl_endpoint->endpoint_proc;
|
||||
opal_process_name_t guid;
|
||||
|
||||
s = mca_btl_tcp_endpoint_recv_blocking(btl_endpoint,
|
||||
&guid, sizeof(opal_process_name_t));
|
||||
if (s != sizeof(opal_process_name_t)) {
|
||||
if (0 == s) {
|
||||
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
|
||||
retval = mca_btl_tcp_endpoint_recv_blocking(btl_endpoint, &hs_msg, sizeof(hs_msg));
|
||||
|
||||
if (sizeof(hs_msg) != retval) {
|
||||
if (0 == retval) {
|
||||
/* If we get zero bytes, the peer closed the socket. This
|
||||
can happen when the two peers started the connection
|
||||
protocol simultaneously. Just report the problem
|
||||
@ -624,10 +608,19 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
|
||||
}
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"did not receive entire connect ACK from peer");
|
||||
return OPAL_ERR_UNREACH;
|
||||
getpid(), "did not receive entire connect ACK from peer");
|
||||
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "server did not receive magic string",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "client", hs_msg.magic_id,
|
||||
"string value");
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
guid = hs_msg.guid;
|
||||
OPAL_PROCESS_NAME_NTOH(guid);
|
||||
/* compare this to the expected values */
|
||||
/* TODO: this deserve a little bit more thinking as we are not supposed
|
||||
@ -708,25 +701,39 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
|
||||
/* setup the socket as non-blocking */
|
||||
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
|
||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
/* Upper layer will handler the error */
|
||||
return OPAL_ERR_UNREACH;
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
|
||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
/* Upper layer will handler the error */
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
}
|
||||
|
||||
/* start the connect - will likely fail with EINPROGRESS */
|
||||
mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);
|
||||
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl: tcp: attempting to connect() to %s address %s on port %d",
|
||||
OPAL_NAME_PRINT(btl_endpoint->endpoint_proc->proc_opal->proc_name),
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(btl_endpoint->endpoint_addr->addr_port));
|
||||
|
||||
if(0 == connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen)) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: connect() to %s:%d completed",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||
/* send our globally unique process identifier to the endpoint */
|
||||
if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OPAL_SUCCESS) {
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
|
||||
@ -742,6 +749,8 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "event_add(send) [start_connect]");
|
||||
MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0);
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: would block, so allowing background progress");
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -765,7 +774,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
* later. Otherwise, send this processes identifier to the endpoint on the
|
||||
* newly connected socket.
|
||||
*/
|
||||
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
int so_error = 0;
|
||||
opal_socklen_t so_length = sizeof(so_error);
|
||||
@ -781,32 +790,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e
|
||||
|
||||
/* check connect completion status */
|
||||
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
|
||||
BTL_ERROR(("getsockopt() to %s failed: %s (%d)",
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
((struct sockaddr_in*) &endpoint_addr)->sin_port,
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
|
||||
return;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
if(so_error != 0) {
|
||||
BTL_ERROR(("connect() to %s failed: %s (%d)",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
strerror(so_error), so_error));
|
||||
char *msg;
|
||||
asprintf(&msg, "connect() to %s:%d failed",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "client connect fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), msg,
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
free(msg);
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||
|
||||
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
|
||||
opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
|
||||
return;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
|
||||
@ -855,6 +881,26 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
|
||||
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
|
||||
}
|
||||
else if (OPAL_ERR_BAD_PARAM == rc) {
|
||||
/* If we get a BAD_PARAM, it means that it probably wasn't
|
||||
an OMPI process on the other end of the socket (e.g.,
|
||||
the magic string ID failed). So we can probably just
|
||||
close the socket and ignore this connection. */
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
}
|
||||
else {
|
||||
/* Otherwise, it probably *was* an OMPI peer process on
|
||||
the other end, and something bad has probably
|
||||
happened. */
|
||||
mca_btl_tcp_module_t *m = btl_endpoint->endpoint_btl;
|
||||
|
||||
/* Fail up to the PML */
|
||||
if (NULL != m->tcp_error_cb) {
|
||||
m->tcp_error_cb((mca_btl_base_module_t*) m, MCA_BTL_ERROR_FLAGS_FATAL,
|
||||
btl_endpoint->endpoint_proc->proc_opal,
|
||||
"TCP ACK is neither SUCCESS nor ERR (something bad has probably happened)");
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
|
||||
return;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define MCA_BTL_TCP_ENDPOINT_CACHE 1
|
||||
|
||||
#define MCA_BTL_TCP_MAGIC_STRING_LENGTH 16
|
||||
/**
|
||||
* State of TCP endpoint connection.
|
||||
*/
|
||||
@ -75,6 +75,14 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
typedef mca_btl_base_endpoint_t mca_btl_tcp_endpoint_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_tcp_endpoint_t);
|
||||
|
||||
/* Magic socket handshake string */
|
||||
extern const char mca_btl_tcp_magic_id_string[MCA_BTL_TCP_MAGIC_STRING_LENGTH];
|
||||
|
||||
typedef struct {
|
||||
opal_process_name_t guid;
|
||||
char magic_id[MCA_BTL_TCP_MAGIC_STRING_LENGTH];
|
||||
} mca_btl_tcp_endpoint_hs_msg_t;
|
||||
|
||||
void mca_btl_tcp_set_socket_options(int sd);
|
||||
void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t*);
|
||||
int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t*, struct mca_btl_tcp_frag_t*);
|
||||
|
@ -421,6 +421,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data;
|
||||
size_t max_peer_interfaces;
|
||||
memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t));
|
||||
char str_local[128], str_remote[128];
|
||||
|
||||
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
|
||||
return OPAL_ERR_UNREACH;
|
||||
@ -508,10 +509,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
default:
|
||||
opal_output(0, "unknown address family for tcp: %d\n",
|
||||
endpoint_addr_ss.ss_family);
|
||||
/*
|
||||
* return OPAL_UNREACH or some error, as this is not
|
||||
* good
|
||||
*/
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
}
|
||||
|
||||
@ -553,14 +551,26 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
|
||||
NULL != peer_interfaces[j]->ipv4_address) {
|
||||
|
||||
/* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr,
|
||||
str_local, sizeof(str_local));
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr,
|
||||
str_remote, sizeof(str_remote));
|
||||
|
||||
if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
|
||||
opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
|
||||
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
|
||||
local_interface->ipv4_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
|
||||
continue;
|
||||
@ -569,8 +579,14 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
|
||||
local_interface->ipv4_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
|
||||
continue;
|
||||
@ -582,12 +598,24 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
if(NULL != local_interface->ipv6_address &&
|
||||
NULL != peer_interfaces[j]->ipv6_address) {
|
||||
|
||||
/* Convert the IPv6 addresses into nicely-printable strings for verbose debugging output */
|
||||
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) local_interface->ipv6_address))->sin6_addr,
|
||||
str_local, sizeof(str_local));
|
||||
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) peer_interfaces[j]->ipv6_address))->sin6_addr,
|
||||
str_remote, sizeof(str_remote));
|
||||
|
||||
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv6_address,
|
||||
local_interface->ipv6_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV6 PUBLIC SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV6 PUBLIC DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr;
|
||||
continue;
|
||||
@ -660,6 +688,12 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
rc = OPAL_SUCCESS;
|
||||
}
|
||||
}
|
||||
if (OPAL_ERR_UNREACH == rc) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: host %s, process %s UNREACHABLE",
|
||||
proc_hostname,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
}
|
||||
|
||||
for(i = 0; i < perm_size; ++i) {
|
||||
free(proc_data->weights[i]);
|
||||
@ -716,7 +750,7 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_
|
||||
OBJ_RELEASE(btl_proc);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
/* The endpoint_addr may still be NULL if this enpoint is
|
||||
/* The endpoint_addr may still be NULL if this endpoint is
|
||||
being removed early in the wireup sequence (e.g., if it
|
||||
is unreachable by all other procs) */
|
||||
if (NULL != btl_endpoint->endpoint_addr) {
|
||||
|
@ -100,3 +100,68 @@ hopefully be able to continue).
|
||||
Peer hostname: %s (%s)
|
||||
Source IP of socket: %s
|
||||
Known IPs of peer: %s
|
||||
#
|
||||
[socket flag fail]
|
||||
WARNING: Open MPI failed to set flags on a TCP socket. This should
|
||||
not happen. It is likely that your MPI job will now fail.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
Flag: %s
|
||||
Error: %s (%d)
|
||||
#
|
||||
[server did not get guid]
|
||||
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||
another Open MPI process but the peer process did not complete the
|
||||
initial handshake properly. This should not happen.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
#
|
||||
[server accept cannot find guid]
|
||||
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||
another Open MPI process but cannot find a corresponding process
|
||||
entry for that peer.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
#
|
||||
[server getpeername failed]
|
||||
WARNING: Open MPI failed to look up the peer IP address information of
|
||||
a TCP connection that it just accepted. This should not happen.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
Error: %s (%d)
|
||||
#
|
||||
[server cannot find endpoint]
|
||||
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||
valid peer Open MPI process but cannot find a corresponding endpoint
|
||||
entry for that peer. This should not happen.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
#
|
||||
[client connect fail]
|
||||
WARNING: Open MPI failed to TCP connect to a peer MPI process via
|
||||
TCP. This should not happen.
|
||||
|
||||
Your Open MPI job may now fail.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
Message: %s
|
||||
Error: %s (%d)
|
||||
#
|
Загрузка…
x
Ссылка в новой задаче
Block a user