1
1

Btl tcp: BTL_ERROR to show_help & update func behaviour

As part of improvement towards tcp debugging
we are moving few BTL_ERROR to show_help and also
update the function behaviour of
mca_btl_tcp_endpoint_complete_connect to return
SUCCESS and ERROR cases.

Signed-off-by: Mohan Gandhi <mohgan@amazon.com>
Этот коммит содержится в:
Mohan 2017-07-19 15:58:32 -07:00 коммит произвёл Mohan Gandhi
родитель 368f9f0dfc
Коммит 0741fad479
3 изменённых файлов: 118 добавлений и 19 удалений

Просмотреть файл

@ -729,7 +729,9 @@ static int mca_btl_tcp_component_create_instances(void)
char* if_name = *argv; char* if_name = *argv;
int if_index = opal_ifnametokindex(if_name); int if_index = opal_ifnametokindex(if_name);
if(if_index < 0) { if(if_index < 0) {
BTL_ERROR(("invalid interface \"%s\"", if_name)); opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude",
true, "include", opal_process_info.nodename,
if_name, "Unknown interface name");
ret = OPAL_ERR_NOT_FOUND; ret = OPAL_ERR_NOT_FOUND;
goto cleanup; goto cleanup;
} }
@ -960,15 +962,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
/* set socket up to be non-blocking, otherwise accept could block */ /* set socket up to be non-blocking, otherwise accept could block */
if((flags = fcntl(sd, F_GETFL, 0)) < 0) { if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
strerror(opal_socket_errno), opal_socket_errno)); true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd); CLOSE_THE_SOCKET(sd);
return OPAL_ERROR; return OPAL_ERROR;
} else { } else {
flags |= O_NONBLOCK; flags |= O_NONBLOCK;
if(fcntl(sd, F_SETFL, flags) < 0) { if(fcntl(sd, F_SETFL, flags) < 0) {
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
strerror(opal_socket_errno), opal_socket_errno)); true, opal_process_info.nodename,
getpid(),
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd); CLOSE_THE_SOCKET(sd);
return OPAL_ERROR; return OPAL_ERROR;
} }

Просмотреть файл

@ -721,13 +721,23 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
/* setup the socket as non-blocking */ /* setup the socket as non-blocking */
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
strerror(opal_socket_errno), opal_socket_errno)); true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
/* Upper layer will handler the error */
return OPAL_ERR_UNREACH;
} else { } else {
flags |= O_NONBLOCK; flags |= O_NONBLOCK;
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) {
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
strerror(opal_socket_errno), opal_socket_errno)); true, opal_process_info.nodename,
getpid(),
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
strerror(opal_socket_errno), opal_socket_errno);
/* Upper layer will handler the error */
return OPAL_ERR_UNREACH;
}
} }
/* start the connect - will likely fail with EINPROGRESS */ /* start the connect - will likely fail with EINPROGRESS */
@ -778,7 +788,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
* later. Otherwise, send this processes identifier to the endpoint on the * later. Otherwise, send this processes identifier to the endpoint on the
* newly connected socket. * newly connected socket.
*/ */
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint) static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
{ {
int so_error = 0; int so_error = 0;
opal_socklen_t so_length = sizeof(so_error); opal_socklen_t so_length = sizeof(so_error);
@ -794,32 +804,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e
/* check connect completion status */ /* check connect completion status */
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
BTL_ERROR(("getsockopt() to %s failed: %s (%d)", opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(), "fcntl(sd, F_GETFL, 0)",
strerror(opal_socket_errno), opal_socket_errno);
BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr), opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
((struct sockaddr_in*) &endpoint_addr)->sin_port,
strerror(opal_socket_errno), opal_socket_errno)); strerror(opal_socket_errno), opal_socket_errno));
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return; return OPAL_ERROR;
} }
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) { if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
return; return OPAL_SUCCESS;
} }
if(so_error != 0) { if(so_error != 0) {
BTL_ERROR(("connect() to %s failed: %s (%d)", char *msg;
asprintf(&msg, "connect() to %s:%d failed",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr), opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
strerror(so_error), so_error)); ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
opal_show_help("help-mpi-btl-tcp.txt", "client connect fail",
true, opal_process_info.nodename,
getpid(), msg,
strerror(opal_socket_errno), opal_socket_errno);
free(msg);
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return; return OPAL_ERROR;
} }
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK",
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) { if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
opal_event_add(&btl_endpoint->endpoint_recv_event, 0); opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]"); MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
return; return OPAL_SUCCESS;
} }
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]"); MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR;
} }

Просмотреть файл

@ -100,3 +100,68 @@ hopefully be able to continue).
Peer hostname: %s (%s) Peer hostname: %s (%s)
Source IP of socket: %s Source IP of socket: %s
Known IPs of peer: %s Known IPs of peer: %s
#
[socket flag fail]
WARNING: Open MPI failed to set flags on a TCP socket. This should
not happen. It is likely that your MPI job will now fail.
Local host: %s
PID: %d
Flag: %s
Error: %s (%d)
#
[server did not get guid]
WARNING: Open MPI accepted a TCP connection from what appears to be a
another Open MPI process but the peer process did not complete the
initial handshake properly. This should not happen.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
#
[server accept cannot find guid]
WARNING: Open MPI accepted a TCP connection from what appears to be a
another Open MPI process but cannot find a corresponding process
entry for that peer.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
#
[server getpeername failed]
WARNING: Open MPI failed to look up the peer IP address information of
a TCP connection that it just accepted. This should not happen.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
Error: %s (%d)
#
[server cannot find endpoint]
WARNING: Open MPI accepted a TCP connection from what appears to be a
valid peer Open MPI process but cannot find a corresponding endpoint
entry for that peer. This should not happen.
This attempted connection will be ignored; your MPI job may or may not
continue properly.
Local host: %s
PID: %d
#
[client connect fail]
WARNING: Open MPI failed to TCP connect to a peer MPI process via
TCP. This should not happen.
Your Open MPI job may now fail.
Local host: %s
PID: %d
Message: %s
Error: %s (%d)
#