Btl tcp: BTL_ERROR to show_help & update func behaviour
As part of improvement towards tcp debugging we are moving few BTL_ERROR to show_help and also update the function behaviour of mca_btl_tcp_endpoint_complete_connect to return SUCCESS and ERROR cases. Signed-off-by: Mohan Gandhi <mohgan@amazon.com>
Этот коммит содержится в:
родитель
368f9f0dfc
Коммит
0741fad479
@ -729,7 +729,9 @@ static int mca_btl_tcp_component_create_instances(void)
|
||||
char* if_name = *argv;
|
||||
int if_index = opal_ifnametokindex(if_name);
|
||||
if(if_index < 0) {
|
||||
BTL_ERROR(("invalid interface \"%s\"", if_name));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude",
|
||||
true, "include", opal_process_info.nodename,
|
||||
if_name, "Unknown interface name");
|
||||
ret = OPAL_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -960,15 +962,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
|
||||
/* set socket up to be non-blocking, otherwise accept could block */
|
||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return OPAL_ERROR;
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
@ -721,13 +721,23 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
|
||||
/* setup the socket as non-blocking */
|
||||
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
|
||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
/* Upper layer will handler the error */
|
||||
return OPAL_ERR_UNREACH;
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
|
||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
/* Upper layer will handler the error */
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
}
|
||||
|
||||
/* start the connect - will likely fail with EINPROGRESS */
|
||||
@ -778,7 +788,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
* later. Otherwise, send this processes identifier to the endpoint on the
|
||||
* newly connected socket.
|
||||
*/
|
||||
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
int so_error = 0;
|
||||
opal_socklen_t so_length = sizeof(so_error);
|
||||
@ -794,32 +804,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e
|
||||
|
||||
/* check connect completion status */
|
||||
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
|
||||
BTL_ERROR(("getsockopt() to %s failed: %s (%d)",
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
((struct sockaddr_in*) &endpoint_addr)->sin_port,
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
|
||||
return;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
if(so_error != 0) {
|
||||
BTL_ERROR(("connect() to %s failed: %s (%d)",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
strerror(so_error), so_error));
|
||||
char *msg;
|
||||
asprintf(&msg, "connect() to %s:%d failed",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "client connect fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), msg,
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
free(msg);
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||
|
||||
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
|
||||
opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
|
||||
return;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
|
||||
|
@ -100,3 +100,68 @@ hopefully be able to continue).
|
||||
Peer hostname: %s (%s)
|
||||
Source IP of socket: %s
|
||||
Known IPs of peer: %s
|
||||
#
|
||||
[socket flag fail]
|
||||
WARNING: Open MPI failed to set flags on a TCP socket. This should
|
||||
not happen. It is likely that your MPI job will now fail.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
Flag: %s
|
||||
Error: %s (%d)
|
||||
#
|
||||
[server did not get guid]
|
||||
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||
another Open MPI process but the peer process did not complete the
|
||||
initial handshake properly. This should not happen.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
#
|
||||
[server accept cannot find guid]
|
||||
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||
another Open MPI process but cannot find a corresponding process
|
||||
entry for that peer.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
#
|
||||
[server getpeername failed]
|
||||
WARNING: Open MPI failed to look up the peer IP address information of
|
||||
a TCP connection that it just accepted. This should not happen.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
Error: %s (%d)
|
||||
#
|
||||
[server cannot find endpoint]
|
||||
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||
valid peer Open MPI process but cannot find a corresponding endpoint
|
||||
entry for that peer. This should not happen.
|
||||
|
||||
This attempted connection will be ignored; your MPI job may or may not
|
||||
continue properly.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
#
|
||||
[client connect fail]
|
||||
WARNING: Open MPI failed to TCP connect to a peer MPI process via
|
||||
TCP. This should not happen.
|
||||
|
||||
Your Open MPI job may now fail.
|
||||
|
||||
Local host: %s
|
||||
PID: %d
|
||||
Message: %s
|
||||
Error: %s (%d)
|
||||
#
|
Загрузка…
x
Ссылка в новой задаче
Block a user