Btl tcp: BTL_ERROR to show_help & update func behaviour
As part of improvement towards tcp debugging we are moving few BTL_ERROR to show_help and also update the function behaviour of mca_btl_tcp_endpoint_complete_connect to return SUCCESS and ERROR cases. Signed-off-by: Mohan Gandhi <mohgan@amazon.com>
Этот коммит содержится в:
родитель
368f9f0dfc
Коммит
0741fad479
@ -729,7 +729,9 @@ static int mca_btl_tcp_component_create_instances(void)
|
|||||||
char* if_name = *argv;
|
char* if_name = *argv;
|
||||||
int if_index = opal_ifnametokindex(if_name);
|
int if_index = opal_ifnametokindex(if_name);
|
||||||
if(if_index < 0) {
|
if(if_index < 0) {
|
||||||
BTL_ERROR(("invalid interface \"%s\"", if_name));
|
opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude",
|
||||||
|
true, "include", opal_process_info.nodename,
|
||||||
|
if_name, "Unknown interface name");
|
||||||
ret = OPAL_ERR_NOT_FOUND;
|
ret = OPAL_ERR_NOT_FOUND;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -960,15 +962,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
|||||||
|
|
||||||
/* set socket up to be non-blocking, otherwise accept could block */
|
/* set socket up to be non-blocking, otherwise accept could block */
|
||||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||||
strerror(opal_socket_errno), opal_socket_errno));
|
true, opal_process_info.nodename,
|
||||||
|
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||||
|
strerror(opal_socket_errno), opal_socket_errno);
|
||||||
CLOSE_THE_SOCKET(sd);
|
CLOSE_THE_SOCKET(sd);
|
||||||
return OPAL_ERROR;
|
return OPAL_ERROR;
|
||||||
} else {
|
} else {
|
||||||
flags |= O_NONBLOCK;
|
flags |= O_NONBLOCK;
|
||||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||||
strerror(opal_socket_errno), opal_socket_errno));
|
true, opal_process_info.nodename,
|
||||||
|
getpid(),
|
||||||
|
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||||
|
strerror(opal_socket_errno), opal_socket_errno);
|
||||||
CLOSE_THE_SOCKET(sd);
|
CLOSE_THE_SOCKET(sd);
|
||||||
return OPAL_ERROR;
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
@ -721,13 +721,23 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
|||||||
|
|
||||||
/* setup the socket as non-blocking */
|
/* setup the socket as non-blocking */
|
||||||
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
|
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
|
||||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||||
strerror(opal_socket_errno), opal_socket_errno));
|
true, opal_process_info.nodename,
|
||||||
|
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||||
|
strerror(opal_socket_errno), opal_socket_errno);
|
||||||
|
/* Upper layer will handler the error */
|
||||||
|
return OPAL_ERR_UNREACH;
|
||||||
} else {
|
} else {
|
||||||
flags |= O_NONBLOCK;
|
flags |= O_NONBLOCK;
|
||||||
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
|
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) {
|
||||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||||
strerror(opal_socket_errno), opal_socket_errno));
|
true, opal_process_info.nodename,
|
||||||
|
getpid(),
|
||||||
|
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||||
|
strerror(opal_socket_errno), opal_socket_errno);
|
||||||
|
/* Upper layer will handler the error */
|
||||||
|
return OPAL_ERR_UNREACH;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* start the connect - will likely fail with EINPROGRESS */
|
/* start the connect - will likely fail with EINPROGRESS */
|
||||||
@ -778,7 +788,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
|||||||
* later. Otherwise, send this processes identifier to the endpoint on the
|
* later. Otherwise, send this processes identifier to the endpoint on the
|
||||||
* newly connected socket.
|
* newly connected socket.
|
||||||
*/
|
*/
|
||||||
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
||||||
{
|
{
|
||||||
int so_error = 0;
|
int so_error = 0;
|
||||||
opal_socklen_t so_length = sizeof(so_error);
|
opal_socklen_t so_length = sizeof(so_error);
|
||||||
@ -794,32 +804,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e
|
|||||||
|
|
||||||
/* check connect completion status */
|
/* check connect completion status */
|
||||||
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
|
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
|
||||||
BTL_ERROR(("getsockopt() to %s failed: %s (%d)",
|
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||||
|
true, opal_process_info.nodename,
|
||||||
|
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||||
|
strerror(opal_socket_errno), opal_socket_errno);
|
||||||
|
BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)",
|
||||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||||
|
((struct sockaddr_in*) &endpoint_addr)->sin_port,
|
||||||
strerror(opal_socket_errno), opal_socket_errno));
|
strerror(opal_socket_errno), opal_socket_errno));
|
||||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||||
return;
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
|
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
|
||||||
return;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
if(so_error != 0) {
|
if(so_error != 0) {
|
||||||
BTL_ERROR(("connect() to %s failed: %s (%d)",
|
char *msg;
|
||||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
asprintf(&msg, "connect() to %s:%d failed",
|
||||||
strerror(so_error), so_error));
|
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||||
|
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||||
|
opal_show_help("help-mpi-btl-tcp.txt", "client connect fail",
|
||||||
|
true, opal_process_info.nodename,
|
||||||
|
getpid(), msg,
|
||||||
|
strerror(opal_socket_errno), opal_socket_errno);
|
||||||
|
free(msg);
|
||||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||||
return;
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||||
|
"btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK",
|
||||||
|
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||||
|
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||||
|
|
||||||
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
|
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
|
||||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
|
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
|
||||||
opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
|
opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
|
||||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
|
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
|
||||||
return;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
|
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
|
||||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||||
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -100,3 +100,68 @@ hopefully be able to continue).
|
|||||||
Peer hostname: %s (%s)
|
Peer hostname: %s (%s)
|
||||||
Source IP of socket: %s
|
Source IP of socket: %s
|
||||||
Known IPs of peer: %s
|
Known IPs of peer: %s
|
||||||
|
#
|
||||||
|
[socket flag fail]
|
||||||
|
WARNING: Open MPI failed to set flags on a TCP socket. This should
|
||||||
|
not happen. It is likely that your MPI job will now fail.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
Flag: %s
|
||||||
|
Error: %s (%d)
|
||||||
|
#
|
||||||
|
[server did not get guid]
|
||||||
|
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||||
|
another Open MPI process but the peer process did not complete the
|
||||||
|
initial handshake properly. This should not happen.
|
||||||
|
|
||||||
|
This attempted connection will be ignored; your MPI job may or may not
|
||||||
|
continue properly.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
#
|
||||||
|
[server accept cannot find guid]
|
||||||
|
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||||
|
another Open MPI process but cannot find a corresponding process
|
||||||
|
entry for that peer.
|
||||||
|
|
||||||
|
This attempted connection will be ignored; your MPI job may or may not
|
||||||
|
continue properly.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
#
|
||||||
|
[server getpeername failed]
|
||||||
|
WARNING: Open MPI failed to look up the peer IP address information of
|
||||||
|
a TCP connection that it just accepted. This should not happen.
|
||||||
|
|
||||||
|
This attempted connection will be ignored; your MPI job may or may not
|
||||||
|
continue properly.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
Error: %s (%d)
|
||||||
|
#
|
||||||
|
[server cannot find endpoint]
|
||||||
|
WARNING: Open MPI accepted a TCP connection from what appears to be a
|
||||||
|
valid peer Open MPI process but cannot find a corresponding endpoint
|
||||||
|
entry for that peer. This should not happen.
|
||||||
|
|
||||||
|
This attempted connection will be ignored; your MPI job may or may not
|
||||||
|
continue properly.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
#
|
||||||
|
[client connect fail]
|
||||||
|
WARNING: Open MPI failed to TCP connect to a peer MPI process via
|
||||||
|
TCP. This should not happen.
|
||||||
|
|
||||||
|
Your Open MPI job may now fail.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
PID: %d
|
||||||
|
Message: %s
|
||||||
|
Error: %s (%d)
|
||||||
|
#
|
Загрузка…
x
Ссылка в новой задаче
Block a user