From 0741fad479a7bd4050aa17059bea2179c72f077f Mon Sep 17 00:00:00 2001 From: Mohan Date: Wed, 19 Jul 2017 15:58:32 -0700 Subject: [PATCH] Btl tcp: BTL_ERROR to show_help & update func behaviour As part of improvement towards tcp debugging we are moving few BTL_ERROR to show_help and also update the function behaviour of mca_btl_tcp_endpoint_complete_connect to return SUCCESS and ERROR cases. Signed-off-by: Mohan Gandhi --- opal/mca/btl/tcp/btl_tcp_component.c | 17 ++++--- opal/mca/btl/tcp/btl_tcp_endpoint.c | 55 +++++++++++++++++------ opal/mca/btl/tcp/help-mpi-btl-tcp.txt | 65 +++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 19 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 120abee313..76e8febe50 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -729,7 +729,9 @@ static int mca_btl_tcp_component_create_instances(void) char* if_name = *argv; int if_index = opal_ifnametokindex(if_name); if(if_index < 0) { - BTL_ERROR(("invalid interface \"%s\"", if_name)); + opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude", + true, "include", opal_process_info.nodename, + if_name, "Unknown interface name"); ret = OPAL_ERR_NOT_FOUND; goto cleanup; } @@ -960,15 +962,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) /* set socket up to be non-blocking, otherwise accept could block */ if((flags = fcntl(sd, F_GETFL, 0)) < 0) { - BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), "fcntl(sd, F_GETFL, 0)", + strerror(opal_socket_errno), opal_socket_errno); CLOSE_THE_SOCKET(sd); return OPAL_ERROR; } else { flags |= O_NONBLOCK; if(fcntl(sd, F_SETFL, flags) < 0) { - BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), + "fcntl(sd, F_SETFL, flags & O_NONBLOCK)", + strerror(opal_socket_errno), opal_socket_errno); CLOSE_THE_SOCKET(sd); return OPAL_ERROR; } diff --git a/opal/mca/btl/tcp/btl_tcp_endpoint.c b/opal/mca/btl/tcp/btl_tcp_endpoint.c index f8e9ab5b8c..68ba66e955 100644 --- a/opal/mca/btl/tcp/btl_tcp_endpoint.c +++ b/opal/mca/btl/tcp/btl_tcp_endpoint.c @@ -721,13 +721,23 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo /* setup the socket as non-blocking */ if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { - BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), "fcntl(sd, F_GETFL, 0)", + strerror(opal_socket_errno), opal_socket_errno); + /* Upper layer will handler the error */ + return OPAL_ERR_UNREACH; } else { flags |= O_NONBLOCK; - if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) - BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); + if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) { + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), + "fcntl(sd, F_SETFL, flags & O_NONBLOCK)", + strerror(opal_socket_errno), opal_socket_errno); + /* Upper layer will handler the error */ + return OPAL_ERR_UNREACH; + } } /* start the connect - will likely fail with EINPROGRESS */ @@ -778,7 +788,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo * later. Otherwise, send this processes identifier to the endpoint on the * newly connected socket. */ -static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint) +static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint) { int so_error = 0; opal_socklen_t so_length = sizeof(so_error); @@ -794,32 +804,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e /* check connect completion status */ if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { - BTL_ERROR(("getsockopt() to %s failed: %s (%d)", + opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail", + true, opal_process_info.nodename, + getpid(), "fcntl(sd, F_GETFL, 0)", + strerror(opal_socket_errno), opal_socket_errno); + BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)", opal_net_get_hostname((struct sockaddr*) &endpoint_addr), + ((struct sockaddr_in*) &endpoint_addr)->sin_port, strerror(opal_socket_errno), opal_socket_errno)); mca_btl_tcp_endpoint_close(btl_endpoint); - return; + return OPAL_ERROR; } if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) { - return; + return OPAL_SUCCESS; } if(so_error != 0) { - BTL_ERROR(("connect() to %s failed: %s (%d)", - opal_net_get_hostname((struct sockaddr*) &endpoint_addr), - strerror(so_error), so_error)); + char *msg; + asprintf(&msg, "connect() to %s:%d failed", + opal_net_get_hostname((struct sockaddr*) &endpoint_addr), + ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port)); + opal_show_help("help-mpi-btl-tcp.txt", "client connect fail", + true, opal_process_info.nodename, + getpid(), msg, + strerror(opal_socket_errno), opal_socket_errno); + free(msg); mca_btl_tcp_endpoint_close(btl_endpoint); - return; + return OPAL_ERROR; } + opal_output_verbose(10, opal_btl_base_framework.framework_output, + "btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK", + opal_net_get_hostname((struct sockaddr*) &endpoint_addr), + ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port)); + if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; opal_event_add(&btl_endpoint->endpoint_recv_event, 0); MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]"); - return; + return OPAL_SUCCESS; } MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]"); btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); + return OPAL_ERROR; } diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index a5781f7ed0..320ee110ff 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -100,3 +100,68 @@ hopefully be able to continue). Peer hostname: %s (%s) Source IP of socket: %s Known IPs of peer: %s +# +[socket flag fail] +WARNING: Open MPI failed to set flags on a TCP socket. This should +not happen. It is likely that your MPI job will now fail. + + Local host: %s + PID: %d + Flag: %s + Error: %s (%d) +# +[server did not get guid] +WARNING: Open MPI accepted a TCP connection from what appears to be a +another Open MPI process but the peer process did not complete the +initial handshake properly. This should not happen. + +This attempted connection will be ignored; your MPI job may or may not +continue properly. + + Local host: %s + PID: %d +# +[server accept cannot find guid] +WARNING: Open MPI accepted a TCP connection from what appears to be a +another Open MPI process but cannot find a corresponding process +entry for that peer. + +This attempted connection will be ignored; your MPI job may or may not +continue properly. + + Local host: %s + PID: %d +# +[server getpeername failed] +WARNING: Open MPI failed to look up the peer IP address information of +a TCP connection that it just accepted. This should not happen. + +This attempted connection will be ignored; your MPI job may or may not +continue properly. + + Local host: %s + PID: %d + Error: %s (%d) +# +[server cannot find endpoint] +WARNING: Open MPI accepted a TCP connection from what appears to be a +valid peer Open MPI process but cannot find a corresponding endpoint +entry for that peer. This should not happen. + +This attempted connection will be ignored; your MPI job may or may not +continue properly. + + Local host: %s + PID: %d +# +[client connect fail] +WARNING: Open MPI failed to TCP connect to a peer MPI process via +TCP. This should not happen. + +Your Open MPI job may now fail. + + Local host: %s + PID: %d + Message: %s + Error: %s (%d) +# \ No newline at end of file