Btl tcp: Improving verbose around tcp
As part of improvement towards tcp btl we are improving verbose in general Signed-off-by: Mohan Gandhi <mohgan@amazon.com>
Этот коммит содержится в:
родитель
4bc7b214dc
Коммит
e3dfe11da9
@ -964,8 +964,13 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
char str[16];
|
||||
mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*) &inaddr)->sin_port;
|
||||
mca_btl_tcp_component.tcp_listen_sd = sd;
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*)&inaddr)->sin_addr), str, sizeof(str));
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: my listening v4 socket is %s:%u",
|
||||
str, ntohs(mca_btl_tcp_component.tcp_listen_port));
|
||||
}
|
||||
|
||||
/* setup listen backlog to maximum allowed by kernel */
|
||||
@ -1104,6 +1109,7 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
size_t current_addr = 0;
|
||||
|
||||
if(mca_btl_tcp_component.tcp_num_btls != 0) {
|
||||
char ifn[32];
|
||||
mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size);
|
||||
memset(addrs, 0, size);
|
||||
|
||||
@ -1121,6 +1127,9 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_ifindextoname(index, ifn, sizeof(ifn));
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: examining interface %s", ifn);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_ifindextoaddr(index, (struct sockaddr*) &my_ss,
|
||||
sizeof (my_ss))) {
|
||||
@ -1144,6 +1153,8 @@ static int mca_btl_tcp_component_exchange(void)
|
||||
addrs[current_addr].addr_ifkindex =
|
||||
opal_ifindextokindex (index);
|
||||
current_addr++;
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: using ipv4 interface %s", ifn);
|
||||
} else
|
||||
#endif
|
||||
if ((AF_INET == my_ss.ss_family) &&
|
||||
@ -1382,6 +1393,9 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
|
||||
/* recv the process identifier */
|
||||
retval = mca_btl_tcp_recv_blocking(sd, (char *)&guid, sizeof(guid));
|
||||
if(retval != sizeof(guid)) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "server did not get guid",
|
||||
true, opal_process_info.nodename,
|
||||
getpid());
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
@ -1389,31 +1403,66 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
|
||||
|
||||
/* now set socket up to be non-blocking */
|
||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), "fcntl(sd, F_GETFL, 0)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
}
|
||||
}
|
||||
|
||||
/* lookup the corresponding process */
|
||||
btl_proc = mca_btl_tcp_proc_lookup(&guid);
|
||||
if(NULL == btl_proc) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt",
|
||||
"server accept cannot find guid",
|
||||
true, opal_process_info.nodename,
|
||||
getpid());
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
|
||||
/* lookup peer address */
|
||||
if(getpeername(sd, (struct sockaddr*)&addr, &addr_len) != 0) {
|
||||
BTL_ERROR(("getpeername() failed: %s (%d)",
|
||||
strerror(opal_socket_errno), opal_socket_errno));
|
||||
opal_show_help("help-mpi-btl-tcp.txt",
|
||||
"server getpeername failed",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
|
||||
/* are there any existing peer instances willing to accept this connection */
|
||||
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
|
||||
|
||||
switch (addr.ss_family) {
|
||||
case AF_INET:
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
|
||||
break;
|
||||
|
||||
#if OPAL_ENABLE_IPV6
|
||||
case AF_INET6:
|
||||
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
|
||||
break;
|
||||
#endif
|
||||
|
||||
default:
|
||||
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
|
||||
}
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: now connected to %s, process %s", str,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
}
|
||||
|
@ -416,8 +416,11 @@ static int mca_btl_tcp_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_en
|
||||
|
||||
/* send process identifier to remote endpoint */
|
||||
OPAL_PROCESS_NAME_HTON(guid);
|
||||
if(mca_btl_tcp_endpoint_send_blocking(btl_endpoint, &guid, sizeof(guid)) !=
|
||||
sizeof(guid)) {
|
||||
if(mca_btl_tcp_endpoint_send_blocking(btl_endpoint, &guid, sizeof(guid)) != sizeof(guid)) {
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
"sending connect ACK failed");
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
@ -743,13 +746,18 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
/* start the connect - will likely fail with EINPROGRESS */
|
||||
mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);
|
||||
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl: tcp: attempting to connect() to %s address %s on port %d",
|
||||
OPAL_NAME_PRINT(btl_endpoint->endpoint_proc->proc_opal->proc_name),
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(btl_endpoint->endpoint_addr->addr_port));
|
||||
|
||||
if(0 == connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen)) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: connect() to %s:%d completed",
|
||||
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
|
||||
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
|
||||
|
||||
/* send our globally unique process identifier to the endpoint */
|
||||
if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OPAL_SUCCESS) {
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
|
||||
@ -765,6 +773,8 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "event_add(send) [start_connect]");
|
||||
MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0);
|
||||
opal_output_verbose(30, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: would block, so allowing background progress");
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -895,6 +905,25 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
|
||||
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
|
||||
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
|
||||
}
|
||||
else if (OPAL_ERR_BAD_PARAM == rc) {
|
||||
/* If we get a BAD_PARAM, it means that it probably wasn't
|
||||
an OMPI process on the other end of the socket (e.g.,
|
||||
the magic string ID failed). So we can probably just
|
||||
close the socket and ignore this connection. */
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
}
|
||||
else {
|
||||
/* Otherwise, it probably *was* an OMPI peer process on
|
||||
the other end, and something bad has probably
|
||||
happened. */
|
||||
mca_btl_tcp_module_t *m = btl_endpoint->endpoint_btl;
|
||||
/* Fail up to the PML */
|
||||
if (NULL != m->tcp_error_cb) {
|
||||
m->tcp_error_cb((mca_btl_base_module_t*) m, MCA_BTL_ERROR_FLAGS_FATAL,
|
||||
btl_endpoint->endpoint_proc->proc_opal,
|
||||
"TCP ACK is neither SUCCESS nor ERR (something bad has probably happened)");
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
|
||||
return;
|
||||
}
|
||||
@ -983,7 +1012,9 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
|
||||
|
||||
switch(btl_endpoint->endpoint_state) {
|
||||
case MCA_BTL_TCP_CONNECTING:
|
||||
mca_btl_tcp_endpoint_complete_connect(btl_endpoint);
|
||||
if (OPAL_SUCCESS != mca_btl_tcp_endpoint_complete_connect(btl_endpoint)) {
|
||||
mca_btl_tcp_module_t *m = btl_endpoint->endpoint_btl;
|
||||
}
|
||||
break;
|
||||
case MCA_BTL_TCP_CONNECTED:
|
||||
/* complete the current send */
|
||||
|
@ -509,10 +509,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
default:
|
||||
opal_output(0, "unknown address family for tcp: %d\n",
|
||||
endpoint_addr_ss.ss_family);
|
||||
/*
|
||||
* return OPAL_UNREACH or some error, as this is not
|
||||
* good
|
||||
*/
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
}
|
||||
|
||||
@ -554,14 +551,26 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
|
||||
NULL != peer_interfaces[j]->ipv4_address) {
|
||||
|
||||
/* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr,
|
||||
str_local, sizeof(str_local));
|
||||
inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr,
|
||||
str_remote, sizeof(str_remote));
|
||||
|
||||
if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
|
||||
opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
|
||||
if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
|
||||
local_interface->ipv4_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
|
||||
continue;
|
||||
@ -570,8 +579,14 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
(struct sockaddr*) peer_interfaces[j]->ipv4_address,
|
||||
local_interface->ipv4_netmask)) {
|
||||
proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK",
|
||||
str_local, str_remote);
|
||||
} else {
|
||||
proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK",
|
||||
str_local, str_remote);
|
||||
}
|
||||
proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
|
||||
continue;
|
||||
@ -673,6 +688,12 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
rc = OPAL_SUCCESS;
|
||||
}
|
||||
}
|
||||
if (OPAL_ERR_UNREACH == rc) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: host %s, process %s UNREACHABLE",
|
||||
proc_hostname,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
}
|
||||
|
||||
for(i = 0; i < perm_size; ++i) {
|
||||
free(proc_data->weights[i]);
|
||||
@ -729,7 +750,7 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_
|
||||
OBJ_RELEASE(btl_proc);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
/* The endpoint_addr may still be NULL if this enpoint is
|
||||
/* The endpoint_addr may still be NULL if this endpoint is
|
||||
being removed early in the wireup sequence (e.g., if it
|
||||
is unreachable by all other procs) */
|
||||
if (NULL != btl_endpoint->endpoint_addr) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user