1
1

Patch submitted by Brian Barrett, inspired by this thread:

http://www.open-mpi.org/community/lists/users/2007/11/4547.php.

- Better handling of ECONNABORTED from connect on Linux.
- Reduce extraneous output from OOB when TCP connections must
  be retried.

This commit was SVN r16808.
Этот коммит содержится в:
Jeff Squyres 2007-11-30 21:42:15 +00:00
родитель b7c885247a
Коммит c20350b943

Просмотреть файл

@ -345,7 +345,7 @@ mca_oob_tcp_peer_create_socket(mca_oob_tcp_peer_t* peer,
static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
{ {
struct sockaddr_storage inaddr; struct sockaddr_storage inaddr;
int rc; int rc, retry_count;
opal_socklen_t addrlen = 0; opal_socklen_t addrlen = 0;
do { do {
@ -384,6 +384,8 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
addrlen = sizeof(struct sockaddr_in6); addrlen = sizeof(struct sockaddr_in6);
} }
retry_count = 0;
retry_connect:
if (connect(peer->peer_sd, (struct sockaddr*)&inaddr, addrlen) < 0) { if (connect(peer->peer_sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
/* non-blocking so wait for completion */ /* non-blocking so wait for completion */
if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
@ -391,17 +393,30 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: " /* Some kernels (Linux 2.6) will automatically software
"connect to %s:%d failed: %s (%d)", abort a connection that was ECONNREFUSED on the last
ORTE_NAME_PRINT(orte_process_info.my_name), attempt, without even trying to establish the
ORTE_NAME_PRINT(&(peer->peer_name)), connection. Handle that case in a semi-rational
opal_net_get_hostname((struct sockaddr*) &inaddr), way. */
opal_net_get_port((struct sockaddr*) &inaddr), if (ECONNABORTED == opal_socket_errno && ++retry_count < 2) {
strerror(opal_socket_errno), goto retry_connect;
opal_socket_errno); }
if ((mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) ||
(ECONNABORTED != opal_socket_errno &&
ECONNREFUSED != opal_socket_errno)) {
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
"connect to %s:%d failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_net_get_hostname((struct sockaddr*) &inaddr),
opal_net_get_port((struct sockaddr*) &inaddr),
strerror(opal_socket_errno),
opal_socket_errno);
}
continue; continue;
} }
/* send our globally unique process identifier to the peer */ /* send our globally unique process identifier to the peer */
if((rc = mca_oob_tcp_peer_send_connect_ack(peer, peer->peer_sd)) == ORTE_SUCCESS) { if((rc = mca_oob_tcp_peer_send_connect_ack(peer, peer->peer_sd)) == ORTE_SUCCESS) {
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK; peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
@ -420,13 +435,14 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
} }
} while(peer->peer_addr->addr_next != 0); } while(peer->peer_addr->addr_next != 0);
/* None of the interfaces worked.. */ /* None of the interfaces worked... We'll try again for a number of
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: " times, so we're not done yet, hence the debug output */
"connect to %s:%d failed, connecting over all interfaces failed!", if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
ORTE_NAME_PRINT(orte_process_info.my_name), opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
ORTE_NAME_PRINT(&(peer->peer_name)), "Connection across all interfaces failed. Likely will retry",
opal_net_get_hostname((struct sockaddr*) &inaddr), ORTE_NAME_PRINT(orte_process_info.my_name),
opal_net_get_port((struct sockaddr*) &inaddr)); ORTE_NAME_PRINT(&(peer->peer_name)));
}
mca_oob_tcp_peer_close(peer); mca_oob_tcp_peer_close(peer);
return ORTE_ERR_UNREACH; return ORTE_ERR_UNREACH;
} }
@ -483,19 +499,14 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
return; return;
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) { } else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
struct timeval tv = { 1,0 }; struct timeval tv = { 1,0 };
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: " if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
"connection failed: %s (%d) - retrying\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
strerror(so_error),
so_error);
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: " opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
"sending ack, %d", "connection failed: %s (%d) - retrying\n",
ORTE_NAME_PRINT(orte_process_info.my_name), ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)), so_error); ORTE_NAME_PRINT(&(peer->peer_name)),
strerror(so_error),
so_error);
} }
mca_oob_tcp_peer_shutdown(peer); mca_oob_tcp_peer_shutdown(peer);
opal_evtimer_add(&peer->peer_timer_event, &tv); opal_evtimer_add(&peer->peer_timer_event, &tv);
return; return;
@ -507,6 +518,13 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
return; return;
} }
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
"sending ack, %d",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)), so_error);
}
if (mca_oob_tcp_peer_send_connect_ack(peer, sd) == ORTE_SUCCESS) { if (mca_oob_tcp_peer_send_connect_ack(peer, sd) == ORTE_SUCCESS) {
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK; peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
opal_event_add(&peer->peer_recv_event, 0); opal_event_add(&peer->peer_recv_event, 0);
@ -578,7 +596,7 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) { if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
mca_oob_tcp_msg_t *msg; mca_oob_tcp_msg_t *msg;
opal_output(0, "%s-%s mca_oob_tcp_peer_shutdown: retries exceeded", opal_output(0, "%s-%s oob-tcp: Communication retries exceeded. Can not communicate with peer",
ORTE_NAME_PRINT(orte_process_info.my_name), ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name))); ORTE_NAME_PRINT(&(peer->peer_name)));