Patch submitted by Brian Barrett, inspired by this thread:
http://www.open-mpi.org/community/lists/users/2007/11/4547.php. - Better handling of ECONNABORTED from connect on Linux. - Reduce extraneous output from OOB when TCP connections must be retried. This commit was SVN r16808.
Этот коммит содержится в:
родитель
b7c885247a
Коммит
c20350b943
@ -345,7 +345,7 @@ mca_oob_tcp_peer_create_socket(mca_oob_tcp_peer_t* peer,
|
|||||||
static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
||||||
{
|
{
|
||||||
struct sockaddr_storage inaddr;
|
struct sockaddr_storage inaddr;
|
||||||
int rc;
|
int rc, retry_count;
|
||||||
opal_socklen_t addrlen = 0;
|
opal_socklen_t addrlen = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
@ -384,6 +384,8 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
|||||||
addrlen = sizeof(struct sockaddr_in6);
|
addrlen = sizeof(struct sockaddr_in6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
retry_count = 0;
|
||||||
|
retry_connect:
|
||||||
if (connect(peer->peer_sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
|
if (connect(peer->peer_sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
|
||||||
/* non-blocking so wait for completion */
|
/* non-blocking so wait for completion */
|
||||||
if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
|
if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
|
||||||
@ -391,17 +393,30 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
/* Some kernels (Linux 2.6) will automatically software
|
||||||
"connect to %s:%d failed: %s (%d)",
|
abort a connection that was ECONNREFUSED on the last
|
||||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
attempt, without even trying to establish the
|
||||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
connection. Handle that case in a semi-rational
|
||||||
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
way. */
|
||||||
opal_net_get_port((struct sockaddr*) &inaddr),
|
if (ECONNABORTED == opal_socket_errno && ++retry_count < 2) {
|
||||||
strerror(opal_socket_errno),
|
goto retry_connect;
|
||||||
opal_socket_errno);
|
}
|
||||||
|
|
||||||
|
if ((mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) ||
|
||||||
|
(ECONNABORTED != opal_socket_errno &&
|
||||||
|
ECONNREFUSED != opal_socket_errno)) {
|
||||||
|
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
||||||
|
"connect to %s:%d failed: %s (%d)",
|
||||||
|
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||||
|
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||||
|
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
||||||
|
opal_net_get_port((struct sockaddr*) &inaddr),
|
||||||
|
strerror(opal_socket_errno),
|
||||||
|
opal_socket_errno);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* send our globally unique process identifier to the peer */
|
/* send our globally unique process identifier to the peer */
|
||||||
if((rc = mca_oob_tcp_peer_send_connect_ack(peer, peer->peer_sd)) == ORTE_SUCCESS) {
|
if((rc = mca_oob_tcp_peer_send_connect_ack(peer, peer->peer_sd)) == ORTE_SUCCESS) {
|
||||||
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
|
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
|
||||||
@ -420,13 +435,14 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
|||||||
}
|
}
|
||||||
} while(peer->peer_addr->addr_next != 0);
|
} while(peer->peer_addr->addr_next != 0);
|
||||||
|
|
||||||
/* None of the interfaces worked.. */
|
/* None of the interfaces worked... We'll try again for a number of
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
times, so we're not done yet, hence the debug output */
|
||||||
"connect to %s:%d failed, connecting over all interfaces failed!",
|
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
||||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
"Connection across all interfaces failed. Likely will retry",
|
||||||
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||||
opal_net_get_port((struct sockaddr*) &inaddr));
|
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||||
|
}
|
||||||
mca_oob_tcp_peer_close(peer);
|
mca_oob_tcp_peer_close(peer);
|
||||||
return ORTE_ERR_UNREACH;
|
return ORTE_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
@ -483,19 +499,14 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
|
|||||||
return;
|
return;
|
||||||
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
|
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
|
||||||
struct timeval tv = { 1,0 };
|
struct timeval tv = { 1,0 };
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||||
"connection failed: %s (%d) - retrying\n",
|
|
||||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
|
||||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
|
||||||
strerror(so_error),
|
|
||||||
so_error);
|
|
||||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
|
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
|
||||||
"sending ack, %d",
|
"connection failed: %s (%d) - retrying\n",
|
||||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||||
ORTE_NAME_PRINT(&(peer->peer_name)), so_error);
|
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||||
|
strerror(so_error),
|
||||||
|
so_error);
|
||||||
}
|
}
|
||||||
|
|
||||||
mca_oob_tcp_peer_shutdown(peer);
|
mca_oob_tcp_peer_shutdown(peer);
|
||||||
opal_evtimer_add(&peer->peer_timer_event, &tv);
|
opal_evtimer_add(&peer->peer_timer_event, &tv);
|
||||||
return;
|
return;
|
||||||
@ -507,6 +518,13 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||||
|
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
|
||||||
|
"sending ack, %d",
|
||||||
|
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||||
|
ORTE_NAME_PRINT(&(peer->peer_name)), so_error);
|
||||||
|
}
|
||||||
|
|
||||||
if (mca_oob_tcp_peer_send_connect_ack(peer, sd) == ORTE_SUCCESS) {
|
if (mca_oob_tcp_peer_send_connect_ack(peer, sd) == ORTE_SUCCESS) {
|
||||||
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
|
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
|
||||||
opal_event_add(&peer->peer_recv_event, 0);
|
opal_event_add(&peer->peer_recv_event, 0);
|
||||||
@ -578,7 +596,7 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
|||||||
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
||||||
mca_oob_tcp_msg_t *msg;
|
mca_oob_tcp_msg_t *msg;
|
||||||
|
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_peer_shutdown: retries exceeded",
|
opal_output(0, "%s-%s oob-tcp: Communication retries exceeded. Can not communicate with peer",
|
||||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user