1
1

Rather than set the connect event timeout number to something big and hoping

its bigger than the timeout for the connect() call, just don't register
the handler by default and fall back to connect() timing out.  Should give
much happier performance on big clusters.

This commit was SVN r13639.
Этот коммит содержится в:
Brian Barrett 2007-02-13 18:36:50 +00:00
родитель c00d841741
Коммит f6a5d58885
2 изменённых файлов: 20 добавлений и 14 удалений

Просмотреть файл

@ -226,10 +226,10 @@ int mca_oob_tcp_component_open(void)
see AWF comment in oob_tcp_peer.c */
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
"connect_timeout",
"connect() timeout in seconds, before trying next interface",
"connect timeout in seconds, before trying next interface (0 means block until connect() times out)",
false,
false,
600,
0,
&mca_oob_tcp_component.tcp_timeout);

Просмотреть файл

@ -289,20 +289,26 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
(struct sockaddr*)&inaddr, sizeof(struct sockaddr_in)) < 0) {
/* non-blocking so wait for completion */
if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
/* AWF - the connect_timeout MCA parameter defaults to a low setting (10secs)
to minimize job startup time on IU BigRed. However, on large machines
such a short timeout may not be suitable -- the head node may not be
able to accept connections fast enough. If this is the case, increase
the connect_timeout MCA parameter.
*/
struct timeval tv;
if (mca_oob_tcp_component.tcp_timeout > 0) {
/* AWF - the connect_timeout MCA parameter
defaults to a low setting (10secs) to minimize
job startup time on IU BigRed. However, on
large machines such a short timeout may not be
suitable -- the head node may not be able to
accept connections fast enough. If this is the
case, increase the connect_timeout MCA
parameter.
*/
struct timeval tv;
tv.tv_sec = mca_oob_tcp_component.tcp_timeout;
tv.tv_usec = 0;
tv.tv_sec = mca_oob_tcp_component.tcp_timeout;
tv.tv_usec = 0;
/* The first event is responsible for our timeout, while the second event
may occur sooner, due to a successful connect() */
opal_evtimer_add(&peer->peer_timer_event, &tv);
/* The first event is responsible for our timeout,
while the second event may occur sooner, due to
a successful connect() */
opal_evtimer_add(&peer->peer_timer_event, &tv);
}
opal_event_add(&peer->peer_send_event, 0);
return ORTE_SUCCESS;