Rather than set the connect event timeout number to something big and hoping
its bigger than the timeout for the connect() call, just don't register the handler by default and fall back to connect() timing out. Should give much happier performance on big clusters. This commit was SVN r13639.
Этот коммит содержится в:
родитель
c00d841741
Коммит
f6a5d58885
@ -226,10 +226,10 @@ int mca_oob_tcp_component_open(void)
|
||||
see AWF comment in oob_tcp_peer.c */
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"connect_timeout",
|
||||
"connect() timeout in seconds, before trying next interface",
|
||||
"connect timeout in seconds, before trying next interface (0 means block until connect() times out)",
|
||||
false,
|
||||
false,
|
||||
600,
|
||||
0,
|
||||
&mca_oob_tcp_component.tcp_timeout);
|
||||
|
||||
|
||||
|
@ -289,20 +289,26 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
||||
(struct sockaddr*)&inaddr, sizeof(struct sockaddr_in)) < 0) {
|
||||
/* non-blocking so wait for completion */
|
||||
if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
|
||||
/* AWF - the connect_timeout MCA parameter defaults to a low setting (10secs)
|
||||
to minimize job startup time on IU BigRed. However, on large machines
|
||||
such a short timeout may not be suitable -- the head node may not be
|
||||
able to accept connections fast enough. If this is the case, increase
|
||||
the connect_timeout MCA parameter.
|
||||
if (mca_oob_tcp_component.tcp_timeout > 0) {
|
||||
/* AWF - the connect_timeout MCA parameter
|
||||
defaults to a low setting (10secs) to minimize
|
||||
job startup time on IU BigRed. However, on
|
||||
large machines such a short timeout may not be
|
||||
suitable -- the head node may not be able to
|
||||
accept connections fast enough. If this is the
|
||||
case, increase the connect_timeout MCA
|
||||
parameter.
|
||||
*/
|
||||
struct timeval tv;
|
||||
|
||||
tv.tv_sec = mca_oob_tcp_component.tcp_timeout;
|
||||
tv.tv_usec = 0;
|
||||
|
||||
/* The first event is responsible for our timeout, while the second event
|
||||
may occur sooner, due to a successful connect() */
|
||||
/* The first event is responsible for our timeout,
|
||||
while the second event may occur sooner, due to
|
||||
a successful connect() */
|
||||
opal_evtimer_add(&peer->peer_timer_event, &tv);
|
||||
}
|
||||
opal_event_add(&peer->peer_send_event, 0);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user