From 1a4c9960e1df53e32cab4f6c98e882111dced72b Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 16 May 2015 08:33:17 -0400 Subject: [PATCH] oob tcp: set KEEPALIVE timeout 60s, retry interval 5s The timeout is frequency at which to send keepalive pings; the retry interval is how often to send successive pings once a keepalive has not replied. Also update comments and MCA param help strings. 60 seconds -- squashme --- orte/mca/oob/tcp/oob_tcp_component.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 53ce5b6cc7..2045189e68 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -404,25 +404,29 @@ static int tcp_component_register(void) &mca_oob_tcp_component.disable_ipv6_family); #endif -#if !defined(__APPLE__) - mca_oob_tcp_component.keepalive_time = 10; + // Default to keepalives every 60 seconds + mca_oob_tcp_component.keepalive_time = 60; (void)mca_base_component_var_register(component, "keepalive_time", - "Idle time in seconds before starting to send keepalives (num <= 0 ----> disable keepalive)", + "Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables keepalive functionality)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_time); - mca_oob_tcp_component.keepalive_intvl = 60; + // Default to keepalive retry interval time of 5 seconds + mca_oob_tcp_component.keepalive_intvl = 5; (void)mca_base_component_var_register(component, "keepalive_intvl", - "Time between keepalives, in seconds", + "Time between successive keepalive pings when peer has not responded, in seconds (ignored if keepalive_time <= 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_intvl); + + // Default to retrying a keepalive 3 times before declaring the + // peer kaput mca_oob_tcp_component.keepalive_probes = 3; (void)mca_base_component_var_register(component, "keepalive_probes", - "Number of keepalives that can be missed before declaring error", + "Number of keepalives that can be missed before declaring error (ignored if keepalive_time <= 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,