Merge pull request #477 from rhc54/topic/keepalive
Add keepalive support to the TCP OOB component
Этот коммит содержится в:
Коммит
ee23b7f300
@ -13,7 +13,8 @@
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -114,7 +115,7 @@ int ompi_osc_base_process_op (void *outbuf, void *inbuf, size_t inbuflen,
|
||||
iov_count = OMPI_OSC_BASE_DECODE_MAX;
|
||||
done = opal_convertor_raw (&convertor, iov, &iov_count, &size);
|
||||
|
||||
for (int i = 0 ; i < iov_count ; ++i) {
|
||||
for (uint32_t i = 0 ; i < iov_count ; ++i) {
|
||||
int primitive_count = iov[i].iov_len / primitive_size;
|
||||
ompi_op_reduce (op, inbuf, iov[i].iov_base, primitive_count, primitive_datatype);
|
||||
inbuf = (void *)((intptr_t) inbuf + iov[i].iov_len);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -42,32 +42,28 @@
|
||||
#ifdef HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETINET_TCP_H
|
||||
#include <netinet/tcp.h>
|
||||
#endif
|
||||
#ifdef HAVE_ARPA_INET_H
|
||||
#include <arpa/inet.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_SOCKET_H
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/opal_socket_errno.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/oob/tcp/oob_tcp.h"
|
||||
#include "orte/mca/oob/tcp/oob_tcp_component.h"
|
||||
#include "oob_tcp_peer.h"
|
||||
@ -77,12 +73,81 @@
|
||||
* Set socket buffering
|
||||
*/
|
||||
|
||||
static void set_keepalive(int sd)
|
||||
{
|
||||
int option;
|
||||
socklen_t optlen;
|
||||
|
||||
/* see if the keepalive option is available */
|
||||
optlen = sizeof(option);
|
||||
if (getsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, &optlen) < 0) {
|
||||
/* not available, so just return */
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set the option active */
|
||||
option = 1;
|
||||
if (setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, optlen) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(SO_KEEPALIVE) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
return;
|
||||
}
|
||||
#if defined(TCP_KEEPALIVE)
|
||||
/* set the idle time */
|
||||
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE,
|
||||
&mca_oob_tcp_component.keepalive_time,
|
||||
sizeof(mca_oob_tcp_component.keepalive_time)) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
return;
|
||||
}
|
||||
#elif defined(TCP_KEEPIDLE)
|
||||
/* set the idle time */
|
||||
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE,
|
||||
&mca_oob_tcp_component.keepalive_time,
|
||||
sizeof(mca_oob_tcp_component.keepalive_time)) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
return;
|
||||
}
|
||||
#endif // TCP_KEEPIDLE
|
||||
#if defined(TCP_KEEPINTVL)
|
||||
/* set the keepalive interval */
|
||||
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL,
|
||||
&mca_oob_tcp_component.keepalive_intvl,
|
||||
sizeof(mca_oob_tcp_component.keepalive_intvl)) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
return;
|
||||
}
|
||||
#endif // TCP_KEEPINTVL
|
||||
#if defined(TCP_KEEPCNT)
|
||||
/* set the miss rate */
|
||||
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT,
|
||||
&mca_oob_tcp_component.keepalive_probes,
|
||||
sizeof(mca_oob_tcp_component.keepalive_probes)) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
}
|
||||
#endif // TCP_KEEPCNT
|
||||
}
|
||||
|
||||
void orte_oob_tcp_set_socket_options(int sd)
|
||||
{
|
||||
#if defined(TCP_NODELAY)
|
||||
int optval;
|
||||
optval = 1;
|
||||
if(setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) {
|
||||
if (setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) {
|
||||
opal_backtrace_print(stderr, NULL, 1);
|
||||
opal_output(0, "[%s:%d] setsockopt(TCP_NODELAY) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
@ -91,8 +156,8 @@ void orte_oob_tcp_set_socket_options(int sd)
|
||||
}
|
||||
#endif
|
||||
#if defined(SO_SNDBUF)
|
||||
if(mca_oob_tcp_component.tcp_sndbuf > 0 &&
|
||||
setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) {
|
||||
if (mca_oob_tcp_component.tcp_sndbuf > 0 &&
|
||||
setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(SO_SNDBUF) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
@ -100,14 +165,19 @@ void orte_oob_tcp_set_socket_options(int sd)
|
||||
}
|
||||
#endif
|
||||
#if defined(SO_RCVBUF)
|
||||
if(mca_oob_tcp_component.tcp_rcvbuf > 0 &&
|
||||
setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) {
|
||||
if (mca_oob_tcp_component.tcp_rcvbuf > 0 &&
|
||||
setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) {
|
||||
opal_output(0, "[%s:%d] setsockopt(SO_RCVBUF) failed: %s (%d)",
|
||||
__FILE__, __LINE__,
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
}
|
||||
#endif
|
||||
#if defined(SO_KEEPALIVE)
|
||||
if (0 < mca_oob_tcp_component.keepalive_time) {
|
||||
set_keepalive(sd);
|
||||
}
|
||||
#endif // SO_KEEPALIVE
|
||||
}
|
||||
|
||||
mca_oob_tcp_peer_t* mca_oob_tcp_peer_lookup(const orte_process_name_t *name)
|
||||
|
@ -410,6 +410,29 @@ static int tcp_component_register(void)
|
||||
&mca_oob_tcp_component.disable_ipv6_family);
|
||||
#endif
|
||||
|
||||
|
||||
mca_oob_tcp_component.keepalive_time = 10;
|
||||
(void)mca_base_component_var_register(component, "keepalive_time",
|
||||
"Idle time in seconds before starting to send keepalives (num <= 0 ----> disable keepalive)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_oob_tcp_component.keepalive_time);
|
||||
|
||||
mca_oob_tcp_component.keepalive_intvl = 60;
|
||||
(void)mca_base_component_var_register(component, "keepalive_intvl",
|
||||
"Time between keepalives, in seconds",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_oob_tcp_component.keepalive_intvl);
|
||||
mca_oob_tcp_component.keepalive_probes = 3;
|
||||
(void)mca_base_component_var_register(component, "keepalive_probes",
|
||||
"Number of keepalives that can be missed before declaring error",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_oob_tcp_component.keepalive_probes);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -77,6 +77,9 @@ typedef struct {
|
||||
bool listen_thread_active;
|
||||
struct timeval listen_thread_tv; /**< Timeout when using listen thread */
|
||||
int stop_thread[2]; /**< pipe used to exit the listen thread */
|
||||
int keepalive_probes; /**< number of keepalives that can be missed before declaring error */
|
||||
int keepalive_time; /**< idle time in seconds before starting to send keepalives */
|
||||
int keepalive_intvl; /**< time between keepalives, in seconds */
|
||||
} mca_oob_tcp_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user