Merge pull request #506 from rhc54/topic/retry
Support attempts to connect async processes
Этот коммит содержится в:
Коммит
6408c87aa0
@ -433,6 +433,23 @@ static int tcp_component_register(void)
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_oob_tcp_component.keepalive_probes);
|
||||
|
||||
mca_oob_tcp_component.retry_delay = 0;
|
||||
(void)mca_base_component_var_register(component, "retry_delay",
|
||||
"Time (in sec) to wait before trying to connect to peer again",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_oob_tcp_component.retry_delay);
|
||||
|
||||
mca_oob_tcp_component.max_recon_attempts = 10;
|
||||
(void)mca_base_component_var_register(component, "max_recon_attempts",
|
||||
"Max number of times to attempt connection before giving up (-1 -> never give up)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_oob_tcp_component.max_retries);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1204,6 +1221,7 @@ static void peer_cons(mca_oob_tcp_peer_t *peer)
|
||||
OBJ_CONSTRUCT(&peer->addrs, opal_list_t);
|
||||
peer->active_addr = NULL;
|
||||
peer->state = MCA_OOB_TCP_UNCONNECTED;
|
||||
peer->num_retries = 0;
|
||||
OBJ_CONSTRUCT(&peer->send_queue, opal_list_t);
|
||||
peer->send_msg = NULL;
|
||||
peer->recv_msg = NULL;
|
||||
|
@ -80,6 +80,8 @@ typedef struct {
|
||||
int keepalive_probes; /**< number of keepalives that can be missed before declaring error */
|
||||
int keepalive_time; /**< idle time in seconds before starting to send keepalives */
|
||||
int keepalive_intvl; /**< time between keepalives, in seconds */
|
||||
int retry_delay; /**< time to wait before retrying connection */
|
||||
int max_recon_attempts; /**< maximum number of times to attempt connect before giving up (-1 for never) */
|
||||
} mca_oob_tcp_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;
|
||||
|
@ -258,11 +258,32 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
/* connection succeeded */
|
||||
addr->retries = 0;
|
||||
connected = true;
|
||||
peer->num_retries = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!connected) {
|
||||
/* it could be that the intended recipient just hasn't
|
||||
* started yet. if requested, wait awhile and try again
|
||||
* unless/until we hit the maximum number of retries */
|
||||
if (0 < mca_oob_tcp_component.retry_delay) {
|
||||
if (mca_oob_tcp_component.max_recon_attempts < 0 ||
|
||||
peer->num_retries < mca_oob_tcp_component.max_recon_attempts) {
|
||||
struct timeval tv;
|
||||
/* reset the addr states */
|
||||
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
|
||||
addr->state = MCA_OOB_TCP_UNCONNECTED;
|
||||
addr->retries = 0;
|
||||
}
|
||||
/* give it awhile and try again */
|
||||
tv.tv_sec = mca_oob_tcp_component.retry_delay;
|
||||
tv.tv_usec = 0;
|
||||
++peer->num_retries;
|
||||
ORTE_RETRY_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect, &tv);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* no address succeeded, so we cannot reach this peer */
|
||||
peer->state = MCA_OOB_TCP_FAILED;
|
||||
host = orte_get_proc_hostname(&(peer->name));
|
||||
|
@ -48,6 +48,7 @@ typedef struct {
|
||||
opal_list_t addrs;
|
||||
mca_oob_tcp_addr_t *active_addr;
|
||||
mca_oob_tcp_state_t state;
|
||||
int num_retries;
|
||||
opal_event_t send_event; /**< registration with event thread for send events */
|
||||
bool send_ev_active;
|
||||
opal_event_t recv_event; /**< registration with event thread for recv events */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user