1
1

Merge pull request #506 from rhc54/topic/retry

Support attempts to connect async processes
Этот коммит содержится в:
rhc54 2015-04-02 01:33:06 -07:00
родитель 9f8ae59162 a4b466efc4
Коммит 6408c87aa0
4 изменённых файлов: 42 добавлений и 0 удалений

Просмотреть файл

@ -433,6 +433,23 @@ static int tcp_component_register(void)
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.keepalive_probes);
mca_oob_tcp_component.retry_delay = 0;
(void)mca_base_component_var_register(component, "retry_delay",
"Time (in sec) to wait before trying to connect to peer again",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.retry_delay);
mca_oob_tcp_component.max_recon_attempts = 10;
(void)mca_base_component_var_register(component, "max_recon_attempts",
"Max number of times to attempt connection before giving up (-1 -> never give up)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.max_retries);
return ORTE_SUCCESS;
}
@ -1204,6 +1221,7 @@ static void peer_cons(mca_oob_tcp_peer_t *peer)
OBJ_CONSTRUCT(&peer->addrs, opal_list_t);
peer->active_addr = NULL;
peer->state = MCA_OOB_TCP_UNCONNECTED;
peer->num_retries = 0;
OBJ_CONSTRUCT(&peer->send_queue, opal_list_t);
peer->send_msg = NULL;
peer->recv_msg = NULL;

Просмотреть файл

@ -80,6 +80,8 @@ typedef struct {
int keepalive_probes; /**< number of keepalives that can be missed before declaring error */
int keepalive_time; /**< idle time in seconds before starting to send keepalives */
int keepalive_intvl; /**< time between keepalives, in seconds */
int retry_delay; /**< time to wait before retrying connection */
int max_recon_attempts; /**< maximum number of times to attempt connect before giving up (-1 for never) */
} mca_oob_tcp_component_t;
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;

Просмотреть файл

@ -258,11 +258,32 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
/* connection succeeded */
addr->retries = 0;
connected = true;
peer->num_retries = 0;
break;
}
}
if (!connected) {
/* it could be that the intended recipient just hasn't
* started yet. if requested, wait awhile and try again
* unless/until we hit the maximum number of retries */
if (0 < mca_oob_tcp_component.retry_delay) {
if (mca_oob_tcp_component.max_recon_attempts < 0 ||
peer->num_retries < mca_oob_tcp_component.max_recon_attempts) {
struct timeval tv;
/* reset the addr states */
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
addr->state = MCA_OOB_TCP_UNCONNECTED;
addr->retries = 0;
}
/* give it awhile and try again */
tv.tv_sec = mca_oob_tcp_component.retry_delay;
tv.tv_usec = 0;
++peer->num_retries;
ORTE_RETRY_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect, &tv);
goto cleanup;
}
}
/* no address succeeded, so we cannot reach this peer */
peer->state = MCA_OOB_TCP_FAILED;
host = orte_get_proc_hostname(&(peer->name));

Просмотреть файл

@ -48,6 +48,7 @@ typedef struct {
opal_list_t addrs;
mca_oob_tcp_addr_t *active_addr;
mca_oob_tcp_state_t state;
int num_retries;
opal_event_t send_event; /**< registration with event thread for send events */
bool send_ev_active;
opal_event_t recv_event; /**< registration with event thread for recv events */