1
1

Attempt to debug hang that is hitting some environments. Posting to 1.7.4 as a placeholder for the eventual solution

cmr=v1.7.4:reviewer=rhc

This commit was SVN r30060.
Этот коммит содержится в:
Ralph Castain 2013-12-23 19:57:05 +00:00
родитель 9eebb79d54
Коммит 7d8c0459a4
3 изменённых файлов: 55 добавлений и 30 удалений

Просмотреть файл

@ -722,31 +722,38 @@ static void recv_handler(int sd, short flags, void *cbdata)
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
mca_oob_tcp_hdr_t hdr;
int rc;
size_t cnt;
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s:tcp:recv:handler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* Some mem checkers don't realize that hdr will guarantee to be
fully filled in during the read(), below :-( */
OPAL_DEBUG_ZERO(hdr);
/* ensure all is zero'd */
memset(&hdr, 0, sizeof(hdr));
/* recv the process identifier */
while ((rc = recv(sd, (char *)&hdr, sizeof(hdr), 0)) != sizeof(hdr)) {
if (rc >= 0) {
cnt = 0;
while (cnt < sizeof(hdr)) {
rc = recv(sd, (char *)&hdr, sizeof(hdr), 0);
if (0 == rc) {
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
opal_output(0, "%s mca_oob_tcp_recv_handler: peer closed connection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
CLOSE_THE_SOCKET(sd);
goto cleanup;
} else if (rc < 0) {
if (opal_socket_errno != EINTR &&
opal_socket_errno != EAGAIN &&
opal_socket_errno != EWOULDBLOCK) {
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
goto cleanup;
}
continue;
}
if (opal_socket_errno != EINTR) {
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
goto cleanup;
}
cnt += rc;
}
MCA_OOB_TCP_HDR_NTOH(&hdr);

Просмотреть файл

@ -327,7 +327,7 @@ static int tcp_peer_send_connect_ack(mca_oob_tcp_module_t *mod,
hdr.tag = 0;
hdr.nbytes = 0;
MCA_OOB_TCP_HDR_HTON(&hdr);
if (0 > tcp_peer_send_blocking(mod, peer, &hdr, sizeof(hdr))) {
if (ORTE_SUCCESS != tcp_peer_send_blocking(mod, peer, &hdr, sizeof(hdr))) {
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
@ -478,7 +478,7 @@ static int tcp_peer_send_blocking(mca_oob_tcp_module_t *mod,
opal_socket_errno);
peer->state = MCA_OOB_TCP_FAILED;
mca_oob_tcp_peer_close(mod, peer);
return -1;
return ORTE_ERR_UNREACH;
}
continue;
}
@ -490,7 +490,7 @@ static int tcp_peer_send_blocking(mca_oob_tcp_module_t *mod,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)));
return cnt;
return ORTE_SUCCESS;
}
/*
@ -508,6 +508,9 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_module_t *mod,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name), peer->sd);
/* ensure all is zero'd */
memset(&hdr, 0, sizeof(hdr));
if (tcp_peer_recv_blocking(mod, peer, &hdr, sizeof(hdr))) {
/* If the peer state is CONNECT_ACK, then we were waiting for
* the connection to be ack'd

Просмотреть файл

@ -297,12 +297,15 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
static int read_bytes(mca_oob_tcp_peer_t* peer)
{
int rc;
int retries;
/* read until all bytes recvd or error */
while (0 < peer->recv_msg->rdbytes) {
/* read until all bytes recvd or error - but limit the number of tries */
retries = 0;
while (0 < peer->recv_msg->rdbytes && retries < mca_oob_tcp_component.max_retries) {
rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes);
if (rc < 0) {
if(opal_socket_errno == EINTR) {
retries++;
continue;
} else if (opal_socket_errno == EAGAIN) {
/* tell the caller to keep this message on active,
@ -321,13 +324,12 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
* the error back to the RML and let the caller know
* to abort this message
*/
if (opal_output_get_verbosity(orte_oob_base_framework.framework_output) >= OOB_TCP_DEBUG_FAIL) {
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
strerror(opal_socket_errno),
opal_socket_errno);
}
opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output,
"%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
strerror(opal_socket_errno),
opal_socket_errno);
// mca_oob_tcp_peer_close(peer);
// if (NULL != mca_oob_tcp.oob_exception_callback) {
// mca_oob_tcp.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED);
@ -337,11 +339,10 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
/* the remote peer closed the connection - report that condition
* and let the caller know
*/
if (opal_output_get_verbosity(orte_oob_base_framework.framework_output) >= OOB_TCP_DEBUG_FAIL) {
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: peer closed connection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)));
}
opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output,
"%s-%s mca_oob_tcp_msg_recv: peer closed connection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)));
/* stop all events */
if (peer->recv_ev_active) {
opal_event_del(&peer->recv_event);
@ -368,7 +369,14 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
/* we were able to read something, so adjust counters and location */
peer->recv_msg->rdbytes -= rc;
peer->recv_msg->rdptr += rc;
retries = 0;
}
if (0 < peer->recv_msg->rdbytes) {
/* we failed to read it all */
return ORTE_ERR_COMM_FAILURE;
}
/* we read the full data block */
return ORTE_SUCCESS;
}
@ -470,8 +478,15 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
peer->recv_msg->data = NULL; // make sure
} else {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s:tcp:recv:handler allocate data region",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
"%s:tcp:recv:handler allocate data region of size %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
/* FIXME: if the data region is absurdly large, then something is wrong */
if (500000000 < peer->recv_msg->hdr.nbytes) {
opal_output(0, "%s: ABSURDLY LARGE MESSAGE OF SIZE %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
mca_oob_tcp_peer_close(mod, peer);
return;
}
/* allocate the data region */
peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
/* point to it */