Attempt to debug hang that is hitting some environments. Posting to 1.7.4 as a placeholder for the eventual solution
cmr=v1.7.4:reviewer=rhc This commit was SVN r30060.
Этот коммит содержится в:
родитель
9eebb79d54
Коммит
7d8c0459a4
@ -722,31 +722,38 @@ static void recv_handler(int sd, short flags, void *cbdata)
|
|||||||
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
|
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
|
||||||
mca_oob_tcp_hdr_t hdr;
|
mca_oob_tcp_hdr_t hdr;
|
||||||
int rc;
|
int rc;
|
||||||
|
size_t cnt;
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s:tcp:recv:handler called",
|
"%s:tcp:recv:handler called",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
|
||||||
/* Some mem checkers don't realize that hdr will guarantee to be
|
/* ensure all is zero'd */
|
||||||
fully filled in during the read(), below :-( */
|
memset(&hdr, 0, sizeof(hdr));
|
||||||
OPAL_DEBUG_ZERO(hdr);
|
|
||||||
|
|
||||||
/* recv the process identifier */
|
/* recv the process identifier */
|
||||||
while ((rc = recv(sd, (char *)&hdr, sizeof(hdr), 0)) != sizeof(hdr)) {
|
cnt = 0;
|
||||||
if (rc >= 0) {
|
while (cnt < sizeof(hdr)) {
|
||||||
|
rc = recv(sd, (char *)&hdr, sizeof(hdr), 0);
|
||||||
|
if (0 == rc) {
|
||||||
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
|
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
|
||||||
opal_output(0, "%s mca_oob_tcp_recv_handler: peer closed connection",
|
opal_output(0, "%s mca_oob_tcp_recv_handler: peer closed connection",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
}
|
}
|
||||||
CLOSE_THE_SOCKET(sd);
|
CLOSE_THE_SOCKET(sd);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
} else if (rc < 0) {
|
||||||
|
if (opal_socket_errno != EINTR &&
|
||||||
|
opal_socket_errno != EAGAIN &&
|
||||||
|
opal_socket_errno != EWOULDBLOCK) {
|
||||||
|
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
|
||||||
|
CLOSE_THE_SOCKET(sd);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
if (opal_socket_errno != EINTR) {
|
cnt += rc;
|
||||||
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
|
|
||||||
CLOSE_THE_SOCKET(sd);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
MCA_OOB_TCP_HDR_NTOH(&hdr);
|
MCA_OOB_TCP_HDR_NTOH(&hdr);
|
||||||
|
|
||||||
|
@ -327,7 +327,7 @@ static int tcp_peer_send_connect_ack(mca_oob_tcp_module_t *mod,
|
|||||||
hdr.tag = 0;
|
hdr.tag = 0;
|
||||||
hdr.nbytes = 0;
|
hdr.nbytes = 0;
|
||||||
MCA_OOB_TCP_HDR_HTON(&hdr);
|
MCA_OOB_TCP_HDR_HTON(&hdr);
|
||||||
if (0 > tcp_peer_send_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
if (ORTE_SUCCESS != tcp_peer_send_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
||||||
return ORTE_ERR_UNREACH;
|
return ORTE_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
@ -478,7 +478,7 @@ static int tcp_peer_send_blocking(mca_oob_tcp_module_t *mod,
|
|||||||
opal_socket_errno);
|
opal_socket_errno);
|
||||||
peer->state = MCA_OOB_TCP_FAILED;
|
peer->state = MCA_OOB_TCP_FAILED;
|
||||||
mca_oob_tcp_peer_close(mod, peer);
|
mca_oob_tcp_peer_close(mod, peer);
|
||||||
return -1;
|
return ORTE_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -490,7 +490,7 @@ static int tcp_peer_send_blocking(mca_oob_tcp_module_t *mod,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&(peer->name)));
|
ORTE_NAME_PRINT(&(peer->name)));
|
||||||
|
|
||||||
return cnt;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -508,6 +508,9 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_module_t *mod,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&peer->name), peer->sd);
|
ORTE_NAME_PRINT(&peer->name), peer->sd);
|
||||||
|
|
||||||
|
/* ensure all is zero'd */
|
||||||
|
memset(&hdr, 0, sizeof(hdr));
|
||||||
|
|
||||||
if (tcp_peer_recv_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
if (tcp_peer_recv_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
||||||
/* If the peer state is CONNECT_ACK, then we were waiting for
|
/* If the peer state is CONNECT_ACK, then we were waiting for
|
||||||
* the connection to be ack'd
|
* the connection to be ack'd
|
||||||
|
@ -297,12 +297,15 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
|
|||||||
static int read_bytes(mca_oob_tcp_peer_t* peer)
|
static int read_bytes(mca_oob_tcp_peer_t* peer)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
int retries;
|
||||||
|
|
||||||
/* read until all bytes recvd or error */
|
/* read until all bytes recvd or error - but limit the number of tries */
|
||||||
while (0 < peer->recv_msg->rdbytes) {
|
retries = 0;
|
||||||
|
while (0 < peer->recv_msg->rdbytes && retries < mca_oob_tcp_component.max_retries) {
|
||||||
rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes);
|
rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes);
|
||||||
if (rc < 0) {
|
if (rc < 0) {
|
||||||
if(opal_socket_errno == EINTR) {
|
if(opal_socket_errno == EINTR) {
|
||||||
|
retries++;
|
||||||
continue;
|
continue;
|
||||||
} else if (opal_socket_errno == EAGAIN) {
|
} else if (opal_socket_errno == EAGAIN) {
|
||||||
/* tell the caller to keep this message on active,
|
/* tell the caller to keep this message on active,
|
||||||
@ -321,13 +324,12 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
|
|||||||
* the error back to the RML and let the caller know
|
* the error back to the RML and let the caller know
|
||||||
* to abort this message
|
* to abort this message
|
||||||
*/
|
*/
|
||||||
if (opal_output_get_verbosity(orte_oob_base_framework.framework_output) >= OOB_TCP_DEBUG_FAIL) {
|
opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output,
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
|
"%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&(peer->name)),
|
ORTE_NAME_PRINT(&(peer->name)),
|
||||||
strerror(opal_socket_errno),
|
strerror(opal_socket_errno),
|
||||||
opal_socket_errno);
|
opal_socket_errno);
|
||||||
}
|
|
||||||
// mca_oob_tcp_peer_close(peer);
|
// mca_oob_tcp_peer_close(peer);
|
||||||
// if (NULL != mca_oob_tcp.oob_exception_callback) {
|
// if (NULL != mca_oob_tcp.oob_exception_callback) {
|
||||||
// mca_oob_tcp.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED);
|
// mca_oob_tcp.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED);
|
||||||
@ -337,11 +339,10 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
|
|||||||
/* the remote peer closed the connection - report that condition
|
/* the remote peer closed the connection - report that condition
|
||||||
* and let the caller know
|
* and let the caller know
|
||||||
*/
|
*/
|
||||||
if (opal_output_get_verbosity(orte_oob_base_framework.framework_output) >= OOB_TCP_DEBUG_FAIL) {
|
opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output,
|
||||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: peer closed connection",
|
"%s-%s mca_oob_tcp_msg_recv: peer closed connection",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&(peer->name)));
|
ORTE_NAME_PRINT(&(peer->name)));
|
||||||
}
|
|
||||||
/* stop all events */
|
/* stop all events */
|
||||||
if (peer->recv_ev_active) {
|
if (peer->recv_ev_active) {
|
||||||
opal_event_del(&peer->recv_event);
|
opal_event_del(&peer->recv_event);
|
||||||
@ -368,7 +369,14 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
|
|||||||
/* we were able to read something, so adjust counters and location */
|
/* we were able to read something, so adjust counters and location */
|
||||||
peer->recv_msg->rdbytes -= rc;
|
peer->recv_msg->rdbytes -= rc;
|
||||||
peer->recv_msg->rdptr += rc;
|
peer->recv_msg->rdptr += rc;
|
||||||
|
retries = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (0 < peer->recv_msg->rdbytes) {
|
||||||
|
/* we failed to read it all */
|
||||||
|
return ORTE_ERR_COMM_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
/* we read the full data block */
|
/* we read the full data block */
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -470,8 +478,15 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
|||||||
peer->recv_msg->data = NULL; // make sure
|
peer->recv_msg->data = NULL; // make sure
|
||||||
} else {
|
} else {
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s:tcp:recv:handler allocate data region",
|
"%s:tcp:recv:handler allocate data region of size %lu",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
|
||||||
|
/* FIXME: if the data region is absurdly large, then something is wrong */
|
||||||
|
if (500000000 < peer->recv_msg->hdr.nbytes) {
|
||||||
|
opal_output(0, "%s: ABSURDLY LARGE MESSAGE OF SIZE %lu",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
|
||||||
|
mca_oob_tcp_peer_close(mod, peer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
/* allocate the data region */
|
/* allocate the data region */
|
||||||
peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
|
peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
|
||||||
/* point to it */
|
/* point to it */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user