Attempt to debug hang that is hitting some environments. Posting to 1.7.4 as a placeholder for the eventual solution
cmr=v1.7.4:reviewer=rhc This commit was SVN r30060.
Этот коммит содержится в:
родитель
9eebb79d54
Коммит
7d8c0459a4
@ -722,31 +722,38 @@ static void recv_handler(int sd, short flags, void *cbdata)
|
||||
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
|
||||
mca_oob_tcp_hdr_t hdr;
|
||||
int rc;
|
||||
size_t cnt;
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp:recv:handler called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* Some mem checkers don't realize that hdr will guarantee to be
|
||||
fully filled in during the read(), below :-( */
|
||||
OPAL_DEBUG_ZERO(hdr);
|
||||
/* ensure all is zero'd */
|
||||
memset(&hdr, 0, sizeof(hdr));
|
||||
|
||||
/* recv the process identifier */
|
||||
while ((rc = recv(sd, (char *)&hdr, sizeof(hdr), 0)) != sizeof(hdr)) {
|
||||
if (rc >= 0) {
|
||||
cnt = 0;
|
||||
while (cnt < sizeof(hdr)) {
|
||||
rc = recv(sd, (char *)&hdr, sizeof(hdr), 0);
|
||||
if (0 == rc) {
|
||||
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: peer closed connection",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
goto cleanup;
|
||||
} else if (rc < 0) {
|
||||
if (opal_socket_errno != EINTR &&
|
||||
opal_socket_errno != EAGAIN &&
|
||||
opal_socket_errno != EWOULDBLOCK) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
goto cleanup;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (opal_socket_errno != EINTR) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
goto cleanup;
|
||||
}
|
||||
cnt += rc;
|
||||
}
|
||||
MCA_OOB_TCP_HDR_NTOH(&hdr);
|
||||
|
||||
|
@ -327,7 +327,7 @@ static int tcp_peer_send_connect_ack(mca_oob_tcp_module_t *mod,
|
||||
hdr.tag = 0;
|
||||
hdr.nbytes = 0;
|
||||
MCA_OOB_TCP_HDR_HTON(&hdr);
|
||||
if (0 > tcp_peer_send_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
||||
if (ORTE_SUCCESS != tcp_peer_send_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
@ -478,7 +478,7 @@ static int tcp_peer_send_blocking(mca_oob_tcp_module_t *mod,
|
||||
opal_socket_errno);
|
||||
peer->state = MCA_OOB_TCP_FAILED;
|
||||
mca_oob_tcp_peer_close(mod, peer);
|
||||
return -1;
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@ -490,7 +490,7 @@ static int tcp_peer_send_blocking(mca_oob_tcp_module_t *mod,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(peer->name)));
|
||||
|
||||
return cnt;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -508,6 +508,9 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_module_t *mod,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer->name), peer->sd);
|
||||
|
||||
/* ensure all is zero'd */
|
||||
memset(&hdr, 0, sizeof(hdr));
|
||||
|
||||
if (tcp_peer_recv_blocking(mod, peer, &hdr, sizeof(hdr))) {
|
||||
/* If the peer state is CONNECT_ACK, then we were waiting for
|
||||
* the connection to be ack'd
|
||||
|
@ -297,12 +297,15 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
|
||||
static int read_bytes(mca_oob_tcp_peer_t* peer)
|
||||
{
|
||||
int rc;
|
||||
int retries;
|
||||
|
||||
/* read until all bytes recvd or error */
|
||||
while (0 < peer->recv_msg->rdbytes) {
|
||||
/* read until all bytes recvd or error - but limit the number of tries */
|
||||
retries = 0;
|
||||
while (0 < peer->recv_msg->rdbytes && retries < mca_oob_tcp_component.max_retries) {
|
||||
rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes);
|
||||
if (rc < 0) {
|
||||
if(opal_socket_errno == EINTR) {
|
||||
retries++;
|
||||
continue;
|
||||
} else if (opal_socket_errno == EAGAIN) {
|
||||
/* tell the caller to keep this message on active,
|
||||
@ -321,13 +324,12 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
|
||||
* the error back to the RML and let the caller know
|
||||
* to abort this message
|
||||
*/
|
||||
if (opal_output_get_verbosity(orte_oob_base_framework.framework_output) >= OOB_TCP_DEBUG_FAIL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(peer->name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
}
|
||||
opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output,
|
||||
"%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(peer->name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
// mca_oob_tcp_peer_close(peer);
|
||||
// if (NULL != mca_oob_tcp.oob_exception_callback) {
|
||||
// mca_oob_tcp.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED);
|
||||
@ -337,11 +339,10 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
|
||||
/* the remote peer closed the connection - report that condition
|
||||
* and let the caller know
|
||||
*/
|
||||
if (opal_output_get_verbosity(orte_oob_base_framework.framework_output) >= OOB_TCP_DEBUG_FAIL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: peer closed connection",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(peer->name)));
|
||||
}
|
||||
opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output,
|
||||
"%s-%s mca_oob_tcp_msg_recv: peer closed connection",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(peer->name)));
|
||||
/* stop all events */
|
||||
if (peer->recv_ev_active) {
|
||||
opal_event_del(&peer->recv_event);
|
||||
@ -368,7 +369,14 @@ static int read_bytes(mca_oob_tcp_peer_t* peer)
|
||||
/* we were able to read something, so adjust counters and location */
|
||||
peer->recv_msg->rdbytes -= rc;
|
||||
peer->recv_msg->rdptr += rc;
|
||||
retries = 0;
|
||||
}
|
||||
|
||||
if (0 < peer->recv_msg->rdbytes) {
|
||||
/* we failed to read it all */
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
/* we read the full data block */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -470,8 +478,15 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
||||
peer->recv_msg->data = NULL; // make sure
|
||||
} else {
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp:recv:handler allocate data region",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
"%s:tcp:recv:handler allocate data region of size %lu",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
|
||||
/* FIXME: if the data region is absurdly large, then something is wrong */
|
||||
if (500000000 < peer->recv_msg->hdr.nbytes) {
|
||||
opal_output(0, "%s: ABSURDLY LARGE MESSAGE OF SIZE %lu",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
|
||||
mca_oob_tcp_peer_close(mod, peer);
|
||||
return;
|
||||
}
|
||||
/* allocate the data region */
|
||||
peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
|
||||
/* point to it */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user