Merge pull request #7720 from abouteiller/bugfix/tcp-failed-lock
Race condition when closing TCP endpoint with error
Этот коммит содержится в:
Коммит
9996b9f54d
@ -388,6 +388,8 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
|
||||
{
|
||||
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
|
||||
if (ret < 0) {
|
||||
/* send-lock not needed because never called when the socket is in the
|
||||
* event set. */
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
}
|
||||
@ -1077,6 +1079,7 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
|
||||
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
|
||||
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
|
||||
assert(btl_endpoint->endpoint_state == MCA_BTL_TCP_CONNECTED);
|
||||
if(mca_btl_tcp_frag_send(frag, btl_endpoint->endpoint_sd) == false) {
|
||||
break;
|
||||
}
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "btl_tcp_frag.h"
|
||||
#include "btl_tcp_proc.h"
|
||||
#include "btl_tcp_endpoint.h"
|
||||
#include "btl_tcp_proc.h"
|
||||
|
||||
@ -130,6 +131,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
|
||||
BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n",
|
||||
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
|
||||
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
|
||||
/* send_lock held by caller */
|
||||
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(frag->endpoint);
|
||||
return false;
|
||||
@ -137,6 +139,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
|
||||
BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)",
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno));
|
||||
/* send_lock held by caller */
|
||||
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(frag->endpoint);
|
||||
return false;
|
||||
@ -215,9 +218,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
||||
cnt = readv(sd, frag->iov_ptr, num_vecs);
|
||||
if( 0 < cnt ) goto advance_iov_position;
|
||||
if( cnt == 0 ) {
|
||||
OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
|
||||
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
|
||||
return false;
|
||||
}
|
||||
switch(opal_socket_errno) {
|
||||
@ -229,28 +234,25 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
||||
BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %lu)\n\t%s(%lu)\n",
|
||||
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
|
||||
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return false;
|
||||
|
||||
break;
|
||||
case ECONNRESET:
|
||||
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(), errhost);
|
||||
free(errhost);
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return false;
|
||||
|
||||
break;
|
||||
default:
|
||||
BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)",
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno));
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
|
||||
return false;
|
||||
} while( cnt < 0 );
|
||||
|
||||
advance_iov_position:
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user