1
1

Merge pull request #7720 from abouteiller/bugfix/tcp-failed-lock

Race condition when closing TCP endpoint with error
Этот коммит содержится в:
Austen Lauria 2020-05-13 16:52:21 -04:00 коммит произвёл GitHub
родитель f744668f5f 0e93d0f647
Коммит 9996b9f54d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 16 добавлений и 11 удалений

Просмотреть файл

@ -388,6 +388,8 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
{
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
if (ret < 0) {
/* send-lock not needed because never called when the socket is in the
* event set. */
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
}
@ -1077,6 +1079,7 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
assert(btl_endpoint->endpoint_state == MCA_BTL_TCP_CONNECTED);
if(mca_btl_tcp_frag_send(frag, btl_endpoint->endpoint_sd) == false) {
break;
}

Просмотреть файл

@ -49,6 +49,7 @@
#include "opal/util/show_help.h"
#include "btl_tcp_frag.h"
#include "btl_tcp_proc.h"
#include "btl_tcp_endpoint.h"
#include "btl_tcp_proc.h"
@ -130,6 +131,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n",
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
/* send_lock held by caller */
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(frag->endpoint);
return false;
@ -137,6 +139,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)",
strerror(opal_socket_errno),
opal_socket_errno));
/* send_lock held by caller */
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(frag->endpoint);
return false;
@ -215,9 +218,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
cnt = readv(sd, frag->iov_ptr, num_vecs);
if( 0 < cnt ) goto advance_iov_position;
if( cnt == 0 ) {
OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
return false;
}
switch(opal_socket_errno) {
@ -229,28 +234,25 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %lu)\n\t%s(%lu)\n",
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return false;
break;
case ECONNRESET:
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
true, opal_process_info.nodename,
getpid(), errhost);
free(errhost);
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return false;
break;
default:
BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)",
strerror(opal_socket_errno),
opal_socket_errno));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return false;
break;
}
OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
return false;
} while( cnt < 0 );
advance_iov_position: