1
1

Merge pull request #5916 from bwbarrett/revert/6acebc4

Revert "Handle error cases in TCP BTL"
Этот коммит содержится в:
Brian Barrett 2018-10-16 13:54:18 -07:00 коммит произвёл GitHub
родитель 069084e6ad 5162011428
Коммит da1189d771
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 12 добавлений и 42 удалений

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University
* Copyright (c) 2004-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -38,8 +38,6 @@
#include "btl_tcp_proc.h"
#include "btl_tcp_endpoint.h"
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc);
mca_btl_tcp_module_t mca_btl_tcp_module = {
.super = {
@ -53,20 +51,11 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
.btl_send = mca_btl_tcp_send,
.btl_put = mca_btl_tcp_put,
.btl_dump = mca_btl_base_dump,
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
.btl_ft_event = mca_btl_tcp_ft_event
},
.tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT
};
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc)
{
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl;
tcp_btl->tcp_error_cb = cbfunc;
return OPAL_SUCCESS;
}
/**
*
*/

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -388,7 +388,6 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
{
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
if (ret < 0) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
}
return ret;
@ -539,9 +538,8 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
btl_endpoint->endpoint_sd = -1;
/**
* If we keep failing to connect to the peer let the caller know about
* this situation by triggering the callback on all pending fragments and
* reporting the error. The upper layer has then the opportunity to
* re-route or re-schedule the fragments.
* this situation by triggering all the pending fragments callback and
* reporting the error.
*/
if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
@ -549,20 +547,11 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
while(NULL != frag) {
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH);
if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
MCA_BTL_TCP_FRAG_RETURN(frag);
}
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
}
btl_endpoint->endpoint_send_frag = NULL;
/* Let's report the error upstream */
if(NULL != btl_endpoint->endpoint_btl->tcp_error_cb) {
btl_endpoint->endpoint_btl->tcp_error_cb((mca_btl_base_module_t*)btl_endpoint->endpoint_btl, 0,
btl_endpoint->endpoint_proc->proc_opal, "Socket closed");
}
} else {
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
}
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
}
/*
@ -619,6 +608,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
true, opal_process_info.nodename,
getpid(), "did not receive entire connect ACK from peer");
return OPAL_ERR_BAD_PARAM;
}
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
@ -638,7 +628,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) {
BTL_ERROR(("received unexpected process identifier %s",
OPAL_NAME_PRINT(guid)));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERR_UNREACH;
}
@ -845,7 +834,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
((struct sockaddr_in*) &endpoint_addr)->sin_port,
strerror(opal_socket_errno), opal_socket_errno));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR;
}
@ -862,7 +850,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
getpid(), msg,
strerror(opal_socket_errno), opal_socket_errno);
free(msg);
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR;
}
@ -934,15 +921,12 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
}
else if (OPAL_ERR_BAD_PARAM == rc
|| OPAL_ERROR == rc) {
else if (OPAL_ERR_BAD_PARAM == rc) {
/* If we get a BAD_PARAM, it means that it probably wasn't
an OMPI process on the other end of the socket (e.g.,
the magic string ID failed). recv_connect_ack already cleaned
up the socket. */
/* If we get OPAL_ERROR, the other end closed the connection
* because it has initiated a symetrical connexion on its end.
* recv_connect_ack already cleaned up the socket. */
the magic string ID failed). So we can probably just
close the socket and ignore this connection. */
CLOSE_THE_SOCKET(sd);
}
else {
/* Otherwise, it probably *was* an OMPI peer process on
@ -1081,8 +1065,6 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
opal_event_del(&btl_endpoint->endpoint_send_event);
}
break;
case MCA_BTL_TCP_FAILED:
break;
default:
BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state));
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");

Просмотреть файл

@ -212,8 +212,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
cnt = readv(sd, frag->iov_ptr, num_vecs);
if( 0 < cnt ) goto advance_iov_position;
if( cnt == 0 ) {
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return false;
}