1
1

Merge pull request #5916 from bwbarrett/revert/6acebc4

Revert "Handle error cases in TCP BTL"
Этот коммит содержится в:
Brian Barrett 2018-10-16 13:54:18 -07:00 коммит произвёл GitHub
родитель 069084e6ad 5162011428
Коммит da1189d771
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 12 добавлений и 42 удалений

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University * Copyright (c) 2004-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -38,8 +38,6 @@
#include "btl_tcp_proc.h" #include "btl_tcp_proc.h"
#include "btl_tcp_endpoint.h" #include "btl_tcp_endpoint.h"
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc);
mca_btl_tcp_module_t mca_btl_tcp_module = { mca_btl_tcp_module_t mca_btl_tcp_module = {
.super = { .super = {
@ -53,20 +51,11 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
.btl_send = mca_btl_tcp_send, .btl_send = mca_btl_tcp_send,
.btl_put = mca_btl_tcp_put, .btl_put = mca_btl_tcp_put,
.btl_dump = mca_btl_base_dump, .btl_dump = mca_btl_base_dump,
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
.btl_ft_event = mca_btl_tcp_ft_event .btl_ft_event = mca_btl_tcp_ft_event
}, },
.tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT .tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT
}; };
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc)
{
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl;
tcp_btl->tcp_error_cb = cbfunc;
return OPAL_SUCCESS;
}
/** /**
* *
*/ */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University * Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -388,7 +388,6 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
{ {
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size); int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
if (ret < 0) { if (ret < 0) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
} }
return ret; return ret;
@ -539,9 +538,8 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
btl_endpoint->endpoint_sd = -1; btl_endpoint->endpoint_sd = -1;
/** /**
* If we keep failing to connect to the peer let the caller know about * If we keep failing to connect to the peer let the caller know about
* this situation by triggering the callback on all pending fragments and * this situation by triggering all the pending fragments callback and
* reporting the error. The upper layer has then the opportunity to * reporting the error.
* re-route or re-schedule the fragments.
*/ */
if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) { if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag; mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
@ -549,20 +547,11 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
while(NULL != frag) { while(NULL != frag) {
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH); frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH);
if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
MCA_BTL_TCP_FRAG_RETURN(frag);
}
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
} }
btl_endpoint->endpoint_send_frag = NULL;
/* Let's report the error upstream */
if(NULL != btl_endpoint->endpoint_btl->tcp_error_cb) {
btl_endpoint->endpoint_btl->tcp_error_cb((mca_btl_base_module_t*)btl_endpoint->endpoint_btl, 0,
btl_endpoint->endpoint_proc->proc_opal, "Socket closed");
}
} else {
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
} }
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
} }
/* /*
@ -619,6 +608,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail", opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
true, opal_process_info.nodename, true, opal_process_info.nodename,
getpid(), "did not receive entire connect ACK from peer"); getpid(), "did not receive entire connect ACK from peer");
return OPAL_ERR_BAD_PARAM; return OPAL_ERR_BAD_PARAM;
} }
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) { if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
@ -638,7 +628,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) { if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) {
BTL_ERROR(("received unexpected process identifier %s", BTL_ERROR(("received unexpected process identifier %s",
OPAL_NAME_PRINT(guid))); OPAL_NAME_PRINT(guid)));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERR_UNREACH; return OPAL_ERR_UNREACH;
} }
@ -845,7 +834,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
opal_net_get_hostname((struct sockaddr*) &endpoint_addr), opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
((struct sockaddr_in*) &endpoint_addr)->sin_port, ((struct sockaddr_in*) &endpoint_addr)->sin_port,
strerror(opal_socket_errno), opal_socket_errno)); strerror(opal_socket_errno), opal_socket_errno));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR; return OPAL_ERROR;
} }
@ -862,7 +850,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
getpid(), msg, getpid(), msg,
strerror(opal_socket_errno), opal_socket_errno); strerror(opal_socket_errno), opal_socket_errno);
free(msg); free(msg);
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR; return OPAL_ERROR;
} }
@ -934,15 +921,12 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected"); MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
} }
else if (OPAL_ERR_BAD_PARAM == rc else if (OPAL_ERR_BAD_PARAM == rc) {
|| OPAL_ERROR == rc) {
/* If we get a BAD_PARAM, it means that it probably wasn't /* If we get a BAD_PARAM, it means that it probably wasn't
an OMPI process on the other end of the socket (e.g., an OMPI process on the other end of the socket (e.g.,
the magic string ID failed). recv_connect_ack already cleaned the magic string ID failed). So we can probably just
up the socket. */ close the socket and ignore this connection. */
/* If we get OPAL_ERROR, the other end closed the connection CLOSE_THE_SOCKET(sd);
* because it has initiated a symetrical connexion on its end.
* recv_connect_ack already cleaned up the socket. */
} }
else { else {
/* Otherwise, it probably *was* an OMPI peer process on /* Otherwise, it probably *was* an OMPI peer process on
@ -1081,8 +1065,6 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
opal_event_del(&btl_endpoint->endpoint_send_event); opal_event_del(&btl_endpoint->endpoint_send_event);
} }
break; break;
case MCA_BTL_TCP_FAILED:
break;
default: default:
BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state)); BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state));
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]"); MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");

Просмотреть файл

@ -212,8 +212,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
cnt = readv(sd, frag->iov_ptr, num_vecs); cnt = readv(sd, frag->iov_ptr, num_vecs);
if( 0 < cnt ) goto advance_iov_position; if( 0 < cnt ) goto advance_iov_position;
if( cnt == 0 ) { if( cnt == 0 ) {
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state) btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint); mca_btl_tcp_endpoint_close(btl_endpoint);
return false; return false;
} }