1
1

Merge pull request #7315 from abouteiller/export/tcp_errors_v2

Handle error cases in TCP BTL (v2)
Этот коммит содержится в:
Austen Lauria 2020-01-27 17:03:07 -05:00 коммит произвёл GitHub
родитель 969eb0286c 76021e35ee
Коммит 10f6a77640
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 84 добавлений и 15 удалений

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2018 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -817,6 +817,14 @@ void mca_pml_ob1_error_handler(
return;
}
#endif /* OPAL_CUDA_SUPPORT */
/* Some BTL report unreachable errors during normal MPI_Finalize
* termination. Lets simply ignore such errors after MPI is not supposed to
* be operational anyway.
*/
if(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
return;
}
ompi_rte_abort(-1, btlinfo);
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2017 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -38,6 +38,8 @@
#include "btl_tcp_proc.h"
#include "btl_tcp_endpoint.h"
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc);
mca_btl_tcp_module_t mca_btl_tcp_module = {
.super = {
@ -51,11 +53,20 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
.btl_send = mca_btl_tcp_send,
.btl_put = mca_btl_tcp_put,
.btl_dump = mca_btl_base_dump,
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
.btl_ft_event = mca_btl_tcp_ft_event
},
.tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT
};
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
mca_btl_base_module_error_cb_fn_t cbfunc)
{
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl;
tcp_btl->tcp_error_cb = cbfunc;
return OPAL_SUCCESS;
}
/**
*
*/

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -388,6 +388,7 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
{
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
if (ret < 0) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
}
return ret;
@ -534,12 +535,26 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
btl_endpoint->endpoint_cache_length = 0;
#endif /* MCA_BTL_TCP_ENDPOINT_CACHE */
/* send a message before closing to differentiate between failures and
* clean disconnect during finalize */
if( MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state ) {
mca_btl_tcp_hdr_t fin_msg = {
.base.tag = 0,
.type = MCA_BTL_TCP_HDR_TYPE_FIN,
.count = 0,
.size = 0,
};
mca_btl_tcp_endpoint_send_blocking(btl_endpoint,
&fin_msg, sizeof(fin_msg));
}
CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);
btl_endpoint->endpoint_sd = -1;
/**
* If we keep failing to connect to the peer let the caller know about
* this situation by triggering all the pending fragments callback and
* reporting the error.
* this situation by triggering the callback on all pending fragments and
* reporting the error. The upper layer has then the opportunity to
* re-route or re-schedule the fragments.
*/
if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
@ -547,11 +562,20 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
while(NULL != frag) {
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH);
if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
MCA_BTL_TCP_FRAG_RETURN(frag);
}
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
}
btl_endpoint->endpoint_send_frag = NULL;
/* Let's report the error upstream */
if(NULL != btl_endpoint->endpoint_btl->tcp_error_cb) {
btl_endpoint->endpoint_btl->tcp_error_cb((mca_btl_base_module_t*)btl_endpoint->endpoint_btl, 0,
btl_endpoint->endpoint_proc->proc_opal, "Socket closed");
}
} else {
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
}
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
}
/*
@ -608,7 +632,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
true, opal_process_info.nodename,
getpid(), "did not receive entire connect ACK from peer");
return OPAL_ERR_BAD_PARAM;
}
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
@ -628,6 +651,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) {
BTL_ERROR(("received unexpected process identifier %s",
OPAL_NAME_PRINT(guid)));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERR_UNREACH;
}
@ -834,6 +858,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
((struct sockaddr_in*) &endpoint_addr)->sin_port,
strerror(opal_socket_errno), opal_socket_errno));
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR;
}
@ -850,6 +875,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
getpid(), msg,
strerror(so_error), so_error);
free(msg);
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return OPAL_ERROR;
}
@ -921,12 +947,15 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
}
else if (OPAL_ERR_BAD_PARAM == rc) {
else if (OPAL_ERR_BAD_PARAM == rc
|| OPAL_ERROR == rc) {
/* If we get a BAD_PARAM, it means that it probably wasn't
an OMPI process on the other end of the socket (e.g.,
the magic string ID failed). So we can probably just
close the socket and ignore this connection. */
CLOSE_THE_SOCKET(sd);
the magic string ID failed). recv_connect_ack already cleaned
up the socket. */
/* If we get OPAL_ERROR, the other end closed the connection
* because it has initiated a symetrical connexion on its end.
* recv_connect_ack already cleaned up the socket. */
}
else {
/* Otherwise, it probably *was* an OMPI peer process on
@ -1065,6 +1094,10 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
opal_event_del(&btl_endpoint->endpoint_send_event);
}
break;
case MCA_BTL_TCP_FAILED:
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");
opal_event_del(&btl_endpoint->endpoint_send_event);
break;
default:
BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state));
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -212,7 +212,8 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
cnt = readv(sd, frag->iov_ptr, num_vecs);
if( 0 < cnt ) goto advance_iov_position;
if( cnt == 0 ) {
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return false;
}
@ -272,6 +273,10 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
if(frag->iov_cnt == 0) {
if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr);
switch(frag->hdr.type) {
case MCA_BTL_TCP_HDR_TYPE_FIN:
frag->endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
mca_btl_tcp_endpoint_close(frag->endpoint);
break;
case MCA_BTL_TCP_HDR_TYPE_SEND:
if(frag->iov_idx == 1 && frag->hdr.size) {
frag->segments[0].seg_addr.pval = frag+1;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -33,6 +33,18 @@ BEGIN_C_DECLS
#define MCA_BTL_TCP_HDR_TYPE_SEND 1
#define MCA_BTL_TCP_HDR_TYPE_PUT 2
#define MCA_BTL_TCP_HDR_TYPE_GET 3
#define MCA_BTL_TCP_HDR_TYPE_FIN 4
/* The MCA_BTL_TCP_HDR_TYPE_FIN is a special kind of message sent during normal
* connexion closing. Before the endpoint closes the socket, it performs a
* 1-way handshake by sending a FIN message in the socket. This lets the other
* end of the connexion discriminate between the case in which the peer has
* closed intentionnally (e.g., during MPI_FINALIZE), or unintentionally (e.g.,
* as the result of some transmission or process failure).
* The process initiating the close sends the FIN message but does not wait
* for a 2-way handshake and closes the socket immediately. Thus, the recipient
* of a FIN message can simply close the socket and mark the endpoint as closed
* without error, and without answering a FIN message itself.
*/
struct mca_btl_tcp_hdr_t {
mca_btl_base_header_t base;