Adding error handling in OpenIB BTL
bugfix: major: openib send credits returned correctly after a fault for pending frags to dead processes; also tweak the default IB retry timeouts tomake this happen faster Make it compile in non-debug builds Mark the IB endpoint as failed when invoking an error; this resolves UDCM connection deadlocks Changing the default IB retry timeouts is not a good idea. We'll need to find another way to speedup credit recovery in failure cases. Remove ULFM specific cases Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
Этот коммит содержится в:
родитель
1b96be5f2f
Коммит
e46c907468
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2017 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -3667,7 +3667,7 @@ error:
|
||||
#endif
|
||||
|
||||
if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) {
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s"
|
||||
"status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d",
|
||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, wc->wr_id,
|
||||
@ -3708,9 +3708,36 @@ error:
|
||||
}
|
||||
}
|
||||
|
||||
if(openib_btl)
|
||||
if(openib_btl) {
|
||||
/* return send wqe */
|
||||
qp_put_wqe(endpoint, qp);
|
||||
|
||||
/* return wqes that were sent before this frag */
|
||||
n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des));
|
||||
|
||||
/* force emptying the pending frags toward the dead endpoint
|
||||
* in progress_pending_frags* below */
|
||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
|
||||
if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
BTL_VERBOSE(("frag %p returning %d credits", frag, 1+n));
|
||||
OPAL_THREAD_FETCH_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n);
|
||||
/* new SRQ credit available. Try to progress pending frags*/
|
||||
progress_pending_frags_srq(openib_btl, qp);
|
||||
}
|
||||
/* new wqe or/and get token available. Try to progress pending frags */
|
||||
progress_pending_frags_wqe(endpoint, qp);
|
||||
mca_btl_openib_frag_progress_pending_put_get(endpoint, qp);
|
||||
|
||||
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||||
des->des_cbfunc(&openib_btl->super, endpoint, des, wc->status);
|
||||
}
|
||||
if (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
|
||||
mca_btl_openib_free(&openib_btl->super, des);
|
||||
}
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
|
||||
(struct opal_proc_t*)remote_proc, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
static int poll_device(mca_btl_openib_device_t* device, int count)
|
||||
|
@ -502,6 +502,7 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
|
||||
mca_btl_openib_frag_t *openib_frag;
|
||||
mca_btl_openib_com_frag_t *com_frag;
|
||||
mca_btl_openib_control_header_t *ctl_hdr;
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
||||
@ -538,11 +539,14 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
|
||||
ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
|
||||
|
||||
/* Send the fragment */
|
||||
if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
|
||||
BTL_ERROR(("Failed to post CTS send"));
|
||||
mca_btl_openib_endpoint_invoke_error(endpoint);
|
||||
if (OPAL_SUCCESS != (rc = mca_btl_openib_endpoint_post_send(endpoint, sc_frag))) {
|
||||
if( OPAL_ERR_RESOURCE_BUSY != rc ) {
|
||||
BTL_ERROR(("Failed to post CTS send"));
|
||||
mca_btl_openib_endpoint_invoke_error(endpoint);
|
||||
}
|
||||
} else {
|
||||
endpoint->endpoint_cts_sent = true;
|
||||
}
|
||||
endpoint->endpoint_cts_sent = true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -611,8 +615,8 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
mca_btl_openib_send_frag_t *frag;
|
||||
mca_btl_openib_endpoint_t *ep;
|
||||
bool master = false;
|
||||
int rc;
|
||||
|
||||
opal_output(-1, "Now we are CONNECTED");
|
||||
if (MCA_BTL_XRC_ENABLED) {
|
||||
opal_mutex_lock (&endpoint->ib_addr->addr_lock);
|
||||
if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
|
||||
@ -664,8 +668,11 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
frag = to_send_frag(frag_item);
|
||||
/* We need to post this one */
|
||||
|
||||
if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
|
||||
BTL_ERROR(("Error posting send"));
|
||||
if(OPAL_SUCCESS != (rc = mca_btl_openib_endpoint_post_send(endpoint, frag))) {
|
||||
/* if we are out of resources, let's try to reschedule everything later */
|
||||
if( OPAL_ERR_RESOURCE_BUSY != rc ) {
|
||||
BTL_ERROR(("Error posting send"));
|
||||
}
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
@ -1030,6 +1037,7 @@ void *mca_btl_openib_endpoint_invoke_error(void *context)
|
||||
}
|
||||
} else {
|
||||
btl = endpoint->endpoint_btl;
|
||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
}
|
||||
|
||||
/* If we didn't find a BTL, then just bail :-( */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user