diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 28fd735ab9..966796f193 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -3666,7 +3666,7 @@ error: #endif if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) { - BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s " + BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s" "status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d", cq_name[cq], btl_openib_component_status_to_string(wc->status), wc->status, wc->wr_id, @@ -3707,9 +3707,36 @@ error: } } - if(openib_btl) + if(openib_btl) { + /* return send wqe */ + qp_put_wqe(endpoint, qp); + + /* return wqes that were sent before this frag */ + n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des)); + + /* force emptying the pending frags toward the dead endpoint + * in progress_pending_frags* below */ + endpoint->endpoint_state = MCA_BTL_IB_FAILED; + + if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) { + BTL_VERBOSE(("frag %p returning %d credits", frag, 1+n)); + OPAL_THREAD_FETCH_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n); + /* new SRQ credit available. Try to progress pending frags*/ + progress_pending_frags_srq(openib_btl, qp); + } + /* new wqe or/and get token available. Try to progress pending frags */ + progress_pending_frags_wqe(endpoint, qp); + mca_btl_openib_frag_progress_pending_put_get(endpoint, qp); + + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + des->des_cbfunc(&openib_btl->super, endpoint, des, wc->status); + } + if (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) { + mca_btl_openib_free(&openib_btl->super, des); + } openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, (struct opal_proc_t*)remote_proc, NULL); + } } static int poll_device(mca_btl_openib_device_t* device, int count) diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index e1f9a32681..6fbf276d9f 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -502,6 +502,7 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint) mca_btl_openib_frag_t *openib_frag; mca_btl_openib_com_frag_t *com_frag; mca_btl_openib_control_header_t *ctl_hdr; + int rc; OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)", opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), @@ -538,11 +539,14 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint) ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS; /* Send the fragment */ - if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) { - BTL_ERROR(("Failed to post CTS send")); - mca_btl_openib_endpoint_invoke_error(endpoint); + if (OPAL_SUCCESS != (rc = mca_btl_openib_endpoint_post_send(endpoint, sc_frag))) { + if( OPAL_ERR_RESOURCE_BUSY != rc ) { + BTL_ERROR(("Failed to post CTS send")); + mca_btl_openib_endpoint_invoke_error(endpoint); + } + } else { + endpoint->endpoint_cts_sent = true; } - endpoint->endpoint_cts_sent = true; } /* @@ -611,8 +615,8 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) mca_btl_openib_send_frag_t *frag; mca_btl_openib_endpoint_t *ep; bool master = false; + int rc; - opal_output(-1, "Now we are CONNECTED"); if (MCA_BTL_XRC_ENABLED) { opal_mutex_lock (&endpoint->ib_addr->addr_lock); if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) { @@ -664,8 +668,11 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) frag = to_send_frag(frag_item); /* We need to post this one */ - if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) { - BTL_ERROR(("Error posting send")); + if(OPAL_SUCCESS != (rc = mca_btl_openib_endpoint_post_send(endpoint, frag))) { + /* if we are out of resources, let's try to reschedule everything later */ + if( OPAL_ERR_RESOURCE_BUSY != rc ) { + BTL_ERROR(("Error posting send")); + } } } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); @@ -1029,6 +1036,7 @@ void *mca_btl_openib_endpoint_invoke_error(void *context) } } else { btl = endpoint->endpoint_btl; + endpoint->endpoint_state = MCA_BTL_IB_FAILED; } /* If we didn't find a BTL, then just bail :-( */