diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index e0e99b1770..8ce8835fc6 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -496,6 +496,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl, trigger credit management (because the rd_credits will still be negative), and Bad Things will happen. */ if (ep->endpoint_posted_recvs) { + /* need to hold to lock for both send_cts and connected */ + OPAL_THREAD_LOCK(&ep->endpoint_lock); if (!ep->endpoint_cts_sent) { mca_btl_openib_endpoint_send_cts(ep); } diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index 484beb56dc..ea57ff1764 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -537,13 +537,11 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint) ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS; /* Send the fragment */ - OPAL_THREAD_LOCK(&endpoint->endpoint_lock); if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) { BTL_ERROR(("Failed to post CTS send")); mca_btl_openib_endpoint_invoke_error(endpoint); } endpoint->endpoint_cts_sent = true; - OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); } /* @@ -588,6 +586,9 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint) OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete", opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); mca_btl_openib_endpoint_connected(endpoint); + } else { + /* the caller hold the lock and expects us to drop it */ + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); } } diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h index c74cd5b0a6..d6846da957 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ b/opal/mca/btl/openib/btl_openib_endpoint.h @@ -342,8 +342,11 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*, void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int); void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*); int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*); + +/* the endpoint lock must be held with OPAL_THREAD_LOCK for both CTS and cpc complete */ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint); void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t*); + void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*); void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*, mca_btl_base_endpoint_t*, diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index 508ff74183..46811aae11 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -1246,6 +1246,7 @@ static void *local_endpoint_cpc_complete(void *context) OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s", opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); mca_btl_openib_endpoint_cpc_complete(endpoint); return NULL;