From d7375ec1024ea9337dd454b92f33e3294d7ed6b8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 31 Oct 2006 09:54:52 +0000 Subject: [PATCH] Fix deadlock reported by Andrew Friedley: What's happening is that we're holding openib_btl->eager_rdma_lock when we call mca_btl_openib_endpoint_send_eager_rdma() on btl_openib_endpoint.c:1227. This in turn calls mca_btl_openib_endpoint_send() on line 1179. Then, if the endpoint state isn't MCA_BTL_IB_CONNECTED or MCA_BTL_IB_FAILED, we call opal_progress(), where we eventually try to lock openib_btl->eager_rdma_lock at btl_openib_component.c:997. The fix removes this lock altogether. Instead we atomically set local RDMA pointer to prevent other threads to create rdma buffer for the same endpoint. And we increment eager_rdma_buffers_count atomically thus polling thread doesn't need lock around it. This commit was SVN r12369. --- ompi/mca/btl/openib/btl_openib.h | 3 +- ompi/mca/btl/openib/btl_openib_component.c | 3 -- ompi/mca/btl/openib/btl_openib_endpoint.c | 39 +++++++++++----------- ompi/mca/btl/openib/btl_openib_endpoint.h | 1 - 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 0a6ef7c832..0c48f77f9e 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -201,10 +201,9 @@ struct mca_btl_openib_module_t { opal_list_t pending_frags[2]; /**< list of pending frags */ - opal_mutex_t eager_rdma_lock; size_t eager_rdma_frag_size; /**< length of eager frag */ orte_pointer_array_t *eager_rdma_buffers; /**< RDMA buffers to poll */ - uint32_t eager_rdma_buffers_count; /**< number of RDMA buffers */ + volatile uint32_t eager_rdma_buffers_count; /**< number of RDMA buffers */ mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */ diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 9201a35afd..f706d12355 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -689,7 +689,6 @@ btl_openib_component_init(int *num_btl_modules, mca_btl_openib_component.max_eager_rdma, 0); openib_btl->eager_rdma_buffers_count = 0; - OBJ_CONSTRUCT(&openib_btl->eager_rdma_lock, opal_mutex_t); orte_pointer_array_init(&openib_btl->endpoints, 10, INT_MAX, 100); btls[i] = &openib_btl->super; @@ -994,9 +993,7 @@ static int btl_openib_component_progress(void) for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i]; - OPAL_THREAD_LOCK(&openib_btl->eager_rdma_lock); c = openib_btl->eager_rdma_buffers_count; - OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock); for(j = 0; j < c; j++) { endpoint = diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index b373e1d4c6..a49bcc3518 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -1191,10 +1191,13 @@ void mca_btl_openib_endpoint_connect_eager_rdma( mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; char *buf; unsigned int i; + orte_std_cntr_t index; - OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock); - if (endpoint->eager_rdma_local.base.pval) - goto unlock_rdma_local; + /* Set local rdma pointer to 1 temporarily so other threads will not try + * to enter the function */ + if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, + (void*)1)) + return; buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, openib_btl->eager_rdma_frag_size * @@ -1221,28 +1224,26 @@ void mca_btl_openib_endpoint_connect_eager_rdma( ((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; } - OPAL_THREAD_LOCK(&openib_btl->eager_rdma_lock); - if(orte_pointer_array_add (&endpoint->eager_rdma_index, - openib_btl->eager_rdma_buffers, endpoint) < 0) + /* set local rdma pointer to real value */ + opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, + buf); + + if(orte_pointer_array_add(&index, openib_btl->eager_rdma_buffers, endpoint) + != ORTE_SUCCESS) goto cleanup; - endpoint->eager_rdma_local.base.pval = buf; - openib_btl->eager_rdma_buffers_count++; - if (mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) { - OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock); - OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock); + if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) { + /* from this point progress function starts to poll new buffer */ + OPAL_THREAD_ADD32(&openib_btl->eager_rdma_buffers_count, 1); return; } - openib_btl->eager_rdma_buffers_count--; - endpoint->eager_rdma_local.base.pval = NULL; - orte_pointer_array_set_item(openib_btl->eager_rdma_buffers, - endpoint->eager_rdma_index, NULL); - + orte_pointer_array_set_item(openib_btl->eager_rdma_buffers, index, NULL); cleanup: - OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock); openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, - buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); + buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); unlock_rdma_local: - OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock); + /* set local rdma pointer back to zero. Will retry later */ + opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, + endpoint->eager_rdma_local.base.pval, NULL); } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index c952199a32..9b683eaa03 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -154,7 +154,6 @@ struct mca_btl_base_endpoint_t { /**< info about remote RDMA buffer */ mca_btl_openib_eager_rdma_local_t eager_rdma_local; /**< info about local RDMA buffer */ - int32_t eager_rdma_index; /**< index into RDMA buffers pointer array */ uint32_t index; /**< index of the endpoint in endpoints array */ struct mca_btl_openib_frag_t *credit_frag[2]; /**< frags for sending explicit high priority credits */ };