As Andrew Friedley pointed, my previous patch may cause deadlock if
mca_btl_openib_endpoint_connect_eager_rdma() is called recursively. He also noticed that orte_pointer_array_add() can't fail because we allocate max number of elements at init time. So just remove error handling and locking. No locking - no deadlocks. This commit was SVN r12388.
Этот коммит содержится в:
родитель
3bf31fe4a3
Коммит
4c784b6403
@ -203,7 +203,6 @@ struct mca_btl_openib_module_t {
|
||||
|
||||
size_t eager_rdma_frag_size; /**< length of eager frag */
|
||||
orte_pointer_array_t *eager_rdma_buffers; /**< RDMA buffers to poll */
|
||||
opal_mutex_t eager_rdma_buffres_lock; /**< should be held while adding new rdma buffer */
|
||||
volatile int32_t eager_rdma_buffers_count; /**< number of RDMA buffers */
|
||||
|
||||
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
|
||||
|
@ -689,7 +689,6 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
mca_btl_openib_component.max_eager_rdma,
|
||||
0);
|
||||
openib_btl->eager_rdma_buffers_count = 0;
|
||||
OBJ_CONSTRUCT(&openib_btl->eager_rdma_buffres_lock, opal_mutex_t);
|
||||
|
||||
orte_pointer_array_init(&openib_btl->endpoints, 10, INT_MAX, 100);
|
||||
btls[i] = &openib_btl->super;
|
||||
|
@ -1228,24 +1228,16 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
|
||||
opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1,
|
||||
buf);
|
||||
|
||||
/* lock is held during eager_rdma_buffers update in order to prevent hole
|
||||
* in the array if mca_btl_openib_endpoint_send_eager_rdma() fails and
|
||||
* another thread was creating eager RDMA buffer for another endpoint and
|
||||
* allocated array index bigger then ours */
|
||||
OPAL_THREAD_LOCK(&openib_btl->eager_rdma_buffres_lock);
|
||||
if(orte_pointer_array_add(&index, openib_btl->eager_rdma_buffers, endpoint)
|
||||
!= ORTE_SUCCESS)
|
||||
goto cleanup;
|
||||
|
||||
if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) {
|
||||
/* This can never fail because max number of entries allocated
|
||||
* at init time */
|
||||
orte_pointer_array_add(&index, openib_btl->eager_rdma_buffers,
|
||||
endpoint);
|
||||
/* from this point progress function starts to poll new buffer */
|
||||
OPAL_THREAD_ADD32(&openib_btl->eager_rdma_buffers_count, 1);
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_buffres_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
orte_pointer_array_set_item(openib_btl->eager_rdma_buffers, index, NULL);
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_buffres_lock);
|
||||
cleanup:
|
||||
openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
|
||||
buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user