From 9ef08218565691f6c5a6a2eef1ff608e9c5cd073 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 12 Nov 2015 20:25:57 -0700 Subject: [PATCH] osc/rdma: fix some threading bugs There were two bugs in osc/rdma when using threads: - Deadlock is ompi_osc_rdma_start_atomic. This occurs because ompi_osc_rdma_frag_alloc is called with the module lock. To fix the issue the module lock is now recursive. In the future I will add a new lock to protect just the current rdma fragment. - Do not drop the lock in ompi_osc_rdma_frag_alloc when calling ompi_osc_rdma_frag_complete. Not only is it not needed but dropping the lock at this point can cause a competing thread to mess up the state. Signed-off-by: Nathan Hjelm --- ompi/mca/osc/rdma/osc_rdma_component.c | 2 +- ompi/mca/osc/rdma/osc_rdma_frag.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index d436300400..2491ab0f21 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -998,7 +998,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } /* initialize the objects, so that always free in cleanup */ - OBJ_CONSTRUCT(&module->lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t); OBJ_CONSTRUCT(&module->pending_posts, opal_list_t); OBJ_CONSTRUCT(&module->peer_lock, opal_mutex_t); diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.h b/ompi/mca/osc/rdma/osc_rdma_frag.h index 6a5215f770..fcd7243fd0 100644 --- a/ompi/mca/osc/rdma/osc_rdma_frag.h +++ b/ompi/mca/osc/rdma/osc_rdma_frag.h @@ -73,9 +73,7 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size module->rdma_frag = NULL; if (curr) { - OPAL_THREAD_UNLOCK(&module->lock); ompi_osc_rdma_frag_complete (curr); - OPAL_THREAD_LOCK(&module->lock); } item = opal_free_list_get (&mca_osc_rdma_component.frags);