From 7bd97ac17be1cd3bb8ac59640ae46e19975b8deb Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 19 May 2009 22:50:44 +0000 Subject: [PATCH] Try to deal with the ticket #1831. I think we might reach a case where the shm_fifos values are only partially updated, and this leads to wrong values for the offset. Moving the write barrier at the right place, plus forcing some read barriers might help. In addition I get rid of the sm_offset array which is completely useless. This commit was SVN r21253. --- ompi/mca/btl/sm/btl_sm.c | 17 ++++++----------- ompi/mca/btl/sm/btl_sm.h | 4 +--- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 0adff49cf9..30adbac113 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -227,11 +227,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) mca_btl_sm_component.sm_mpool_base = mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]); - /* set the shared memory offset */ - mca_btl_sm_component.sm_offset = (ptrdiff_t*)calloc(n, sizeof(ptrdiff_t)); - if(NULL == mca_btl_sm_component.sm_offset) - return OMPI_ERR_OUT_OF_RESOURCE; - /* create a list of peers */ mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**) calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); @@ -277,7 +272,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) return OMPI_ERROR; } - mca_btl_sm_component.shm_fifo = (sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr; + mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr; mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n); mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n); @@ -293,8 +288,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; - opal_atomic_wmb(); - /* cache the pointer to the 2d fifo array. These addresses * are valid in the current process space */ mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n); @@ -499,14 +492,16 @@ int mca_btl_sm_add_procs( goto CLEANUP; } + opal_atomic_wmb(); + /* Sync with other local procs. Force the FIFO initialization to always * happens before the readers access it. */ opal_atomic_add_32( &mca_btl_sm_component.mmap_file->map_seg->seg_inited, 1); while( n_local_procs > mca_btl_sm_component.mmap_file->map_seg->seg_inited) { - opal_atomic_rmb(); opal_progress(); + opal_atomic_rmb(); } /* coordinate with other processes */ @@ -516,14 +511,14 @@ int mca_btl_sm_add_procs( /* spin until this element is allocated */ /* doesn't really wait for that process... FIFO might be allocated, but not initialized */ + opal_atomic_rmb(); while(NULL == mca_btl_sm_component.shm_fifo[j]) { - opal_atomic_rmb(); opal_progress(); + opal_atomic_rmb(); } /* Calculate the difference as (my_base - their_base) */ diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]); - mca_btl_sm_component.sm_offset[j] = diff; /* store local address of remote fifos */ mca_btl_sm_component.fifo[j] = diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index ddf4ef44f5..c4e101f907 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -134,7 +134,7 @@ struct mca_btl_sm_component_t { mca_common_sm_mmap_t *mmap_file; /**< description of mmap'ed file */ mca_common_sm_file_header_t *sm_ctl_header; /* control header in shared memory */ - sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ + volatile sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ char **shm_bases; /**< pointer to base pointers in shared memory */ uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */ sm_fifo_t **fifo; /**< cached copy of the pointer to the 2D @@ -146,8 +146,6 @@ struct mca_btl_sm_component_t { size_t fifo_size; /**< number of FIFO queue entries */ size_t fifo_lazy_free; /**< number of reads before lazy fifo free is triggered */ int nfifos; /**< number of FIFOs per receiver */ - ptrdiff_t *sm_offset; /**< offset to be applied to shared memory - addresses, per local process value */ int32_t num_smp_procs; /**< current number of smp procs on this host */ int32_t my_smp_rank; /**< My SMP process rank. Used for accessing * SMP specfic data structures. */