Try to deal with the ticket #1831. I think we might reach a case where the
shm_fifos values are only partially updated, and this leads to wrong values for the offset. Moving the write barrier at the right place, plus forcing some read barriers might help. In addition I get rid of the sm_offset array which is completely useless. This commit was SVN r21253.
This commit is contained in:
parent
b48f13cda4
commit
7bd97ac17b
@ -227,11 +227,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
|
||||
mca_btl_sm_component.sm_mpool_base =
|
||||
mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]);
|
||||
|
||||
/* set the shared memory offset */
|
||||
mca_btl_sm_component.sm_offset = (ptrdiff_t*)calloc(n, sizeof(ptrdiff_t));
|
||||
if(NULL == mca_btl_sm_component.sm_offset)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
/* create a list of peers */
|
||||
mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**)
|
||||
calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
|
||||
@ -277,7 +272,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
mca_btl_sm_component.shm_fifo = (sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr;
|
||||
mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr;
|
||||
mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n);
|
||||
mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n);
|
||||
|
||||
@ -293,8 +288,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
|
||||
|
||||
mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;
|
||||
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* cache the pointer to the 2d fifo array. These addresses
|
||||
* are valid in the current process space */
|
||||
mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);
|
||||
@ -499,14 +492,16 @@ int mca_btl_sm_add_procs(
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Sync with other local procs. Force the FIFO initialization to always
|
||||
* happens before the readers access it.
|
||||
*/
|
||||
opal_atomic_add_32( &mca_btl_sm_component.mmap_file->map_seg->seg_inited, 1);
|
||||
while( n_local_procs >
|
||||
mca_btl_sm_component.mmap_file->map_seg->seg_inited) {
|
||||
opal_atomic_rmb();
|
||||
opal_progress();
|
||||
opal_atomic_rmb();
|
||||
}
|
||||
|
||||
/* coordinate with other processes */
|
||||
@ -516,14 +511,14 @@ int mca_btl_sm_add_procs(
|
||||
|
||||
/* spin until this element is allocated */
|
||||
/* doesn't really wait for that process... FIFO might be allocated, but not initialized */
|
||||
opal_atomic_rmb();
|
||||
while(NULL == mca_btl_sm_component.shm_fifo[j]) {
|
||||
opal_atomic_rmb();
|
||||
opal_progress();
|
||||
opal_atomic_rmb();
|
||||
}
|
||||
|
||||
/* Calculate the difference as (my_base - their_base) */
|
||||
diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]);
|
||||
mca_btl_sm_component.sm_offset[j] = diff;
|
||||
|
||||
/* store local address of remote fifos */
|
||||
mca_btl_sm_component.fifo[j] =
|
||||
|
@ -134,7 +134,7 @@ struct mca_btl_sm_component_t {
|
||||
mca_common_sm_mmap_t *mmap_file; /**< description of mmap'ed file */
|
||||
mca_common_sm_file_header_t *sm_ctl_header; /* control header in
|
||||
shared memory */
|
||||
sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */
|
||||
volatile sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */
|
||||
char **shm_bases; /**< pointer to base pointers in shared memory */
|
||||
uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */
|
||||
sm_fifo_t **fifo; /**< cached copy of the pointer to the 2D
|
||||
@ -146,8 +146,6 @@ struct mca_btl_sm_component_t {
|
||||
size_t fifo_size; /**< number of FIFO queue entries */
|
||||
size_t fifo_lazy_free; /**< number of reads before lazy fifo free is triggered */
|
||||
int nfifos; /**< number of FIFOs per receiver */
|
||||
ptrdiff_t *sm_offset; /**< offset to be applied to shared memory
|
||||
addresses, per local process value */
|
||||
int32_t num_smp_procs; /**< current number of smp procs on this host */
|
||||
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
|
Loading…
Reference in New Issue
Block a user