1
1

Try to deal with the ticket #1831. I think we might reach a case where the

shm_fifos values are only partially updated, and this leads to wrong values
for the offset. Moving the write barrier at the right place, plus forcing
some read barriers might help.

In addition I get rid of the sm_offset array which is completely useless.

This commit was SVN r21253.
Этот коммит содержится в:
George Bosilca 2009-05-19 22:50:44 +00:00
родитель b48f13cda4
Коммит 7bd97ac17b
2 изменённых файлов: 7 добавлений и 14 удалений

Просмотреть файл

@ -227,11 +227,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
mca_btl_sm_component.sm_mpool_base = mca_btl_sm_component.sm_mpool_base =
mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]); mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]);
/* set the shared memory offset */
mca_btl_sm_component.sm_offset = (ptrdiff_t*)calloc(n, sizeof(ptrdiff_t));
if(NULL == mca_btl_sm_component.sm_offset)
return OMPI_ERR_OUT_OF_RESOURCE;
/* create a list of peers */ /* create a list of peers */
mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**) mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**)
calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
@ -277,7 +272,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
return OMPI_ERROR; return OMPI_ERROR;
} }
mca_btl_sm_component.shm_fifo = (sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr; mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr;
mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n); mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n);
mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n); mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n);
@ -293,8 +288,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;
opal_atomic_wmb();
/* cache the pointer to the 2d fifo array. These addresses /* cache the pointer to the 2d fifo array. These addresses
* are valid in the current process space */ * are valid in the current process space */
mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n); mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);
@ -499,14 +492,16 @@ int mca_btl_sm_add_procs(
goto CLEANUP; goto CLEANUP;
} }
opal_atomic_wmb();
/* Sync with other local procs. Force the FIFO initialization to always /* Sync with other local procs. Force the FIFO initialization to always
* happens before the readers access it. * happens before the readers access it.
*/ */
opal_atomic_add_32( &mca_btl_sm_component.mmap_file->map_seg->seg_inited, 1); opal_atomic_add_32( &mca_btl_sm_component.mmap_file->map_seg->seg_inited, 1);
while( n_local_procs > while( n_local_procs >
mca_btl_sm_component.mmap_file->map_seg->seg_inited) { mca_btl_sm_component.mmap_file->map_seg->seg_inited) {
opal_atomic_rmb();
opal_progress(); opal_progress();
opal_atomic_rmb();
} }
/* coordinate with other processes */ /* coordinate with other processes */
@ -516,14 +511,14 @@ int mca_btl_sm_add_procs(
/* spin until this element is allocated */ /* spin until this element is allocated */
/* doesn't really wait for that process... FIFO might be allocated, but not initialized */ /* doesn't really wait for that process... FIFO might be allocated, but not initialized */
opal_atomic_rmb();
while(NULL == mca_btl_sm_component.shm_fifo[j]) { while(NULL == mca_btl_sm_component.shm_fifo[j]) {
opal_atomic_rmb();
opal_progress(); opal_progress();
opal_atomic_rmb();
} }
/* Calculate the difference as (my_base - their_base) */ /* Calculate the difference as (my_base - their_base) */
diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]); diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]);
mca_btl_sm_component.sm_offset[j] = diff;
/* store local address of remote fifos */ /* store local address of remote fifos */
mca_btl_sm_component.fifo[j] = mca_btl_sm_component.fifo[j] =

Просмотреть файл

@ -134,7 +134,7 @@ struct mca_btl_sm_component_t {
mca_common_sm_mmap_t *mmap_file; /**< description of mmap'ed file */ mca_common_sm_mmap_t *mmap_file; /**< description of mmap'ed file */
mca_common_sm_file_header_t *sm_ctl_header; /* control header in mca_common_sm_file_header_t *sm_ctl_header; /* control header in
shared memory */ shared memory */
sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ volatile sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */
char **shm_bases; /**< pointer to base pointers in shared memory */ char **shm_bases; /**< pointer to base pointers in shared memory */
uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */ uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */
sm_fifo_t **fifo; /**< cached copy of the pointer to the 2D sm_fifo_t **fifo; /**< cached copy of the pointer to the 2D
@ -146,8 +146,6 @@ struct mca_btl_sm_component_t {
size_t fifo_size; /**< number of FIFO queue entries */ size_t fifo_size; /**< number of FIFO queue entries */
size_t fifo_lazy_free; /**< number of reads before lazy fifo free is triggered */ size_t fifo_lazy_free; /**< number of reads before lazy fifo free is triggered */
int nfifos; /**< number of FIFOs per receiver */ int nfifos; /**< number of FIFOs per receiver */
ptrdiff_t *sm_offset; /**< offset to be applied to shared memory
addresses, per local process value */
int32_t num_smp_procs; /**< current number of smp procs on this host */ int32_t num_smp_procs; /**< current number of smp procs on this host */
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
* SMP specfic data structures. */ * SMP specfic data structures. */