fix some barrier addressing errors.
This commit was SVN r17830.
Этот коммит содержится в:
родитель
0453e7d2f4
Коммит
a1766b29f6
@ -435,36 +435,36 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
|
||||
/* get pointers to my work buffers */
|
||||
my_ctl_pointer=sm_buffer_desc->proc_memory[my_rank].control_region;
|
||||
my_write_pointer=sm_buffer_desc->proc_memory[my_rank].data_segment;
|
||||
my_read_pointer=my_write_pointer+len_data_buffer;
|
||||
my_tmp_data_buffer[0]=my_write_pointer;
|
||||
my_tmp_data_buffer[1]=my_read_pointer;
|
||||
/* debug */
|
||||
t2=opal_sys_timer_get_cycles();
|
||||
timers[0]+=(t2-t1);
|
||||
/* end debug */
|
||||
my_write_pointer=sm_buffer_desc->proc_memory[my_rank].data_segment;
|
||||
my_read_pointer=my_write_pointer+len_data_buffer;
|
||||
my_tmp_data_buffer[0]=my_write_pointer;
|
||||
my_tmp_data_buffer[1]=my_read_pointer;
|
||||
/* debug */
|
||||
t2=opal_sys_timer_get_cycles();
|
||||
timers[0]+=(t2-t1);
|
||||
/* end debug */
|
||||
|
||||
/* copy data into the write buffer */
|
||||
rc=ompi_ddt_copy_content_same_ddt(dtype, count_this_stripe,
|
||||
(char *)my_write_pointer,
|
||||
(char *)((char *)sbuf+dt_extent*count_processed));
|
||||
if( 0 != rc ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* debug */
|
||||
t3=opal_sys_timer_get_cycles();
|
||||
timers[1]+=(t3-t2);
|
||||
/* end debug */
|
||||
/* copy data into the write buffer */
|
||||
rc=ompi_ddt_copy_content_same_ddt(dtype, count_this_stripe,
|
||||
(char *)my_write_pointer,
|
||||
(char *)((char *)sbuf+dt_extent*count_processed));
|
||||
if( 0 != rc ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* debug */
|
||||
t3=opal_sys_timer_get_cycles();
|
||||
timers[1]+=(t3-t2);
|
||||
/* end debug */
|
||||
|
||||
/* copy data in from the "extra" source, if need be */
|
||||
tag=base_tag;
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
/* copy data in from the "extra" source, if need be */
|
||||
tag=base_tag;
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
|
||||
if ( EXCHANGE_NODE == my_exchange_node->node_type ) {
|
||||
if ( EXCHANGE_NODE == my_exchange_node->node_type ) {
|
||||
|
||||
extra_rank=my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer=
|
||||
sm_buffer_desc->proc_memory[extra_rank].control_region;
|
||||
extra_rank=my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer=
|
||||
sm_buffer_desc->proc_memory[extra_rank].control_region;
|
||||
extra_rank_write_data_pointer=
|
||||
sm_buffer_desc->proc_memory[extra_rank].data_segment;
|
||||
|
||||
|
@ -76,7 +76,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
/* get pointer to nb-barrier structure */
|
||||
index=request->sm_index;
|
||||
sm_barrier_region=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
request->barrier_base_address[index];
|
||||
(request->barrier_base_address[index]);
|
||||
|
||||
/* set barrier tag - no atomicity needed as only only one outstanding
|
||||
* collective per communicator exists
|
||||
@ -95,7 +95,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=tag;
|
||||
/* don't need memory barrier here, as we are not setting any other sm
|
||||
* data for someone else to read
|
||||
@ -112,7 +112,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.parent_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if( sm_address->flag != -tag ) {
|
||||
/* if parent has not checked in - set parameters for async
|
||||
* completion, incomplet barrier flag, and bail
|
||||
@ -139,7 +139,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.children_ranks[child] *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if(sm_address->flag == tag ) {
|
||||
/* child arrived */
|
||||
cnt++;
|
||||
@ -162,7 +162,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=tag;
|
||||
/* don't need memory barrier here, as we are not setting any other sm
|
||||
* data for someone else to read
|
||||
@ -178,7 +178,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.parent_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if( sm_address->flag != -tag ) {
|
||||
/* if parent has not checked in - set parameters for async
|
||||
* completion, incomplet barrier flag, and bail
|
||||
@ -190,7 +190,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=-tag;
|
||||
|
||||
/*
|
||||
@ -212,7 +212,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.children_ranks[child] *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if(sm_address->flag == tag ) {
|
||||
/* child arrived */
|
||||
cnt++;
|
||||
@ -235,7 +235,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=-tag;
|
||||
|
||||
/*
|
||||
@ -297,7 +297,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.parent_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if( sm_address->flag != -tag ) {
|
||||
/* if parent has not checked in - set parameters for async
|
||||
* completion, incomplet barrier flag, and bail
|
||||
@ -330,7 +330,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.children_ranks[child] *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if(sm_address->flag == tag ) {
|
||||
/* child arrived */
|
||||
cnt++;
|
||||
@ -353,7 +353,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=tag;
|
||||
/* don't need memory barrier here, as we are not setting any other sm
|
||||
* data for someone else to read
|
||||
@ -370,7 +370,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.parent_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if( sm_address->flag != -tag ) {
|
||||
/* if parent has not checked in - set parameters for async
|
||||
* completion, incomplet barrier flag, and bail
|
||||
@ -382,7 +382,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=-tag;
|
||||
|
||||
/*
|
||||
@ -408,7 +408,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.children_ranks[child] *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
if(sm_address->flag == tag ) {
|
||||
/* child arrived */
|
||||
cnt++;
|
||||
@ -431,7 +431,7 @@ int mca_coll_sm2_nbbarrier_intra_progress(struct ompi_communicator_t *comm,
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
((char *)sm_barrier_region+
|
||||
sm_module->sm_buffer_mgmt_barrier_tree.my_rank *
|
||||
sm_module->segement_size_per_process);
|
||||
sm_module->sm2_size_management_region_per_proc);
|
||||
sm_address->flag=-tag;
|
||||
|
||||
/*
|
||||
|
@ -424,9 +424,9 @@ static int init_sm2_barrier(struct ompi_communicator_t *comm,
|
||||
for( j =0 ; j < 2 ; j++ ) {
|
||||
module->barrier_request[i].barrier_base_address[j]=
|
||||
(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
(module->shared_memory_region + j*
|
||||
module->sm2_size_management_region_per_proc *
|
||||
module->sm_buffer_mgmt_barrier_tree.tree_size);
|
||||
(module->shared_memory_region +
|
||||
/* there are 2 barrier structs per bank */
|
||||
(2*i+j)*CACHE_LINE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -435,6 +435,7 @@ static int init_sm2_barrier(struct ompi_communicator_t *comm,
|
||||
module->sm2_size_management_region_per_proc*
|
||||
module->sm_buffer_mgmt_barrier_tree.tree_size;
|
||||
|
||||
|
||||
/* set the pointer to the request that needs to be completed first */
|
||||
module->current_request_index=0;
|
||||
|
||||
@ -470,6 +471,7 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
size_t mem_management_total;
|
||||
size_t size_sm2_backing_file;
|
||||
size_t len;
|
||||
size_t size_buff_ctl_per_proc,size_data_buff_per_proc;
|
||||
|
||||
/*
|
||||
* This is activated only for intra-communicators
|
||||
@ -557,6 +559,8 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
#if 0 /* data buffers and management buffers are allocated in a single
|
||||
* contigous region */
|
||||
/*
|
||||
* Now figure out how much memory to allocate for use as
|
||||
* working memory for the shared memory collectives.
|
||||
@ -607,8 +611,67 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
|
||||
/* compute total memory in the memory banks */
|
||||
tot_size_mem_banks=tot_size_per_bank*mca_coll_sm2_component.sm2_num_mem_banks;
|
||||
sm_module->data_memory_per_proc_per_segment=size_tot_per_proc_per_seg-
|
||||
ctl_memory_per_proc_per_segment;
|
||||
|
||||
/* compute the amount of memory needed for the anynchromous barriers used to
|
||||
#endif
|
||||
|
||||
/* management structures are allocated is a one segment, and data buffers
|
||||
* in a separate segment
|
||||
*/
|
||||
/*
|
||||
* Now figure out how much memory to allocate for use as
|
||||
* working memory for the shared memory collectives.
|
||||
*/
|
||||
/*
|
||||
* get control region size
|
||||
*/
|
||||
/* just enough place for two flags per process */
|
||||
ctl_memory_per_proc_per_segment=2*sizeof(long long);
|
||||
if( mca_coll_sm2_component.sm2_ctl_size_per_proc > ctl_memory_per_proc_per_segment )
|
||||
ctl_memory_per_proc_per_segment=mca_coll_sm2_component.sm2_ctl_size_per_proc;
|
||||
|
||||
/* pad this up to the alignment needed by the data segment, as the
|
||||
* that data segment will directly follow the control segment in
|
||||
* memory.
|
||||
*/
|
||||
alignment=mca_coll_sm2_component.sm2_data_alignment;
|
||||
ctl_memory_per_proc_per_segment=
|
||||
(alignment + ctl_memory_per_proc_per_segment -1) / alignment;
|
||||
ctl_memory_per_proc_per_segment*=alignment;
|
||||
mca_coll_sm2_component.sm2_ctl_size_allocated=ctl_memory_per_proc_per_segment;
|
||||
sm_module->ctl_memory_per_proc_per_segment=ctl_memory_per_proc_per_segment;
|
||||
|
||||
/* get data region size - allocation happens on a page granularity, with
|
||||
* a minimum of a page allocated per proc, so adjust to this
|
||||
*/
|
||||
size=mca_coll_sm2_component.sm2_data_seg_size;
|
||||
if( size < getpagesize() )
|
||||
size=getpagesize();
|
||||
if( size > mca_coll_sm2_component.sm2_max_data_seg_size )
|
||||
size=mca_coll_sm2_component.sm2_max_data_seg_size;
|
||||
size= ( size + getpagesize() - 1)/getpagesize();
|
||||
size*=getpagesize();
|
||||
sm_module->segment_size=size*group_size;
|
||||
size_data_buff_per_proc=size;
|
||||
|
||||
/* compute size of management region - per proc */
|
||||
size_buff_ctl_per_proc=
|
||||
ctl_memory_per_proc_per_segment*sm_module->sm2_module_num_buffers;
|
||||
size_buff_ctl_per_proc= ( size_buff_ctl_per_proc + getpagesize() - 1)/
|
||||
getpagesize();
|
||||
size_buff_ctl_per_proc*=getpagesize();
|
||||
|
||||
tot_size_mem_banks=
|
||||
/* size of buffer conrol region */
|
||||
size_buff_ctl_per_proc*group_size+
|
||||
/* size of data buffers */
|
||||
size*sm_module->sm2_module_num_buffers*group_size;
|
||||
sm_module->data_memory_per_proc_per_segment=size;
|
||||
|
||||
|
||||
/*
|
||||
* compute the amount of memory needed for the anynchromous barriers used to
|
||||
* manage the memory resources.
|
||||
*/
|
||||
/* for each bank, 2 sets of barrier buffers */
|
||||
@ -706,6 +769,9 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
sm_module->sm2_first_buffer_index_next_bank=0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* buffers and control region are contiguous */
|
||||
|
||||
/* setup shared memory memory descriptors */
|
||||
for( i=0 ; i < sm_module->sm2_module_num_buffers ; i++ ) {
|
||||
|
||||
@ -736,6 +802,47 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
/* setup shared memory memory descriptors */
|
||||
for( i=0 ; i < sm_module->sm2_module_num_buffers ; i++ ) {
|
||||
|
||||
char *base_buffer;
|
||||
volatile mca_coll_sm2_nb_request_process_shared_mem_t *ctl_ptr;
|
||||
|
||||
/* set the base address for this working buffer */
|
||||
base_buffer= sm_module->collective_buffer_region+
|
||||
/* offset past control data structures */
|
||||
size_buff_ctl_per_proc*group_size +
|
||||
i*sm_module->segment_size;
|
||||
sm_module->sm_buffer_descriptor[i].base_segment_address=base_buffer;
|
||||
|
||||
/* allocate array to keep data on each segment in the buffer.
|
||||
* One segment per process in the group.
|
||||
*/
|
||||
sm_module->sm_buffer_descriptor[i].proc_memory=
|
||||
(sm_memory_region_desc_t *)malloc(sizeof(sm_memory_region_desc_t)*
|
||||
group_size);
|
||||
if( NULL == sm_module->sm_buffer_descriptor[i].proc_memory ) {
|
||||
goto CLEANUP;
|
||||
}
|
||||
for(j=0 ; j < group_size ; j++ ) {
|
||||
ctl_ptr=(volatile mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
(base_buffer+j* sm_module->segement_size_per_process);
|
||||
sm_module->sm_buffer_descriptor[i].proc_memory[j].control_region=
|
||||
(volatile mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
/* offset to temp space */
|
||||
(sm_module->collective_buffer_region+
|
||||
/* offset to the per-proc control region */
|
||||
size_buff_ctl_per_proc*j+
|
||||
/* offset to control structure for the i'th buffer */
|
||||
ctl_memory_per_proc_per_segment*i);
|
||||
sm_module->sm_buffer_descriptor[i].proc_memory[j].data_segment=
|
||||
(char *)base_buffer+
|
||||
/* offset to data segment for the j'th proc */
|
||||
j*size_data_buff_per_proc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* touch pages to apply memory affinity - Note: do we really need this or will
|
||||
* the algorithms do this */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user