move temp buffer allocation out of the iteration loop - i.e. always use the
same temp loop. The algorithm is rather synchronous already... This commit was SVN r17831.
Этот коммит содержится в:
родитель
a1766b29f6
Коммит
afcd1016fd
@ -406,21 +406,17 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
|
||||
count_processed=0;
|
||||
|
||||
/* debug */
|
||||
t0=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
sm_buffer_desc=alloc_sm2_shared_buffer(sm_module);
|
||||
/* debug */
|
||||
t1=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
|
||||
/* get a pointer to the shared-memory working buffer */
|
||||
/* NOTE: starting with a rather synchronous approach */
|
||||
for( stripe_number=0 ; stripe_number < n_data_segments ; stripe_number++ ) {
|
||||
/* debug */
|
||||
t0=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
sm_buffer_desc=alloc_sm2_shared_buffer(sm_module);
|
||||
sm_buffer=sm_buffer_desc->base_segment_address;
|
||||
if( NULL == sm_buffer) {
|
||||
rc=OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Error;
|
||||
}
|
||||
/* debug */
|
||||
t1=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
/* get number of elements to process in this stripe */
|
||||
count_this_stripe=n_dts_per_buffer;
|
||||
if( count_processed + count_this_stripe > count )
|
||||
@ -605,6 +601,9 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* signal that I am done */
|
||||
my_ctl_pointer->flag=tag;
|
||||
|
||||
} else {
|
||||
|
||||
tag=base_tag+my_exchange_node->n_tags-1;
|
||||
@ -619,6 +618,18 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
*/
|
||||
my_ctl_pointer->flag=tag;
|
||||
|
||||
/* wait until child is done to move on - this buffer will
|
||||
* be reused for the next stripe, so don't want to move
|
||||
* on too quick.
|
||||
*/
|
||||
extra_rank=my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer=
|
||||
sm_buffer_desc->proc_memory[extra_rank].control_region;
|
||||
|
||||
/* wait until remote data is read */
|
||||
while( extra_ctl_pointer->flag < tag ) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -632,25 +643,30 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
if( 0 != rc ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* debug */
|
||||
t9=opal_sys_timer_get_cycles();
|
||||
timers[5]+=(t9-t8);
|
||||
/* end debug */
|
||||
|
||||
/* "free" the shared-memory working buffer */
|
||||
rc=free_sm2_shared_buffer(sm_module);
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
goto Error;
|
||||
}
|
||||
/* debug */
|
||||
t10=opal_sys_timer_get_cycles();
|
||||
timers[6]+=(t10-t9);
|
||||
/* end debug */
|
||||
|
||||
/* update the count of elements processed */
|
||||
count_processed+=count_this_stripe;
|
||||
}
|
||||
|
||||
|
||||
/* debug */
|
||||
|
||||
t9=opal_sys_timer_get_cycles();
|
||||
timers[5]+=(t9-t8);
|
||||
/* end debug */
|
||||
|
||||
|
||||
/* "free" the shared-memory working buffer */
|
||||
rc=free_sm2_shared_buffer(sm_module);
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* debug */
|
||||
t10=opal_sys_timer_get_cycles();
|
||||
timers[6]+=(t10-t9);
|
||||
/* end debug */
|
||||
|
||||
/* return */
|
||||
return rc;
|
||||
|
||||
@ -734,15 +750,25 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
/* get a pointer to the shared-memory working buffer */
|
||||
/* NOTE: starting with a rather synchronous approach */
|
||||
|
||||
|
||||
/* debug */
|
||||
t0=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
|
||||
/* use the same set of buffers for a single reduction */
|
||||
sm_buffer_desc=alloc_sm2_shared_buffer(sm_module);
|
||||
|
||||
/* get pointers to my work buffers */
|
||||
my_ctl_pointer=sm_buffer_desc->proc_memory[my_rank].control_region;
|
||||
my_write_pointer=sm_buffer_desc->proc_memory[my_rank].data_segment;
|
||||
my_read_pointer=my_write_pointer+len_data_buffer;
|
||||
my_tmp_data_buffer[0]=my_write_pointer;
|
||||
my_tmp_data_buffer[1]=my_read_pointer;
|
||||
|
||||
/* debug */
|
||||
t1=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
for( stripe_number=0 ; stripe_number < n_data_segments ; stripe_number++ ) {
|
||||
/* debug */
|
||||
t0=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
/* debug */
|
||||
t1=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
/* get number of elements to process in this stripe */
|
||||
count_this_stripe=n_dts_per_buffer;
|
||||
if( count_processed + count_this_stripe > count )
|
||||
@ -755,12 +781,6 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
base_tag=sm_module->collective_tag;
|
||||
sm_module->collective_tag+=my_exchange_node->n_tags;
|
||||
|
||||
/* get pointers to my work buffers */
|
||||
my_ctl_pointer=sm_buffer_desc->proc_memory[my_rank].control_region;
|
||||
my_write_pointer=sm_buffer_desc->proc_memory[my_rank].data_segment;
|
||||
my_read_pointer=my_write_pointer+len_data_buffer;
|
||||
my_tmp_data_buffer[0]=my_write_pointer;
|
||||
my_tmp_data_buffer[1]=my_read_pointer;
|
||||
/* debug */
|
||||
t2=opal_sys_timer_get_cycles();
|
||||
timers[0]+=(t2-t1);
|
||||
@ -823,7 +843,6 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
|
||||
/* loop over data exchanges */
|
||||
for(exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) {
|
||||
|
||||
/* debug */
|
||||
t4=opal_sys_timer_get_cycles();
|
||||
/* end debug */
|
||||
@ -927,6 +946,10 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* signal that I am done */
|
||||
my_ctl_pointer->flag=tag;
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
tag=base_tag+my_exchange_node->n_tags-1;
|
||||
@ -941,6 +964,18 @@ int mca_coll_sm2_allreduce_intra_recursive_doubling(void *sbuf, void *rbuf,
|
||||
*/
|
||||
my_ctl_pointer->flag=tag;
|
||||
|
||||
/* wait until child is done to move on - this buffer will
|
||||
* be reused for the next stripe, so don't want to move
|
||||
* on too quick.
|
||||
*/
|
||||
extra_rank=my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer=
|
||||
sm_buffer_desc->proc_memory[extra_rank].control_region;
|
||||
/* wait until remote data is read */
|
||||
while(! ( extra_ctl_pointer->flag < tag ) ) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user