common/ompio: use avg. file view size in the aggregator selection logic
This is a fix based on a bugreport on github/mailing list from CGNS. The core of the problem was that different processes entered different branches of our aggregator selection logic, due to the fact that in some cases processes had a matching file_view size and contiguous chunk size (thus assuming 1-D distribution), and some processes did not (thus assuming 2-D distribution). The fix is to calculate the avg. file view size across all processes and use this value, thus ensuring that all processes enter the same branch. Fixes issue #7809 Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu> (cherry picked from commit 4a8a330bbaf9fe5ea07cd01146afb83b569f3138)
Этот коммит содержится в:
родитель
f334a699b7
Коммит
eeee011ac0
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2016 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
|
||||
@ -165,6 +165,7 @@ struct ompio_file_t {
|
||||
size_t f_stripe_size;
|
||||
int f_stripe_count;
|
||||
size_t f_cc_size;
|
||||
size_t f_avg_view_size;
|
||||
int f_bytes_per_agg;
|
||||
enum ompio_fs_type f_fstype;
|
||||
ompi_request_t *f_split_coll_req;
|
||||
|
@ -107,7 +107,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
|
||||
/* Determine whether to use the formula for 1-D or 2-D data decomposition. Anything
|
||||
** that is not 1-D is assumed to be 2-D in this version
|
||||
*/
|
||||
mode = ( fh->f_cc_size == fh->f_view_size ) ? 1 : 2;
|
||||
mode = ( fh->f_cc_size == fh->f_avg_view_size ) ? 1 : 2;
|
||||
|
||||
/* Determine the increment size when searching the optimal
|
||||
** no. of aggregators
|
||||
|
@ -328,36 +328,28 @@ exit:
|
||||
|
||||
OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (ompio_file_t *fh, int flag)
|
||||
{
|
||||
int uniform = 0;
|
||||
OMPI_MPI_OFFSET_TYPE avg[3] = {0,0,0};
|
||||
OMPI_MPI_OFFSET_TYPE global_avg[3] = {0,0,0};
|
||||
int i = 0;
|
||||
|
||||
/* This function does two things: first, it determines the average data chunk
|
||||
** size in the file view for each process and across all processes.
|
||||
** Second, it establishes whether the view across all processes is uniform.
|
||||
** By definition, uniform means:
|
||||
** 1. the file view of each process has the same number of contiguous sections
|
||||
** 2. each section in the file view has exactly the same size
|
||||
/* This function determines the average data chunk
|
||||
** size in the file view for each process and across all processes,
|
||||
** and the avg. file_view size across processes.
|
||||
*/
|
||||
|
||||
if ( flag ) {
|
||||
global_avg[0] = MCA_IO_DEFAULT_FILE_VIEW_SIZE;
|
||||
fh->f_avg_view_size = fh->f_view_size;
|
||||
}
|
||||
else {
|
||||
for (i=0 ; i<(int)fh->f_iov_count ; i++) {
|
||||
avg[0] += fh->f_decoded_iov[i].iov_len;
|
||||
if (i && 0 == uniform) {
|
||||
if (fh->f_decoded_iov[i].iov_len != fh->f_decoded_iov[i-1].iov_len) {
|
||||
uniform = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( 0 != fh->f_iov_count ) {
|
||||
avg[0] = avg[0]/fh->f_iov_count;
|
||||
}
|
||||
avg[1] = (OMPI_MPI_OFFSET_TYPE) fh->f_iov_count;
|
||||
avg[2] = (OMPI_MPI_OFFSET_TYPE) uniform;
|
||||
avg[2] = (OMPI_MPI_OFFSET_TYPE) fh->f_view_size;
|
||||
|
||||
fh->f_comm->c_coll->coll_allreduce (avg,
|
||||
global_avg,
|
||||
@ -368,37 +360,7 @@ OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (ompio_file_t *fh, int flag)
|
||||
fh->f_comm->c_coll->coll_allreduce_module);
|
||||
global_avg[0] = global_avg[0]/fh->f_size;
|
||||
global_avg[1] = global_avg[1]/fh->f_size;
|
||||
|
||||
#if 0
|
||||
/* Disabling the feature since we are not using it anyway. Saves us one allreduce operation. */
|
||||
int global_uniform=0;
|
||||
|
||||
if ( global_avg[0] == avg[0] &&
|
||||
global_avg[1] == avg[1] &&
|
||||
0 == avg[2] &&
|
||||
0 == global_avg[2] ) {
|
||||
uniform = 0;
|
||||
}
|
||||
else {
|
||||
uniform = 1;
|
||||
}
|
||||
|
||||
/* second confirmation round to see whether all processes agree
|
||||
** on having a uniform file view or not
|
||||
*/
|
||||
fh->f_comm->c_coll->coll_allreduce (&uniform,
|
||||
&global_uniform,
|
||||
1,
|
||||
MPI_INT,
|
||||
MPI_MAX,
|
||||
fh->f_comm,
|
||||
fh->f_comm->c_coll->coll_allreduce_module);
|
||||
|
||||
if ( 0 == global_uniform ){
|
||||
/* yes, everybody agrees on having a uniform file view */
|
||||
fh->f_flags |= OMPIO_UNIFORM_FVIEW;
|
||||
}
|
||||
#endif
|
||||
fh->f_avg_view_size = global_avg[2]/fh->f_size;
|
||||
}
|
||||
|
||||
return global_avg[0];
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user