1
1

common/ompio: use avg. file view size in the aggregator selection logic

This is a fix  based on a bugreport on github/mailing list from CGNS.
The core of the problem was that different processes entered different branches of
our aggregator selection logic, due to the fact that in some cases processes had
a matching file_view size and contiguous chunk size (thus assuming 1-D distribution),
and some processes did not (thus assuming 2-D distribution). The fix is to calculate
the avg. file view size across all processes and use this value, thus ensuring that
all processes enter the same branch.

Fixes issue #7809

Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
(cherry picked from commit 4a8a330bbaf9fe5ea07cd01146afb83b569f3138)
Этот коммит содержится в:
Edgar Gabriel 2020-06-15 09:17:44 -05:00
родитель f334a699b7
Коммит eeee011ac0
3 изменённых файлов: 9 добавлений и 46 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2016 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
@ -165,6 +165,7 @@ struct ompio_file_t {
size_t f_stripe_size;
int f_stripe_count;
size_t f_cc_size;
size_t f_avg_view_size;
int f_bytes_per_agg;
enum ompio_fs_type f_fstype;
ompi_request_t *f_split_coll_req;

Просмотреть файл

@ -107,7 +107,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
/* Determine whether to use the formula for 1-D or 2-D data decomposition. Anything
** that is not 1-D is assumed to be 2-D in this version
*/
mode = ( fh->f_cc_size == fh->f_view_size ) ? 1 : 2;
mode = ( fh->f_cc_size == fh->f_avg_view_size ) ? 1 : 2;
/* Determine the increment size when searching the optimal
** no. of aggregators

Просмотреть файл

@ -328,36 +328,28 @@ exit:
OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (ompio_file_t *fh, int flag)
{
int uniform = 0;
OMPI_MPI_OFFSET_TYPE avg[3] = {0,0,0};
OMPI_MPI_OFFSET_TYPE global_avg[3] = {0,0,0};
int i = 0;
/* This function does two things: first, it determines the average data chunk
** size in the file view for each process and across all processes.
** Second, it establishes whether the view across all processes is uniform.
** By definition, uniform means:
** 1. the file view of each process has the same number of contiguous sections
** 2. each section in the file view has exactly the same size
/* This function determines the average data chunk
** size in the file view for each process and across all processes,
** and the avg. file_view size across processes.
*/
if ( flag ) {
global_avg[0] = MCA_IO_DEFAULT_FILE_VIEW_SIZE;
fh->f_avg_view_size = fh->f_view_size;
}
else {
for (i=0 ; i<(int)fh->f_iov_count ; i++) {
avg[0] += fh->f_decoded_iov[i].iov_len;
if (i && 0 == uniform) {
if (fh->f_decoded_iov[i].iov_len != fh->f_decoded_iov[i-1].iov_len) {
uniform = 1;
}
}
}
if ( 0 != fh->f_iov_count ) {
avg[0] = avg[0]/fh->f_iov_count;
}
avg[1] = (OMPI_MPI_OFFSET_TYPE) fh->f_iov_count;
avg[2] = (OMPI_MPI_OFFSET_TYPE) uniform;
avg[2] = (OMPI_MPI_OFFSET_TYPE) fh->f_view_size;
fh->f_comm->c_coll->coll_allreduce (avg,
global_avg,
@ -368,37 +360,7 @@ OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (ompio_file_t *fh, int flag)
fh->f_comm->c_coll->coll_allreduce_module);
global_avg[0] = global_avg[0]/fh->f_size;
global_avg[1] = global_avg[1]/fh->f_size;
#if 0
/* Disabling the feature since we are not using it anyway. Saves us one allreduce operation. */
int global_uniform=0;
if ( global_avg[0] == avg[0] &&
global_avg[1] == avg[1] &&
0 == avg[2] &&
0 == global_avg[2] ) {
uniform = 0;
}
else {
uniform = 1;
}
/* second confirmation round to see whether all processes agree
** on having a uniform file view or not
*/
fh->f_comm->c_coll->coll_allreduce (&uniform,
&global_uniform,
1,
MPI_INT,
MPI_MAX,
fh->f_comm,
fh->f_comm->c_coll->coll_allreduce_module);
if ( 0 == global_uniform ){
/* yes, everybody agrees on having a uniform file view */
fh->f_flags |= OMPIO_UNIFORM_FVIEW;
}
#endif
fh->f_avg_view_size = global_avg[2]/fh->f_size;
}
return global_avg[0];