common/ompio: fix calculation in simple-grouping option
This is based on a bug reported on the mailing list using a netcdf testcase. The problem occurs if processes are using a custom file view, but on some of them it appears as if the default file view is being used. Because of that, the simple-grouping option lead to different number of aggregators used on different processes, and ultimately to a deadlock. This patch fixes the problem by not using the file_view size anymore for the calculation in the simple-grouping option, but the contiguous chunk size (which is identical on all processes). Fixes issue #7109 Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
Этот коммит содержится в:
родитель
8343a289f2
Коммит
ad5d0df4e9
@ -126,17 +126,17 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
|
|||||||
}
|
}
|
||||||
|
|
||||||
P_a = 1;
|
P_a = 1;
|
||||||
time_prev = cost_calc ( fh->f_size, P_a, fh->f_view_size, (size_t) fh->f_bytes_per_agg, mode );
|
time_prev = cost_calc ( fh->f_size, P_a, fh->f_cc_size, (size_t) fh->f_bytes_per_agg, mode );
|
||||||
P_a_prev = P_a;
|
P_a_prev = P_a;
|
||||||
for ( P_a = incr; P_a <= fh->f_size; P_a += incr ) {
|
for ( P_a = incr; P_a <= fh->f_size; P_a += incr ) {
|
||||||
time = cost_calc ( fh->f_size, P_a, fh->f_view_size, (size_t) fh->f_bytes_per_agg, mode );
|
time = cost_calc ( fh->f_size, P_a, fh->f_cc_size, (size_t) fh->f_bytes_per_agg, mode );
|
||||||
dtime_abs = (time_prev - time);
|
dtime_abs = (time_prev - time);
|
||||||
dtime = dtime_abs / time_prev;
|
dtime = dtime_abs / time_prev;
|
||||||
dtime_diff = ( P_a == incr ) ? dtime : (dtime_prev - dtime);
|
dtime_diff = ( P_a == incr ) ? dtime : (dtime_prev - dtime);
|
||||||
#ifdef OMPIO_DEBUG
|
#ifdef OMPIO_DEBUG
|
||||||
if ( 0 == fh->f_rank ){
|
if ( 0 == fh->f_rank ){
|
||||||
printf(" d_p = %ld P_a = %d time = %lf dtime = %lf dtime_abs =%lf dtime_diff=%lf\n",
|
printf(" d_p = %ld P_a = %d time = %lf dtime = %lf dtime_abs =%lf dtime_diff=%lf\n",
|
||||||
fh->f_view_size, P_a, time, dtime, dtime_abs, dtime_diff );
|
fh->f_cc_size, P_a, time, dtime, dtime_abs, dtime_diff );
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if ( dtime_diff < dtime_threshold ) {
|
if ( dtime_diff < dtime_threshold ) {
|
||||||
@ -171,7 +171,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
|
|||||||
num_groups = P_a_prev;
|
num_groups = P_a_prev;
|
||||||
#ifdef OMPIO_DEBUG
|
#ifdef OMPIO_DEBUG
|
||||||
printf(" For P=%d d_p=%ld b_c=%d threshold=%f chosen P_a = %d \n",
|
printf(" For P=%d d_p=%ld b_c=%d threshold=%f chosen P_a = %d \n",
|
||||||
fh->f_size, fh->f_view_size, fh->f_bytes_per_agg, dtime_threshold, P_a_prev);
|
fh->f_size, fh->f_cc_size, fh->f_bytes_per_agg, dtime_threshold, P_a_prev);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Cap the maximum number of aggregators.*/
|
/* Cap the maximum number of aggregators.*/
|
||||||
@ -183,6 +183,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
|
|||||||
}
|
}
|
||||||
|
|
||||||
*num_groups_out = num_groups;
|
*num_groups_out = num_groups;
|
||||||
|
|
||||||
return mca_common_ompio_forced_grouping ( fh, num_groups, contg_groups);
|
return mca_common_ompio_forced_grouping ( fh, num_groups, contg_groups);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -576,7 +577,7 @@ int mca_common_ompio_create_groups(ompio_file_t *fh,
|
|||||||
opal_output (1, "mca_common_ompio_create_groups: error in mca_common_ompio_prepare_to_group\n");
|
opal_output (1, "mca_common_ompio_create_groups: error in mca_common_ompio_prepare_to_group\n");
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch(ompio_grouping_flag){
|
switch(ompio_grouping_flag){
|
||||||
|
|
||||||
case OMPIO_SPLIT:
|
case OMPIO_SPLIT:
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user