From f70bb4774ad6bca0b65b9b03f07610528b25633a Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 10 Dec 2020 08:53:11 -0600 Subject: [PATCH 1/8] dynamic_gen2/file_write_all: fix chunk assignment per stride the dynamic_gen_file_write_all component distinguishes between the amount of data communicated to aggregators, and the amount of data written in a cycle by the aggregator (in contrary e.g. to the vulcan component). There was a bug in calculating which chunks have to be written in a cycle by an aggregator: we added as many elements into the io_array until we filled one stripe. Unfortuantely, the metric used was the amount of data instead of ensuring that all offsets fall within a single stripe. This commit fixes this issue. Note, the bug did not create a correctness problem, just a performance problem in case there were gaps in the file view. Signed-off-by: Edgar Gabriel --- .../fcoll_dynamic_gen2_file_write_all.c | 67 +++++++++++-------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c index 9763140990..282b248f79 100644 --- a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c +++ b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2016 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. @@ -92,7 +92,7 @@ typedef struct mca_io_ompio_aggregator_data { static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_io_ompio_aggregator_data *data, ompi_request_t **reqs ); -static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data, int write_chunksize ); +static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data ); int mca_fcoll_dynamic_gen2_break_file_view ( struct iovec *decoded_iov, int iov_count, struct iovec *local_iov_array, int local_count, @@ -111,8 +111,7 @@ static int local_heap_sort (mca_io_ompio_local_io_array *io_array, int *sorted); int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *work_array, - int num_entries, int *last_array_pos, int *last_pos_in_field, - int chunk_size ); + int num_entries, int *last_array_pos, int *last_pos_in_field ); int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh, @@ -145,7 +144,7 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh, MPI_Aint *broken_total_lengths=NULL; int *aggregators=NULL; - int write_chunksize, *result_counts=NULL; + int *result_counts=NULL; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN @@ -199,15 +198,9 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh, if ( fh->f_stripe_size == 0 ) { // EDGAR: just a quick heck for testing + //fh->f_stripe_size = 1048576; fh->f_stripe_size = 65536; } - if ( -1 == mca_fcoll_dynamic_gen2_write_chunksize ) { - write_chunksize = fh->f_stripe_size; - } - else { - write_chunksize = mca_fcoll_dynamic_gen2_write_chunksize; - } - ret = mca_fcoll_dynamic_gen2_get_configuration (fh, &dynamic_gen2_num_io_procs, &aggregators); if (OMPI_SUCCESS != ret){ @@ -608,7 +601,7 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh, #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif - ret = write_init (fh, aggregators[i], aggr_data[i], write_chunksize ); + ret = write_init (fh, aggregators[i], aggr_data[i] ); if (OMPI_SUCCESS != ret){ goto exit; } @@ -637,7 +630,7 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh, #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif - ret = write_init (fh, aggregators[i], aggr_data[i], write_chunksize ); + ret = write_init (fh, aggregators[i], aggr_data[i] ); if (OMPI_SUCCESS != ret){ goto exit; } @@ -735,7 +728,7 @@ exit : } -static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data, int write_chunksize ) +static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data ) { int ret=OMPI_SUCCESS; int last_array_pos=0; @@ -746,14 +739,28 @@ static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator while ( aggr_data->prev_bytes_to_write > 0 ) { aggr_data->prev_bytes_to_write -= mca_fcoll_dynamic_gen2_split_iov_array (fh, aggr_data->prev_io_array, aggr_data->prev_num_io_entries, - &last_array_pos, &last_pos, - write_chunksize ); + &last_array_pos, &last_pos ); if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { free ( aggr_data->prev_io_array); opal_output (1, "dynamic_gen2_write_all: fbtl_pwritev failed\n"); ret = OMPI_ERROR; goto exit; } + +#if DEBUG_ON + printf("fh->f_num_of_io_entries=%d\n", fh->f_num_of_io_entries); + printf("[%d]: fh->f_io_array[0].offset = %ld .size = %ld\n", fh->f_rank, (long)fh->f_io_array[0].offset, + fh->f_io_array[0].length); + if ( fh->f_num_of_io_entries > 1 ) + printf("[%d]: fh->f_io_array[1].offset = %ld .size = %ld\n", fh->f_rank, (long)fh->f_io_array[1].offset, + fh->f_io_array[1].length); + + + int n = fh->f_num_of_io_entries-1; + if ( fh->f_num_of_io_entries > 2 ) + printf("[%d]: fh->f_io_array[n].offset = %ld .size = %ld\n", fh->f_rank, (long)fh->f_io_array[n].offset, + fh->f_io_array[n].length); +#endif } free ( fh->f_io_array ); free ( aggr_data->prev_io_array); @@ -1595,14 +1602,15 @@ int mca_fcoll_dynamic_gen2_get_configuration (ompio_file_t *fh, int *dynamic_gen int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *io_array, int num_entries, - int *ret_array_pos, int *ret_pos, int chunk_size ) + int *ret_array_pos, int *ret_pos ) { int array_pos = *ret_array_pos; int pos = *ret_pos; size_t bytes_written = 0; - size_t bytes_to_write = chunk_size; - + off_t baseaddr = ((off_t)io_array[array_pos].offset + pos) - (((off_t)io_array[array_pos].offset + pos) % (off_t)fh->f_stripe_size); + off_t endaddr = baseaddr + fh->f_stripe_size; + if ( 0 == array_pos && 0 == pos ) { fh->f_io_array = (mca_common_ompio_io_array_t *) malloc ( num_entries * sizeof(mca_common_ompio_io_array_t)); if ( NULL == fh->f_io_array ){ @@ -1612,20 +1620,21 @@ int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_ } int i=0; - while (bytes_to_write > 0 ) { - fh->f_io_array[i].memory_address = &(((char *)io_array[array_pos].memory_address)[pos]); - fh->f_io_array[i].offset = &(((char *)io_array[array_pos].offset)[pos]); + do { + fh->f_io_array[i].memory_address = (char *)io_array[array_pos].memory_address + pos; + fh->f_io_array[i].offset = (char *)io_array[array_pos].offset + pos; - if ( (io_array[array_pos].length - pos ) >= bytes_to_write ) { - fh->f_io_array[i].length = bytes_to_write; + off_t length = io_array[array_pos].length - pos; + + if ( ( (off_t)fh->f_io_array[i].offset + length) < endaddr ) { + fh->f_io_array[i].length = length; } else { - fh->f_io_array[i].length = io_array[array_pos].length - pos; + fh->f_io_array[i].length = endaddr - (size_t)fh->f_io_array[i].offset; } - + pos += fh->f_io_array[i].length; bytes_written += fh->f_io_array[i].length; - bytes_to_write-= fh->f_io_array[i].length; i++; if ( pos == (int)io_array[array_pos].length ) { @@ -1637,7 +1646,7 @@ int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_ break; } } - } + } while ( (array_pos < num_entries) && (((off_t)io_array[array_pos].offset+pos ) < endaddr) ); fh->f_num_of_io_entries = i; *ret_array_pos = array_pos; From 5385e5f85fca21f54bc3b8d1e146c8b81a48eec3 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 10 Dec 2020 14:16:07 -0600 Subject: [PATCH 2/8] fbtl/posix/preadv.c: first cut on adding data sieving the lack of performing data sieving has been identified as a main reason for the poor performance in some instances on the Lustre file system. This commit introduces the fundamental ability to perform data sieving for read operations (which should not be controversial). The code itself is correct, what is still lacking is a) the logic when and how to activate data sieving and b) the logic to limit the size of the temporary buffer when doing data sieving. Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 226 +++++++++++++++++------- 1 file changed, 162 insertions(+), 64 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index f8a031a926..34eab4e95b 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2017 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -28,100 +28,198 @@ #include "ompi/constants.h" #include "ompi/mca/fbtl/fbtl.h" + +static int mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh); +static int mca_fbtl_posix_preadv_generic (ompio_file_t *fh); + ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) { - /*int *fp = NULL;*/ - int i, block=1, ret; - struct iovec *iov = NULL; - int iov_count = 0; - OMPI_MPI_OFFSET_TYPE iov_offset = 0; ssize_t bytes_read=0, ret_code=0; struct flock lock; - off_t total_length, end_offset=0; + int ret; if (NULL == fh->f_io_array) { return OMPI_ERROR; } + + if ( fh->f_num_of_io_entries > 1 ) { + bool do_data_sieving = false; + + if ( do_data_sieving) { + return mca_fbtl_posix_preadv_datasieving (fh); + } + else { + return mca_fbtl_posix_preadv_generic (fh); + } + } + else { + // Case num_of_io_entries == 1 + ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, (off_t)fh->f_io_array[0].offset, + (off_t)fh->f_io_array[0].length, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + return OMPI_ERROR; + } + + ret_code = pread(fh->fd, fh->f_io_array[0].memory_address, fh->f_io_array[0].length, (off_t)fh->f_io_array[0].offset ); + mca_fbtl_posix_unlock ( &lock, fh ); + if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_preadv: error in (p)read(v):%s", strerror(errno)); + return OMPI_ERROR; + } + + bytes_read += ret_code; + } + + return bytes_read; +} - iov = (struct iovec *) malloc - (OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec)); +int mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh) +{ + size_t start, end, len; + int ret, i; + ssize_t bytes_read=0, ret_code=0; + struct flock lock; + + start = (off_t)fh->f_io_array[0].offset; + end = (off_t)fh->f_io_array[fh->f_num_of_io_entries-1].offset + fh->f_io_array[fh->f_num_of_io_entries-1].length; + len = end - start; + + char *temp_buf = (char *) malloc ( len ); + if ( NULL == temp_buf ) { + opal_output(1, "OUT OF MEMORY\n"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + // Read the entire block. + ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, start, len, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + return OMPI_ERROR; + } + + ret_code = pread (fh->fd, temp_buf, len, start); + mca_fbtl_posix_unlock ( &lock, fh); + if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in (p)read(v):%s", strerror(errno)); + return OMPI_ERROR; + } + + // Copy out the elements that were requested. + size_t pos = 0; + size_t num_bytes; + size_t start_offset = (size_t) fh->f_io_array[0].offset; + for (i=0 ; if_num_of_io_entries ; i++) { + pos = (size_t) fh->f_io_array[i].offset - start_offset; + if ( (ssize_t) pos > ret_code ) { + break; + } + num_bytes = fh->f_io_array[i].length; + if ( ((ssize_t) pos + (ssize_t)num_bytes) > ret_code ) { + num_bytes = ret_code - (ssize_t)pos; + } + + memcpy (fh->f_io_array[i].memory_address, temp_buf + pos, num_bytes); + bytes_read += num_bytes; + } + + free ( temp_buf); + return bytes_read; +} + +int mca_fbtl_posix_preadv_generic (ompio_file_t *fh ) +{ + ssize_t bytes_read=0, ret_code=0; + struct iovec *iov = NULL; + struct flock lock; + int ret, i; + + int block=1; + int iov_count = 0; + OMPI_MPI_OFFSET_TYPE iov_offset = 0; + off_t total_length, end_offset=0; + + iov = (struct iovec *) malloc (OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec)); if (NULL == iov) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } - + for (i=0 ; if_num_of_io_entries ; i++) { - if (0 == iov_count) { - iov[iov_count].iov_base = fh->f_io_array[i].memory_address; - iov[iov_count].iov_len = fh->f_io_array[i].length; - iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; + if (0 == iov_count) { + iov[iov_count].iov_base = fh->f_io_array[i].memory_address; + iov[iov_count].iov_len = fh->f_io_array[i].length; + iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; - iov_count ++; - } - - if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) { - block ++; - iov = (struct iovec *)realloc - (iov, OMPIO_IOVEC_INITIAL_SIZE * block * - sizeof(struct iovec)); - if (NULL == iov) { - opal_output(1, "OUT OF MEMORY\n"); - return OMPI_ERR_OUT_OF_RESOURCE; - } - } - - if (fh->f_num_of_io_entries != i+1) { - if (((((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset + - (ptrdiff_t)fh->f_io_array[i].length) == - (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset)) && - (iov_count < IOV_MAX ) ){ - iov[iov_count].iov_base = - fh->f_io_array[i+1].memory_address; - iov[iov_count].iov_len = fh->f_io_array[i+1].length; - end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; - iov_count ++; - continue; - } - } - + iov_count ++; + } + + if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) { + block ++; + iov = (struct iovec *)realloc + (iov, OMPIO_IOVEC_INITIAL_SIZE * block * + sizeof(struct iovec)); + if (NULL == iov) { + opal_output(1, "OUT OF MEMORY\n"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + } + + if (fh->f_num_of_io_entries != i+1) { + if (((((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset + + (ptrdiff_t)fh->f_io_array[i].length) == + (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset)) && + (iov_count < IOV_MAX ) ){ + iov[iov_count].iov_base = + fh->f_io_array[i+1].memory_address; + iov[iov_count].iov_len = fh->f_io_array[i+1].length; + end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; + iov_count ++; + continue; + } + } + total_length = (end_offset - (off_t)iov_offset ); - + ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); if ( 0 < ret ) { - opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); + opal_output(1, "mca_fbtl_posix_preadv_generic: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); free (iov); /* Just in case some part of the lock worked */ mca_fbtl_posix_unlock ( &lock, fh); return OMPI_ERROR; } #if defined(HAVE_PREADV) - ret_code = preadv (fh->fd, iov, iov_count, iov_offset); + ret_code = preadv (fh->fd, iov, iov_count, iov_offset); #else - if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) { - opal_output(1, "mca_fbtl_posix_preadv: error in lseek:%s", strerror(errno)); + if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) { + opal_output(1, "mca_fbtl_posix_preadv_generic: error in lseek:%s", strerror(errno)); free(iov); mca_fbtl_posix_unlock ( &lock, fh ); - return OMPI_ERROR; - } - ret_code = readv (fh->fd, iov, iov_count); + return OMPI_ERROR; + } + ret_code = readv (fh->fd, iov, iov_count); #endif mca_fbtl_posix_unlock ( &lock, fh ); - if ( 0 < ret_code ) { - bytes_read+=ret_code; - } - else if ( ret_code == -1 ) { - opal_output(1, "mca_fbtl_posix_preadv: error in (p)readv:%s", strerror(errno)); + if ( 0 < ret_code ) { + bytes_read+=ret_code; + } + else if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_preadv_generic: error in (p)readv:%s", strerror(errno)); free(iov); - return OMPI_ERROR; - } - else if ( 0 == ret_code ){ - /* end of file reached, no point in continue reading; */ - break; - } - iov_count = 0; - } + return OMPI_ERROR; + } + else if ( 0 == ret_code ){ + /* end of file reached, no point in continue reading; */ + break; + } + iov_count = 0; + } free (iov); - return bytes_read; } From dbf0d6e5a351e014338cd293f6ca90583f394177 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 15 Dec 2020 18:12:07 -0600 Subject: [PATCH 3/8] fbtl_posix: add control logic for data sieving only implemented for read at the moment, but the parameters for write are also in place. Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix.h | 7 ++- ompi/mca/fbtl/posix/fbtl_posix_component.c | 71 +++++++++++++++++++++- ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 28 +++++++-- 3 files changed, 100 insertions(+), 6 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix.h b/ompi/mca/fbtl/posix/fbtl_posix.h index b9f1c1149e..e01101f1dc 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.h +++ b/ompi/mca/fbtl/posix/fbtl_posix.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2018 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -29,6 +29,11 @@ #include "ompi/mca/common/ompio/common_ompio_request.h" extern int mca_fbtl_posix_priority; +extern bool mca_fbtl_posix_read_datasieving; +extern bool mca_fbtl_posix_write_datasieving; +extern size_t mca_fbtl_posix_max_block_size; +extern size_t mca_fbtl_posix_max_gap_size; +extern size_t mca_fbtl_posix_max_tmpbuf_size; BEGIN_C_DECLS diff --git a/ompi/mca/fbtl/posix/fbtl_posix_component.c b/ompi/mca/fbtl/posix/fbtl_posix_component.c index 8575c2cad4..32f1d41e1a 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_component.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_component.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2018 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -37,6 +37,15 @@ const char *mca_fbtl_posix_component_version_string = "OMPI/MPI posix FBTL MCA component version " OMPI_VERSION; int mca_fbtl_posix_priority = 10; +bool mca_fbtl_posix_read_datasieving = true; +bool mca_fbtl_posix_write_datasieving = true; +size_t mca_fbtl_posix_max_block_size = 1048576; // 1MB +size_t mca_fbtl_posix_max_gap_size = 4096; // Size of a block in many linux fs +size_t mca_fbtl_posix_max_tmpbuf_size = 67108864; // 64 MB +/* + * Private functions + */ +static int register_component(void); /* * Instantiate the public struct with all of our public information @@ -54,6 +63,7 @@ mca_fbtl_base_component_2_0_0_t mca_fbtl_posix_component = { .mca_component_name = "posix", MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, OMPI_RELEASE_VERSION), + .mca_register_component_params = register_component, }, .fbtlm_data = { /* This component is checkpointable */ @@ -63,3 +73,62 @@ mca_fbtl_base_component_2_0_0_t mca_fbtl_posix_component = { .fbtlm_file_query = mca_fbtl_posix_component_file_query, /* get priority and actions */ .fbtlm_file_unquery = mca_fbtl_posix_component_file_unquery, /* undo what was done by previous function */ }; + +static int register_component(void) +{ + mca_fbtl_posix_priority = 10; + (void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version, + "priority", "Priority of the fbtl posix component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fbtl_posix_priority); + + mca_fbtl_posix_max_block_size = 1048576; + (void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version, + "max_block_size", "Maximum average size in bytes of a data block in an iovec for data sieving. " + "An average block size larger than this parameter will disable data sieving. Default: 1048576 bytes.", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fbtl_posix_max_block_size ); + + mca_fbtl_posix_max_gap_size = 4096; + (void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version, + "max_gap_size", "Maximum average gap size between two blocks in an iovec for data sieving. " + "An average gap size larger than this parameter will disable data sieving. Default: 4096 bytes. " , + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fbtl_posix_max_gap_size ); + + mca_fbtl_posix_max_tmpbuf_size = 67108864; + (void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version, + "max_tmpbuf_size", "Maximum size of the temporary buffer used for data sieving in bytes. " + "Default: 67108864 (64MB). " , + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fbtl_posix_max_tmpbuf_size ); + + mca_fbtl_posix_read_datasieving = true; + (void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version, + "read_datasieving", "Parameter indicating whether to perform data sieving for read operations. " + "Default: true.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fbtl_posix_read_datasieving ); + + mca_fbtl_posix_write_datasieving = true; + (void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version, + "write_datasieving", "Parameter indicating whether to perform data sieving for write operations. " + "Default: true.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fbtl_posix_write_datasieving ); + + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index 34eab4e95b..5e19080105 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -43,8 +43,27 @@ ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) } if ( fh->f_num_of_io_entries > 1 ) { - bool do_data_sieving = false; - + bool do_data_sieving = true; + + size_t avg_gap_size=0; + size_t avg_block_size = 0; + off_t prev_offset = (off_t)fh->f_io_array[0].offset; + int i; + for ( i=0; i< fh->f_num_of_io_entries; i++ ) { + avg_block_size += fh->f_io_array[i].length; + avg_gap_size += (size_t)((off_t)fh->f_io_array[i].offset - prev_offset); + prev_offset = (off_t)fh->f_io_array[i].offset; + } + avg_block_size = avg_block_size / fh->f_num_of_io_entries; + avg_gap_size = avg_gap_size / fh->f_num_of_io_entries; + + if ( mca_fbtl_posix_read_datasieving == false || + avg_gap_size == 0 || + avg_block_size > mca_fbtl_posix_max_block_size || + avg_gap_size > mca_fbtl_posix_max_gap_size ) { + do_data_sieving = false; + } + if ( do_data_sieving) { return mca_fbtl_posix_preadv_datasieving (fh); } @@ -53,7 +72,7 @@ ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) } } else { - // Case num_of_io_entries == 1 + // i.e. fh->f_num_of_io_entries == 1 ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, (off_t)fh->f_io_array[0].offset, (off_t)fh->f_io_array[0].length, OMPIO_LOCK_ENTIRE_REGION ); if ( 0 < ret ) { @@ -63,7 +82,8 @@ ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) return OMPI_ERROR; } - ret_code = pread(fh->fd, fh->f_io_array[0].memory_address, fh->f_io_array[0].length, (off_t)fh->f_io_array[0].offset ); + ret_code = pread(fh->fd, fh->f_io_array[0].memory_address, fh->f_io_array[0].length, + (off_t)fh->f_io_array[0].offset ); mca_fbtl_posix_unlock ( &lock, fh ); if ( ret_code == -1 ) { opal_output(1, "mca_fbtl_posix_preadv: error in (p)read(v):%s", strerror(errno)); From 90d8c8c39cd81baa4f3cc9d330a5bcf6cbfe5712 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 16 Dec 2020 11:17:15 -0600 Subject: [PATCH 4/8] fbtl_posix_preadv: limit the size of the temporary buffer when using data sieving. Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 130 ++++++++++++++++-------- 1 file changed, 85 insertions(+), 45 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index 5e19080105..15e9772981 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -29,8 +29,8 @@ #include "ompi/mca/fbtl/fbtl.h" -static int mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh); -static int mca_fbtl_posix_preadv_generic (ompio_file_t *fh); +static ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh); +static ssize_t mca_fbtl_posix_preadv_generic (ompio_file_t *fh); ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) { @@ -57,8 +57,8 @@ ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) avg_block_size = avg_block_size / fh->f_num_of_io_entries; avg_gap_size = avg_gap_size / fh->f_num_of_io_entries; - if ( mca_fbtl_posix_read_datasieving == false || - avg_gap_size == 0 || + if ( false == mca_fbtl_posix_read_datasieving || + 0 == avg_gap_size || avg_block_size > mca_fbtl_posix_max_block_size || avg_gap_size > mca_fbtl_posix_max_gap_size ) { do_data_sieving = false; @@ -76,7 +76,8 @@ ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, (off_t)fh->f_io_array[0].offset, (off_t)fh->f_io_array[0].length, OMPIO_LOCK_ENTIRE_REGION ); if ( 0 < ret ) { - opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); + opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", + ret, strerror(errno)); /* Just in case some part of the lock worked */ mca_fbtl_posix_unlock ( &lock, fh); return OMPI_ERROR; @@ -96,62 +97,101 @@ ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) return bytes_read; } -int mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh) +ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh) { size_t start, end, len; - int ret, i; + size_t bufsize = 0; + int ret, i, j; ssize_t bytes_read=0, ret_code=0; struct flock lock; + char *temp_buf = NULL; - start = (off_t)fh->f_io_array[0].offset; - end = (off_t)fh->f_io_array[fh->f_num_of_io_entries-1].offset + fh->f_io_array[fh->f_num_of_io_entries-1].length; - len = end - start; + int startindex = 0; + int endindex = 0; + bool done = false; - char *temp_buf = (char *) malloc ( len ); - if ( NULL == temp_buf ) { - opal_output(1, "OUT OF MEMORY\n"); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - // Read the entire block. - ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, start, len, OMPIO_LOCK_ENTIRE_REGION ); - if ( 0 < ret ) { - opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); - /* Just in case some part of the lock worked */ - mca_fbtl_posix_unlock ( &lock, fh); - return OMPI_ERROR; - } - - ret_code = pread (fh->fd, temp_buf, len, start); - mca_fbtl_posix_unlock ( &lock, fh); - if ( ret_code == -1 ) { - opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in (p)read(v):%s", strerror(errno)); - return OMPI_ERROR; - } - - // Copy out the elements that were requested. - size_t pos = 0; - size_t num_bytes; - size_t start_offset = (size_t) fh->f_io_array[0].offset; - for (i=0 ; if_num_of_io_entries ; i++) { - pos = (size_t) fh->f_io_array[i].offset - start_offset; - if ( (ssize_t) pos > ret_code ) { + while (!done) { + // Break the io_array into chunks such that the size of the temporary + // buffer does not exceed mca_fbtl_posix_max_tmpbuf_size bytes. + // Each iteration will thus work in the range (startindex, endindex[ + startindex = endindex; + if ( startindex >= fh->f_num_of_io_entries ) { + done = true; break; } - num_bytes = fh->f_io_array[i].length; - if ( ((ssize_t) pos + (ssize_t)num_bytes) > ret_code ) { - num_bytes = ret_code - (ssize_t)pos; + + size_t sstart = (size_t)fh->f_io_array[startindex].offset; + size_t slen=0; + + for ( j = startindex; j < fh->f_num_of_io_entries; j++ ) { + endindex = j; + slen = ((size_t)fh->f_io_array[j].offset + fh->f_io_array[j].length) - sstart; + if (slen > mca_fbtl_posix_max_tmpbuf_size ) { + endindex = j-1; + break; + } + } + // Need to increment the value of endindex + // by one for the loop syntax to work correctly. + endindex++; + + start = (size_t)fh->f_io_array[startindex].offset; + end = (size_t)fh->f_io_array[endindex-1].offset + fh->f_io_array[endindex-1].length; + len = end - start; + + if ( len > bufsize ) { + if ( NULL != temp_buf ) { + free ( temp_buf); + } + temp_buf = (char *) malloc ( len ); + if ( NULL == temp_buf ) { + opal_output(1, "OUT OF MEMORY\n"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + bufsize = len; } - memcpy (fh->f_io_array[i].memory_address, temp_buf + pos, num_bytes); - bytes_read += num_bytes; + // Read the entire block. + ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, start, len, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in mca_fbtl_posix_lock() ret=%d: %s", + ret, strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + return OMPI_ERROR; + } + + ret_code = pread (fh->fd, temp_buf, len, start); + mca_fbtl_posix_unlock ( &lock, fh); + if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in (p)read(v):%s", strerror(errno)); + return OMPI_ERROR; + } + + // Copy out the elements that were requested. + size_t pos = 0; + size_t num_bytes; + size_t start_offset = (size_t) fh->f_io_array[startindex].offset; + for ( i = startindex ; i < endindex ; i++) { + pos = (size_t) fh->f_io_array[i].offset - start_offset; + if ( (ssize_t) pos > ret_code ) { + break; + } + num_bytes = fh->f_io_array[i].length; + if ( ((ssize_t) pos + (ssize_t)num_bytes) > ret_code ) { + num_bytes = ret_code - (ssize_t)pos; + } + + memcpy (fh->f_io_array[i].memory_address, temp_buf + pos, num_bytes); + bytes_read += num_bytes; + } } free ( temp_buf); return bytes_read; } -int mca_fbtl_posix_preadv_generic (ompio_file_t *fh ) +ssize_t mca_fbtl_posix_preadv_generic (ompio_file_t *fh ) { ssize_t bytes_read=0, ret_code=0; struct iovec *iov = NULL; From d65480df3558f4803b9a6ae3e276cdb6b280f05d Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 16 Dec 2020 15:34:45 -0600 Subject: [PATCH 5/8] fbtl_posix_pwritev: add datasieving support for write its however restricted to collective I/O operations, at this point only from vulcan and dynamic_gen2. required some more infrastructure to be added to recognize individual I/O and multi-threaded environments. Signed-off-by: Edgar Gabriel --- ompi/mca/common/ompio/common_ompio.h | 1 + ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 2 + ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 182 +++++++++++++++++- .../fcoll_dynamic_gen2_file_write_all.c | 8 +- .../vulcan/fcoll_vulcan_file_write_all.c | 2 + 5 files changed, 192 insertions(+), 3 deletions(-) diff --git a/ompi/mca/common/ompio/common_ompio.h b/ompi/mca/common/ompio/common_ompio.h index 33dd0dd402..1f2bbc585d 100644 --- a/ompi/mca/common/ompio/common_ompio.h +++ b/ompi/mca/common/ompio/common_ompio.h @@ -67,6 +67,7 @@ #define OMPIO_LOCK_NEVER 0x00000100 #define OMPIO_LOCK_NOT_THIS_OP 0x00000200 #define OMPIO_DATAREP_NATIVE 0x00000400 +#define OMPIO_COLLECTIVE_OP 0x00000800 #define OMPIO_ROOT 0 diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index 15e9772981..89a819a6e2 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -158,6 +158,7 @@ ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh) ret, strerror(errno)); /* Just in case some part of the lock worked */ mca_fbtl_posix_unlock ( &lock, fh); + free ( temp_buf); return OMPI_ERROR; } @@ -165,6 +166,7 @@ ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh) mca_fbtl_posix_unlock ( &lock, fh); if ( ret_code == -1 ) { opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in (p)read(v):%s", strerror(errno)); + free ( temp_buf); return OMPI_ERROR; } diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index 7ad6e6d9d2..d54e9e0943 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2017 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -30,7 +30,187 @@ #include "ompi/constants.h" #include "ompi/mca/fbtl/fbtl.h" +static ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh ); +static ssize_t mca_fbtl_posix_pwritev_generic (ompio_file_t *fh ); + ssize_t mca_fbtl_posix_pwritev(ompio_file_t *fh ) +{ + ssize_t bytes_written=0, ret_code=0; + struct flock lock; + int ret; + + if (NULL == fh->f_io_array) { + return OMPI_ERROR; + } + + if ( fh->f_num_of_io_entries > 1 ) { + bool do_data_sieving = true; + + size_t avg_gap_size=0; + size_t avg_block_size = 0; + off_t prev_offset = (off_t)fh->f_io_array[0].offset; + int i; + for ( i=0; i< fh->f_num_of_io_entries; i++ ) { + avg_block_size += fh->f_io_array[i].length; + avg_gap_size += (size_t)((off_t)fh->f_io_array[i].offset - prev_offset); + prev_offset = (off_t)fh->f_io_array[i].offset; + } + avg_block_size = avg_block_size / fh->f_num_of_io_entries; + avg_gap_size = avg_gap_size / fh->f_num_of_io_entries; + + if ( false == mca_fbtl_posix_write_datasieving || + 0 == avg_gap_size || + avg_block_size > mca_fbtl_posix_max_block_size || + avg_gap_size > mca_fbtl_posix_max_gap_size || + ompi_mpi_thread_multiple || + !(fh->f_flags & OMPIO_COLLECTIVE_OP) ) { + do_data_sieving = false; + } + + if ( do_data_sieving) { + return mca_fbtl_posix_pwritev_datasieving (fh); + } + else { + return mca_fbtl_posix_pwritev_generic (fh); + } + } + else { + // i.e. fh->f_num_of_io_entries == 1 + ret = mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, (off_t)fh->f_io_array[0].offset, + (off_t)fh->f_io_array[0].length, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_pwritev: error in mca_fbtl_posix_lock() ret=%d: %s", + ret, strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + return OMPI_ERROR; + } + + ret_code = pwrite(fh->fd, fh->f_io_array[0].memory_address, fh->f_io_array[0].length, + (off_t)fh->f_io_array[0].offset ); + mca_fbtl_posix_unlock ( &lock, fh ); + if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_pwritev: error in (p)write(v):%s", strerror(errno)); + return OMPI_ERROR; + } + + bytes_written += ret_code; + } + + return bytes_written; +} + +ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh) +{ + size_t start, end, len; + size_t bufsize = 0; + int ret, i, j; + ssize_t bytes_written=0, ret_code=0; + struct flock lock; + char *temp_buf = NULL; + + int startindex = 0; + int endindex = 0; + bool done = false; + + while (!done) { + // Break the io_array into chunks such that the size of the temporary + // buffer does not exceed mca_fbtl_posix_max_tmpbuf_size bytes. + // Each iteration will thus work in the range (startindex, endindex[ + startindex = endindex; + if ( startindex >= fh->f_num_of_io_entries ) { + done = true; + break; + } + + size_t sstart = (size_t)fh->f_io_array[startindex].offset; + size_t slen=0; + + for ( j = startindex; j < fh->f_num_of_io_entries; j++ ) { + endindex = j; + slen = ((size_t)fh->f_io_array[j].offset + fh->f_io_array[j].length) - sstart; + if (slen > mca_fbtl_posix_max_tmpbuf_size ) { + endindex = j-1; + break; + } + } + // Need to increment the value of endindex + // by one for the loop syntax to work correctly. + endindex++; + + start = (size_t)fh->f_io_array[startindex].offset; + end = (size_t)fh->f_io_array[endindex-1].offset + fh->f_io_array[endindex-1].length; + len = end - start; + + if ( len > bufsize ) { + if ( NULL != temp_buf ) { + free ( temp_buf); + } + temp_buf = (char *) malloc ( len ); + if ( NULL == temp_buf ) { + opal_output(1, "OUT OF MEMORY\n"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + bufsize = len; + } + + // Read the entire block. + ret = mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, start, len, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in mca_fbtl_posix_lock() ret=%d: %s", + ret, strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + free ( temp_buf); + return OMPI_ERROR; + } + + ret_code = pread (fh->fd, temp_buf, len, start); + if ( ret_code == -1 ) { + //opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + free ( temp_buf); + return OMPI_ERROR; + } + + // Copy out the elements to write into temporary buffer. + size_t pos = 0; + size_t num_bytes; + size_t start_offset = (size_t) fh->f_io_array[startindex].offset; + for ( i = startindex ; i < endindex ; i++) { + pos = (size_t) fh->f_io_array[i].offset - start_offset; + num_bytes = fh->f_io_array[i].length; + memcpy (temp_buf + pos, fh->f_io_array[i].memory_address, num_bytes); + bytes_written += num_bytes; + } + ret_code = pwrite (fh->fd, temp_buf, len, start); + if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + free ( temp_buf); + return OMPI_ERROR; + } + + mca_fbtl_posix_unlock ( &lock, fh); + if ( ret_code == -1 ) { + opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno)); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + free ( temp_buf); + return OMPI_ERROR; + } + + } + + free ( temp_buf); + return bytes_written; +} + + +ssize_t mca_fbtl_posix_pwritev_generic (ompio_file_t *fh ) { /*int *fp = NULL;*/ int i, block = 1, ret; diff --git a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c index 282b248f79..6e6f09a20c 100644 --- a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c +++ b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c @@ -736,11 +736,15 @@ static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator if ( aggregator == fh->f_rank && aggr_data->prev_num_io_entries) { - while ( aggr_data->prev_bytes_to_write > 0 ) { + while ( aggr_data->prev_bytes_to_write > 0 ) { + ssize_t tret; aggr_data->prev_bytes_to_write -= mca_fcoll_dynamic_gen2_split_iov_array (fh, aggr_data->prev_io_array, aggr_data->prev_num_io_entries, &last_array_pos, &last_pos ); - if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { + fh->f_flags |= OMPIO_COLLECTIVE_OP; + tret = fh->f_fbtl->fbtl_pwritev (fh); + fh->f_flags &= ~OMPIO_COLLECTIVE_OP; + if ( 0 > tret ) { free ( aggr_data->prev_io_array); opal_output (1, "dynamic_gen2_write_all: fbtl_pwritev failed\n"); ret = OMPI_ERROR; diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c index abf281a0a3..8b2ccaa183 100644 --- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c +++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c @@ -771,7 +771,9 @@ static int write_init (ompio_file_t *fh, } } else { + fh->f_flags |= OMPIO_COLLECTIVE_OP; ret_temp = fh->f_fbtl->fbtl_pwritev(fh); + fh->f_flags &= ~OMPIO_COLLECTIVE_OP; if(0 > ret_temp) { opal_output (1, "vulcan_write_all: fbtl_pwritev failed\n"); ret = ret_temp; From 2c6107473960ef5cc108d07c4715aad47940d40b Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 17 Dec 2020 11:43:23 -0600 Subject: [PATCH 6/8] dynamic_gen2: code cleanup remove now unused mca parameter, get rid of an unnecesary if-else part, and move setting the flag outside of the while loop. Signed-off-by: Edgar Gabriel --- ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2.h | 3 +-- .../fcoll/dynamic_gen2/fcoll_dynamic_gen2_component.c | 10 +--------- .../dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c | 11 +++-------- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2.h b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2.h index bd46710c45..10317ca180 100644 --- a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2.h +++ b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2016 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -36,7 +36,6 @@ BEGIN_C_DECLS extern int mca_fcoll_dynamic_gen2_priority; extern int mca_fcoll_dynamic_gen2_num_groups; -extern int mca_fcoll_dynamic_gen2_write_chunksize; OMPI_MODULE_DECLSPEC extern mca_fcoll_base_component_2_0_0_t mca_fcoll_dynamic_gen2_component; diff --git a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_component.c b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_component.c index 055b6b244b..688a70138a 100644 --- a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_component.c +++ b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_component.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2008-2016 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -42,7 +42,6 @@ const char *mca_fcoll_dynamic_gen2_component_version_string = */ int mca_fcoll_dynamic_gen2_priority = 10; int mca_fcoll_dynamic_gen2_num_groups = 1; -int mca_fcoll_dynamic_gen2_write_chunksize = -1; /* * Local function @@ -95,12 +94,5 @@ dynamic_gen2_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_dynamic_gen2_num_groups); - mca_fcoll_dynamic_gen2_write_chunksize = -1; - (void) mca_base_component_var_register(&mca_fcoll_dynamic_gen2_component.fcollm_version, - "write_chunksize", "Chunk size written at once. Default: stripe_size of the file system", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_dynamic_gen2_write_chunksize); - return OMPI_SUCCESS; } diff --git a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c index 6e6f09a20c..d39e94201f 100644 --- a/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c +++ b/ompi/mca/fcoll/dynamic_gen2/fcoll_dynamic_gen2_file_write_all.c @@ -736,14 +736,13 @@ static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator if ( aggregator == fh->f_rank && aggr_data->prev_num_io_entries) { + fh->f_flags |= OMPIO_COLLECTIVE_OP; while ( aggr_data->prev_bytes_to_write > 0 ) { ssize_t tret; aggr_data->prev_bytes_to_write -= mca_fcoll_dynamic_gen2_split_iov_array (fh, aggr_data->prev_io_array, aggr_data->prev_num_io_entries, &last_array_pos, &last_pos ); - fh->f_flags |= OMPIO_COLLECTIVE_OP; tret = fh->f_fbtl->fbtl_pwritev (fh); - fh->f_flags &= ~OMPIO_COLLECTIVE_OP; if ( 0 > tret ) { free ( aggr_data->prev_io_array); opal_output (1, "dynamic_gen2_write_all: fbtl_pwritev failed\n"); @@ -766,6 +765,7 @@ static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator fh->f_io_array[n].length); #endif } + fh->f_flags &= ~OMPIO_COLLECTIVE_OP; free ( fh->f_io_array ); free ( aggr_data->prev_io_array); } @@ -1643,12 +1643,7 @@ int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_ if ( pos == (int)io_array[array_pos].length ) { pos = 0; - if ((array_pos + 1) < num_entries) { - array_pos++; - } - else { - break; - } + array_pos++; } } while ( (array_pos < num_entries) && (((off_t)io_array[array_pos].offset+pos ) < endaddr) ); From aa2d21ee50b7a30e9fb6251d14aa6730bd673136 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 17 Dec 2020 12:22:54 -0600 Subject: [PATCH 7/8] lustre_file_open: avoid explicit locking on lustre file systems Signed-off-by: Edgar Gabriel --- ompi/mca/fs/lustre/fs_lustre_file_open.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ompi/mca/fs/lustre/fs_lustre_file_open.c b/ompi/mca/fs/lustre/fs_lustre_file_open.c index 7d8540025d..3a0e1c049b 100644 --- a/ompi/mca/fs/lustre/fs_lustre_file_open.c +++ b/ompi/mca/fs/lustre/fs_lustre_file_open.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2018 University of Houston. All rights reserved. - * Copyright (c) 2015-2018 Research Organization for Information Science + * Copyright (c) 2015-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -144,6 +144,7 @@ mca_fs_lustre_file_open (struct ompi_communicator_t *comm, fh->f_stripe_size = lump->lmm_stripe_size; fh->f_stripe_count = lump->lmm_stripe_count; fh->f_fs_block_size = lump->lmm_stripe_size; + fh->f_flags |= OMPIO_LOCK_NEVER; return OMPI_SUCCESS; } From 56dbd096d3d3d6cd914944d8bf7644baae7ed0ba Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 17 Dec 2020 13:26:51 -0600 Subject: [PATCH 8/8] io/ompio: remove the special handling of Lustre in the selection logic ompio is now the default on Lustre as well Signed-off-by: Edgar Gabriel --- ompi/mca/io/ompio/io_ompio_component.c | 36 ++------------------------ 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/ompi/mca/io/ompio/io_ompio_component.c b/ompi/mca/io/ompio/io_ompio_component.c index 2d82f16475..09d18aad75 100644 --- a/ompi/mca/io/ompio/io_ompio_component.c +++ b/ompi/mca/io/ompio/io_ompio_component.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2019 University of Houston. All rights reserved. + * Copyright (c) 2008-2020 University of Houston. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science @@ -302,42 +302,11 @@ file_query(struct ompi_file_t *file, int *priority) { mca_common_ompio_data_t *data; - char *tmp; - int rank; - int is_lustre=0; //false - tmp = strchr (file->f_filename, ':'); - rank = ompi_comm_rank ( file->f_comm); - if (!tmp) { - if ( 0 == rank) { - if (LUSTRE == mca_fs_base_get_fstype(file->f_filename)) { - is_lustre = 1; //true - } - } - - file->f_comm->c_coll->coll_bcast (&is_lustre, - 1, - MPI_INT, - 0, - file->f_comm, - file->f_comm->c_coll->coll_bcast_module); - } - else { - if (!strncasecmp(file->f_filename, "lustre:", 7) ) { - is_lustre = 1; - } - } - - if (is_lustre) { - *priority = 1; - } - else { - *priority = priority_param; - } + *priority = priority_param; /* Allocate a space for this module to hang private data (e.g., the OMPIO file handle) */ - data = calloc(1, sizeof(mca_common_ompio_data_t)); if (NULL == data) { return NULL; @@ -346,7 +315,6 @@ file_query(struct ompi_file_t *file, *private_data = (struct mca_io_base_file_t*) data; /* All done */ - return &mca_io_ompio_module; }