From f5e158c8693c007e839b351c2d23997aa10f0eaf Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 20 Sep 2017 17:40:04 -0500 Subject: [PATCH 1/6] fbtl/posix: first cut in adding locking support Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/Makefile.am | 5 +- ompi/mca/fbtl/posix/fbtl_posix_lock.c | 114 +++++++++++++++++++++++ ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 9 +- ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 16 +++- 4 files changed, 135 insertions(+), 9 deletions(-) create mode 100644 ompi/mca/fbtl/posix/fbtl_posix_lock.c diff --git a/ompi/mca/fbtl/posix/Makefile.am b/ompi/mca/fbtl/posix/Makefile.am index 865aa5edcd..a7b0624d3e 100644 --- a/ompi/mca/fbtl/posix/Makefile.am +++ b/ompi/mca/fbtl/posix/Makefile.am @@ -9,7 +9,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2008-2011 University of Houston. All rights reserved. +# Copyright (c) 2008-2017 University of Houston. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. # $COPYRIGHT$ # @@ -49,4 +49,5 @@ sources = \ fbtl_posix_preadv.c \ fbtl_posix_ipreadv.c \ fbtl_posix_pwritev.c \ - fbtl_posix_ipwritev.c + fbtl_posix_ipwritev.c \ + fbtl_posix_lock.c diff --git a/ompi/mca/fbtl/posix/fbtl_posix_lock.c b/ompi/mca/fbtl/posix/fbtl_posix_lock.c new file mode 100644 index 0000000000..cb0be53709 --- /dev/null +++ b/ompi/mca/fbtl/posix/fbtl_posix_lock.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 University of Houston. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "fbtl_posix.h" + +#include "mpi.h" +#include +#include +#include +#include "ompi/constants.h" +#include "ompi/mca/fbtl/fbtl.h" + + + +/* + op: can be F_WRLCK or F_RDLCK + flags: can be OMPIO_LOCK_ENTIRE_REGION or OMPIO_LOCK_SELECTIVE +*/ + +int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, + OMPI_MPI_OFFSET_TYPE iov_offset, off_t len, int flags) +{ + off_t lmod, bmod; + + lock->l_type = op; + lock->l_whence = SEEK_SET; + lock->l_start =-1; + lock->l_len =-1; + if ( fh->f_atomicity || + fh->f_flags & OMPIO_LOCK_ALWAYS ) { + /* Need to lock the entire region */ + lock->l_start = (off_t) iov_offset; + lock->l_len = len; + } + else { + if ( (fh->f_flags & OMPIO_LOCK_NEVER) || + (fh->f_flags & OMPIO_LOCK_NOT_THIS_OP )){ + /* OMPIO_LOCK_NEVER: + ompio tells us not to worry about locking. This can be due to three + reasons: + 1. user enforced + 2. single node job where the locking is handled already in the kernel + 3. file view is set to distinct regions such that multiple processes + do not collide on the block level. ( not entirely sure yet how + to check for this except in trivial cases). + OMPI_LOCK_NOT_THIS_OP: + will typically be set by fcoll components indicating that the file partitioning + ensures no overlap in blocks. + */ + return 0; + } + if ( OMPIO_LOCK_ENTIRE_REGION ) { + lock->l_start = (off_t) iov_offset; + lock->l_len = len; + } + else { + /* We only try to lock the first block in the data range if + the starting offset is not the starting offset of a file system + block. And the last block in the data range if the offset+len + is not equal to the end of a file system block. + If we need to lock both beginning + end, we combine + the two into a single lock. + */ + bmod = offset % fh->f_fs_block_size; + if ( !bmod ) { + lock->l_start = (off_t) iov_offset; + lock->l_len = fh->f_fs_block_size - bmod; + } + lmod = (offset+len-1)%fh->f_fs_block_size; + if ( !lmod ) { + if ( bmod ) { + lock->l_start = (offset+len-lmod ); + lock->l_len = lmod; + } + else { + lock->l_len = len; + } + } + if ( -1 == lock->l_start && -1 == lock->l_len ) { + /* no need to lock in this instance */ + return 0; + } + } + } + + return (fcntl ( fh->fd, F_SETLKW, lock)); +} + +int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ) +{ + if ( -1 == lock->l_start && -1 == lock->l_len ) { + return 0; + } + + lock->l_type = F_UNLCK; + return (fcntl ( fh->fd, F_SETLK, lock)); +} diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index f99e93348c..11d3de02af 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2014 University of Houston. All rights reserved. + * Copyright (c) 2008-2017 University of Houston. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -36,6 +36,8 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) int iov_count = 0; OMPI_MPI_OFFSET_TYPE iov_offset = 0; ssize_t bytes_read=0, ret_code=0; + struct flock lock; + off_t total_length; if (NULL == fh->f_io_array) { return OMPI_ERROR; @@ -53,6 +55,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; + total_length = iov[iov_count].iov_len; iov_count ++; } @@ -75,11 +78,12 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; iov[iov_count].iov_len = fh->f_io_array[i+1].length; + total_length += iov[iov_count].iov_len; iov_count ++; continue; } } - + mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_len, OMPIO_LOCK_SELECTIVE ); #if defined(HAVE_PREADV) ret_code = preadv (fh->fd, iov, iov_count, iov_offset); if ( 0 < ret_code ) { @@ -96,6 +100,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) bytes_read+=ret_code; } #endif + mca_fbtl_posix_unlock ( &lock, fh ); else if ( ret_code == -1 ) { opal_output(1, "readv:%s", strerror(errno)); free(iov); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index 5ed6574a5b..9929823185 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2014 University of Houston. All rights reserved. + * Copyright (c) 2008-2017 University of Houston. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -38,6 +38,8 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) int iov_count = 0; OMPI_MPI_OFFSET_TYPE iov_offset = 0; ssize_t ret_code=0, bytes_written=0; + struct flock lock; + off_t total_length; if (NULL == fh->f_io_array) { return OMPI_ERROR; @@ -55,6 +57,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; + total_length = iov[iov_count].iov_len; iov_count ++; } @@ -74,10 +77,10 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) (ptrdiff_t)fh->f_io_array[i].length) == (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset) && (iov_count < IOV_MAX )) { - iov[iov_count].iov_base = - fh->f_io_array[i+1].memory_address; - iov[iov_count].iov_len = fh->f_io_array[i+1].length; - iov_count ++; + iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; + iov[iov_count].iov_len = fh->f_io_array[i+1].length; + total_length += iov[iov_count].iov_len; + iov_count ++; continue; } } @@ -93,6 +96,8 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) } */ + + mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_len, OMPIO_LOCK_SELECTIVE ); #if defined (HAVE_PWRITEV) ret_code = pwritev (fh->fd, iov, iov_count, iov_offset); if ( 0 < ret_code ) { @@ -110,6 +115,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) bytes_written += ret_code; } #endif + mca_fbtl_posix_unlock ( &lock, fh ); else if (-1 == ret_code ) { opal_output(1, "writev:%s", strerror(errno)); free (iov); From 415e76514d0189c8261bf2a795ac26f70fe611be Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 20 Sep 2017 17:57:57 -0500 Subject: [PATCH 2/6] fbtl/posix: make the code compile Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix.h | 6 ++++++ ompi/mca/fbtl/posix/fbtl_posix_lock.c | 8 ++++---- ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 10 ++++------ ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 11 ++++------- ompi/mca/io/ompio/io_ompio.h | 9 +++++++++ 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix.h b/ompi/mca/fbtl/posix/fbtl_posix.h index 9111cba761..b9353a921a 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.h +++ b/ompi/mca/fbtl/posix/fbtl_posix.h @@ -58,6 +58,11 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *file, bool mca_fbtl_posix_progress ( mca_ompio_request_t *req); void mca_fbtl_posix_request_free ( mca_ompio_request_t *req); +int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, + OMPI_MPI_OFFSET_TYPE iov_offset, off_t len, int flags); +int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ); + + struct mca_fbtl_posix_request_data_t { int aio_req_count; /* total number of aio reqs */ int aio_open_reqs; /* number of unfinished reqs */ @@ -78,6 +83,7 @@ typedef struct mca_fbtl_posix_request_data_t mca_fbtl_posix_request_data_t; #define FBTL_POSIX_READ 1 #define FBTL_POSIX_WRITE 2 + /* * ****************************************************************** * ************ functions implemented in this module end ************ diff --git a/ompi/mca/fbtl/posix/fbtl_posix_lock.c b/ompi/mca/fbtl/posix/fbtl_posix_lock.c index cb0be53709..30551b4424 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_lock.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_lock.c @@ -35,7 +35,7 @@ */ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, - OMPI_MPI_OFFSET_TYPE iov_offset, off_t len, int flags) + OMPI_MPI_OFFSET_TYPE offset, off_t len, int flags) { off_t lmod, bmod; @@ -46,7 +46,7 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, if ( fh->f_atomicity || fh->f_flags & OMPIO_LOCK_ALWAYS ) { /* Need to lock the entire region */ - lock->l_start = (off_t) iov_offset; + lock->l_start = (off_t) offset; lock->l_len = len; } else { @@ -67,7 +67,7 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, return 0; } if ( OMPIO_LOCK_ENTIRE_REGION ) { - lock->l_start = (off_t) iov_offset; + lock->l_start = (off_t) offset; lock->l_len = len; } else { @@ -80,7 +80,7 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, */ bmod = offset % fh->f_fs_block_size; if ( !bmod ) { - lock->l_start = (off_t) iov_offset; + lock->l_start = (off_t) offset; lock->l_len = fh->f_fs_block_size - bmod; } lmod = (offset+len-1)%fh->f_fs_block_size; diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index 11d3de02af..cbf9d34fb9 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -83,24 +83,22 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) continue; } } - mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_len, OMPIO_LOCK_SELECTIVE ); + mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); #if defined(HAVE_PREADV) ret_code = preadv (fh->fd, iov, iov_count, iov_offset); - if ( 0 < ret_code ) { - bytes_read+=ret_code; - } #else if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) { opal_output(1, "lseek:%s", strerror(errno)); free(iov); + mca_fbtl_posix_unlock ( &lock, fh ); return OMPI_ERROR; } ret_code = readv (fh->fd, iov, iov_count); +#endif + mca_fbtl_posix_unlock ( &lock, fh ); if ( 0 < ret_code ) { bytes_read+=ret_code; } -#endif - mca_fbtl_posix_unlock ( &lock, fh ); else if ( ret_code == -1 ) { opal_output(1, "readv:%s", strerror(errno)); free(iov); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index 9929823185..da7c31e959 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -97,25 +97,22 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) */ - mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_len, OMPIO_LOCK_SELECTIVE ); + mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); #if defined (HAVE_PWRITEV) ret_code = pwritev (fh->fd, iov, iov_count, iov_offset); - if ( 0 < ret_code ) { - bytes_written += ret_code; - } - #else if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) { opal_output(1, "lseek:%s", strerror(errno)); free(iov); + mca_fbtl_posix_unlock ( &lock, fh ); return OMPI_ERROR; } ret_code = writev (fh->fd, iov, iov_count); +#endif + mca_fbtl_posix_unlock ( &lock, fh ); if ( 0 < ret_code ) { bytes_written += ret_code; } -#endif - mca_fbtl_posix_unlock ( &lock, fh ); else if (-1 == ret_code ) { opal_output(1, "writev:%s", strerror(errno)); free (iov); diff --git a/ompi/mca/io/ompio/io_ompio.h b/ompi/mca/io/ompio/io_ompio.h index ee310c9802..07554b21cf 100644 --- a/ompi/mca/io/ompio/io_ompio.h +++ b/ompi/mca/io/ompio/io_ompio.h @@ -64,6 +64,10 @@ OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info; #define OMPIO_CONTIGUOUS_FVIEW 0x00000010 #define OMPIO_AGGREGATOR_IS_SET 0x00000020 #define OMPIO_SHAREDFP_IS_SET 0x00000040 +#define OMPIO_LOCK_ALWAYS 0x00000080 +#define OMPIO_LOCK_NEVER 0x00000100 +#define OMPIO_LOCK_NOT_THIS_OP 0x00000200 + #define QUEUESIZE 2048 #define MCA_IO_DEFAULT_FILE_VIEW_SIZE 4*1024*1024 @@ -121,6 +125,10 @@ OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info; #define OMPIO_PROCS_IN_GROUP_TAG 1 #define OMPIO_MERGE_THRESHOLD 0.5 + +#define OMPIO_LOCK_ENTIRE_REGION 10 +#define OMPIO_LOCK_SELECTIVE 11 + /*---------------------------*/ BEGIN_C_DECLS @@ -216,6 +224,7 @@ struct mca_io_ompio_file_t { opal_info_t *f_info; int32_t f_flags; void *f_fs_ptr; + int f_fs_block_size; int f_atomicity; size_t f_stripe_size; int f_stripe_count; From a3c638bc380a65788a51f802ce3c6a0e3038a0cc Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 17 Oct 2017 15:57:34 -0500 Subject: [PATCH 3/6] fbtl/posix: add support for file locking for the non-blocking operations Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix.c | 18 ++++++++++++++++++ ompi/mca/fbtl/posix/fbtl_posix.h | 2 ++ ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c | 9 +++++++++ ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c | 8 ++++++++ ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 4 ++-- ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 3 +-- 6 files changed, 40 insertions(+), 4 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix.c b/ompi/mca/fbtl/posix/fbtl_posix.c index 4c6d21ab01..29dbdb4197 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.c +++ b/ompi/mca/fbtl/posix/fbtl_posix.c @@ -118,6 +118,7 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req) #if defined (FBTL_POSIX_HAVE_AIO) int i=0, lcount=0; mca_fbtl_posix_request_data_t *data=(mca_fbtl_posix_request_data_t *)req->req_data; + off_t start_offset, end_offset, total_length; for (i=data->aio_first_active_req; i < data->aio_last_active_req; i++ ) { if ( EINPROGRESS == data->aio_req_status[i] ) { @@ -154,6 +155,9 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req) #endif if ( (lcount == data->aio_req_chunks) && (0 != data->aio_open_reqs )) { + /* release the lock of the previous operations */ + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); + /* post the next batch of operations */ data->aio_first_active_req = data->aio_last_active_req; if ( (data->aio_req_count-data->aio_last_active_req) > data->aio_req_chunks ) { @@ -162,16 +166,30 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req) else { data->aio_last_active_req = data->aio_req_count; } + + start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset; + end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes; + total_length = (end_offset - start_offset); + + if ( FBTL_POSIX_READ == data->aio_req_type ) { + mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + } + else if ( FBTL_POSIX_WRITE == data->aio_req_type ) { + mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + } + for ( i=data->aio_first_active_req; i< data->aio_last_active_req; i++ ) { if ( FBTL_POSIX_READ == data->aio_req_type ) { if (-1 == aio_read(&data->aio_reqs[i])) { perror("aio_read() error"); + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); return OMPI_ERROR; } } else if ( FBTL_POSIX_WRITE == data->aio_req_type ) { if (-1 == aio_write(&data->aio_reqs[i])) { perror("aio_write() error"); + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); return OMPI_ERROR; } } diff --git a/ompi/mca/fbtl/posix/fbtl_posix.h b/ompi/mca/fbtl/posix/fbtl_posix.h index b9353a921a..b4d029dd96 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.h +++ b/ompi/mca/fbtl/posix/fbtl_posix.h @@ -73,6 +73,8 @@ struct mca_fbtl_posix_request_data_t { struct aiocb *aio_reqs; /* pointer array of req structures */ int *aio_req_status; /* array of statuses */ ssize_t aio_total_len; /* total amount of data written */ + struct flock aio_lock; /* lock used for certain file systems */ + mca_io_ompio_file_t *aio_fh; /* pointer back to the mca_io_ompio_fh structure */ }; typedef struct mca_fbtl_posix_request_data_t mca_fbtl_posix_request_data_t; diff --git a/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c b/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c index 00eaedeaf7..6ac9d03703 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c @@ -40,6 +40,7 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh, mca_fbtl_posix_request_data_t *data; mca_ompio_request_t *req = (mca_ompio_request_t *) request; int i=0; + off_t start_offset, end_offset, total_length; data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t)); if ( NULL == data ) { @@ -67,6 +68,7 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh, free(data); return 0; } + data->aio_fh = fh; for ( i=0; if_num_of_io_entries; i++ ) { data->aio_reqs[i].aio_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) @@ -86,9 +88,16 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh, else { data->aio_last_active_req = data->aio_req_count; } + + start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset; + end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes; + total_length = (end_offset - start_offset); + mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + for (i=0; i < data->aio_last_active_req; i++) { if (-1 == aio_read(&data->aio_reqs[i])) { opal_output(1, "aio_read() error: %s", strerror(errno)); + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); free(data->aio_reqs); free(data->aio_req_status); free(data); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c index 1d869c2a75..2bd04ff747 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c @@ -39,6 +39,7 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh, mca_fbtl_posix_request_data_t *data; mca_ompio_request_t *req = (mca_ompio_request_t *) request; int i=0; + off_t start_offset, end_offset, total_length; data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t)); if ( NULL == data ) { @@ -66,6 +67,7 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh, free(data); return 0; } + data->aio_fh = fh; for ( i=0; if_num_of_io_entries; i++ ) { data->aio_reqs[i].aio_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) @@ -85,10 +87,16 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh, else { data->aio_last_active_req = data->aio_req_count; } + + start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset; + end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes; + total_length = (end_offset - start_offset); + mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); for (i=0; i < data->aio_last_active_req; i++) { if (-1 == aio_write(&data->aio_reqs[i])) { opal_output(1, "aio_write() error: %s", strerror(errno)); + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); free(data->aio_req_status); free(data->aio_reqs); free(data); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index cbf9d34fb9..b6360b3d97 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -55,7 +55,6 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; - total_length = iov[iov_count].iov_len; iov_count ++; } @@ -78,11 +77,12 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; iov[iov_count].iov_len = fh->f_io_array[i+1].length; - total_length += iov[iov_count].iov_len; iov_count ++; continue; } } + + total_length = ((off_t)iov[iov_count-1].iov_base + iov[iov_count-1].iov_len - (off_t)iov_offset ); mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); #if defined(HAVE_PREADV) ret_code = preadv (fh->fd, iov, iov_count, iov_offset); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index da7c31e959..61d8008355 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -57,7 +57,6 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; - total_length = iov[iov_count].iov_len; iov_count ++; } @@ -79,7 +78,6 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) (iov_count < IOV_MAX )) { iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; iov[iov_count].iov_len = fh->f_io_array[i+1].length; - total_length += iov[iov_count].iov_len; iov_count ++; continue; } @@ -97,6 +95,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) */ + total_length = ((off_t)iov[iov_count-1].iov_base + iov[iov_count-1].iov_len - (off_t)iov_offset); mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); #if defined (HAVE_PWRITEV) ret_code = pwritev (fh->fd, iov, iov_count, iov_offset); From f66c55f77a0938424dc39bb25095df27156d1883 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 18 Oct 2017 09:02:42 -0500 Subject: [PATCH 4/6] fbtl/posix: fixes in the offset calculation and for aio operations our own internal testsuite passes now correctly. More testing to follow. Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix.c | 2 ++ ompi/mca/fbtl/posix/fbtl_posix_lock.c | 30 ++++++++++++++++++------ ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 6 +++-- ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 6 +++-- ompi/mca/fs/ufs/fs_ufs_file_open.c | 4 ++++ 5 files changed, 37 insertions(+), 11 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix.c b/ompi/mca/fbtl/posix/fbtl_posix.c index 29dbdb4197..65f01c20d6 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.c +++ b/ompi/mca/fbtl/posix/fbtl_posix.c @@ -203,6 +203,7 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req) /* all pending operations are finished for this request */ req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; req->req_ompi.req_status._ucount = data->aio_total_len; + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); ret = true; } #endif @@ -215,6 +216,7 @@ void mca_fbtl_posix_request_free ( mca_ompio_request_t *req) /* Free the fbtl specific data structures */ mca_fbtl_posix_request_data_t *data=(mca_fbtl_posix_request_data_t *)req->req_data; if (NULL != data ) { + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); if ( NULL != data->aio_reqs ) { free ( data->aio_reqs); } diff --git a/ompi/mca/fbtl/posix/fbtl_posix_lock.c b/ompi/mca/fbtl/posix/fbtl_posix_lock.c index 30551b4424..31897d5c91 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_lock.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_lock.c @@ -43,6 +43,9 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, lock->l_whence = SEEK_SET; lock->l_start =-1; lock->l_len =-1; + if ( 0 == len ) { + return 0; + } if ( fh->f_atomicity || fh->f_flags & OMPIO_LOCK_ALWAYS ) { /* Need to lock the entire region */ @@ -66,7 +69,7 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, */ return 0; } - if ( OMPIO_LOCK_ENTIRE_REGION ) { + if ( flags == OMPIO_LOCK_ENTIRE_REGION ) { lock->l_start = (off_t) offset; lock->l_len = len; } @@ -79,13 +82,13 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, the two into a single lock. */ bmod = offset % fh->f_fs_block_size; - if ( !bmod ) { + if ( bmod ) { lock->l_start = (off_t) offset; - lock->l_len = fh->f_fs_block_size - bmod; + lock->l_len = bmod; } - lmod = (offset+len-1)%fh->f_fs_block_size; - if ( !lmod ) { - if ( bmod ) { + lmod = (offset+len)%fh->f_fs_block_size; + if ( lmod ) { + if ( !bmod ) { lock->l_start = (offset+len-lmod ); lock->l_len = lmod; } @@ -100,15 +103,28 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, } } + +#ifdef OMPIO_DEBUG + printf("%d: acquiring lock for offset %ld length %ld requested offset %ld request len %ld \n", + fh->f_rank, lock->l_start, lock->l_len, offset, len); +#endif return (fcntl ( fh->fd, F_SETLKW, lock)); } int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ) { + int ret; if ( -1 == lock->l_start && -1 == lock->l_len ) { return 0; } lock->l_type = F_UNLCK; - return (fcntl ( fh->fd, F_SETLK, lock)); +#ifdef OMPIO_DEBUG + printf("%d: releasing lock for offset %ld length %ld\n", fh->f_rank, lock->l_start, lock->l_len); +#endif + ret = fcntl ( fh->fd, F_SETLK, lock); + lock->l_start = -1; + lock->l_len = -1; + + return ret; } diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index b6360b3d97..3ebc69cfa3 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -37,7 +37,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) OMPI_MPI_OFFSET_TYPE iov_offset = 0; ssize_t bytes_read=0, ret_code=0; struct flock lock; - off_t total_length; + off_t total_length, end_offset=0; if (NULL == fh->f_io_array) { return OMPI_ERROR; @@ -55,6 +55,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; + end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; iov_count ++; } @@ -77,12 +78,13 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; iov[iov_count].iov_len = fh->f_io_array[i+1].length; + end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; iov_count ++; continue; } } - total_length = ((off_t)iov[iov_count-1].iov_base + iov[iov_count-1].iov_len - (off_t)iov_offset ); + total_length = (end_offset - (off_t)iov_offset ); mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); #if defined(HAVE_PREADV) ret_code = preadv (fh->fd, iov, iov_count, iov_offset); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index 61d8008355..1c09827bde 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -39,7 +39,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) OMPI_MPI_OFFSET_TYPE iov_offset = 0; ssize_t ret_code=0, bytes_written=0; struct flock lock; - off_t total_length; + off_t total_length, end_offset=0; if (NULL == fh->f_io_array) { return OMPI_ERROR; @@ -57,6 +57,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; + end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; iov_count ++; } @@ -78,6 +79,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) (iov_count < IOV_MAX )) { iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; iov[iov_count].iov_len = fh->f_io_array[i+1].length; + end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length; iov_count ++; continue; } @@ -95,7 +97,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) */ - total_length = ((off_t)iov[iov_count-1].iov_base + iov[iov_count-1].iov_len - (off_t)iov_offset); + total_length = (end_offset - (off_t)iov_offset); mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); #if defined (HAVE_PWRITEV) ret_code = pwritev (fh->fd, iov, iov_count, iov_offset); diff --git a/ompi/mca/fs/ufs/fs_ufs_file_open.c b/ompi/mca/fs/ufs/fs_ufs_file_open.c index 8f0ea650c9..b579720d6a 100644 --- a/ompi/mca/fs/ufs/fs_ufs_file_open.c +++ b/ompi/mca/fs/ufs/fs_ufs_file_open.c @@ -96,6 +96,10 @@ mca_fs_ufs_file_open (struct ompi_communicator_t *comm, fh->f_stripe_size=0; fh->f_stripe_count=1; + /* Need to find a way to determine the file system block size at run time. + 4096 is the most common value, but it might not always be accurate. + */ + fh->f_fs_block_size = 4096; return OMPI_SUCCESS; } From e62f9d2e52befc12a92b6770302f7f539f200135 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Wed, 18 Oct 2017 09:36:08 -0500 Subject: [PATCH 5/6] fs/ufs: ensure that the never-lock flag is set if not on NFS Signed-off-by: Edgar Gabriel --- ompi/mca/fs/ufs/fs_ufs_file_open.c | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ompi/mca/fs/ufs/fs_ufs_file_open.c b/ompi/mca/fs/ufs/fs_ufs_file_open.c index b579720d6a..bb174cf20b 100644 --- a/ompi/mca/fs/ufs/fs_ufs_file_open.c +++ b/ompi/mca/fs/ufs/fs_ufs_file_open.c @@ -28,9 +28,11 @@ #include #include "mpi.h" #include "ompi/constants.h" +#include "ompi/mca/fs/base/base.h" #include "ompi/mca/fs/fs.h" #include "ompi/communicator/communicator.h" #include "ompi/info/info.h" +#include "opal/util/path.h" /* * file_open_ufs @@ -101,5 +103,34 @@ mca_fs_ufs_file_open (struct ompi_communicator_t *comm, */ fh->f_fs_block_size = 4096; + /* Need to check for NFS here. If the file system is not NFS but a regular UFS file system, + we do not need to enforce locking. A regular XFS or EXT4 file system can only be used + within a single node, local environment, and in this case the OS will already ensure correct + handling of file system blocks; + */ + + char *fstype=NULL; + bool bret = opal_path_nfs ( (char *)filename, &fstype ); + + if ( false == bret ) { + char *dir; + mca_fs_base_get_parent_dir ( (char *)filename, &dir ); + bret = opal_path_nfs (dir, &fstype); + free(dir); + } + + if ( true == bret ) { + if ( 0 == strncasecmp(fstype, "nfs", sizeof("nfs")) ) { + /* Nothing really to be done in this case. Locking can stay */ + } + else { + fh->f_flags |= OMPIO_LOCK_NEVER; + } + } + else { + fh->f_flags |= OMPIO_LOCK_NEVER; + } + free (fstype); + return OMPI_SUCCESS; } From be0de21e6f13968736166c7994ef3775bd81b8af Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 19 Oct 2017 14:50:13 -0500 Subject: [PATCH 6/6] fs/ufs and fbtl/posix: cleanup lock management This commit looks large, but its really mostly a cleanup step. 1. introduce proper error handling for the return values of fcntl and the fbtl_posix_lock function 2. rename a parameter to more accurately reflect what it does 3. introduce an mca parameter in the fs/ufs component that allows to control what the level of locking the user would like to enforce 4. move the initialization of the fs_block_size parameter from fs/ufs into the common/ompio component. An fs component might be allowed to overwrite this value, but none of the actual fs components do that. Signed-off-by: Edgar Gabriel --- .../mca/common/ompio/common_ompio_file_open.c | 1 + ompi/mca/fbtl/posix/fbtl_posix.c | 18 ++++--- ompi/mca/fbtl/posix/fbtl_posix.h | 2 +- ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c | 14 +++-- ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c | 14 +++-- ompi/mca/fbtl/posix/fbtl_posix_lock.c | 45 +++++++++++----- ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 16 ++++-- ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 15 ++++-- ompi/mca/fs/ufs/fs_ufs.h | 6 +++ ompi/mca/fs/ufs/fs_ufs_component.c | 30 +++++++++++ ompi/mca/fs/ufs/fs_ufs_file_open.c | 54 ++++++++++++------- ompi/mca/io/ompio/io_ompio.h | 2 +- 12 files changed, 163 insertions(+), 54 deletions(-) diff --git a/ompi/mca/common/ompio/common_ompio_file_open.c b/ompi/mca/common/ompio/common_ompio_file_open.c index d21863a26e..3e48295349 100644 --- a/ompi/mca/common/ompio/common_ompio_file_open.c +++ b/ompi/mca/common/ompio/common_ompio_file_open.c @@ -90,6 +90,7 @@ int mca_common_ompio_file_open (ompi_communicator_t *comm, ompio_fh->f_amode = amode; ompio_fh->f_info = info; ompio_fh->f_atomicity = 0; + ompio_fh->f_fs_block_size = 4096; mca_common_ompio_set_file_defaults (ompio_fh); ompio_fh->f_filename = filename; diff --git a/ompi/mca/fbtl/posix/fbtl_posix.c b/ompi/mca/fbtl/posix/fbtl_posix.c index 65f01c20d6..a7ce4dad78 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.c +++ b/ompi/mca/fbtl/posix/fbtl_posix.c @@ -116,7 +116,7 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req) { bool ret=false; #if defined (FBTL_POSIX_HAVE_AIO) - int i=0, lcount=0; + int i=0, lcount=0, ret_code; mca_fbtl_posix_request_data_t *data=(mca_fbtl_posix_request_data_t *)req->req_data; off_t start_offset, end_offset, total_length; @@ -172,23 +172,29 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req) total_length = (end_offset - start_offset); if ( FBTL_POSIX_READ == data->aio_req_type ) { - mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + ret_code = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); } else if ( FBTL_POSIX_WRITE == data->aio_req_type ) { - mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + ret_code = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); } - + if ( 0 < ret_code ) { + opal_output(1, "mca_fbtl_posix_progress: error in mca_fbtl_posix_lock() %d", ret_code); + /* Just in case some part of the lock actually succeeded. */ + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); + return OMPI_ERROR; + } + for ( i=data->aio_first_active_req; i< data->aio_last_active_req; i++ ) { if ( FBTL_POSIX_READ == data->aio_req_type ) { if (-1 == aio_read(&data->aio_reqs[i])) { - perror("aio_read() error"); + opal_output(1, "mca_fbtl_posix_progress: error in aio_read()"); mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); return OMPI_ERROR; } } else if ( FBTL_POSIX_WRITE == data->aio_req_type ) { if (-1 == aio_write(&data->aio_reqs[i])) { - perror("aio_write() error"); + opal_output(1, "mca_fbtl_posix_progress: error in aio_write()"); mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); return OMPI_ERROR; } diff --git a/ompi/mca/fbtl/posix/fbtl_posix.h b/ompi/mca/fbtl/posix/fbtl_posix.h index b4d029dd96..1ee6ffb40a 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix.h +++ b/ompi/mca/fbtl/posix/fbtl_posix.h @@ -60,7 +60,7 @@ void mca_fbtl_posix_request_free ( mca_ompio_request_t *req); int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, OMPI_MPI_OFFSET_TYPE iov_offset, off_t len, int flags); -int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ); +void mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ); struct mca_fbtl_posix_request_data_t { diff --git a/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c b/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c index 6ac9d03703..0b56d8334a 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c @@ -39,7 +39,7 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh, #if defined (FBTL_POSIX_HAVE_AIO) mca_fbtl_posix_request_data_t *data; mca_ompio_request_t *req = (mca_ompio_request_t *) request; - int i=0; + int i=0, ret; off_t start_offset, end_offset, total_length; data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t)); @@ -92,11 +92,19 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh, start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset; end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes; total_length = (end_offset - start_offset); - mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + ret = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_ipreadv: error in mca_fbtl_posix_lock() error ret=%d %s", ret, strerror(errno)); + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); + free(data->aio_reqs); + free(data->aio_req_status); + free(data); + return OMPI_ERROR; + } for (i=0; i < data->aio_last_active_req; i++) { if (-1 == aio_read(&data->aio_reqs[i])) { - opal_output(1, "aio_read() error: %s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_ipreadv: error in aio_read(): %s", strerror(errno)); mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); free(data->aio_reqs); free(data->aio_req_status); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c index 2bd04ff747..11790f453f 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c @@ -38,7 +38,7 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh, #if defined(FBTL_POSIX_HAVE_AIO) mca_fbtl_posix_request_data_t *data; mca_ompio_request_t *req = (mca_ompio_request_t *) request; - int i=0; + int i=0, ret; off_t start_offset, end_offset, total_length; data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t)); @@ -91,11 +91,19 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh, start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset; end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes; total_length = (end_offset - start_offset); - mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + ret = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_ipwritev: error in mca_fbtl_posix_lock() error ret=%d %s", ret, strerror(errno)); + mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); + free(data->aio_reqs); + free(data->aio_req_status); + free(data); + return OMPI_ERROR; + } for (i=0; i < data->aio_last_active_req; i++) { if (-1 == aio_write(&data->aio_reqs[i])) { - opal_output(1, "aio_write() error: %s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_ipwritev: error in aio_write(): %s", strerror(errno)); mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh ); free(data->aio_req_status); free(data->aio_reqs); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_lock.c b/ompi/mca/fbtl/posix/fbtl_posix_lock.c index 31897d5c91..b59ec057e9 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_lock.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_lock.c @@ -23,21 +23,29 @@ #include "mpi.h" #include #include +#include #include #include "ompi/constants.h" #include "ompi/mca/fbtl/fbtl.h" - +#define MAX_ERRCOUNT 100 /* op: can be F_WRLCK or F_RDLCK - flags: can be OMPIO_LOCK_ENTIRE_REGION or OMPIO_LOCK_SELECTIVE + flags: can be OMPIO_LOCK_ENTIRE_REGION or OMPIO_LOCK_SELECTIVE. This is typically set by the operation, not the fs component. + e.g. a collective and an individual component might require different level of protection through locking, + also one might need to do different things for blocking (pwritev,preadv) operations and non-blocking (aio) operations. + + fh->f_flags can contain similar sounding flags, those were set by the fs component and/or user requests. + + Support for MPI atomicity operations are envisioned, but not yet tested. */ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, OMPI_MPI_OFFSET_TYPE offset, off_t len, int flags) { off_t lmod, bmod; + int ret, err_count; lock->l_type = op; lock->l_whence = SEEK_SET; @@ -46,11 +54,10 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, if ( 0 == len ) { return 0; } - if ( fh->f_atomicity || - fh->f_flags & OMPIO_LOCK_ALWAYS ) { - /* Need to lock the entire region */ - lock->l_start = (off_t) offset; - lock->l_len = len; + + if ( fh->f_flags & OMPIO_LOCK_ENTIRE_FILE ) { + lock->l_start = (off_t) 0; + lock->l_len = 0; } else { if ( (fh->f_flags & OMPIO_LOCK_NEVER) || @@ -108,23 +115,35 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op, printf("%d: acquiring lock for offset %ld length %ld requested offset %ld request len %ld \n", fh->f_rank, lock->l_start, lock->l_len, offset, len); #endif - return (fcntl ( fh->fd, F_SETLKW, lock)); + errno=0; + err_count=0; + do { + ret = fcntl ( fh->fd, F_SETLKW, lock); + if ( ret ) { +#ifdef OMPIO_DEBUG + printf("[%d] ret = %d errno=%d %s\n", fh->f_rank, ret, errno, strerror(errno) ); +#endif + err_count++; + } + } while ( ret && ((errno == EINTR) || ((errno == EINPROGRESS) && err_count < MAX_ERRCOUNT ))); + + + return ret; } -int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ) +void mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh ) { - int ret; if ( -1 == lock->l_start && -1 == lock->l_len ) { - return 0; + return; } lock->l_type = F_UNLCK; #ifdef OMPIO_DEBUG printf("%d: releasing lock for offset %ld length %ld\n", fh->f_rank, lock->l_start, lock->l_len); #endif - ret = fcntl ( fh->fd, F_SETLK, lock); + fcntl ( fh->fd, F_SETLK, lock); lock->l_start = -1; lock->l_len = -1; - return ret; + return; } diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index 3ebc69cfa3..5f5593c827 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -31,7 +31,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) { /*int *fp = NULL;*/ - int i, block=1; + int i, block=1, ret; struct iovec *iov = NULL; int iov_count = 0; OMPI_MPI_OFFSET_TYPE iov_offset = 0; @@ -85,12 +85,20 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) } total_length = (end_offset - (off_t)iov_offset ); - mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); + + ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno)); + free (iov); + /* Just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh); + return OMPI_ERROR; + } #if defined(HAVE_PREADV) ret_code = preadv (fh->fd, iov, iov_count, iov_offset); #else if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) { - opal_output(1, "lseek:%s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_preadv: error in lseek:%s", strerror(errno)); free(iov); mca_fbtl_posix_unlock ( &lock, fh ); return OMPI_ERROR; @@ -102,7 +110,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh ) bytes_read+=ret_code; } else if ( ret_code == -1 ) { - opal_output(1, "readv:%s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_preadv: error in (p)readv:%s", strerror(errno)); free(iov); return OMPI_ERROR; } diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index 1c09827bde..c6a640290d 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -33,7 +33,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) { /*int *fp = NULL;*/ - int i, block = 1; + int i, block = 1, ret; struct iovec *iov = NULL; int iov_count = 0; OMPI_MPI_OFFSET_TYPE iov_offset = 0; @@ -98,12 +98,19 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) */ total_length = (end_offset - (off_t)iov_offset); - mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); + ret = mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE ); + if ( 0 < ret ) { + opal_output(1, "mca_fbtl_posix_pwritev: error in mca_fbtl_posix_lock() error ret=%d %s", ret, strerror(errno)); + free (iov); + /* just in case some part of the lock worked */ + mca_fbtl_posix_unlock ( &lock, fh ); + return OMPI_ERROR; + } #if defined (HAVE_PWRITEV) ret_code = pwritev (fh->fd, iov, iov_count, iov_offset); #else if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) { - opal_output(1, "lseek:%s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_pwritev: error in lseek:%s", strerror(errno)); free(iov); mca_fbtl_posix_unlock ( &lock, fh ); return OMPI_ERROR; @@ -115,7 +122,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh ) bytes_written += ret_code; } else if (-1 == ret_code ) { - opal_output(1, "writev:%s", strerror(errno)); + opal_output(1, "mca_fbtl_posix_pwritev: error in writev:%s", strerror(errno)); free (iov); return OMPI_ERROR; } diff --git a/ompi/mca/fs/ufs/fs_ufs.h b/ompi/mca/fs/ufs/fs_ufs.h index b03ea669d3..08fb426e4e 100644 --- a/ompi/mca/fs/ufs/fs_ufs.h +++ b/ompi/mca/fs/ufs/fs_ufs.h @@ -29,6 +29,12 @@ #include "ompi/mca/common/ompio/common_ompio.h" extern int mca_fs_ufs_priority; +extern int mca_fs_ufs_lock_algorithm; + +#define FS_UFS_LOCK_AUTO 0 +#define FS_UFS_LOCK_NEVER 1 +#define FS_UFS_LOCK_ENTIRE_FILE 2 +#define FS_UFS_LOCK_RANGES 3 BEGIN_C_DECLS diff --git a/ompi/mca/fs/ufs/fs_ufs_component.c b/ompi/mca/fs/ufs/fs_ufs_component.c index d5f3c157da..7ecaf9e0fd 100644 --- a/ompi/mca/fs/ufs/fs_ufs_component.c +++ b/ompi/mca/fs/ufs/fs_ufs_component.c @@ -31,6 +31,12 @@ #include "mpi.h" int mca_fs_ufs_priority = 10; +int mca_fs_ufs_lock_algorithm=0; /* auto */ +/* + * Private functions + */ +static int register_component(void); + /* * Public string showing the fs ufs component version number @@ -54,6 +60,7 @@ mca_fs_base_component_2_0_0_t mca_fs_ufs_component = { .mca_component_name = "ufs", MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, OMPI_RELEASE_VERSION), + .mca_register_component_params = register_component, }, .fsm_data = { /* This component is checkpointable */ @@ -63,3 +70,26 @@ mca_fs_base_component_2_0_0_t mca_fs_ufs_component = { .fsm_file_query = mca_fs_ufs_component_file_query, /* get priority and actions */ .fsm_file_unquery = mca_fs_ufs_component_file_unquery, /* undo what was done by previous function */ }; + +static int register_component(void) +{ + mca_fs_ufs_priority = 10; + (void) mca_base_component_var_register(&mca_fs_ufs_component.fsm_version, + "priority", "Priority of the fs ufs component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fs_ufs_priority); + + mca_fs_ufs_lock_algorithm = 0; + (void) mca_base_component_var_register(&mca_fs_ufs_component.fsm_version, + "lock_algorithm", "Locking algorithm used by the fs ufs component. " + " 0: auto (default), 1: skip locking, 2: always lock entire file, " + "3: lock only specific ranges", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fs_ufs_lock_algorithm ); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/fs/ufs/fs_ufs_file_open.c b/ompi/mca/fs/ufs/fs_ufs_file_open.c index bb174cf20b..92d5eba322 100644 --- a/ompi/mca/fs/ufs/fs_ufs_file_open.c +++ b/ompi/mca/fs/ufs/fs_ufs_file_open.c @@ -98,39 +98,55 @@ mca_fs_ufs_file_open (struct ompi_communicator_t *comm, fh->f_stripe_size=0; fh->f_stripe_count=1; - /* Need to find a way to determine the file system block size at run time. - 4096 is the most common value, but it might not always be accurate. - */ - fh->f_fs_block_size = 4096; /* Need to check for NFS here. If the file system is not NFS but a regular UFS file system, we do not need to enforce locking. A regular XFS or EXT4 file system can only be used within a single node, local environment, and in this case the OS will already ensure correct handling of file system blocks; */ - - char *fstype=NULL; - bool bret = opal_path_nfs ( (char *)filename, &fstype ); - if ( false == bret ) { - char *dir; - mca_fs_base_get_parent_dir ( (char *)filename, &dir ); - bret = opal_path_nfs (dir, &fstype); - free(dir); - } - - if ( true == bret ) { - if ( 0 == strncasecmp(fstype, "nfs", sizeof("nfs")) ) { - /* Nothing really to be done in this case. Locking can stay */ + if ( FS_UFS_LOCK_AUTO == mca_fs_ufs_lock_algorithm ) { + char *fstype=NULL; + bool bret = opal_path_nfs ( (char *)filename, &fstype ); + + if ( false == bret ) { + char *dir; + mca_fs_base_get_parent_dir ( (char *)filename, &dir ); + bret = opal_path_nfs (dir, &fstype); + free(dir); + } + + if ( true == bret ) { + if ( 0 == strncasecmp(fstype, "nfs", sizeof("nfs")) ) { + /* Based on my tests, only locking the entire file for all operations + guarantueed for the entire teststuite to pass correctly. I would not + be surprised, if depending on the NFS configuration that might not always + be necessary, and the user can change that with an MCA parameter of this + component. */ + fh->f_flags |= OMPIO_LOCK_ENTIRE_FILE; + } + else { + fh->f_flags |= OMPIO_LOCK_NEVER; + } } else { fh->f_flags |= OMPIO_LOCK_NEVER; } + free (fstype); + } + else if ( FS_UFS_LOCK_NEVER == mca_fs_ufs_lock_algorithm ) { + fh->f_flags |= OMPIO_LOCK_NEVER; + } + else if ( FS_UFS_LOCK_ENTIRE_FILE == mca_fs_ufs_lock_algorithm ) { + fh->f_flags |= OMPIO_LOCK_ENTIRE_FILE; + } + else if ( FS_UFS_LOCK_RANGES == mca_fs_ufs_lock_algorithm ) { + /* Nothing to be done. This is what the posix fbtl component would do + anyway without additional information . */ } else { - fh->f_flags |= OMPIO_LOCK_NEVER; + opal_output ( 1, "Invalid value for mca_fs_ufs_lock_algorithm %d", mca_fs_ufs_lock_algorithm ); } - free (fstype); return OMPI_SUCCESS; } diff --git a/ompi/mca/io/ompio/io_ompio.h b/ompi/mca/io/ompio/io_ompio.h index 07554b21cf..a010ab1499 100644 --- a/ompi/mca/io/ompio/io_ompio.h +++ b/ompi/mca/io/ompio/io_ompio.h @@ -64,7 +64,7 @@ OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info; #define OMPIO_CONTIGUOUS_FVIEW 0x00000010 #define OMPIO_AGGREGATOR_IS_SET 0x00000020 #define OMPIO_SHAREDFP_IS_SET 0x00000040 -#define OMPIO_LOCK_ALWAYS 0x00000080 +#define OMPIO_LOCK_ENTIRE_FILE 0x00000080 #define OMPIO_LOCK_NEVER 0x00000100 #define OMPIO_LOCK_NOT_THIS_OP 0x00000200