1
1

fs/ufs and fbtl/posix: cleanup lock management

This commit looks large, but its really mostly a cleanup step.
1. introduce proper error handling for the return values of fcntl and the fbtl_posix_lock function
2. rename a parameter to more accurately reflect what it does
3. introduce an mca parameter in the fs/ufs component that allows to control
   what the level of locking the user would like to enforce
4. move the initialization of the fs_block_size parameter from fs/ufs into the
   common/ompio component. An fs component might be allowed to overwrite this
   value, but none of the actual fs components do that.

Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
Этот коммит содержится в:
Edgar Gabriel 2017-10-19 14:50:13 -05:00
родитель e62f9d2e52
Коммит be0de21e6f
12 изменённых файлов: 163 добавлений и 54 удалений

Просмотреть файл

@ -90,6 +90,7 @@ int mca_common_ompio_file_open (ompi_communicator_t *comm,
ompio_fh->f_amode = amode;
ompio_fh->f_info = info;
ompio_fh->f_atomicity = 0;
ompio_fh->f_fs_block_size = 4096;
mca_common_ompio_set_file_defaults (ompio_fh);
ompio_fh->f_filename = filename;

Просмотреть файл

@ -116,7 +116,7 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req)
{
bool ret=false;
#if defined (FBTL_POSIX_HAVE_AIO)
int i=0, lcount=0;
int i=0, lcount=0, ret_code;
mca_fbtl_posix_request_data_t *data=(mca_fbtl_posix_request_data_t *)req->req_data;
off_t start_offset, end_offset, total_length;
@ -172,23 +172,29 @@ bool mca_fbtl_posix_progress ( mca_ompio_request_t *req)
total_length = (end_offset - start_offset);
if ( FBTL_POSIX_READ == data->aio_req_type ) {
mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
ret_code = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
}
else if ( FBTL_POSIX_WRITE == data->aio_req_type ) {
mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
ret_code = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
}
if ( 0 < ret_code ) {
opal_output(1, "mca_fbtl_posix_progress: error in mca_fbtl_posix_lock() %d", ret_code);
/* Just in case some part of the lock actually succeeded. */
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
return OMPI_ERROR;
}
for ( i=data->aio_first_active_req; i< data->aio_last_active_req; i++ ) {
if ( FBTL_POSIX_READ == data->aio_req_type ) {
if (-1 == aio_read(&data->aio_reqs[i])) {
perror("aio_read() error");
opal_output(1, "mca_fbtl_posix_progress: error in aio_read()");
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
return OMPI_ERROR;
}
}
else if ( FBTL_POSIX_WRITE == data->aio_req_type ) {
if (-1 == aio_write(&data->aio_reqs[i])) {
perror("aio_write() error");
opal_output(1, "mca_fbtl_posix_progress: error in aio_write()");
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
return OMPI_ERROR;
}

Просмотреть файл

@ -60,7 +60,7 @@ void mca_fbtl_posix_request_free ( mca_ompio_request_t *req);
int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op,
OMPI_MPI_OFFSET_TYPE iov_offset, off_t len, int flags);
int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh );
void mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh );
struct mca_fbtl_posix_request_data_t {

Просмотреть файл

@ -39,7 +39,7 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh,
#if defined (FBTL_POSIX_HAVE_AIO)
mca_fbtl_posix_request_data_t *data;
mca_ompio_request_t *req = (mca_ompio_request_t *) request;
int i=0;
int i=0, ret;
off_t start_offset, end_offset, total_length;
data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t));
@ -92,11 +92,19 @@ ssize_t mca_fbtl_posix_ipreadv (mca_io_ompio_file_t *fh,
start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset;
end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes;
total_length = (end_offset - start_offset);
mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
ret = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_RDLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_ipreadv: error in mca_fbtl_posix_lock() error ret=%d %s", ret, strerror(errno));
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
free(data->aio_reqs);
free(data->aio_req_status);
free(data);
return OMPI_ERROR;
}
for (i=0; i < data->aio_last_active_req; i++) {
if (-1 == aio_read(&data->aio_reqs[i])) {
opal_output(1, "aio_read() error: %s", strerror(errno));
opal_output(1, "mca_fbtl_posix_ipreadv: error in aio_read(): %s", strerror(errno));
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
free(data->aio_reqs);
free(data->aio_req_status);

Просмотреть файл

@ -38,7 +38,7 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh,
#if defined(FBTL_POSIX_HAVE_AIO)
mca_fbtl_posix_request_data_t *data;
mca_ompio_request_t *req = (mca_ompio_request_t *) request;
int i=0;
int i=0, ret;
off_t start_offset, end_offset, total_length;
data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t));
@ -91,11 +91,19 @@ ssize_t mca_fbtl_posix_ipwritev (mca_io_ompio_file_t *fh,
start_offset = data->aio_reqs[data->aio_first_active_req].aio_offset;
end_offset = data->aio_reqs[data->aio_last_active_req-1].aio_offset + data->aio_reqs[data->aio_last_active_req-1].aio_nbytes;
total_length = (end_offset - start_offset);
mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
ret = mca_fbtl_posix_lock( &data->aio_lock, data->aio_fh, F_WRLCK, start_offset, total_length, OMPIO_LOCK_ENTIRE_REGION );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_ipwritev: error in mca_fbtl_posix_lock() error ret=%d %s", ret, strerror(errno));
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
free(data->aio_reqs);
free(data->aio_req_status);
free(data);
return OMPI_ERROR;
}
for (i=0; i < data->aio_last_active_req; i++) {
if (-1 == aio_write(&data->aio_reqs[i])) {
opal_output(1, "aio_write() error: %s", strerror(errno));
opal_output(1, "mca_fbtl_posix_ipwritev: error in aio_write(): %s", strerror(errno));
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
free(data->aio_req_status);
free(data->aio_reqs);

Просмотреть файл

@ -23,21 +23,29 @@
#include "mpi.h"
#include <unistd.h>
#include <sys/uio.h>
#include <errno.h>
#include <limits.h>
#include "ompi/constants.h"
#include "ompi/mca/fbtl/fbtl.h"
#define MAX_ERRCOUNT 100
/*
op: can be F_WRLCK or F_RDLCK
flags: can be OMPIO_LOCK_ENTIRE_REGION or OMPIO_LOCK_SELECTIVE
flags: can be OMPIO_LOCK_ENTIRE_REGION or OMPIO_LOCK_SELECTIVE. This is typically set by the operation, not the fs component.
e.g. a collective and an individual component might require different level of protection through locking,
also one might need to do different things for blocking (pwritev,preadv) operations and non-blocking (aio) operations.
fh->f_flags can contain similar sounding flags, those were set by the fs component and/or user requests.
Support for MPI atomicity operations are envisioned, but not yet tested.
*/
int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op,
OMPI_MPI_OFFSET_TYPE offset, off_t len, int flags)
{
off_t lmod, bmod;
int ret, err_count;
lock->l_type = op;
lock->l_whence = SEEK_SET;
@ -46,11 +54,10 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op,
if ( 0 == len ) {
return 0;
}
if ( fh->f_atomicity ||
fh->f_flags & OMPIO_LOCK_ALWAYS ) {
/* Need to lock the entire region */
lock->l_start = (off_t) offset;
lock->l_len = len;
if ( fh->f_flags & OMPIO_LOCK_ENTIRE_FILE ) {
lock->l_start = (off_t) 0;
lock->l_len = 0;
}
else {
if ( (fh->f_flags & OMPIO_LOCK_NEVER) ||
@ -108,23 +115,35 @@ int mca_fbtl_posix_lock ( struct flock *lock, mca_io_ompio_file_t *fh, int op,
printf("%d: acquiring lock for offset %ld length %ld requested offset %ld request len %ld \n",
fh->f_rank, lock->l_start, lock->l_len, offset, len);
#endif
return (fcntl ( fh->fd, F_SETLKW, lock));
errno=0;
err_count=0;
do {
ret = fcntl ( fh->fd, F_SETLKW, lock);
if ( ret ) {
#ifdef OMPIO_DEBUG
printf("[%d] ret = %d errno=%d %s\n", fh->f_rank, ret, errno, strerror(errno) );
#endif
err_count++;
}
} while ( ret && ((errno == EINTR) || ((errno == EINPROGRESS) && err_count < MAX_ERRCOUNT )));
return ret;
}
int mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh )
void mca_fbtl_posix_unlock ( struct flock *lock, mca_io_ompio_file_t *fh )
{
int ret;
if ( -1 == lock->l_start && -1 == lock->l_len ) {
return 0;
return;
}
lock->l_type = F_UNLCK;
#ifdef OMPIO_DEBUG
printf("%d: releasing lock for offset %ld length %ld\n", fh->f_rank, lock->l_start, lock->l_len);
#endif
ret = fcntl ( fh->fd, F_SETLK, lock);
fcntl ( fh->fd, F_SETLK, lock);
lock->l_start = -1;
lock->l_len = -1;
return ret;
return;
}

Просмотреть файл

@ -31,7 +31,7 @@
ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh )
{
/*int *fp = NULL;*/
int i, block=1;
int i, block=1, ret;
struct iovec *iov = NULL;
int iov_count = 0;
OMPI_MPI_OFFSET_TYPE iov_offset = 0;
@ -85,12 +85,20 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh )
}
total_length = (end_offset - (off_t)iov_offset );
mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE );
ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno));
free (iov);
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
return OMPI_ERROR;
}
#if defined(HAVE_PREADV)
ret_code = preadv (fh->fd, iov, iov_count, iov_offset);
#else
if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) {
opal_output(1, "lseek:%s", strerror(errno));
opal_output(1, "mca_fbtl_posix_preadv: error in lseek:%s", strerror(errno));
free(iov);
mca_fbtl_posix_unlock ( &lock, fh );
return OMPI_ERROR;
@ -102,7 +110,7 @@ ssize_t mca_fbtl_posix_preadv (mca_io_ompio_file_t *fh )
bytes_read+=ret_code;
}
else if ( ret_code == -1 ) {
opal_output(1, "readv:%s", strerror(errno));
opal_output(1, "mca_fbtl_posix_preadv: error in (p)readv:%s", strerror(errno));
free(iov);
return OMPI_ERROR;
}

Просмотреть файл

@ -33,7 +33,7 @@
ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh )
{
/*int *fp = NULL;*/
int i, block = 1;
int i, block = 1, ret;
struct iovec *iov = NULL;
int iov_count = 0;
OMPI_MPI_OFFSET_TYPE iov_offset = 0;
@ -98,12 +98,19 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh )
*/
total_length = (end_offset - (off_t)iov_offset);
mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE );
ret = mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_pwritev: error in mca_fbtl_posix_lock() error ret=%d %s", ret, strerror(errno));
free (iov);
/* just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh );
return OMPI_ERROR;
}
#if defined (HAVE_PWRITEV)
ret_code = pwritev (fh->fd, iov, iov_count, iov_offset);
#else
if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) {
opal_output(1, "lseek:%s", strerror(errno));
opal_output(1, "mca_fbtl_posix_pwritev: error in lseek:%s", strerror(errno));
free(iov);
mca_fbtl_posix_unlock ( &lock, fh );
return OMPI_ERROR;
@ -115,7 +122,7 @@ ssize_t mca_fbtl_posix_pwritev(mca_io_ompio_file_t *fh )
bytes_written += ret_code;
}
else if (-1 == ret_code ) {
opal_output(1, "writev:%s", strerror(errno));
opal_output(1, "mca_fbtl_posix_pwritev: error in writev:%s", strerror(errno));
free (iov);
return OMPI_ERROR;
}

Просмотреть файл

@ -29,6 +29,12 @@
#include "ompi/mca/common/ompio/common_ompio.h"
extern int mca_fs_ufs_priority;
extern int mca_fs_ufs_lock_algorithm;
#define FS_UFS_LOCK_AUTO 0
#define FS_UFS_LOCK_NEVER 1
#define FS_UFS_LOCK_ENTIRE_FILE 2
#define FS_UFS_LOCK_RANGES 3
BEGIN_C_DECLS

Просмотреть файл

@ -31,6 +31,12 @@
#include "mpi.h"
int mca_fs_ufs_priority = 10;
int mca_fs_ufs_lock_algorithm=0; /* auto */
/*
* Private functions
*/
static int register_component(void);
/*
* Public string showing the fs ufs component version number
@ -54,6 +60,7 @@ mca_fs_base_component_2_0_0_t mca_fs_ufs_component = {
.mca_component_name = "ufs",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_register_component_params = register_component,
},
.fsm_data = {
/* This component is checkpointable */
@ -63,3 +70,26 @@ mca_fs_base_component_2_0_0_t mca_fs_ufs_component = {
.fsm_file_query = mca_fs_ufs_component_file_query, /* get priority and actions */
.fsm_file_unquery = mca_fs_ufs_component_file_unquery, /* undo what was done by previous function */
};
static int register_component(void)
{
mca_fs_ufs_priority = 10;
(void) mca_base_component_var_register(&mca_fs_ufs_component.fsm_version,
"priority", "Priority of the fs ufs component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fs_ufs_priority);
mca_fs_ufs_lock_algorithm = 0;
(void) mca_base_component_var_register(&mca_fs_ufs_component.fsm_version,
"lock_algorithm", "Locking algorithm used by the fs ufs component. "
" 0: auto (default), 1: skip locking, 2: always lock entire file, "
"3: lock only specific ranges",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fs_ufs_lock_algorithm );
return OMPI_SUCCESS;
}

Просмотреть файл

@ -98,39 +98,55 @@ mca_fs_ufs_file_open (struct ompi_communicator_t *comm,
fh->f_stripe_size=0;
fh->f_stripe_count=1;
/* Need to find a way to determine the file system block size at run time.
4096 is the most common value, but it might not always be accurate.
*/
fh->f_fs_block_size = 4096;
/* Need to check for NFS here. If the file system is not NFS but a regular UFS file system,
we do not need to enforce locking. A regular XFS or EXT4 file system can only be used
within a single node, local environment, and in this case the OS will already ensure correct
handling of file system blocks;
*/
char *fstype=NULL;
bool bret = opal_path_nfs ( (char *)filename, &fstype );
if ( false == bret ) {
char *dir;
mca_fs_base_get_parent_dir ( (char *)filename, &dir );
bret = opal_path_nfs (dir, &fstype);
free(dir);
}
if ( true == bret ) {
if ( 0 == strncasecmp(fstype, "nfs", sizeof("nfs")) ) {
/* Nothing really to be done in this case. Locking can stay */
if ( FS_UFS_LOCK_AUTO == mca_fs_ufs_lock_algorithm ) {
char *fstype=NULL;
bool bret = opal_path_nfs ( (char *)filename, &fstype );
if ( false == bret ) {
char *dir;
mca_fs_base_get_parent_dir ( (char *)filename, &dir );
bret = opal_path_nfs (dir, &fstype);
free(dir);
}
if ( true == bret ) {
if ( 0 == strncasecmp(fstype, "nfs", sizeof("nfs")) ) {
/* Based on my tests, only locking the entire file for all operations
guarantueed for the entire teststuite to pass correctly. I would not
be surprised, if depending on the NFS configuration that might not always
be necessary, and the user can change that with an MCA parameter of this
component. */
fh->f_flags |= OMPIO_LOCK_ENTIRE_FILE;
}
else {
fh->f_flags |= OMPIO_LOCK_NEVER;
}
}
else {
fh->f_flags |= OMPIO_LOCK_NEVER;
}
free (fstype);
}
else if ( FS_UFS_LOCK_NEVER == mca_fs_ufs_lock_algorithm ) {
fh->f_flags |= OMPIO_LOCK_NEVER;
}
else if ( FS_UFS_LOCK_ENTIRE_FILE == mca_fs_ufs_lock_algorithm ) {
fh->f_flags |= OMPIO_LOCK_ENTIRE_FILE;
}
else if ( FS_UFS_LOCK_RANGES == mca_fs_ufs_lock_algorithm ) {
/* Nothing to be done. This is what the posix fbtl component would do
anyway without additional information . */
}
else {
fh->f_flags |= OMPIO_LOCK_NEVER;
opal_output ( 1, "Invalid value for mca_fs_ufs_lock_algorithm %d", mca_fs_ufs_lock_algorithm );
}
free (fstype);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -64,7 +64,7 @@ OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info;
#define OMPIO_CONTIGUOUS_FVIEW 0x00000010
#define OMPIO_AGGREGATOR_IS_SET 0x00000020
#define OMPIO_SHAREDFP_IS_SET 0x00000040
#define OMPIO_LOCK_ALWAYS 0x00000080
#define OMPIO_LOCK_ENTIRE_FILE 0x00000080
#define OMPIO_LOCK_NEVER 0x00000100
#define OMPIO_LOCK_NOT_THIS_OP 0x00000200