1
1

Merge pull request #8293 from edgargabriel/topic/lustre-perf-fix

Topic/lustre perf fix
Этот коммит содержится в:
Edgar Gabriel 2020-12-17 14:22:45 -06:00 коммит произвёл GitHub
родитель d36977c925 56dbd096d3
Коммит 38e3936721
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 535 добавлений и 150 удалений

Просмотреть файл

@ -67,6 +67,7 @@
#define OMPIO_LOCK_NEVER 0x00000100
#define OMPIO_LOCK_NOT_THIS_OP 0x00000200
#define OMPIO_DATAREP_NATIVE 0x00000400
#define OMPIO_COLLECTIVE_OP 0x00000800
#define OMPIO_ROOT 0

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2018 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -29,6 +29,11 @@
#include "ompi/mca/common/ompio/common_ompio_request.h"
extern int mca_fbtl_posix_priority;
extern bool mca_fbtl_posix_read_datasieving;
extern bool mca_fbtl_posix_write_datasieving;
extern size_t mca_fbtl_posix_max_block_size;
extern size_t mca_fbtl_posix_max_gap_size;
extern size_t mca_fbtl_posix_max_tmpbuf_size;
BEGIN_C_DECLS

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2018 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -37,6 +37,15 @@ const char *mca_fbtl_posix_component_version_string =
"OMPI/MPI posix FBTL MCA component version " OMPI_VERSION;
int mca_fbtl_posix_priority = 10;
bool mca_fbtl_posix_read_datasieving = true;
bool mca_fbtl_posix_write_datasieving = true;
size_t mca_fbtl_posix_max_block_size = 1048576; // 1MB
size_t mca_fbtl_posix_max_gap_size = 4096; // Size of a block in many linux fs
size_t mca_fbtl_posix_max_tmpbuf_size = 67108864; // 64 MB
/*
* Private functions
*/
static int register_component(void);
/*
* Instantiate the public struct with all of our public information
@ -54,6 +63,7 @@ mca_fbtl_base_component_2_0_0_t mca_fbtl_posix_component = {
.mca_component_name = "posix",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_register_component_params = register_component,
},
.fbtlm_data = {
/* This component is checkpointable */
@ -63,3 +73,62 @@ mca_fbtl_base_component_2_0_0_t mca_fbtl_posix_component = {
.fbtlm_file_query = mca_fbtl_posix_component_file_query, /* get priority and actions */
.fbtlm_file_unquery = mca_fbtl_posix_component_file_unquery, /* undo what was done by previous function */
};
static int register_component(void)
{
mca_fbtl_posix_priority = 10;
(void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version,
"priority", "Priority of the fbtl posix component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_posix_priority);
mca_fbtl_posix_max_block_size = 1048576;
(void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version,
"max_block_size", "Maximum average size in bytes of a data block in an iovec for data sieving. "
"An average block size larger than this parameter will disable data sieving. Default: 1048576 bytes.",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_posix_max_block_size );
mca_fbtl_posix_max_gap_size = 4096;
(void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version,
"max_gap_size", "Maximum average gap size between two blocks in an iovec for data sieving. "
"An average gap size larger than this parameter will disable data sieving. Default: 4096 bytes. " ,
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_posix_max_gap_size );
mca_fbtl_posix_max_tmpbuf_size = 67108864;
(void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version,
"max_tmpbuf_size", "Maximum size of the temporary buffer used for data sieving in bytes. "
"Default: 67108864 (64MB). " ,
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_posix_max_tmpbuf_size );
mca_fbtl_posix_read_datasieving = true;
(void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version,
"read_datasieving", "Parameter indicating whether to perform data sieving for read operations. "
"Default: true.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_posix_read_datasieving );
mca_fbtl_posix_write_datasieving = true;
(void) mca_base_component_var_register(&mca_fbtl_posix_component.fbtlm_version,
"write_datasieving", "Parameter indicating whether to perform data sieving for write operations. "
"Default: true.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_posix_write_datasieving );
return OMPI_SUCCESS;
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2017 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -28,100 +28,260 @@
#include "ompi/constants.h"
#include "ompi/mca/fbtl/fbtl.h"
static ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh);
static ssize_t mca_fbtl_posix_preadv_generic (ompio_file_t *fh);
ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh )
{
/*int *fp = NULL;*/
int i, block=1, ret;
struct iovec *iov = NULL;
int iov_count = 0;
OMPI_MPI_OFFSET_TYPE iov_offset = 0;
ssize_t bytes_read=0, ret_code=0;
struct flock lock;
off_t total_length, end_offset=0;
int ret;
if (NULL == fh->f_io_array) {
return OMPI_ERROR;
}
iov = (struct iovec *) malloc
(OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec));
if ( fh->f_num_of_io_entries > 1 ) {
bool do_data_sieving = true;
size_t avg_gap_size=0;
size_t avg_block_size = 0;
off_t prev_offset = (off_t)fh->f_io_array[0].offset;
int i;
for ( i=0; i< fh->f_num_of_io_entries; i++ ) {
avg_block_size += fh->f_io_array[i].length;
avg_gap_size += (size_t)((off_t)fh->f_io_array[i].offset - prev_offset);
prev_offset = (off_t)fh->f_io_array[i].offset;
}
avg_block_size = avg_block_size / fh->f_num_of_io_entries;
avg_gap_size = avg_gap_size / fh->f_num_of_io_entries;
if ( false == mca_fbtl_posix_read_datasieving ||
0 == avg_gap_size ||
avg_block_size > mca_fbtl_posix_max_block_size ||
avg_gap_size > mca_fbtl_posix_max_gap_size ) {
do_data_sieving = false;
}
if ( do_data_sieving) {
return mca_fbtl_posix_preadv_datasieving (fh);
}
else {
return mca_fbtl_posix_preadv_generic (fh);
}
}
else {
// i.e. fh->f_num_of_io_entries == 1
ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, (off_t)fh->f_io_array[0].offset,
(off_t)fh->f_io_array[0].length, OMPIO_LOCK_ENTIRE_REGION );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s",
ret, strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
return OMPI_ERROR;
}
ret_code = pread(fh->fd, fh->f_io_array[0].memory_address, fh->f_io_array[0].length,
(off_t)fh->f_io_array[0].offset );
mca_fbtl_posix_unlock ( &lock, fh );
if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_preadv: error in (p)read(v):%s", strerror(errno));
return OMPI_ERROR;
}
bytes_read += ret_code;
}
return bytes_read;
}
ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh)
{
size_t start, end, len;
size_t bufsize = 0;
int ret, i, j;
ssize_t bytes_read=0, ret_code=0;
struct flock lock;
char *temp_buf = NULL;
int startindex = 0;
int endindex = 0;
bool done = false;
while (!done) {
// Break the io_array into chunks such that the size of the temporary
// buffer does not exceed mca_fbtl_posix_max_tmpbuf_size bytes.
// Each iteration will thus work in the range (startindex, endindex[
startindex = endindex;
if ( startindex >= fh->f_num_of_io_entries ) {
done = true;
break;
}
size_t sstart = (size_t)fh->f_io_array[startindex].offset;
size_t slen=0;
for ( j = startindex; j < fh->f_num_of_io_entries; j++ ) {
endindex = j;
slen = ((size_t)fh->f_io_array[j].offset + fh->f_io_array[j].length) - sstart;
if (slen > mca_fbtl_posix_max_tmpbuf_size ) {
endindex = j-1;
break;
}
}
// Need to increment the value of endindex
// by one for the loop syntax to work correctly.
endindex++;
start = (size_t)fh->f_io_array[startindex].offset;
end = (size_t)fh->f_io_array[endindex-1].offset + fh->f_io_array[endindex-1].length;
len = end - start;
if ( len > bufsize ) {
if ( NULL != temp_buf ) {
free ( temp_buf);
}
temp_buf = (char *) malloc ( len );
if ( NULL == temp_buf ) {
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
bufsize = len;
}
// Read the entire block.
ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, start, len, OMPIO_LOCK_ENTIRE_REGION );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in mca_fbtl_posix_lock() ret=%d: %s",
ret, strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
free ( temp_buf);
return OMPI_ERROR;
}
ret_code = pread (fh->fd, temp_buf, len, start);
mca_fbtl_posix_unlock ( &lock, fh);
if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_preadv_datasieving: error in (p)read(v):%s", strerror(errno));
free ( temp_buf);
return OMPI_ERROR;
}
// Copy out the elements that were requested.
size_t pos = 0;
size_t num_bytes;
size_t start_offset = (size_t) fh->f_io_array[startindex].offset;
for ( i = startindex ; i < endindex ; i++) {
pos = (size_t) fh->f_io_array[i].offset - start_offset;
if ( (ssize_t) pos > ret_code ) {
break;
}
num_bytes = fh->f_io_array[i].length;
if ( ((ssize_t) pos + (ssize_t)num_bytes) > ret_code ) {
num_bytes = ret_code - (ssize_t)pos;
}
memcpy (fh->f_io_array[i].memory_address, temp_buf + pos, num_bytes);
bytes_read += num_bytes;
}
}
free ( temp_buf);
return bytes_read;
}
ssize_t mca_fbtl_posix_preadv_generic (ompio_file_t *fh )
{
ssize_t bytes_read=0, ret_code=0;
struct iovec *iov = NULL;
struct flock lock;
int ret, i;
int block=1;
int iov_count = 0;
OMPI_MPI_OFFSET_TYPE iov_offset = 0;
off_t total_length, end_offset=0;
iov = (struct iovec *) malloc (OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec));
if (NULL == iov) {
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
if (0 == iov_count) {
iov[iov_count].iov_base = fh->f_io_array[i].memory_address;
iov[iov_count].iov_len = fh->f_io_array[i].length;
iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset;
if (0 == iov_count) {
iov[iov_count].iov_base = fh->f_io_array[i].memory_address;
iov[iov_count].iov_len = fh->f_io_array[i].length;
iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset;
end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length;
iov_count ++;
}
iov_count ++;
}
if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) {
block ++;
iov = (struct iovec *)realloc
(iov, OMPIO_IOVEC_INITIAL_SIZE * block *
sizeof(struct iovec));
if (NULL == iov) {
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) {
block ++;
iov = (struct iovec *)realloc
(iov, OMPIO_IOVEC_INITIAL_SIZE * block *
sizeof(struct iovec));
if (NULL == iov) {
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
if (fh->f_num_of_io_entries != i+1) {
if (((((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset +
(ptrdiff_t)fh->f_io_array[i].length) ==
(OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset)) &&
(iov_count < IOV_MAX ) ){
iov[iov_count].iov_base =
fh->f_io_array[i+1].memory_address;
iov[iov_count].iov_len = fh->f_io_array[i+1].length;
end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length;
iov_count ++;
continue;
}
}
if (fh->f_num_of_io_entries != i+1) {
if (((((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset +
(ptrdiff_t)fh->f_io_array[i].length) ==
(OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset)) &&
(iov_count < IOV_MAX ) ){
iov[iov_count].iov_base =
fh->f_io_array[i+1].memory_address;
iov[iov_count].iov_len = fh->f_io_array[i+1].length;
end_offset = (off_t)fh->f_io_array[i].offset + (off_t)fh->f_io_array[i].length;
iov_count ++;
continue;
}
}
total_length = (end_offset - (off_t)iov_offset );
ret = mca_fbtl_posix_lock ( &lock, fh, F_RDLCK, iov_offset, total_length, OMPIO_LOCK_SELECTIVE );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_preadv: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno));
opal_output(1, "mca_fbtl_posix_preadv_generic: error in mca_fbtl_posix_lock() ret=%d: %s", ret, strerror(errno));
free (iov);
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
return OMPI_ERROR;
}
#if defined(HAVE_PREADV)
ret_code = preadv (fh->fd, iov, iov_count, iov_offset);
ret_code = preadv (fh->fd, iov, iov_count, iov_offset);
#else
if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) {
opal_output(1, "mca_fbtl_posix_preadv: error in lseek:%s", strerror(errno));
if (-1 == lseek (fh->fd, iov_offset, SEEK_SET)) {
opal_output(1, "mca_fbtl_posix_preadv_generic: error in lseek:%s", strerror(errno));
free(iov);
mca_fbtl_posix_unlock ( &lock, fh );
return OMPI_ERROR;
}
ret_code = readv (fh->fd, iov, iov_count);
return OMPI_ERROR;
}
ret_code = readv (fh->fd, iov, iov_count);
#endif
mca_fbtl_posix_unlock ( &lock, fh );
if ( 0 < ret_code ) {
bytes_read+=ret_code;
}
else if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_preadv: error in (p)readv:%s", strerror(errno));
if ( 0 < ret_code ) {
bytes_read+=ret_code;
}
else if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_preadv_generic: error in (p)readv:%s", strerror(errno));
free(iov);
return OMPI_ERROR;
}
else if ( 0 == ret_code ){
/* end of file reached, no point in continue reading; */
break;
}
iov_count = 0;
return OMPI_ERROR;
}
else if ( 0 == ret_code ){
/* end of file reached, no point in continue reading; */
break;
}
iov_count = 0;
}
free (iov);
return bytes_read;
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2017 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -30,7 +30,187 @@
#include "ompi/constants.h"
#include "ompi/mca/fbtl/fbtl.h"
static ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh );
static ssize_t mca_fbtl_posix_pwritev_generic (ompio_file_t *fh );
ssize_t mca_fbtl_posix_pwritev(ompio_file_t *fh )
{
ssize_t bytes_written=0, ret_code=0;
struct flock lock;
int ret;
if (NULL == fh->f_io_array) {
return OMPI_ERROR;
}
if ( fh->f_num_of_io_entries > 1 ) {
bool do_data_sieving = true;
size_t avg_gap_size=0;
size_t avg_block_size = 0;
off_t prev_offset = (off_t)fh->f_io_array[0].offset;
int i;
for ( i=0; i< fh->f_num_of_io_entries; i++ ) {
avg_block_size += fh->f_io_array[i].length;
avg_gap_size += (size_t)((off_t)fh->f_io_array[i].offset - prev_offset);
prev_offset = (off_t)fh->f_io_array[i].offset;
}
avg_block_size = avg_block_size / fh->f_num_of_io_entries;
avg_gap_size = avg_gap_size / fh->f_num_of_io_entries;
if ( false == mca_fbtl_posix_write_datasieving ||
0 == avg_gap_size ||
avg_block_size > mca_fbtl_posix_max_block_size ||
avg_gap_size > mca_fbtl_posix_max_gap_size ||
ompi_mpi_thread_multiple ||
!(fh->f_flags & OMPIO_COLLECTIVE_OP) ) {
do_data_sieving = false;
}
if ( do_data_sieving) {
return mca_fbtl_posix_pwritev_datasieving (fh);
}
else {
return mca_fbtl_posix_pwritev_generic (fh);
}
}
else {
// i.e. fh->f_num_of_io_entries == 1
ret = mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, (off_t)fh->f_io_array[0].offset,
(off_t)fh->f_io_array[0].length, OMPIO_LOCK_ENTIRE_REGION );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_pwritev: error in mca_fbtl_posix_lock() ret=%d: %s",
ret, strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
return OMPI_ERROR;
}
ret_code = pwrite(fh->fd, fh->f_io_array[0].memory_address, fh->f_io_array[0].length,
(off_t)fh->f_io_array[0].offset );
mca_fbtl_posix_unlock ( &lock, fh );
if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_pwritev: error in (p)write(v):%s", strerror(errno));
return OMPI_ERROR;
}
bytes_written += ret_code;
}
return bytes_written;
}
ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh)
{
size_t start, end, len;
size_t bufsize = 0;
int ret, i, j;
ssize_t bytes_written=0, ret_code=0;
struct flock lock;
char *temp_buf = NULL;
int startindex = 0;
int endindex = 0;
bool done = false;
while (!done) {
// Break the io_array into chunks such that the size of the temporary
// buffer does not exceed mca_fbtl_posix_max_tmpbuf_size bytes.
// Each iteration will thus work in the range (startindex, endindex[
startindex = endindex;
if ( startindex >= fh->f_num_of_io_entries ) {
done = true;
break;
}
size_t sstart = (size_t)fh->f_io_array[startindex].offset;
size_t slen=0;
for ( j = startindex; j < fh->f_num_of_io_entries; j++ ) {
endindex = j;
slen = ((size_t)fh->f_io_array[j].offset + fh->f_io_array[j].length) - sstart;
if (slen > mca_fbtl_posix_max_tmpbuf_size ) {
endindex = j-1;
break;
}
}
// Need to increment the value of endindex
// by one for the loop syntax to work correctly.
endindex++;
start = (size_t)fh->f_io_array[startindex].offset;
end = (size_t)fh->f_io_array[endindex-1].offset + fh->f_io_array[endindex-1].length;
len = end - start;
if ( len > bufsize ) {
if ( NULL != temp_buf ) {
free ( temp_buf);
}
temp_buf = (char *) malloc ( len );
if ( NULL == temp_buf ) {
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
bufsize = len;
}
// Read the entire block.
ret = mca_fbtl_posix_lock ( &lock, fh, F_WRLCK, start, len, OMPIO_LOCK_ENTIRE_REGION );
if ( 0 < ret ) {
opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in mca_fbtl_posix_lock() ret=%d: %s",
ret, strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
free ( temp_buf);
return OMPI_ERROR;
}
ret_code = pread (fh->fd, temp_buf, len, start);
if ( ret_code == -1 ) {
//opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno));
opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
free ( temp_buf);
return OMPI_ERROR;
}
// Copy out the elements to write into temporary buffer.
size_t pos = 0;
size_t num_bytes;
size_t start_offset = (size_t) fh->f_io_array[startindex].offset;
for ( i = startindex ; i < endindex ; i++) {
pos = (size_t) fh->f_io_array[i].offset - start_offset;
num_bytes = fh->f_io_array[i].length;
memcpy (temp_buf + pos, fh->f_io_array[i].memory_address, num_bytes);
bytes_written += num_bytes;
}
ret_code = pwrite (fh->fd, temp_buf, len, start);
if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
free ( temp_buf);
return OMPI_ERROR;
}
mca_fbtl_posix_unlock ( &lock, fh);
if ( ret_code == -1 ) {
opal_output(1, "mca_fbtl_posix_pwritev_datasieving: error in pwrite:%s", strerror(errno));
/* Just in case some part of the lock worked */
mca_fbtl_posix_unlock ( &lock, fh);
free ( temp_buf);
return OMPI_ERROR;
}
}
free ( temp_buf);
return bytes_written;
}
ssize_t mca_fbtl_posix_pwritev_generic (ompio_file_t *fh )
{
/*int *fp = NULL;*/
int i, block = 1, ret;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2016 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -36,7 +36,6 @@ BEGIN_C_DECLS
extern int mca_fcoll_dynamic_gen2_priority;
extern int mca_fcoll_dynamic_gen2_num_groups;
extern int mca_fcoll_dynamic_gen2_write_chunksize;
OMPI_MODULE_DECLSPEC extern mca_fcoll_base_component_2_0_0_t mca_fcoll_dynamic_gen2_component;

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2016 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -42,7 +42,6 @@ const char *mca_fcoll_dynamic_gen2_component_version_string =
*/
int mca_fcoll_dynamic_gen2_priority = 10;
int mca_fcoll_dynamic_gen2_num_groups = 1;
int mca_fcoll_dynamic_gen2_write_chunksize = -1;
/*
* Local function
@ -95,12 +94,5 @@ dynamic_gen2_register(void)
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_dynamic_gen2_num_groups);
mca_fcoll_dynamic_gen2_write_chunksize = -1;
(void) mca_base_component_var_register(&mca_fcoll_dynamic_gen2_component.fcollm_version,
"write_chunksize", "Chunk size written at once. Default: stripe_size of the file system",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_dynamic_gen2_write_chunksize);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2016 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
@ -92,7 +92,7 @@ typedef struct mca_io_ompio_aggregator_data {
static int shuffle_init ( int index, int cycles, int aggregator, int rank,
mca_io_ompio_aggregator_data *data,
ompi_request_t **reqs );
static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data, int write_chunksize );
static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data );
int mca_fcoll_dynamic_gen2_break_file_view ( struct iovec *decoded_iov, int iov_count,
struct iovec *local_iov_array, int local_count,
@ -111,8 +111,7 @@ static int local_heap_sort (mca_io_ompio_local_io_array *io_array,
int *sorted);
int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *work_array,
int num_entries, int *last_array_pos, int *last_pos_in_field,
int chunk_size );
int num_entries, int *last_array_pos, int *last_pos_in_field );
int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh,
@ -145,7 +144,7 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh,
MPI_Aint *broken_total_lengths=NULL;
int *aggregators=NULL;
int write_chunksize, *result_counts=NULL;
int *result_counts=NULL;
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
@ -199,15 +198,9 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh,
if ( fh->f_stripe_size == 0 ) {
// EDGAR: just a quick heck for testing
//fh->f_stripe_size = 1048576;
fh->f_stripe_size = 65536;
}
if ( -1 == mca_fcoll_dynamic_gen2_write_chunksize ) {
write_chunksize = fh->f_stripe_size;
}
else {
write_chunksize = mca_fcoll_dynamic_gen2_write_chunksize;
}
ret = mca_fcoll_dynamic_gen2_get_configuration (fh, &dynamic_gen2_num_io_procs, &aggregators);
if (OMPI_SUCCESS != ret){
@ -608,7 +601,7 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh,
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
start_write_time = MPI_Wtime();
#endif
ret = write_init (fh, aggregators[i], aggr_data[i], write_chunksize );
ret = write_init (fh, aggregators[i], aggr_data[i] );
if (OMPI_SUCCESS != ret){
goto exit;
}
@ -637,7 +630,7 @@ int mca_fcoll_dynamic_gen2_file_write_all (ompio_file_t *fh,
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
start_write_time = MPI_Wtime();
#endif
ret = write_init (fh, aggregators[i], aggr_data[i], write_chunksize );
ret = write_init (fh, aggregators[i], aggr_data[i] );
if (OMPI_SUCCESS != ret){
goto exit;
}
@ -735,7 +728,7 @@ exit :
}
static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data, int write_chunksize )
static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data )
{
int ret=OMPI_SUCCESS;
int last_array_pos=0;
@ -743,18 +736,36 @@ static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator
if ( aggregator == fh->f_rank && aggr_data->prev_num_io_entries) {
fh->f_flags |= OMPIO_COLLECTIVE_OP;
while ( aggr_data->prev_bytes_to_write > 0 ) {
ssize_t tret;
aggr_data->prev_bytes_to_write -= mca_fcoll_dynamic_gen2_split_iov_array (fh, aggr_data->prev_io_array,
aggr_data->prev_num_io_entries,
&last_array_pos, &last_pos,
write_chunksize );
if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) {
&last_array_pos, &last_pos );
tret = fh->f_fbtl->fbtl_pwritev (fh);
if ( 0 > tret ) {
free ( aggr_data->prev_io_array);
opal_output (1, "dynamic_gen2_write_all: fbtl_pwritev failed\n");
ret = OMPI_ERROR;
goto exit;
}
#if DEBUG_ON
printf("fh->f_num_of_io_entries=%d\n", fh->f_num_of_io_entries);
printf("[%d]: fh->f_io_array[0].offset = %ld .size = %ld\n", fh->f_rank, (long)fh->f_io_array[0].offset,
fh->f_io_array[0].length);
if ( fh->f_num_of_io_entries > 1 )
printf("[%d]: fh->f_io_array[1].offset = %ld .size = %ld\n", fh->f_rank, (long)fh->f_io_array[1].offset,
fh->f_io_array[1].length);
int n = fh->f_num_of_io_entries-1;
if ( fh->f_num_of_io_entries > 2 )
printf("[%d]: fh->f_io_array[n].offset = %ld .size = %ld\n", fh->f_rank, (long)fh->f_io_array[n].offset,
fh->f_io_array[n].length);
#endif
}
fh->f_flags &= ~OMPIO_COLLECTIVE_OP;
free ( fh->f_io_array );
free ( aggr_data->prev_io_array);
}
@ -1595,13 +1606,14 @@ int mca_fcoll_dynamic_gen2_get_configuration (ompio_file_t *fh, int *dynamic_gen
int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *io_array, int num_entries,
int *ret_array_pos, int *ret_pos, int chunk_size )
int *ret_array_pos, int *ret_pos )
{
int array_pos = *ret_array_pos;
int pos = *ret_pos;
size_t bytes_written = 0;
size_t bytes_to_write = chunk_size;
off_t baseaddr = ((off_t)io_array[array_pos].offset + pos) - (((off_t)io_array[array_pos].offset + pos) % (off_t)fh->f_stripe_size);
off_t endaddr = baseaddr + fh->f_stripe_size;
if ( 0 == array_pos && 0 == pos ) {
fh->f_io_array = (mca_common_ompio_io_array_t *) malloc ( num_entries * sizeof(mca_common_ompio_io_array_t));
@ -1612,32 +1624,28 @@ int mca_fcoll_dynamic_gen2_split_iov_array ( ompio_file_t *fh, mca_common_ompio_
}
int i=0;
while (bytes_to_write > 0 ) {
fh->f_io_array[i].memory_address = &(((char *)io_array[array_pos].memory_address)[pos]);
fh->f_io_array[i].offset = &(((char *)io_array[array_pos].offset)[pos]);
do {
fh->f_io_array[i].memory_address = (char *)io_array[array_pos].memory_address + pos;
fh->f_io_array[i].offset = (char *)io_array[array_pos].offset + pos;
if ( (io_array[array_pos].length - pos ) >= bytes_to_write ) {
fh->f_io_array[i].length = bytes_to_write;
off_t length = io_array[array_pos].length - pos;
if ( ( (off_t)fh->f_io_array[i].offset + length) < endaddr ) {
fh->f_io_array[i].length = length;
}
else {
fh->f_io_array[i].length = io_array[array_pos].length - pos;
fh->f_io_array[i].length = endaddr - (size_t)fh->f_io_array[i].offset;
}
pos += fh->f_io_array[i].length;
bytes_written += fh->f_io_array[i].length;
bytes_to_write-= fh->f_io_array[i].length;
i++;
if ( pos == (int)io_array[array_pos].length ) {
pos = 0;
if ((array_pos + 1) < num_entries) {
array_pos++;
}
else {
break;
}
array_pos++;
}
}
} while ( (array_pos < num_entries) && (((off_t)io_array[array_pos].offset+pos ) < endaddr) );
fh->f_num_of_io_entries = i;
*ret_array_pos = array_pos;

Просмотреть файл

@ -771,7 +771,9 @@ static int write_init (ompio_file_t *fh,
}
}
else {
fh->f_flags |= OMPIO_COLLECTIVE_OP;
ret_temp = fh->f_fbtl->fbtl_pwritev(fh);
fh->f_flags &= ~OMPIO_COLLECTIVE_OP;
if(0 > ret_temp) {
opal_output (1, "vulcan_write_all: fbtl_pwritev failed\n");
ret = ret_temp;

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2018 University of Houston. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* Copyright (c) 2015-2020 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
@ -144,6 +144,7 @@ mca_fs_lustre_file_open (struct ompi_communicator_t *comm,
fh->f_stripe_size = lump->lmm_stripe_size;
fh->f_stripe_count = lump->lmm_stripe_count;
fh->f_fs_block_size = lump->lmm_stripe_size;
fh->f_flags |= OMPIO_LOCK_NEVER;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2019 University of Houston. All rights reserved.
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
@ -302,42 +302,11 @@ file_query(struct ompi_file_t *file,
int *priority)
{
mca_common_ompio_data_t *data;
char *tmp;
int rank;
int is_lustre=0; //false
tmp = strchr (file->f_filename, ':');
rank = ompi_comm_rank ( file->f_comm);
if (!tmp) {
if ( 0 == rank) {
if (LUSTRE == mca_fs_base_get_fstype(file->f_filename)) {
is_lustre = 1; //true
}
}
file->f_comm->c_coll->coll_bcast (&is_lustre,
1,
MPI_INT,
0,
file->f_comm,
file->f_comm->c_coll->coll_bcast_module);
}
else {
if (!strncasecmp(file->f_filename, "lustre:", 7) ) {
is_lustre = 1;
}
}
if (is_lustre) {
*priority = 1;
}
else {
*priority = priority_param;
}
*priority = priority_param;
/* Allocate a space for this module to hang private data (e.g.,
the OMPIO file handle) */
data = calloc(1, sizeof(mca_common_ompio_data_t));
if (NULL == data) {
return NULL;
@ -346,7 +315,6 @@ file_query(struct ompi_file_t *file,
*private_data = (struct mca_io_base_file_t*) data;
/* All done */
return &mca_io_ompio_module;
}