1
1

Romio Refresh from mpich2-1.3.1. Work by Pascal Deveze, tested through bitbucket by Jeff Squyres (https://bitbucket.org/devezep/new-romio-for-openmpi).

This commit was SVN r24264.
Этот коммит содержится в:
Sylvain Jeaugey 2011-01-19 15:55:10 +00:00
родитель b2f3a5b7c2
Коммит 0e921bba7f
359 изменённых файлов: 27054 добавлений и 8895 удалений

Просмотреть файл

@ -35,4 +35,5 @@ __sgi_mpi
__hp_mpi
__cray_mpi
__lam_mpi
__Darwin
__open_mpi

Просмотреть файл

@ -1,58 +0,0 @@
<dir>
<file name="ad_bgl_getsh.c" info="1205188711"/>
<file name="ad_bgl_fcntl.c" info="1205188711"/>
<file name="ad_bgl_tuning.c" info="1205188711"/>
<file name="ad_bgl_pset.h" info="1205188711"/>
<file name="ad_bgl_aggrs.c" info="1205188711"/>
<file name="ad_bgl_wrcoll.c" info="1205188711"/>
<file name="ad_bgl_aggrs.h" info="1205188711"/>
<file name="ad_bgl_pset.c" info="1205188711"/>
<file name="ad_bgl_setsh.c" info="1205188711"/>
<file name="ad_bgl_close.c" info="1206398065"/>
<file name="ad_bgl.h" info="1205188711"/>
<file name="ad_bgl_read.c" info="1205188711"/>
<file name="ad_bgl_rdcoll.c" info="1205188711"/>
<file name="ad_bgl_open.c" info="1205188711"/>
<file name="ad_bgl_tuning.h" info="1205188711"/>
<file name="ad_bgl_write.c" info="1205188711"/>
<file name="ad_bgl_hints.c" info="1205188711"/>
<file name="ad_bgl.c" info="1205188711"/>
</dir>
<data>
<fileinfo name="ad_bgl_getsh.c">
</fileinfo>
<fileinfo name="ad_bgl_fcntl.c">
</fileinfo>
<fileinfo name="ad_bgl_tuning.c">
</fileinfo>
<fileinfo name="ad_bgl_pset.h">
</fileinfo>
<fileinfo name="ad_bgl_aggrs.c">
</fileinfo>
<fileinfo name="ad_bgl_wrcoll.c">
</fileinfo>
<fileinfo name="ad_bgl_aggrs.h">
</fileinfo>
<fileinfo name="ad_bgl_pset.c">
</fileinfo>
<fileinfo name="ad_bgl_setsh.c">
</fileinfo>
<fileinfo name="ad_bgl_close.c">
</fileinfo>
<fileinfo name="ad_bgl.h">
</fileinfo>
<fileinfo name="ad_bgl_read.c">
</fileinfo>
<fileinfo name="ad_bgl_rdcoll.c">
</fileinfo>
<fileinfo name="ad_bgl_open.c">
</fileinfo>
<fileinfo name="ad_bgl_tuning.h">
</fileinfo>
<fileinfo name="ad_bgl_write.c">
</fileinfo>
<fileinfo name="ad_bgl_hints.c">
</fileinfo>
<fileinfo name="ad_bgl.c">
</fileinfo>
</data>

Просмотреть файл

@ -26,6 +26,7 @@ libadio_bgl_la_SOURCES = \
ad_bgl.c \
ad_bgl_close.c \
ad_bgl_fcntl.c \
ad_bgl_flush.c \
ad_bgl_getsh.c \
ad_bgl.h \
ad_bgl_hints.c \

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl.c
* \brief ???
@ -18,6 +19,7 @@
struct ADIOI_Fns_struct ADIO_BGL_operations = {
ADIOI_BGL_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_BGL_ReadContig, /* ReadContig */
ADIOI_BGL_WriteContig, /* WriteContig */
#if BGL_OPTIM_STEP1_2
@ -51,7 +53,8 @@ struct ADIOI_Fns_struct ADIO_BGL_operations = {
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_BGL_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
};

Просмотреть файл

@ -28,8 +28,10 @@
#include <aio.h>
#endif
#if 0
int ADIOI_BGL_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle);
#endif
void ADIOI_BGL_Open(ADIO_File fd, int *error_code);
@ -87,6 +89,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code);
#include "ad_bgl_tuning.h"

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_aggrs.c
* \brief The externally used function from this file is is declared in ad_bgl_aggrs.h
@ -7,7 +8,7 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
@ -16,10 +17,49 @@
#include "ad_bgl.h"
#include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
int aggrsInPsetSize=0;
int *aggrsInPset=NULL;
static int aggrsInPsetSize=0;
static int *aggrsInPset=NULL;
/* Comments copied from common:
* This file contains four functions:
*
* ADIOI_Calc_aggregator()
* ADIOI_Calc_file_domains()
* ADIOI_Calc_my_req()
* ADIOI_Calc_others_req()
*
* The last three of these were originally in ad_read_coll.c, but they are
* also shared with ad_write_coll.c. I felt that they were better kept with
* the rest of the shared aggregation code.
*/
/* Discussion of values available from above:
*
* ADIO_Offset st_offsets[0..nprocs-1]
* ADIO_Offset end_offsets[0..nprocs-1]
* These contain a list of start and end offsets for each process in
* the communicator. For example, an access at loc 10, size 10 would
* have a start offset of 10 and end offset of 19.
* int nprocs
* number of processors in the collective I/O communicator
* ADIO_Offset min_st_offset
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
* starting location of "file domain"; region that a given process will
* perform aggregation for (i.e. actually do I/O)
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
* start + size - 1 roughly, but it can be less, or 0, in the case of
* uneven distributions
*/
/* forward declaration */
static void
@ -219,8 +259,7 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
ADIOI_BGL_ProcInfo_t *all_procInfo,
int *aggrsInPset )
{
# define DEBUG 0
# if DEBUG
# if AGG_DEBUG
int i;
# endif
int naggs;
@ -229,9 +268,10 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
# if DEBUG
for (i=0; i<confInfo->nProcs; i++)
printf( "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
# if AGG_DEBUG
for (i=0; i<confInfo->nProcs; i++) {
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
}
# endif
naggs =
@ -239,7 +279,7 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
# define VERIFY 0
# if VERIFY
printf( "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n",
DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n",
confInfo->PsetSize ,
confInfo->numPsets ,
confInfo->isVNM ,
@ -250,9 +290,10 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
naggs );
# endif
# if DEBUG
for (i=0; i<naggs; i++)
printf( "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
# if AGG_DEBUG
for (i=0; i<naggs; i++) {
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
}
# endif
/* copy the ranklist of IO aggregators to fd->hints */
@ -267,293 +308,34 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
return;
}
/*
* Compute a dynamic access range based file domain partition among I/O aggregators,
* which align to the GPFS block size
* Divide the I/O workload among "nprocs_for_coll" processes. This is
* done by (logically) dividing the file into file domains (FDs); each
* process may directly access only its own file domain.
* Additional effort is to make sure that each I/O aggregator get
* a file domain that aligns to the GPFS block size. So, there will
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
*/
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
__blksize_t blksize = 1048576; /* default to 1M */
if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
/* FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);*/
/* find the range of all the requests */
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
// printf( "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
/* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process */
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
int naggs = nprocs_for_coll;
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
for (i=0; i<naggs; i++)
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
else fd_size[i] = (nb_cn_small+1) * blksize;
/* FPRINTF(stderr,"%s(%d): "
"gpfs_ub %llu, "
"gpfs_lb %llu, "
"gpfs_ub_rdoff %llu, "
"gpfs_lb_rdoff %llu, "
"fd_gpfs_range %llu, "
"n_gpfs_blk %llu, "
"nb_cn_small %llu, "
"naggs_large %llu, "
"naggs_small %llu, "
"\n",
myname,__LINE__,
gpfs_ub ,
gpfs_lb ,
gpfs_ub_rdoff,
gpfs_lb_rdoff,
fd_gpfs_range,
n_gpfs_blk ,
nb_cn_small ,
naggs_large ,
naggs_small
);
*/
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
/* compute the file domain for each aggr */
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
ADIOI_Free (fd_size);
}
/*
* deprecated
/* Description from common/ad_aggregate.c. (Does it completely apply to bgl?)
* ADIOI_Calc_aggregator()
*
void ADIOI_BGL_GPFS_Calc_file_domain0(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i;
static int GPFS_BSIZE=1048576;
* find the range of all the requests *
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process *
* GPFS specific, pseudo starting/end point has to round to GPFS_BSIZE *
ADIO_Offset gpfs_ub = (max_end_offset +GPFS_BSIZE-1) / GPFS_BSIZE * GPFS_BSIZE - 1;
ADIO_Offset gpfs_lb = min_st_offset / GPFS_BSIZE * GPFS_BSIZE;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +GPFS_BSIZE-1) / GPFS_BSIZE * GPFS_BSIZE - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / GPFS_BSIZE * GPFS_BSIZE;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
* all computation of partition is based on the rounded pseudo-range *
ADIO_Offset fds_ub = (fd_gpfs_range +nprocs_for_coll-1) / nprocs_for_coll;
ADIO_Offset fds_lb = fd_gpfs_range / nprocs_for_coll;
int naggs = nprocs_for_coll;
int npsets = aggrsInPset[0]; * special meaning for element 0 *
fd_size = (ADIO_Offset *) ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
* some pre-computation to determine rough ratio of when to up-fit, when to low-fit *
* 1. get the estimated data per pset *
* 2. determine a factor between up and down *
int avg_aggrsInPset = (naggs +npsets-1)/npsets;
ADIO_Offset avg_bytes_perPset = fd_gpfs_range / npsets;
ADIO_Offset resid = avg_bytes_perPset % GPFS_BSIZE;
ADIO_Offset downr = GPFS_BSIZE - resid;
int small = (resid < downr);
int ratio = downr == 0 ? npsets + 2 : (resid +downr-1)/downr;
if (small) ratio = resid == 0 ? npsets + 2 : (downr +resid-1)/resid;
* go through aggrsInfo of all PSETs *
ADIO_Offset fd_range = fd_gpfs_range;
int aggr = 0, pset;
for (pset=0; pset<npsets; pset++) {
ADIO_Offset fds_try = fds_lb;
int my_naggs = aggrsInPset[pset+1];
ADIO_Offset fds_pset;
* Last pset will deal with the residuals *
if (pset == npsets-1)
fds_pset = fd_range;
else
{
int cond1 = ((pset+1) % ratio == 0);
int cond2 = ((pset+1) % ratio != 0);
if (small) {
int temp = cond1; cond1 = cond2; cond2 = temp;
}
if (cond1) {
fds_pset = fds_try * my_naggs;
if (fds_pset % GPFS_BSIZE) // align to GPFS_BSIZE
fds_pset = ((fds_pset +GPFS_BSIZE-1)/GPFS_BSIZE) * GPFS_BSIZE;
}
if (cond2)
{
fds_try = fds_ub;
fds_pset = fds_try * my_naggs;
if (fds_pset % GPFS_BSIZE) // align to GPFS_BSIZE
fds_pset = (fds_pset / GPFS_BSIZE) * GPFS_BSIZE;
}
}
* for aggrs in each PSET, divide evenly the data range *
#define CN_ALIGN 1
#if !CN_ALIGN
fd_range -= fds_pset;
if ( pset == 0 ) fds_pset -= gpfs_lb_rdoff;
if ( pset == npsets-1 ) fds_pset -= gpfs_ub_rdoff;
int p;
for (p=0; p<my_naggs; p++) {
fd_size[aggr] = (fds_pset +my_naggs-1) / my_naggs;
if (p== my_naggs-1)
fd_size[aggr] -= (fd_size[aggr]*my_naggs - fds_pset);
aggr++;
}
#else
ADIO_Offset avg_bytes_perP = fds_pset / my_naggs;
ADIO_Offset resid2 = avg_bytes_perP % GPFS_BSIZE;
ADIO_Offset downr2 = GPFS_BSIZE - resid2;
int small2 = (resid2 < downr2);
int ratio2 = downr2 == 0 ? my_naggs + 2 : (resid2 +downr2-1)/downr2;
if (small2) ratio2 = resid2 == 0 ? my_naggs + 2 : (downr2 +resid2-1)/resid2;
ADIO_Offset accu = 0;
int p;
for (p=0; p<my_naggs; p++) {
int cond1 = ((p+1) % ratio2 == 0);
int cond2 = ((p+1) % ratio2 != 0);
if (small2) {
int temp = cond1; cond1 = cond2; cond2 = temp;
}
fd_size[aggr] = avg_bytes_perP;
if (cond2) fd_size[aggr] = ((fd_size[aggr] +GPFS_BSIZE-1)/GPFS_BSIZE) * GPFS_BSIZE;
if (cond1) fd_size[aggr] = ((fd_size[aggr] )/GPFS_BSIZE) * GPFS_BSIZE;
if (p== my_naggs-1)
fd_size[aggr] = (fds_pset - accu);
accu += fd_size[aggr];
fd_range -= fd_size[aggr];
aggr++;
}
#endif
}
* after scheduling, the first and the last region has to remove the round-off effect *
#if CN_ALIGN
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
#endif
* compute the file domain for each aggr *
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
* The intention here is to implement a function which provides basically
* the same functionality as in Rajeev's original version of
* ADIOI_Calc_my_req(). He used a ceiling division approach to assign the
* file domains, and we use the same approach here when calculating the
* location of an offset/len in a specific file domain. Further we assume
* this same distribution when calculating the rank_index, which is later
* used to map to a specific process rank in charge of the file domain.
*
printf( "\t%6d : %12qd:%12qd, %12qd:%12qd:%12qd, %12qd:%12qd:%12qd\n",
naggs,
min_st_offset,
max_end_offset,
fd_start[0],
fd_end [0],
fd_size [0],
fd_start[naggs-1],
fd_end [naggs-1],
fd_size [naggs-1] );
* A better (i.e. more general) approach would be to use the list of file
* domains only. This would be slower in the case where the
* original ceiling division was used, but it would allow for arbitrary
* distributions of regions to aggregators. We'd need to know the
* nprocs_for_coll in that case though, which we don't have now.
*
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
ADIOI_Free (fd_size);
}
* Note a significant difference between this function and Rajeev's old code:
* this code doesn't necessarily return a rank in the range
* 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a
* result of the rank mapping; any set of ranks in the communicator could be
* used now.
*
* Returns an integer representing a rank in the collective I/O communicator.
*
* The "len" parameter is also modified to indicate the amount of data
* actually available in this file domain.
*/
/*
* When a process is an IO aggregator, this will return its index in the aggrs list.
* Otherwise, this will return -1
*/
int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
{
int i;
for (i=0; i<fd->hints->cb_nodes; i++)
if (fd->hints->ranklist[i] == myrank) return i;
return -1;
}
/*
* This is more general aggregator search function which does not base on the assumption
* that each aggregator hosts the file domain with the same size
@ -574,6 +356,21 @@ int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
/* binary search --> rank_index is returned */
int ub = fd->hints->cb_nodes;
int lb = 0;
/* get an index into our array of aggregators */
/* Common code for striping - bgl doesn't use it but it's
here to make diff'ing easier.
rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
if (fd->hints->striping_unit > 0) {
* wkliao: implementation for file domain alignment
fd_start[] and fd_end[] have been aligned with file lock
boundaries when returned from ADIOI_Calc_file_domains() so cannot
just use simple arithmatic as above *
rank_index = 0;
while (off > fd_end[rank_index]) rank_index++;
}
bgl does it's own striping below
*/
rank_index = fd->hints->cb_nodes / 2;
while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
if ( off > fd_end [rank_index] ) {
@ -586,8 +383,15 @@ int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
rank_index = (rank_index + lb) / 2;
}
}
// printf ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index );
/* we index into fd_end with rank_index, and fd_end was allocated to be no
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort */
if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
rank_index,fd->hints->cb_nodes,fd_size,off);
MPI_Abort(MPI_COMM_WORLD, 1);
}
// DBG_FPRINTF ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index );
/*
* remember here that even in Rajeev's original code it was the case that
@ -611,16 +415,161 @@ int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
return rank;
}
/*
* Compute a dynamic access range based file domain partition among I/O aggregators,
* which align to the GPFS block size
* Divide the I/O workload among "nprocs_for_coll" processes. This is
* done by (logically) dividing the file into file domains (FDs); each
* process may directly access only its own file domain.
* Additional effort is to make sure that each I/O aggregator get
* a file domain that aligns to the GPFS block size. So, there will
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
*
* The common version of this now accepts a min_fd_size and striping_unit.
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
*/
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5004, 0, NULL);
#endif
# if AGG_DEBUG
static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
myname,__LINE__,nprocs_for_coll);
# endif
__blksize_t blksize = 1048576; /* default to 1M */
if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
# endif
/* find min of start offsets and max of end offsets of all processes */
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
// DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
/* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process */
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
int naggs = nprocs_for_coll;
/* Tweak the file domains so that no fd is smaller than a threshold. We
* have to strike a balance between efficency and parallelism: somewhere
* between 10k processes sending 32-byte requests and one process sending a
* 320k request is a (system-dependent) sweet spot
This is from the common code - the new min_fd_size parm that we didn't implement.
(And common code uses a different declaration of fd_size so beware)
if (fd_size < min_fd_size)
fd_size = min_fd_size;
*/
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
for (i=0; i<naggs; i++)
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
else fd_size[i] = (nb_cn_small+1) * blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): "
"gpfs_ub %llu, "
"gpfs_lb %llu, "
"gpfs_ub_rdoff %llu, "
"gpfs_lb_rdoff %llu, "
"fd_gpfs_range %llu, "
"n_gpfs_blk %llu, "
"nb_cn_small %llu, "
"naggs_large %llu, "
"naggs_small %llu, "
"\n",
myname,__LINE__,
gpfs_ub ,
gpfs_lb ,
gpfs_ub_rdoff,
gpfs_lb_rdoff,
fd_gpfs_range,
n_gpfs_blk ,
nb_cn_small ,
naggs_large ,
naggs_small
);
# endif
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
/* compute the file domain for each aggr */
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5005, 0, NULL);
#endif
ADIOI_Free (fd_size);
}
/*
* When a process is an IO aggregator, this will return its index in the aggrs list.
* Otherwise, this will return -1
*/
int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
{
int i;
for (i=0; i<fd->hints->cb_nodes; i++)
if (fd->hints->ranklist[i] == myrank) return i;
return -1;
}
/*
* ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
* is specific for static file domain partitioning.
*
* ADIOI_Calc_my_req() calculate what portions of the access requests
* ADIOI_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list,
void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
@ -629,12 +578,17 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
They are used as memory buffer indices so it seems like the 2G limit is in effect */
{
int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
int i, l, proc;
ADIO_Offset fd_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5024, 0, NULL);
#endif
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
@ -656,10 +610,10 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/
for (i=0; i < contig_access_count; i++) {
/* When there is no data being processed, bypass this loop */
if (len_list[i] == 0) continue;
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
fd_len = len_list[i];
/* note: we set fd_len to be the total size of the access. then
@ -710,20 +664,24 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
/* now fill in my_req */
curr_idx = 0;
for (i=0; i<contig_access_count; i++) {
/* When there is no data being processed, bypass this loop */
if (len_list[i] == 0) continue;
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
fd_len = len_list[i];
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
/* for each separate contiguous access from this process */
if (buf_idx[proc] == -1) buf_idx[proc] = (int) curr_idx;
if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count;
curr_idx += (int) fd_len; /* NOTE: Why is curr_idx an int? Fix? */
curr_idx += fd_len;
rem_len = len_list[i] - fd_len;
@ -733,6 +691,7 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
* and the associated count.
*/
my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len;
my_req[proc].count++;
@ -742,13 +701,18 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
if (buf_idx[proc] == -1) buf_idx[proc] = (int) curr_idx;
if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count;
curr_idx += fd_len;
rem_len -= fd_len;
my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len;
my_req[proc].count++;
}
@ -757,27 +721,26 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
#ifdef AGG_DEBUG
for (i=0; i<nprocs; i++) {
if (count_my_req_per_proc[i] > 0) {
FPRINTF(stdout, "data needed from %d (count = %d):\n", i,
DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
my_req[i].count);
for (l=0; l < my_req[i].count; l++) {
FPRINTF(stdout, " off[%d] = %Ld, len[%d] = %d\n", l,
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l,
my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
}
#if 0
for (i=0; i<nprocs; i++) {
FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
}
#endif
#endif
*count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5025, 0, NULL);
#endif
}
/*
* ADIOI_Calc_others_req
* ADIOI_Calc_others_req (copied to bgl and switched to all to all for performance)
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
@ -826,7 +789,9 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
*recvBufForLens =(void*)0xFFFFFFFF;
/* first find out how much to send/recv and from/to whom */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5026, 0, NULL);
#endif
/* Send 1 int to each process. count_my_req_per_proc[i] is the number of
* requests that my process will do to the file domain owned by process[i].
* Receive 1 int from each process. count_others_req_per_proc[i] is the number of
@ -866,9 +831,9 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
others_req[i].lens = (int *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
if ( (unsigned)others_req[i].offsets < (unsigned)recvBufForOffsets )
if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
recvBufForOffsets = others_req[i].offsets;
if ( (unsigned)others_req[i].lens < (unsigned)recvBufForLens )
if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
recvBufForLens = others_req[i].lens;
others_req[i].mem_ptrs = (MPI_Aint *)
@ -883,6 +848,9 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
others_req[i].lens = NULL;
}
}
/* If no recv buffer was allocated in the loop above, make it NULL */
if ( recvBufForOffsets == (void*)0xFFFFFFFF) recvBufForOffsets = NULL;
if ( recvBufForLens == (void*)0xFFFFFFFF) recvBufForLens = NULL;
/* Now send the calculated offsets and lengths to respective processes */
@ -894,14 +862,18 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
for (i=0; i<nprocs; i++)
{
if ( (my_req[i].count) &&
((unsigned)my_req[i].offsets <= (unsigned)sendBufForOffsets) )
((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
sendBufForOffsets = my_req[i].offsets;
if ( (my_req[i].count) &&
((unsigned)my_req[i].lens <= (unsigned)sendBufForLens) )
((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
sendBufForLens = my_req[i].lens;
}
/* If no send buffer was found in the loop above, make it NULL */
if ( sendBufForOffsets == (void*)0xFFFFFFFF) sendBufForOffsets = NULL;
if ( sendBufForLens == (void*)0xFFFFFFFF) sendBufForLens = NULL;
/* Calculate the displacements from the sendBufForOffsets/Lens */
for (i=0; i<nprocs; i++)
{
@ -910,16 +882,20 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = ( (unsigned)my_req[i].offsets -
(unsigned)sendBufForOffsets ) / sizeof(ADIO_Offset);
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].offsets -
(MPIR_Upint)sendBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
// Receive these offsets from process i.
rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 )
rdispls[i] = 0;
else
rdispls[i] = ( (unsigned)others_req[i].offsets -
(unsigned)recvBufForOffsets ) / sizeof(ADIO_Offset);
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].offsets -
(MPIR_Upint)recvBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
}
/* Exchange the offsets */
@ -940,16 +916,20 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = ( (unsigned)my_req[i].lens -
(unsigned)sendBufForLens ) / sizeof(int);
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].lens -
(MPIR_Upint)sendBufForLens ) /
(MPIR_Upint) sizeof(int) );
// Receive these offsets from process i.
rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 )
rdispls[i] = 0;
else
rdispls[i] = ( (unsigned)others_req[i].lens -
(unsigned)recvBufForLens ) / sizeof(int);
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].lens -
(MPIR_Upint)recvBufForLens ) /
(MPIR_Upint) sizeof(int) );
}
/* Exchange the lengths */
@ -967,4 +947,7 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
ADIOI_Free (rdispls);
*count_others_req_procs_ptr = count_others_req_procs;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5027, 0, NULL);
#endif
}

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_aggrs.h
* \brief ???
@ -22,13 +23,22 @@
#include "adio.h"
#include <sys/stat.h>
extern int *aggrsInPset; /* defined in ad_bgl_aggrs.c */
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
/* File system (BGL) specific information -
hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
typedef struct ADIOI_BGL_fs_s {
__blksize_t blksize;
int fsync_aggr; /* "fsync aggregation" flags (below) */
#define ADIOI_BGL_FSYNC_AGGREGATION_DISABLED 0x00
#define ADIOI_BGL_FSYNC_AGGREGATION_ENABLED 0x01
#define ADIOI_BGL_FSYNC_AGGREGATOR 0x10 /* This rank is an aggregator */
} ADIOI_BGL_fs;
/* generate a list of I/O aggregators that utilizes BGL-PSET orginization. */
@ -60,7 +70,7 @@
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
static file domain partitioning */
void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, int *len_list,
void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,

Просмотреть файл

@ -1,7 +1,8 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_open.c
* \file ad_bgl_close.c
* \brief ???
*/

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_fcntl.c
* \brief ???

Просмотреть файл

@ -0,0 +1,90 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_flush.c
* \brief Scalable flush based on underlying filesystem and psets
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "ad_bgl_aggrs.h"
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code)
{
int err=0;
static char myname[] = "ADIOI_BGL_FLUSH";
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATION_ENABLED)
{
int rank;
/* Barrier so we can collectively do fewer fsync's */
MPI_Barrier(fd->comm);
MPI_Comm_rank(fd->comm, &rank);
/* All ranks marked as "fsync aggregators" should fsync.
(We currently only do one fsync on rank 0 but this is general
enough to support >1 aggregator using allreduce to get the
results instead of simply bcast'ing the results from rank 0.)*/
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATOR)
{
err = fsync(fd->fd_sys);
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
/* We want errno, not the return code if it failed */
if (err == -1) err = errno;
else err = 0;
}
/* Just pick an errno (using unsigned MPI_MAX) from any failures */
MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
if (err) /* if it's non-zero, it must be an errno */
{
errno = err;
err = -1;
}
}
else /* Non-aggregated fsync */
{
#ifdef USE_DBG_LOGGING
int rank;
#endif
err = fsync(fd->fd_sys);
#ifdef USE_DBG_LOGGING
MPI_Comm_rank(fd->comm, &rank);
if(rank == 0)
{
DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
else
{
DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
#endif
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -38,8 +38,8 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
MPI_Info info;
char *value;
int flag, intval, tmp_val, nprocs, nprocs_is_valid = 0;
static char myname[] = "ADIOI_GEN_SETINFO";
int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0;
static char myname[] = "ADIOI_BGL_SETINFO";
int did_anything = 0;
@ -61,15 +61,15 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
did_anything = 1;
/* buffer size for collective I/O */
MPI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
/* default is to let romio automatically decide when to use
* collective buffering
*/
MPI_Info_set(info, "romio_cb_read", "enable");
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = ADIOI_HINT_ENABLE;
MPI_Info_set(info, "romio_cb_write", "enable");
ADIOI_Info_set(info, "romio_cb_write", "enable");
fd->hints->cb_write = ADIOI_HINT_ENABLE;
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
@ -78,30 +78,54 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* number of processes that perform I/O in collective I/O */
MPI_Comm_size(fd->comm, &nprocs);
nprocs_is_valid = 1;
sprintf(value, "%d", nprocs);
MPI_Info_set(info, "cb_nodes", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
ADIOI_Info_set(info, "cb_nodes", value);
fd->hints->cb_nodes = -1;
/* hint indicating that no indep. I/O will be performed on this file */
MPI_Info_set(info, "romio_no_indep_rw", "false");
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
/* deferred_open derrived from no_indep_rw and cb_{read,write} */
/* bgl is not implementing file realms (ADIOI_IOStridedColl),
initialize to disabled it. */
/* hint instructing the use of persistent file realms */
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
/* hint guiding the assignment of persistent file realms */
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
fd->hints->cb_fr_type = ADIOI_FR_AAR;
/* hint to align file realms with a certain byte value */
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
fd->hints->cb_fr_alignment = 1;
/* hint to set a threshold percentage for a datatype's size/extent at
* which data sieving should be done in collective I/O */
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
fd->hints->cb_ds_threshold = 0;
/* hint to switch between point-to-point or all-to-all for two-phase */
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
/* deferred_open derived from no_indep_rw and cb_{read,write} */
fd->hints->deferred_open = 0;
/* buffer size for data sieving in independent reads */
MPI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
/* buffer size for data sieving in independent writes */
MPI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
if(fd->file_system == ADIO_UFS)
{
/* default for ufs/pvfs is to disable data sieving */
MPI_Info_set(info, "romio_ds_read", "disable");
ADIOI_Info_set(info, "romio_ds_read", "disable");
fd->hints->ds_read = ADIOI_HINT_DISABLE;
MPI_Info_set(info, "romio_ds_write", "disable");
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
else
@ -109,18 +133,23 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* default is to let romio automatically decide when to use data
* sieving
*/
MPI_Info_set(info, "romio_ds_read", "automatic");
ADIOI_Info_set(info, "romio_ds_read", "automatic");
fd->hints->ds_read = ADIOI_HINT_AUTO;
MPI_Info_set(info, "romio_ds_write", "automatic");
ADIOI_Info_set(info, "romio_ds_write", "automatic");
fd->hints->ds_write = ADIOI_HINT_AUTO;
}
/* still to do: tune this a bit for a variety of file systems. there's
* no good default value so just leave it unset */
fd->hints->min_fdomain_size = 0;
fd->hints->striping_unit = 0;
fd->hints->initialized = 1;
}
/* add in user's info if supplied */
if (users_info != MPI_INFO_NULL) {
MPI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
@ -135,30 +164,106 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
/* --END ERROR HANDLING-- */
MPI_Info_set(info, "cb_buffer_size", value);
ADIOI_Info_set(info, "cb_buffer_size", value);
fd->hints->cb_buffer_size = intval;
}
#if 0
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
/* aligning file realms to certain sizes (e.g. stripe sizes)
* may benefit I/O performance */
ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_alignment",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_alignment", value);
fd->hints->cb_fr_alignment = intval;
}
/* for collective I/O, try to be smarter about when to do data sieving
* using a specific threshold for the datatype size/extent
* (percentage 0-100%) */
ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_ds_threshold",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_ds_threshold", value);
fd->hints->cb_ds_threshold = intval;
}
ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_alltoall;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_alltoall) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_alltoall",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
#endif
/* new hints for enabling/disabling coll. buffering on
* reads/writes
*/
MPI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag);
ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_cb_read", value);
ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
/* romio_cb_read overrides no_indep_rw */
MPI_Info_set(info, "romio_cb_read", value);
MPI_Info_set(info, "romio_no_indep_rw", "false");
ADIOI_Info_set(info, "romio_cb_read", value);
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_read = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
MPI_Info_set(info, "romio_cb_read", value);
ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
@ -174,24 +279,25 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
/* --END ERROR HANDLING-- */
}
MPI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag);
ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_cb_write", value);
ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{
/* romio_cb_write overrides no_indep_rw, too */
MPI_Info_set(info, "romio_cb_write", value);
MPI_Info_set(info, "romio_no_indep_rw", "false");
ADIOI_Info_set(info, "romio_cb_write", value);
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_write = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") ||
!strcmp(value, "AUTOMATIC"))
{
MPI_Info_set(info, "romio_cb_write", value);
ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_AUTO;
}
@ -208,23 +314,81 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */
}
#if 0
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
/* enable/disable persistent file realms for collective I/O */
/* may want to check for no_indep_rdwr hint as well */
ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_pfr;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_pfr) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_pfr",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
/* file realm assignment types ADIOI_FR_AAR(0),
ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
a regular fr size in bytes. probably not the best way... */
ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) >= -2)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_type",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_type", value);
fd->hints->cb_fr_type = intval;
}
#endif
/* new hint for specifying no indep. read/write will be performed */
MPI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag);
ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
/* if 'no_indep_rw' set, also hint that we will do
* collective buffering: if we aren't doing independent io,
* then we have to do collective */
MPI_Info_set(info, "romio_no_indep_rw", value);
MPI_Info_set(info, "romio_cb_write", "enable");
MPI_Info_set(info, "romio_cb_read", "enable");
ADIOI_Info_set(info, "romio_no_indep_rw", value);
ADIOI_Info_set(info, "romio_cb_write", "enable");
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->no_indep_rw = 1;
fd->hints->cb_read = 1;
fd->hints->cb_write = 1;
tmp_val = 1;
}
else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
MPI_Info_set(info, "romio_no_indep_rw", value);
ADIOI_Info_set(info, "romio_no_indep_rw", value);
fd->hints->no_indep_rw = 0;
tmp_val = 0;
}
@ -246,64 +410,80 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* new hints for enabling/disabling data sieving on
* reads/writes
*/
MPI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_ds_read", value);
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
MPI_Info_set(info, "romio_ds_read", value);
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
MPI_Info_set(info, "romio_ds_read", value);
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_AUTO;
}
/* otherwise ignore */
}
MPI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_ds_write", value);
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
MPI_Info_set(info, "romio_ds_write", value);
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
MPI_Info_set(info, "romio_ds_write", value);
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_AUTO;
}
/* otherwise ignore */
}
MPI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
MPI_Info_set(info, "ind_wr_buffer_size", value);
ADIOI_Info_set(info, "ind_wr_buffer_size", value);
fd->hints->ind_wr_buffer_size = intval;
}
MPI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
MPI_Info_set(info, "ind_rd_buffer_size", value);
ADIOI_Info_set(info, "ind_rd_buffer_size", value);
fd->hints->ind_rd_buffer_size = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
MPI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
fd->hints->min_fdomain_size = intval;
}
/* Now we use striping unit in common code so we should
process hints for it. */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "striping_unit", value);
fd->hints->striping_unit = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
did_anything = 1;
MPI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value);
ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value);
fd->hints->cb_nodes = intval;
}
}
@ -312,24 +492,30 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (did_anything) {
ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes);
}
/* deferred_open won't be set by callers, but if the user doesn't
* explicitly disable collecitve buffering (two-phase) and does hint that
* io w/o independent io is going on, we'll set this internal hint as a
* convenience */
if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE)
&& (fd->hints->cb_write != ADIOI_HINT_DISABLE)
&& fd->hints->no_indep_rw ) )
{
fd->hints->deferred_open = 1;
} else {
/* setting romio_no_indep_rw enable and romio_cb_{read,write}
* disable at the same time doesn't make sense. honor
* romio_cb_{read,write} and force the no_indep_rw hint to
* 'disable' */
MPI_Info_set(info, "romio_no_indep_rw", "false");
/* ignore defered open hints and do not enable it for bluegene: need all
* processors in the open path so we can stat-and-broadcast the blocksize
*/
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
fd->hints->deferred_open = 0;
/* BobC commented this out, but since hint processing runs on both bgl and
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
* PVFS */
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
/* disable data sieving for fs that do not
support file locking */
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
/* get rid of this value if it is set */
ADIOI_Info_delete(info, "ind_wr_buffer_size");
}
/* note: leave ind_wr_buffer_size alone; used for other cases
* as well. -- Rob Ross, 04/22/2003
*/
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
ADIOI_Free(value);

Просмотреть файл

@ -15,6 +15,181 @@
#include "ad_bgl.h"
#include "ad_bgl_aggrs.h"
#include <sys/statfs.h>
#include <sys/vfs.h>
/* COPIED FROM ad_fstype.c since it is static in that file
ADIO_FileSysType_parentdir - determines a string pathname for the
parent directory of a given filename.
Input Parameters:
. filename - pointer to file name character array
Output Parameters:
. dirnamep - pointer to location in which to store a pointer to a string
Note that the caller should free the memory located at the pointer returned
after the string is no longer needed.
*/
#ifndef PATH_MAX
#define PATH_MAX 65535
#endif
/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that
here. We assume that S_ISLNK is *always* defined as a macro. If
that is not universally true, then add a test to the romio
configure that trys to link a program that references S_ISLNK */
#if !defined(S_ISLNK)
# if defined(S_IFLNK)
/* Check for the link bit */
# define S_ISLNK(mode) ((mode) & S_IFLNK)
# else
/* no way to check if it is a link, so say false */
# define S_ISLNK(mode) 0
# endif
#endif /* !(S_ISLNK) */
/* ADIO_FileSysType_parentdir
*
* Returns pointer to string in dirnamep; that string is allocated with
* strdup and must be free()'d.
*/
static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
{
int err;
char *dir = NULL, *slash;
struct stat statbuf;
err = lstat(filename, &statbuf);
if (err || (!S_ISLNK(statbuf.st_mode))) {
/* no such file, or file is not a link; these are the "normal"
* cases where we can just return the parent directory.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* filename is a symlink. we've presumably already tried
* to stat it and found it to be missing (dangling link),
* but this code doesn't care if the target is really there
* or not.
*/
int namelen;
char *linkbuf;
linkbuf = ADIOI_Malloc(PATH_MAX+1);
namelen = readlink(filename, linkbuf, PATH_MAX+1);
if (namelen == -1) {
/* something strange has happened between the time that
* we determined that this was a link and the time that
* we attempted to read it; punt and use the old name.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* successfully read the link */
linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
dir = ADIOI_Strdup(linkbuf);
ADIOI_Free(linkbuf);
}
}
slash = strrchr(dir, '/');
if (!slash) ADIOI_Strncpy(dir, ".", 2);
else {
if (slash == dir) *(dir + 1) = '\0';
else *slash = '\0';
}
*dirnamep = dir;
return;
}
static void scaleable_stat(ADIO_File fd)
{
struct stat64 bgl_stat;
struct statfs bgl_statfs;
int rank, rc;
char * dir;
long buf[2];
MPI_Comm_rank(fd->comm, &rank);
if (rank == 0) {
/* Get the (real) underlying file system block size */
rc = stat64(fd->filename, &bgl_stat);
if (rc >= 0)
{
buf[0] = bgl_stat.st_blksize;
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
fd->filename,bgl_stat.st_blksize);
}
else
{
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
}
/* Get the (real) underlying file system type so we can
* plan our fsync scaling strategy */
rc = statfs(fd->filename,&bgl_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",
fd->filename,bgl_statfs.f_type);
buf[1] = bgl_statfs.f_type;
}
else
{
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
ADIO_FileSysType_parentdir(fd->filename, &dir);
rc = statfs(dir,&bgl_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",dir,bgl_statfs.f_type);
buf[1] = bgl_statfs.f_type;
}
else
{
/* Hmm. Guess we'll assume the worst-case, that it's not GPFS
* or PVFS2 below */
buf[1] = -1; /* bogus magic number */
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
}
free(dir);
}
}
/* now we can broadcast the stat/statfs data to everyone else */
MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
bgl_stat.st_blksize = buf[0];
bgl_statfs.f_type = buf[1];
/* data from stat64 */
/* store the blksize in the file system specific storage */
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
/* data from statfs */
if ((bgl_statfs.f_type == GPFS_SUPER_MAGIC) ||
(bgl_statfs.f_type == PVFS2_SUPER_MAGIC))
{
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BGL_FSYNC_AGGREGATION_ENABLED;
/* Only one rank is an "fsync aggregator" because only one
* fsync is needed */
if (rank == 0)
{
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr |=
ADIOI_BGL_FSYNC_AGGREGATOR;
DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
}
else ; /* aggregation enabled but this rank is not an aggregator*/
}
else; /* Other filesystems default to no fsync aggregation */
}
void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode;
@ -41,8 +216,14 @@ void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
fd->fd_direct = -1;
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
@ -51,17 +232,28 @@ void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
if(fd->fd_sys != -1)
{
struct stat64 bgl_stat;
int rc = stat64(fd->filename,&bgl_stat);
if (rc >= 0)
{
/* store the blksize in the file system specific storage */
struct statfs bgl_statfs;
char* dir;
int rc;
/* Initialize the ad_bgl file system specific information */
AD_BGL_assert(fd->fs_ptr == NULL);
fd->fs_ptr = (ADIOI_BGL_fs*) ADIOI_Malloc(sizeof(ADIOI_BGL_fs));
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
/* FPRINTF(stderr,"%s(%d):Successful stat '%s'. Blocksize=%ld\n",myname,__LINE__,fd->filename,bgl_stat.st_blksize);*/
}
/* else
FPRINTF(stderr,"%s(%d):Stat '%s' failed with rc=%d, errno=%d\n",myname,__LINE__,fd->filename,rc,errno);*/
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
/* default is no fsync aggregation */
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BGL_FSYNC_AGGREGATION_DISABLED;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
scaleable_stat(fd);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
}
if (fd->fd_sys == -1) {
@ -112,3 +304,6 @@ void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
}
else *error_code = MPI_SUCCESS;
}
/*
*vim: ts=8 sts=4 sw=4 noexpandtab
*/

Просмотреть файл

@ -8,6 +8,7 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
@ -22,18 +23,25 @@
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define RDCOLL_DEBUG 1
#endif
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
/* prototypes of functions used for collective reads only. */
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs,
int myrank, ADIOI_Access
*others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count,
ADIO_Offset *len_list, int contig_access_count,
ADIO_Offset
min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code);
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int
*flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size,
int *count, int *start_pos,
int *partial_send,
@ -47,7 +55,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
int iter,
MPI_Aint buftype_extent, int *buf_idx);
static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int
*flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size,
int *count, int *start_pos,
int *partial_send,
@ -62,8 +70,8 @@ static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatl
MPI_Aint buftype_extent, int *buf_idx);
static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **recv_buf, ADIO_Offset
*offset_list, int *len_list,
int *recv_size,
*offset_list, ADIO_Offset *len_list,
unsigned *recv_size,
MPI_Request *requests, MPI_Status *statuses,
int *recd_from_proc, int nprocs,
int contig_access_count,
@ -74,7 +82,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
datatype, int file_ptr_type, ADIO_Offset
offset, ADIO_Offset **offset_list_ptr, int
offset, ADIO_Offset **offset_list_ptr, ADIO_Offset
**len_list_ptr, ADIO_Offset *start_offset_ptr,
ADIO_Offset *end_offset_ptr, int
*contig_access_count_ptr);
@ -99,25 +107,15 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
whose request lies in this process's file domain. */
int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
int contig_access_count, interleave_count = 0, buftype_is_contig;
int contig_access_count=0, interleave_count = 0, buftype_is_contig;
int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
*fd_end = NULL, *end_offsets = NULL;
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
int ii;
int *len_list = NULL, *buf_idx = NULL;
double io_time = 0., all_time, max_all_time;
double tstep1, max_tstep1;
double tstep1_1, max_tstep1_1;
double tstep1_2, max_tstep1_2;
double tstep1_3, max_tstep1_3;
double tstep2, max_tstep2;
double tstep3, max_tstep3;
double tstep4, max_tstep4;
double sum_sz;
ADIO_Offset *len_list = NULL;
int *buf_idx = NULL;
#if BGL_PROFILE
BGLMPIO_T_CIO_RESET( 0, r )
#endif
@ -126,6 +124,14 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
int bufsize, size;
#endif
#if 0
/* From common code - not implemented for bgl. */
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype,
file_ptr_type, offset, status, error_code);
return;
} */
#endif
#ifdef PROFILE
MPE_Log_event(13, 0, "start computation");
#endif
@ -157,14 +163,16 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
#endif
/* for (i=0; i<contig_access_count; i++) {
FPRINTF(stderr, "rank %d off %ld len %d\n", myrank, offset_list[i],
len_list[i]);
}*/
#ifdef RDCOLL_DEBUG
for (i=0; i<contig_access_count; i++) {
DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n",
myrank, offset_list[i], len_list[i]);
}
#endif
/* each process communicates its start and end offsets to other
processes. The result is an array each of start and end offsets stored
in order of process rank. */
processes. The result is an array each of start and end offsets
stored in order of process rank. */
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
@ -200,7 +208,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
/* are the accesses of different processes interleaved? */
for (i=1; i<nprocs; i++)
if (st_offsets[i] < end_offsets[i-1]) interleave_count++;
if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice
for the moment. */
}
@ -223,7 +233,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (fd->etype_size) * offset;
off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
off, status, error_code);
}
@ -263,7 +273,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
else
ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
nprocs_for_coll, &min_st_offset,
&fd_start, &fd_end, &fd_size);
&fd_start, &fd_end,
fd->hints->min_fdomain_size, &fd_size,
fd->hints->striping_unit);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
@ -381,205 +393,11 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
fd->fp_sys_posn = -1; /* set it to null. */
}
#if 0
void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
datatype, int file_ptr_type, ADIO_Offset
offset, ADIO_Offset **offset_list_ptr, int
**len_list_ptr, ADIO_Offset *start_offset_ptr,
ADIO_Offset *end_offset_ptr, int
*contig_access_count_ptr)
{
int filetype_size, buftype_size, etype_size;
int i, j, k, frd_size=0, old_frd_size=0, st_index=0;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int contig_access_count, *len_list, flag, filetype_is_contig;
MPI_Aint filetype_extent, filetype_lb;
ADIOI_Flatlist_node *flat_file;
ADIO_Offset *offset_list, off, end_offset=0, disp;
/* For this process's request, calculate the list of offsets and
lengths in the file and determine the start and end offsets. */
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_lb(fd->filetype, &filetype_lb);
MPI_Type_size(datatype, &buftype_size);
etype_size = fd->etype_size;
if ( ! filetype_size ) {
*contig_access_count_ptr = 0;
*offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
*len_list_ptr = (int *) ADIOI_Malloc(2*sizeof(int));
/* 2 is for consistency. everywhere I malloc one more than needed */
offset_list = *offset_list_ptr;
len_list = *len_list_ptr;
offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
len_list[0] = 0;
*start_offset_ptr = offset_list[0];
*end_offset_ptr = offset_list[0] + len_list[0] - 1;
return;
}
if (filetype_is_contig) {
*contig_access_count_ptr = 1;
*offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
*len_list_ptr = (int *) ADIOI_Malloc(2*sizeof(int));
/* 2 is for consistency. everywhere I malloc one more than needed */
offset_list = *offset_list_ptr;
len_list = *len_list_ptr;
offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
len_list[0] = bufcount * buftype_size;
*start_offset_ptr = offset_list[0];
*end_offset_ptr = offset_list[0] + len_list[0] - 1;
/* update file pointer */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = *end_offset_ptr + 1;
}
else {
/* First calculate what size of offset_list and len_list to allocate */
/* filetype already flattened in ADIO_Open or ADIO_Fcntl */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent +
flat_file->blocklens[i] >= offset)
{
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
break;
}
}
}
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
/* calculate how much space to allocate for offset_list, len_list */
old_frd_size = frd_size;
contig_access_count = i = 0;
j = st_index;
bufsize = buftype_size * bufcount;
frd_size = ADIOI_MIN(frd_size, bufsize);
while (i < bufsize) {
if (frd_size) contig_access_count++;
i += frd_size;
j = (j + 1) % flat_file->count;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
/* allocate space for offset_list and len_list */
*offset_list_ptr = (ADIO_Offset *)
ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset));
*len_list_ptr = (int *) ADIOI_Malloc((contig_access_count+1)*sizeof(int));
/* +1 to avoid a 0-size malloc */
offset_list = *offset_list_ptr;
len_list = *len_list_ptr;
/* find start offset, end offset, and fill in offset_list and len_list */
*start_offset_ptr = offset; /* calculated above */
i = k = 0;
j = st_index;
off = offset;
frd_size = ADIOI_MIN(old_frd_size, bufsize);
while (i < bufsize) {
if (frd_size) {
offset_list[k] = off;
len_list[k] = frd_size;
k++;
}
i += frd_size;
end_offset = off + frd_size - 1;
/* Note: end_offset points to the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] +
(ADIO_Offset) n_filetypes*filetype_extent)
{
off += frd_size;
/* did not reach end of contiguous block in filetype.
* no more I/O needed. off is incremented by frd_size.
*/
}
else {
if (j < (flat_file->count - 1)) j++;
else {
/* hit end of flattened filetype;
* start at beginning again
*/
j = 0;
n_filetypes++;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
}
/* update file pointer */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
*contig_access_count_ptr = contig_access_count;
*end_offset_ptr = end_offset;
}
}
#endif
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs,
int myrank, ADIOI_Access
*others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count, ADIO_Offset
ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code)
@ -594,19 +412,21 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
array from a file, where each local array is 8Mbytes, requiring
at least another 8Mbytes of temp space is unacceptable. */
int i, j, m, size, ntimes, max_ntimes, buftype_is_contig;
int i, j, m, ntimes, max_ntimes, buftype_is_contig;
ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
char *read_buf = NULL, *tmp_buf;
int *curr_offlen_ptr, *count, *send_size, *recv_size;
int *partial_send, *recd_from_proc, *start_pos, for_next_iter;
int real_size, req_len, flag, for_curr_iter, rank;
int *partial_send, *recd_from_proc, *start_pos;
/* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
int req_len, flag, rank;
MPI_Status status;
ADIOI_Flatlist_node *flat_buf=NULL;
MPI_Aint buftype_extent;
int coll_bufsize;
#ifdef RDCOLL_DEBUG
int iii;
#endif
*error_code = MPI_SUCCESS; /* changed below if error */
/* only I/O errors are currently reported */
@ -738,7 +558,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
#ifdef PROFILE
MPE_Log_event(13, 0, "start computation");
#endif
size = (int) (ADIOI_MIN(coll_bufsize, end_loc-st_loc+1-done));
size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done);
real_off = off - for_curr_iter;
real_size = size + for_curr_iter;
@ -746,7 +566,9 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
for_next_iter = 0;
for (i=0; i<nprocs; i++) {
/* FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); */
#ifdef RDCOLL_DEBUG
DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count);
#endif
if (others_req[i].count) {
start_pos[i] = curr_offlen_ptr[i];
for (j=curr_offlen_ptr[i]; j<others_req[i].count;
@ -769,22 +591,22 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
}
if (req_off < real_off + real_size) {
count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
MPI_Address(read_buf+req_off-real_off,
&(others_req[i].mem_ptrs[j]));
send_size[i] += (int)(ADIOI_MIN(real_off + (ADIO_Offset)real_size -
req_off, req_len));
ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
(ADIO_Offset)(unsigned)req_len));
if (real_off+real_size-req_off < req_len) {
partial_send[i] = (int) (real_off+real_size-
req_off);
if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
partial_send[i] = (int) (real_off + real_size - req_off);
if ((j+1 < others_req[i].count) &&
(others_req[i].offsets[j+1] <
real_off+real_size)) {
/* this is the case illustrated in the
figure above. */
for_next_iter = (int) (ADIOI_MAX(for_next_iter,
real_off + real_size -
others_req[i].offsets[j+1]));
for_next_iter = ADIOI_MAX(for_next_iter,
real_off + real_size - others_req[i].offsets[j+1]);
/* max because it must cover requests
from different processes */
}
@ -805,13 +627,14 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
MPE_Log_event(14, 0, "end computation");
#endif
if (flag) {
ADIO_ReadContig(fd, read_buf+for_curr_iter, size, MPI_BYTE,
ADIOI_Assert(size == (int)size);
ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, error_code);
/*
printf( "\tread_coll: 700, data read [%3d] = ", size );
for (iii=0; iii<size; iii++) { printf( "%3d,", *((unsigned char *)read_buf + for_curr_iter + iii) ); }
printf( "\n" );
*/
#ifdef RDCOLL_DEBUG
DBG_FPRINTF(stderr, "\tread_coll: 700, data read [%lld] = ", size );
for (iii=0; iii<size && iii<80; iii++) { DBGV_FPRINTF(stderr, "%3d,", *((unsigned char *)read_buf + for_curr_iter + iii) ); }
DBG_FPRINTF(stderr, "\n" );
#endif
if (*error_code != MPI_SUCCESS) return;
}
@ -849,6 +672,8 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
if (for_next_iter) {
tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
ADIOI_Free(read_buf);
read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
@ -902,7 +727,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
}
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int
*flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size,
int *count, int *start_pos, int *partial_send,
int *recd_from_proc, int nprocs,
@ -937,6 +762,10 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
/* post recvs. if buftype_is_contig, data can be directly recd. into
user buf at location given by buf_idx. else use recv_buf. */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5032, 0, NULL);
#endif
if (buftype_is_contig) {
j = 0;
for (i=0; i < nprocs; i++)
@ -960,8 +789,10 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i,
myrank+i+100*iter, fd->comm, requests+j);
j++;
/* FPRINTF(stderr, "node %d, recv_size %d, tag %d \n",
myrank, recv_size[i], myrank+i+100*iter); */
#ifdef RDCOLL_DEBUG
DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n",
myrank, recv_size[i], myrank+i+100*iter);
#endif
}
}
@ -1006,7 +837,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
/* if noncontiguous, to the copies from the recv buffers */
if (!buftype_is_contig)
ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
offset_list, len_list, recv_size,
offset_list, len_list, (unsigned*)recv_size,
requests, statuses, recd_from_proc,
nprocs, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end,
@ -1024,9 +855,11 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
if (recv_size[i]) ADIOI_Free(recv_buf[i]);
ADIOI_Free(recv_buf);
}
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5033, 0, NULL);
#endif
}
#define ADIOI_BUF_INCR \
{ \
while (buf_incr) { \
@ -1040,7 +873,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
buf_incr -= size_in_buf; \
@ -1052,9 +885,11 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
{ \
while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)(buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(((char *) buf) + user_buf_idx, \
&(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \
recv_buf_idx[p] += size_in_buf; \
recv_buf_idx[p] += size_in_buf; /* already tested (size_t)size_in_buf*/ \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
@ -1064,7 +899,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
size -= size_in_buf; \
@ -1073,11 +908,10 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
ADIOI_BUF_INCR \
}
static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **recv_buf, ADIO_Offset
*offset_list, int *len_list,
int *recv_size,
*offset_list, ADIO_Offset *len_list,
unsigned *recv_size,
MPI_Request *requests, MPI_Status *statuses,
int *recd_from_proc, int nprocs,
int contig_access_count,
@ -1086,13 +920,18 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
ADIO_Offset *fd_end,
MPI_Aint buftype_extent)
{
/* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size, buf_incr;
int flat_buf_sz, size_in_buf, n_buftypes;
int i, p, flat_buf_idx;
ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
int n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx;
/* Not sure unsigned is necessary, but it makes the math safer */
unsigned *curr_from_proc, *done_from_proc, *recv_buf_idx;
int *curr_from_proc, *done_from_proc, *recv_buf_idx;
ADIOI_UNREFERENCED_ARG(requests);
ADIOI_UNREFERENCED_ARG(statuses);
/* curr_from_proc[p] = amount of data recd from proc. p that has already
been accounted for so far
@ -1100,9 +939,9 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
filled into user buffer in previous iterations
user_buf_idx = current location in user buffer
recv_buf_idx[p] = current location in recv_buf of proc. p */
curr_from_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
done_from_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
recv_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
curr_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
done_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
recv_buf_idx = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
for (i=0; i < nprocs; i++) {
recv_buf_idx[i] = curr_from_proc[i] = 0;
@ -1120,7 +959,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
for (i=0; i<contig_access_count; i++) {
off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i];
rem_len = len_list[i];
/* this request may span the file domains of more than one process */
while (rem_len > 0) {
@ -1140,29 +979,32 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
if (recv_buf_idx[p] < recv_size[p]) {
if (curr_from_proc[p]+len > done_from_proc[p]) {
if (done_from_proc[p] > curr_from_proc[p]) {
size = (int)ADIOI_MIN(curr_from_proc[p] + len -
size = ADIOI_MIN(curr_from_proc[p] + len -
done_from_proc[p], recv_size[p]-recv_buf_idx[p]);
buf_incr = done_from_proc[p] - curr_from_proc[p];
ADIOI_BUF_INCR
buf_incr = (int)(curr_from_proc[p]+len-done_from_proc[p]);
buf_incr = curr_from_proc[p]+len-done_from_proc[p];
ADIOI_Assert((done_from_proc[p] + size) == (unsigned)((ADIO_Offset)done_from_proc[p] + size));
curr_from_proc[p] = done_from_proc[p] + size;
ADIOI_BUF_COPY
}
else {
size = (int)ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]);
buf_incr = (int)len;
curr_from_proc[p] += size;
size = ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]);
buf_incr = len;
ADIOI_Assert((curr_from_proc[p] + size) == (unsigned)((ADIO_Offset)curr_from_proc[p] + size));
curr_from_proc[p] += (unsigned) size;
ADIOI_BUF_COPY
}
}
else {
curr_from_proc[p] += (int)len;
buf_incr = (int)len;
ADIOI_Assert((curr_from_proc[p] + len) == (unsigned)((ADIO_Offset)curr_from_proc[p] + len));
curr_from_proc[p] += (unsigned) len;
buf_incr = len;
ADIOI_BUF_INCR
}
}
else {
buf_incr = (int)len;
buf_incr = len;
ADIOI_BUF_INCR
}
off += len;
@ -1179,7 +1021,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
static void ADIOI_R_Exchange_data_alltoallv(
ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int
*flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size,
int *count, int *start_pos, int *partial_send,
int *recd_from_proc, int nprocs,
@ -1192,9 +1034,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
{
int i, j, k=0, tmp=0, nprocs_recv, nprocs_send;
char **recv_buf = NULL;
MPI_Request *requests;
MPI_Datatype send_type;
MPI_Status *statuses;
MPI_Request *requests=NULL;
MPI_Status *statuses=NULL;
int rtail, stail;
char *sbuf_ptr, *from_ptr;
int len;
@ -1238,7 +1079,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
}
sbuf_ptr = all_send_buf + sdispls[i];
for (j=0; j<count[i]; j++) {
from_ptr = (char *)( others_req[i].mem_ptrs[ start_pos[i]+j ] );
ADIOI_ENSURE_AINT_FITS_IN_PTR( others_req[i].mem_ptrs[ start_pos[i]+j ]);
from_ptr = (char *) ADIOI_AINT_CAST_TO_VOID_PTR ( others_req[i].mem_ptrs[ start_pos[i]+j ] );
len = others_req[i].lens[ start_pos[i]+j ] ;
memcpy( sbuf_ptr, from_ptr, len );
sbuf_ptr += len;
@ -1247,26 +1089,19 @@ static void ADIOI_R_Exchange_data_alltoallv(
}
}
#if 0
printf( "\tsend_size = " );
for (i=0; i<nprocs; i++) { printf( "%2d,", send_size[i] ); }
printf( "\n" );
printf( "\trecv_size = " );
for (i=0; i<nprocs; i++) { printf( "%2d,", recv_size[i] ); }
printf( "\n" );
printf( "\tsdispls = " );
for (i=0; i<nprocs; i++) { printf( "%2d,", sdispls [i] ); }
printf( "\n" );
printf( "\trdispls = " );
for (i=0; i<nprocs; i++) { printf( "%2d,", rdispls [i] ); }
printf( "\n" );
printf( "\ttails = %4d, %4d\n", stail, rtail );
#endif
#if 0
#if RDCOLL_DEBUG
DBG_FPRINTF(stderr, "\tsend_size = [%d]%2d,",0,send_size[0]);
for (i=1; i<nprocs; i++) if(send_size[i-1]!=send_size[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,send_size[i] ); }
DBG_FPRINTF(stderr, "\trecv_size = [%d]%2d,",0,recv_size[0]);
for (i=1; i<nprocs; i++) if(recv_size[i-1]!=recv_size[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,recv_size[i] ); }
DBG_FPRINTF(stderr, "\tsdispls = [%d]%2d,",0,sdispls[0]);
for (i=1; i<nprocs; i++) if(sdispls[i-1]!=sdispls[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,sdispls [i] ); }
DBG_FPRINTF(stderr, "\trdispls = [%d]%2d,",0,rdispls[0]);
for (i=1; i<nprocs; i++) if(rdispls[i-1]!=rdispls[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,rdispls [i] ); }
DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail );
if (nprocs_send) {
printf( "\tall_send_buf = " );
for (i=0; i<nprocs; i++) { printf( "%2d,", all_send_buf [i*131072] ); }
printf( "\n" );
DBG_FPRINTF(stderr, "\tall_send_buf = [%d]%2d,",0,all_send_buf[0]);
for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf [i*131072] ); }
}
#endif
@ -1277,16 +1112,16 @@ static void ADIOI_R_Exchange_data_alltoallv(
fd->comm );
#if 0
printf( "\tall_recv_buf = " );
for (i=131072; i<131073; i++) { printf( "%2d,", all_recv_buf [i] ); }
printf( "\n" );
DBG_FPRINTF(stderr, "\tall_recv_buf = " );
for (i=131072; i<131073; i++) { DBG_FPRINTF(stderr, "%2d,", all_recv_buf [i] ); }
DBG_FPRINTF(stderr, "\n" );
#endif
/* unpack at the receiver side */
if (nprocs_recv) {
if (!buftype_is_contig)
ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
offset_list, len_list, recv_size,
offset_list, len_list, (unsigned*)recv_size,
requests, statuses, /* never used inside */
recd_from_proc,
nprocs, contig_access_count,

Просмотреть файл

@ -21,9 +21,9 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size, len;
int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BGL_READCONTIG";
#if BGL_PROFILE
/* timing */
double io_time, io_time2;
@ -35,7 +35,8 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
#endif
MPI_Type_size(datatype, &datatype_size);
len = datatype_size * count;
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
#if BGL_PROFILE
@ -48,7 +49,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
@ -64,7 +65,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
@ -79,7 +80,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
@ -91,7 +92,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
@ -120,12 +121,11 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
}
#define ADIOI_BUFFERED_READ \
{ \
if (req_off >= readbuf_off + readbuf_len) { \
readbuf_off = req_off; \
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
err = read(fd->fd_sys, readbuf, readbuf_len);\
@ -133,6 +133,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (err == -1) err_flag = 1; \
} \
while (req_len > readbuf_off + readbuf_len - req_off) { \
ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
@ -141,7 +142,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
memcpy(readbuf, tmp_buf, partial_read); \
ADIOI_Free(tmp_buf); \
readbuf_off += readbuf_len-partial_read; \
readbuf_len = (int) (partial_read + ADIOI_MIN(max_bufsize, \
readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
end_offset-readbuf_off+1)); \
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
@ -149,6 +150,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
if (err == -1) err_flag = 1; \
} \
ADIOI_Assert(req_len == (size_t)req_len); \
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
}
@ -160,20 +162,23 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, err=-1, brd_size, frd_size=0, st_index=0;
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset i_offset, new_brd_size, brd_size, size;
int i, j, k, err=-1, st_index=0;
ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
unsigned num, bufsize;
int n_etypes_in_filetype;
ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, req_len, partial_read;
int filetype_size, etype_size, buftype_size, partial_read;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset userbuf_off, req_len, sum;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value;
int flag, st_frd_size, st_n_filetypes, readbuf_len;
int new_brd_size, new_frd_size, err_flag=0, info_flag, max_bufsize;
int err_flag=0, info_flag;
unsigned max_bufsize, readbuf_len;
static char myname[] = "ADIOI_BGL_READSTRIDED";
if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
@ -207,12 +212,13 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
@ -226,13 +232,13 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
readbuf_off = off;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
@ -245,13 +251,16 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
if (err == -1) err_flag = 1;
for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) {
userbuf_off = j*buftype_extent + flat_buf->indices[i];
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_READ
off += flat_buf->blocklens[i];
}
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
@ -277,29 +286,36 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i]
>= offset) {
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* frd_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
frd_size = dist;
break;
}
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
@ -315,32 +331,63 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
i = 0;
i_offset = 0;
j = st_index;
off = offset;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i < bufsize) {
i += frd_size;
while (i_offset < bufsize) {
i_offset += frd_size;
end_offset = off + frd_size - 1;
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock (exclusive) the region to be accessed */
@ -350,7 +397,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
/* initial read into readbuf */
readbuf_off = offset;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
lseek(fd->fd_sys, offset, SEEK_SET);
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
@ -364,12 +411,12 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i = 0;
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i < bufsize) {
while (i_offset < bufsize) {
if (frd_size) {
/* TYPE_UB and TYPE_LB can result in
frd_size = 0. save system call in such cases */
@ -378,25 +425,26 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
req_off = off;
req_len = frd_size;
userbuf_off = i;
userbuf_off = i_offset;
ADIOI_BUFFERED_READ
}
i += frd_size;
i_offset += frd_size;
if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += frd_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */
else {
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
}
}
@ -408,7 +456,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i = (int) (flat_buf->indices[0]);
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
@ -423,7 +471,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
req_off = off;
req_len = size;
userbuf_off = i;
userbuf_off = i_offset;
ADIOI_BUFFERED_READ
}
@ -432,18 +480,19 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
if (size == frd_size) {
/* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
n_filetypes*(ADIO_Offset)filetype_extent;
new_frd_size = flat_file->blocklens[j];
if (size != brd_size) {
i += size;
i_offset += size;
new_brd_size -= size;
}
}
@ -453,7 +502,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
k = (k + 1)%flat_buf->count;
buf_count++;
i = (int) (buftype_extent*(buf_count/flat_buf->count) +
i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k]);
new_brd_size = flat_buf->blocklens[k];
if (size != frd_size) {
@ -461,6 +510,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
new_frd_size -= size;
}
}
ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
num += size;
frd_size = new_frd_size;
brd_size = new_brd_size;

Просмотреть файл

@ -3,7 +3,13 @@
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_tuning.c
* \brief ???
* \brief defines ad_bgl performance tuning
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*---------------------------------------------------------------------
@ -26,6 +32,40 @@ double bglmpio_prof_cw [BGLMPIO_CIO_LAST];
double bglmpio_prof_cr [BGLMPIO_CIO_LAST];
/* set internal variables for tuning environment variables */
/** \page env_vars Environment Variables
* - BGLMPIO_COMM - Define how data is exchanged on collective
* reads and writes. Possible values:
* - 0 - Use MPI_Alltoallv.
* - 1 - Use MPI_Isend/MPI_Irecv.
* - Default is 0.
*
* - BGLMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
* Must also compile the library with BGL_PROFILE defined. Possible values:
* - 0 - Do not collect/report timing.
* - 1 - Collect/report timing.
* - Default is 0.
*
* - BGLMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
* Must also compile the library with BGL_PROFILE defined. Possible values:
* - 0 - Do not collect/report averages.
* - 1 - Collect/report averages.
* - Default is 0.
*
* - BGLMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
* for aggregator collective i/o. Possible values:
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
* - Default is 1.
*
* - BGLMPIO_TUNEBLOCKING - Tune how aggregate file domains are
* calculated (block size). Possible values:
* - 0 - Evenly calculate file domains across aggregators. Also use
* MPI_Isend/MPI_Irecv to exchange domain information.
* - 1 - Align file domains with the underlying file system's block size. Also use
* MPI_Alltoallv to exchange domain information.
* - Default is 1.
*
*/
void ad_bgl_get_env_vars() {
char *x;

Просмотреть файл

@ -18,6 +18,9 @@
#include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef PROFILE
#include "mpe.h"
#endif
@ -26,13 +29,13 @@
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs, int myrank, ADIOI_Access
*others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count, ADIO_Offset
ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code);
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIOI_Flatlist_node *flat_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size,
*offset_list, ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off, int size,
int *count, int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs,
@ -49,7 +52,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
char *write_buf, /* 1 */
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
int *len_list, int *send_size, int *recv_size,
ADIO_Offset *len_list, int *send_size, int *recv_size,
ADIO_Offset off, int size, /* 2 */
int *count, int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs, int myrank,
@ -65,7 +68,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
int *error_code);
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size,
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank,
int contig_access_count, ADIO_Offset
@ -76,7 +79,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
MPI_Aint buftype_extent);
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size,
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank,
int contig_access_count, ADIO_Offset
@ -118,26 +121,27 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
int ii;
int *buf_idx = NULL, *len_list = NULL;
double io_time = 0, all_time, max_all_time;
double tstep1, max_tstep1;
double tstep1_1, max_tstep1_1;
double tstep1_2, max_tstep1_2;
double tstep1_3, max_tstep1_3;
double tstep2, max_tstep2;
double tstep3, max_tstep3;
double tstep4, max_tstep4;
double sum_sz;
int *buf_idx = NULL;
ADIO_Offset *len_list = NULL;
#if BGL_PROFILE
BGLMPIO_T_CIO_RESET( 0, w )
#endif
#if 0
/* From common code - not implemented for bgl.*/
int old_error, tmp_error;
#endif
#ifdef PROFILE
MPE_Log_event(13, 0, "start computation");
#endif
#if 0
/* From common code - not implemented for bgl. */
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
ADIOI_IOStridedColl (fd, buf, count, ADIOI_WRITE, datatype,
file_ptr_type, offset, status, error_code);
return;
}
#endif
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
@ -207,7 +211,8 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
/* are the accesses of different processes interleaved? */
for (i=1; i<nprocs; i++)
if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i])) interleave_count++;
(st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice
for the moment. */
}
@ -231,7 +236,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (fd->etype_size) * offset;
off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_WriteContig(fd, buf, count, datatype,
ADIO_EXPLICIT_OFFSET,
off, status, error_code);
@ -260,7 +265,9 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
else
ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
nprocs_for_coll, &min_st_offset,
&fd_start, &fd_end, &fd_size);
&fd_start, &fd_end,
fd->hints->min_fdomain_size, &fd_size,
fd->hints->striping_unit);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
@ -329,9 +336,50 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
BGLMPIO_T_CIO_REPORT( 0, w, fd, myrank )
#endif
#if 0
/* From common code - not implemented for bgl.
*
* If this collective write is followed by an independent write,
* it's possible to have those subsequent writes on other processes
* race ahead and sneak in before the read-modify-write completes.
* We carry out a collective communication at the end here so no one
* can start independent i/o before collective I/O completes.
*
* need to do some gymnastics with the error codes so that if something
* went wrong, all processes report error, but if a process has a more
* specific error code, we can still have that process report the
* additional information */
old_error = *error_code;
if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
/* optimization: if only one process performing i/o, we can perform
* a less-expensive Bcast */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
#endif
if (fd->hints->cb_nodes == 1)
MPI_Bcast(error_code, 1, MPI_INT,
fd->hints->ranklist[0], fd->comm);
else {
tmp_error = *error_code;
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
MPI_MAX, fd->comm);
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
#endif
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5012, 0, NULL);
#endif
if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
*error_code = old_error;
#endif
/* free all memory allocated for collective I/O */
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
for (i=0; i<nprocs; i++) {
if (others_req[i].count) {
@ -363,6 +411,9 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
#endif
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5013, 0, NULL);
#endif
}
@ -371,12 +422,12 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
* code is created and returned in error_code.
*/
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs, int myrank,
datatype, int nprocs,
int myrank,
ADIOI_Access
*others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count,
ADIO_Offset
min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *len_list, int contig_access_count,
ADIO_Offset min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code)
{
@ -389,7 +440,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
array to a file, where each local array is 8Mbytes, requiring
at least another 8Mbytes of temp space is unacceptable. */
int hole, i, j, m, size=0, ntimes, max_ntimes, buftype_is_contig;
/* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
ADIO_Offset size=0;
int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
char *write_buf=NULL;
int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
@ -410,7 +463,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
That gives the no. of communication phases as well. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
coll_bufsize = atoi(value);
ADIOI_Free(value);
@ -526,7 +579,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
#endif
for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0;
size = (int) (ADIOI_MIN(coll_bufsize, end_loc-st_loc+1-done));
size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done);
for (i=0; i < nprocs; i++) {
if (others_req[i].count) {
@ -550,12 +603,14 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
}
if (req_off < off + size) {
count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
MPI_Address(write_buf+req_off-off,
&(others_req[i].mem_ptrs[j]));
recv_size[i] += (int)(ADIOI_MIN(off + (ADIO_Offset)size -
req_off, req_len));
ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off));
recv_size[i] += (int)(ADIOI_MIN(off + size - req_off,
(unsigned)req_len));
if (off+size-req_off < req_len)
if (off+size-req_off < (unsigned)req_len)
{
partial_recv[i] = (int) (off + size - req_off);
@ -618,7 +673,8 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
if (count[i]) flag = 1;
if (flag) {
ADIO_WriteContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
ADIOI_Assert(size == (int)size);
ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
off, &status, error_code);
if (*error_code != MPI_SUCCESS) return;
}
@ -678,7 +734,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
*/
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIOI_Flatlist_node *flat_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size,
*offset_list, ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off, int size,
int *count, int *start_pos,
int *partial_recv,
@ -758,20 +814,27 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
}
ADIOI_Free(tmp_len);
/* check if there are any holes */
/* check if there are any holes. If yes, must do read-modify-write.
* holes can be in three places. 'middle' is what you'd expect: the
* processes are operating on noncontigous data. But holes can also show
* up at the beginning or end of the file domain (see John Bent ROMIO REQ
* #835). Missing these holes would result in us writing more data than
* recieved by everyone else. */
*hole = 0;
/* See if there are holes before the first request or after the last request*/
if((srt_off[0] > off) ||
((srt_off[sum-1] + srt_len[sum-1]) < (off + size)))
{
if (off != srt_off[0]) /* hole at the front */
*hole = 1;
else { /* coalesce the sorted offset-length pairs */
for (i=1; i<sum; i++) {
if (srt_off[i] <= srt_off[0] + srt_len[0]) {
int new_len = srt_off[i] + srt_len[i] - srt_off[0];
if (new_len > srt_len[0]) srt_len[0] = new_len;
}
else /* See if there are holes between the requests, if there are more than one */
for (i=0; i<sum-1; i++)
if (srt_off[i]+srt_len[i] < srt_off[i+1]) {
*hole = 1;
else
break;
}
if (i < sum || size != srt_len[0]) /* hole in middle or end */
*hole = 1;
}
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
@ -821,6 +884,9 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
/* post sends. if buftype_is_contig, data can be directly sent from
user buf at location given by buf_idx. else use send_buf. */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5032, 0, NULL);
#endif
if (buftype_is_contig) {
j = 0;
for (i=0; i < nprocs; i++)
@ -895,6 +961,9 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
MPI_Waitall(nprocs_send+nprocs_recv, requests, statuses);
#endif
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5033, 0, NULL);
#endif
ADIOI_Free(statuses);
ADIOI_Free(requests);
if (!buftype_is_contig && nprocs_send) {
@ -918,7 +987,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
buf_incr -= size_in_buf; \
@ -930,6 +999,8 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
{ \
while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)((MPIR_Upint)buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(&(send_buf[p][send_buf_idx[p]]), \
((char *) buf) + user_buf_idx, size_in_buf); \
send_buf_idx[p] += size_in_buf; \
@ -942,7 +1013,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
size -= size_in_buf; \
@ -951,11 +1022,9 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIOI_BUF_INCR \
}
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size,
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank,
int contig_access_count,
@ -967,8 +1036,9 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
{
/* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
int i, p, flat_buf_idx;
ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
int jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already
@ -995,7 +1065,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
for (i=0; i<contig_access_count; i++) {
off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i];
rem_len = len_list[i];
/*this request may span the file domains of more than one process*/
while (rem_len != 0) {
@ -1015,17 +1085,20 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p]+len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) {
size = (int)ADIOI_MIN(curr_to_proc[p] + len -
size = ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p], send_size[p]-send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR
buf_incr = (int)(curr_to_proc[p] + len - done_to_proc[p]);
ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = curr_to_proc[p] + len - done_to_proc[p];
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY
}
else {
size = (int)ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
buf_incr = (int)len;
size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
buf_incr = len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size;
ADIOI_BUF_COPY
}
@ -1036,13 +1109,14 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
}
}
else {
curr_to_proc[p] += (int)len;
buf_incr = (int)len;
ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
curr_to_proc[p] += len;
buf_incr = len;
ADIOI_BUF_INCR
}
}
else {
buf_incr = (int)len;
buf_incr = len;
ADIOI_BUF_INCR
}
off += len;
@ -1181,7 +1255,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
char *write_buf, /* 1 */
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
int *len_list, int *send_size, int *recv_size,
ADIO_Offset *len_list, int *send_size, int *recv_size,
ADIO_Offset off, int size, /* 2 */
int *count, int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs, int myrank,
@ -1196,11 +1270,10 @@ static void ADIOI_W_Exchange_data_alltoallv(
int iter, MPI_Aint buftype_extent, int *buf_idx,
int *error_code)
{
int i, j, k=0, tmp=0, nprocs_recv, nprocs_send, erri, *tmp_len, err;
int i, j, k=0, nprocs_recv, nprocs_send, *tmp_len, err;
char **send_buf = NULL;
MPI_Request *requests, *send_req;
MPI_Datatype recv_type;
MPI_Status *statuses, status;
MPI_Request *send_req=NULL;
MPI_Status status;
int rtail, stail;
char *sbuf_ptr, *to_ptr;
int len;
@ -1324,7 +1397,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
sbuf_ptr = all_recv_buf + rdispls[i];
for (j=0; j<count[i]; j++) {
to_ptr = (char *)( others_req[i].mem_ptrs[ start_pos[i]+j ] );
ADIOI_ENSURE_AINT_FITS_IN_PTR(others_req[i].mem_ptrs[ start_pos[i]+j ]);
to_ptr = (char *) ADIOI_AINT_CAST_TO_VOID_PTR ( others_req[i].mem_ptrs[ start_pos[i]+j ] );
len = others_req[i].lens[ start_pos[i]+j ] ;
memcpy( to_ptr, sbuf_ptr, len );
sbuf_ptr += len;
@ -1349,7 +1423,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size,
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank,
int contig_access_count,
@ -1361,8 +1435,9 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
{
/* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
int i, p, flat_buf_idx;
ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
int jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already
@ -1389,7 +1464,7 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
for (i=0; i<contig_access_count; i++) {
off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i];
rem_len = len_list[i];
/*this request may span the file domains of more than one process*/
while (rem_len != 0) {
@ -1409,17 +1484,20 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p]+len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) {
size = (int)ADIOI_MIN(curr_to_proc[p] + len -
size = ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p], send_size[p]-send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR
buf_incr = (int)(curr_to_proc[p] + len - done_to_proc[p]);
ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = curr_to_proc[p] + len - done_to_proc[p];
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY
}
else {
size = (int)ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
buf_incr = (int)len;
size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
buf_incr = len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size;
ADIOI_BUF_COPY
}
@ -1433,13 +1511,14 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
*/
}
else {
ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
curr_to_proc[p] += (int)len;
buf_incr = (int)len;
buf_incr = len;
ADIOI_BUF_INCR
}
}
else {
buf_incr = (int)len;
buf_incr = len;
ADIOI_BUF_INCR
}
off += len;

Просмотреть файл

@ -17,13 +17,20 @@
#include "ad_bgl_tuning.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size, len;
int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BGL_WRITECONTIG";
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5036, 0, NULL);
#endif
#if BGL_PROFILE
/* timing */
double io_time, io_time2;
@ -35,7 +42,8 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
#endif
MPI_Type_size(datatype, &datatype_size);
len = datatype_size * count;
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
#if BGL_PROFILE
@ -46,7 +54,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
@ -60,7 +68,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
@ -73,7 +81,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
@ -83,7 +91,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
@ -110,11 +118,12 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
#endif
*error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5037, 0, NULL);
#endif
}
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
@ -123,7 +132,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
@ -135,7 +144,8 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
return; \
} \
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
@ -145,7 +155,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
@ -173,9 +183,10 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
@ -186,7 +197,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
@ -201,19 +212,23 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset i_offset, sum, size_in_filetype;
int i, j, k, err=-1, st_index=0;
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, req_len;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value;
int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize;
unsigned bufsize, writebuf_len, max_bufsize, write_sz;
int err_flag=0, info_flag;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
static char myname[] = "ADIOI_BGL_WRITESTRIDED";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
@ -247,12 +262,13 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
@ -272,20 +288,23 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
end_offset = off + bufsize - 1;
writebuf_off = off;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) {
userbuf_off = j*buftype_extent + flat_buf->indices[i];
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
}
/* write the buffer out finally */
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
@ -317,29 +336,37 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i]
>= offset) {
st_index = i;
fwr_size = (int) (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
int i;
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
@ -355,32 +382,64 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i = 0;
i_offset = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i < bufsize) {
i += fwr_size;
while (i_offset < bufsize) {
i_offset += fwr_size;
end_offset = off + fwr_size - 1;
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock the region to be accessed */
@ -390,7 +449,7 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
/* initial read for the read-modify-write */
writebuf_off = offset;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (int)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
err = read(fd->fd_sys, writebuf, writebuf_len);
@ -408,39 +467,41 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i = 0;
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i < bufsize) {
while (i_offset < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);*/
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
i += fwr_size;
i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
}
}
}
@ -452,7 +513,7 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i = (int) (flat_buf->indices[0]);
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
@ -463,11 +524,11 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i, size); */
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off;
req_len = size;
userbuf_off = i;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
@ -476,18 +537,19 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
if (size == fwr_size) {
/* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i += size;
i_offset += size;
new_bwr_size -= size;
}
}
@ -497,8 +559,8 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
k = (k + 1)%flat_buf->count;
buf_count++;
i = (int) (buftype_extent*(buf_count/flat_buf->count) +
flat_buf->indices[k]);
i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;

Просмотреть файл

@ -1,7 +0,0 @@
<dir>
<file name="ad_bglockless.c" info="1205188711"/>
</dir>
<data>
<fileinfo name="ad_bglockless.c">
</fileinfo>
</data>

Просмотреть файл

@ -21,4 +21,6 @@ include $(top_srcdir)/Makefile.options
noinst_LTLIBRARIES = libadio_bglockless.la
libadio_bglockless_la_SOURCES = \
ad_bglockless.c
ad_bglockless.c \
ad_bglockless.h \
ad_bglockless_features.c

Просмотреть файл

@ -6,12 +6,14 @@
*/
#include "../ad_bgl/ad_bgl.h"
#include "ad_bglockless.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
ADIOI_BGL_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_BGL_ReadStridedColl, /* ReadStridedColl */
@ -35,7 +37,8 @@ struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_BGL_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_BGLOCKLESS_Feature /* Features */
};

Просмотреть файл

@ -0,0 +1,14 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 2008 Uchicago Argonne LLC
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_BGLOCKLESS_INCLUDE
#define AD_PVFS2_INCLUDE
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag);
#endif

Просмотреть файл

@ -0,0 +1,15 @@
#include "adio.h"
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
return 1;
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -25,6 +25,7 @@ libadio_gridftp_la_SOURCES = \
ad_gridftp_close.c \
ad_gridftp_delete.c \
ad_gridftp_fcntl.c \
ad_gridftp_features.c \
ad_gridftp_flush.c \
ad_gridftp_hints.c \
ad_gridftp_open.c \

Просмотреть файл

@ -33,4 +33,5 @@ struct ADIOI_Fns_struct ADIO_GRIDFTP_operations = {
ADIOI_GRIDFTP_Flush, /* Flush */
ADIOI_GRIDFTP_Resize, /* Resize */
ADIOI_GRIDFTP_Delete, /* Delete */
ADIOI_GRIDFTP_Feature, /* Features */
};

Просмотреть файл

@ -0,0 +1,12 @@
int ADIOI_GRIDFTP_Feature (ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -56,8 +56,8 @@ void ADIOI_GRIDFTP_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
MPI_Info_get_valuelen(users_info,key,&valuelen,&flag);
if (flag)
{
MPI_Info_get(users_info,key,valuelen,value,&flag);
if (flag) MPI_Info_set(fd->info,key,value);
ADIOI_Info_get(users_info,key,valuelen,value,&flag);
if (flag) ADIOI_Info_set(fd->info,key,value);
}
}
}

Просмотреть файл

@ -136,7 +136,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
oattr[] (eg. parallelism, striping, etc.) goes here */
if ( fd->info!=MPI_INFO_NULL )
{
MPI_Info_get(fd->info,"ftp_control_mode",MPI_MAX_INFO_VAL,hintval,&keyfound);
ADIOI_Info_get(fd->info,"ftp_control_mode",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
if ( ( !strcmp(hintval,"extended") || !strcmp(hintval,"extended_block") ) &&
@ -153,7 +153,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
}
MPI_Info_get(fd->info,"parallelism",MPI_MAX_INFO_VAL,hintval,&keyfound);
ADIOI_Info_get(fd->info,"parallelism",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
int nftpthreads;
@ -170,14 +170,14 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
}
}
MPI_Info_get(fd->info,"striped_ftp",MPI_MAX_INFO_VAL,hintval,&keyfound);
ADIOI_Info_get(fd->info,"striped_ftp",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
/* if set to "true" or "enable", set up round-robin block layout */
if ( !strncmp("true",hintval,4) || !strncmp("TRUE",hintval,4) ||
!strncmp("enable",hintval,4) || !strncmp("ENABLE",hintval,4) )
{
MPI_Info_get(fd->info,"striping_factor",MPI_MAX_INFO_VAL,hintval,&keyfound);
ADIOI_Info_get(fd->info,"striping_factor",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
int striping_factor;
@ -197,7 +197,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
}
}
MPI_Info_get(fd->info,"tcp_buffer",MPI_MAX_INFO_VAL,hintval,&keyfound);
ADIOI_Info_get(fd->info,"tcp_buffer",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
/* set tcp buffer size */
@ -214,7 +214,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
}
}
MPI_Info_get(fd->info,"transfer_type",MPI_MAX_INFO_VAL,hintval,&keyfound);
ADIOI_Info_get(fd->info,"transfer_type",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
globus_ftp_control_type_t filetype;
@ -340,84 +340,4 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
}
}
num_gridftp_handles++;
#if 0
/* Debugging info for testing PASV mode behind firewalls */
if ( myrank==0 )
{
globus_bool_t striped;
globus_ftp_control_mode_t mode;
globus_ftp_control_type_t filetype;
globus_ftp_control_parallelism_t parallelism;
FPRINTF(stderr,"--gridftp details for %s--\n",
fd->filename);
/*
FPRINTF(stderr,"Connection caching: ");
globus_ftp_client_handleattr_get_cache_all(&hattr,&cached);
if ( cached==GLOBUS_TRUE )
FPRINTF(stderr,"Y\n");
else
FPRINTF(stderr,"N\n");
*/
FPRINTF(stderr,"Control mode: ");
globus_ftp_client_operationattr_get_mode(&(oattr[fd->fd_sys]),&mode);
if ( mode==GLOBUS_FTP_CONTROL_MODE_BLOCK )
FPRINTF(stderr,"block\n");
else if ( mode==GLOBUS_FTP_CONTROL_MODE_COMPRESSED )
FPRINTF(stderr,"compressed\n");
else if ( mode==GLOBUS_FTP_CONTROL_MODE_EXTENDED_BLOCK )
FPRINTF(stderr,"extended block\n");
else if ( mode==GLOBUS_FTP_CONTROL_MODE_STREAM )
FPRINTF(stderr,"stream\n");
else
FPRINTF(stderr,"unknown\n");
FPRINTF(stderr,"File type: ");
globus_ftp_client_operationattr_get_type(&(oattr[fd->fd_sys]),&filetype);
if ( filetype==GLOBUS_FTP_CONTROL_TYPE_ASCII )
FPRINTF(stderr,"ASCII\n");
else if ( filetype==GLOBUS_FTP_CONTROL_TYPE_IMAGE )
FPRINTF(stderr,"binary\n");
else if ( filetype==GLOBUS_FTP_CONTROL_TYPE_EBCDIC )
FPRINTF(stderr,"EBCDIC\n");
else
FPRINTF(stderr,"unknown\n");
FPRINTF(stderr,"Parallelism: ");
globus_ftp_client_operationattr_get_parallelism(&(oattr[fd->fd_sys]),&parallelism);
if ( parallelism.mode==GLOBUS_FTP_CONTROL_PARALLELISM_NONE )
FPRINTF(stderr,"none\n");
else if ( parallelism.mode==GLOBUS_FTP_CONTROL_PARALLELISM_FIXED )
FPRINTF(stderr,"fixed with %d streams\n",parallelism.fixed.size);
else
FPRINTF(stderr,"unknown\n");
FPRINTF(stderr,"Striping: ");
globus_ftp_client_operationattr_get_striped(&(oattr[fd->fd_sys]),&striped);
if ( striped==GLOBUS_TRUE )
{
globus_ftp_control_layout_t layout;
FPRINTF(stderr,"Y\nLayout: ");
globus_ftp_client_operationattr_get_layout(&(oattr[fd->fd_sys]),
&layout);
if ( layout.mode==GLOBUS_FTP_CONTROL_STRIPING_NONE )
FPRINTF(stderr,"none\n");
else if ( layout.mode==GLOBUS_FTP_CONTROL_STRIPING_PARTITIONED )
FPRINTF(stderr,"partitioned, size=%d\n",layout.partitioned.size);
else if ( layout.mode==GLOBUS_FTP_CONTROL_STRIPING_BLOCKED_ROUND_ROBIN )
FPRINTF(stderr,"round-robin, block size=%d\n",layout.round_robin.block_size);
else
FPRINTF(stderr,"unknown\n");
}
else
FPRINTF(stderr,"N\n");
fflush(stderr);
}
#endif
}

Просмотреть файл

@ -50,10 +50,6 @@ static void readcontig_data_cb(void *myargs, globus_ftp_client_handle_t *handle,
readcontig_data_cb: buffer 0x404c0008 length 65536 offset 32112640 eof 0
readcontig_data_cb: buffer 0x404d0008 length 65536 offset 32178176 eof 0
*/
#if 0
FPRINTF(stderr, "%s: buffer %p length %d offset %Ld eof %d\n",
__func__, buffer, length, offset, eof);
#endif
if ( !eof )
globus_ftp_client_register_read(handle,
buffer+length,

Просмотреть файл

@ -364,10 +364,6 @@ void ADIOI_GRIDFTP_WriteDiscontig(ADIO_File fd, void *buf, int count,
{
fd->fp_ind += extent;
fd->fp_sys_posn = fd->fp_ind;
#if 0
FPRINTF(stdout, "[%d/%d] new file position is %Ld\n", myrank,
nprocs, (long long) fd->fp_ind);
#endif
}
else {
fd->fp_sys_posn = offset + extent;

Просмотреть файл

@ -8,6 +8,9 @@
#include "ad_hfs.h"
#include "adio_extern.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{
int i, ntimes, err;

Просмотреть файл

@ -7,6 +7,10 @@
#include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode;

Просмотреть файл

@ -7,6 +7,10 @@
#include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)

Просмотреть файл

@ -7,6 +7,10 @@
#include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)

Просмотреть файл

@ -1,22 +0,0 @@
<dir>
<file name="ad_lustre_fcntl.c" info="1204573775"/>
<file name="ad_lustre_hints.c" info="1204573775"/>
<file name="ad_lustre_open.c" info="1204573775"/>
<file name="ad_lustre_rwcontig.c" info="1204573775"/>
<file name="ad_lustre.h" info="1204573775"/>
<file name="ad_lustre.c" info="1204573775"/>
</dir>
<data>
<fileinfo name="ad_lustre_fcntl.c">
</fileinfo>
<fileinfo name="ad_lustre_hints.c">
</fileinfo>
<fileinfo name="ad_lustre_open.c">
</fileinfo>
<fileinfo name="ad_lustre_rwcontig.c">
</fileinfo>
<fileinfo name="ad_lustre.h">
</fileinfo>
<fileinfo name="ad_lustre.c">
</fileinfo>
</data>

Просмотреть файл

@ -24,8 +24,11 @@ EXTRA_DIST = README
noinst_LTLIBRARIES = libadio_lustre.la
libadio_lustre_la_SOURCES = \
ad_lustre.c \
ad_lustre_aggregate.c \
ad_lustre_fcntl.c \
ad_lustre.h \
ad_lustre_hints.c \
ad_lustre_open.c \
ad_lustre_rwcontig.c
ad_lustre_wrcoll.c \
ad_lustre_rwcontig.c \
ad_lustre_wrstr.c

Просмотреть файл

@ -4,6 +4,21 @@ Upcoming soon:
Further out:
o To post the code for ParColl (Partitioned collective IO)
-----------------------------------------------------
V05:
-----------------------------------------------------
Improved data redistribution
o Improve I/O pattern identification. Besides checking interleaving,
if request I/O size is small, collective I/O will be performed.
The hint bigsize can be used to define the req size value.
o Provide hint CO for load balancing to control the number of
IO clients for each OST
o Produce stripe-contiguous I/O pattern that Lustre prefers
o Control read-modify-write in data sieving in collective IO
by hint ds_in_coll.
o Reduce extent lock conflicts by make each OST accessed by one or
more constant clients.
-----------------------------------------------------
V04:
-----------------------------------------------------

Просмотреть файл

@ -4,21 +4,24 @@
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
ADIOI_LUSTRE_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_LUSTRE_ReadContig, /* ReadContig */
ADIOI_LUSTRE_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_LUSTRE_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
#if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
ADIOI_GEN_IreadContig, /* IreadContig */
@ -36,4 +39,5 @@ struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
};

Просмотреть файл

@ -4,6 +4,8 @@
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#ifndef AD_UNIX_INCLUDE
@ -17,6 +19,7 @@
#ifdef __linux__
# include <sys/ioctl.h> /* necessary for: */
# include <time.h>
# define __USE_GNU /* O_DIRECT and */
# include <fcntl.h> /* IO operations */
# undef __USE_GNU
@ -24,7 +27,7 @@
/*#include <fcntl.h>*/
#include <sys/ioctl.h>
#include "lustre/lustre_user.h"
#include <lustre/lustre_user.h>
#include "adio.h"
/*#include "adioi.h"*/
@ -43,22 +46,46 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code);
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
/* the lustre utilities: */
int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
ADIO_Offset *len_list, int nprocs);
void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
int mode);
void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
ADIO_Offset *len_list, int contig_access_count,
int *striping_info, int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int ***buf_idx_ptr);
int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
ADIO_Offset *len, int *striping_info);
#endif /* End of AD_UNIX_INCLUDE */

Просмотреть файл

@ -0,0 +1,322 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#undef AGG_DEBUG
void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
int mode)
{
int *striping_info = NULL;
/* get striping information:
* striping_info[0]: stripe_size
* striping_info[1]: stripe_count
* striping_info[2]: avail_cb_nodes
*/
int stripe_size, stripe_count, CO = 1;
int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
/* Get hints value */
/* stripe size */
stripe_size = fd->hints->striping_unit;
/* stripe count */
/* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
stripe_count = fd->hints->striping_factor;
/* Calculate the available number of I/O clients */
if (!mode) {
/* for collective read,
* if "CO" clients access the same OST simultaneously,
* the OST disk seek time would be much. So, to avoid this,
* it might be better if 1 client only accesses 1 OST.
* So, we set CO = 1 to meet the above requirement.
*/
CO = 1;
/*XXX: maybe there are other better way for collective read */
} else {
/* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
CO = fd->hints->fs_hints.lustre.co_ratio;
}
/* Calculate how many IO clients we need */
/* Algorithm courtesy Pascal Deveze (pascal.deveze@bull.net) */
/* To avoid extent lock conflicts,
* avail_cb_nodes should either
* - be a multiple of stripe_count,
* - or divide stripe_count exactly
* so that each OST is accessed by a maximum of CO constant clients. */
if (nprocs_for_coll >= stripe_count)
/* avail_cb_nodes should be a multiple of stripe_count and the number
* of procs per OST should be limited to the minimum between
* nprocs_for_coll/stripe_count and CO
*
* e.g. if stripe_count=20, nprocs_for_coll=42 and CO=3 then
* avail_cb_nodes should be equal to 40 */
avail_cb_nodes =
stripe_count * ADIOI_MIN(nprocs_for_coll/stripe_count, CO);
else {
/* nprocs_for_coll is less than stripe_count */
/* avail_cb_nodes should divide stripe_count */
/* e.g. if stripe_count=60 and nprocs_for_coll=8 then
* avail_cb_nodes should be egal to 6 */
/* This could be done with :
while (stripe_count % avail_cb_nodes != 0) avail_cb_nodes--;
but this can be optimized for large values of nprocs_for_coll and
stripe_count */
divisor = 2;
avail_cb_nodes = 1;
/* try to divise */
while (stripe_count >= divisor*divisor) {
if ((stripe_count % divisor) == 0) {
if (stripe_count/divisor <= nprocs_for_coll) {
/* The value is found ! */
avail_cb_nodes = stripe_count/divisor;
break;
}
/* if divisor is less than nprocs_for_coll, divisor is a
* solution, but it is not sure that it is the best one */
else if (divisor <= nprocs_for_coll)
avail_cb_nodes = divisor;
}
divisor++;
}
}
*striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
striping_info = *striping_info_ptr;
striping_info[0] = stripe_size;
striping_info[1] = stripe_count;
striping_info[2] = avail_cb_nodes;
}
int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
ADIO_Offset *len, int *striping_info)
{
int rank_index, rank;
ADIO_Offset avail_bytes;
int stripe_size = striping_info[0];
int avail_cb_nodes = striping_info[2];
/* Produce the stripe-contiguous pattern for Lustre */
rank_index = (int)((off / stripe_size) % avail_cb_nodes);
/* we index into fd_end with rank_index, and fd_end was allocated to be no
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort
*/
if (rank_index >= fd->hints->cb_nodes)
MPI_Abort(MPI_COMM_WORLD, 1);
avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
(ADIO_Offset)stripe_size - off;
if (avail_bytes < *len) {
/* this proc only has part of the requested contig. region */
*len = avail_bytes;
}
/* map our index to a rank */
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
rank = fd->hints->ranklist[rank_index];
return rank;
}
/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
ADIO_Offset *len_list, int contig_access_count,
int *striping_info, int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int ***buf_idx_ptr)
{
/* Nothing different from ADIOI_Calc_my_req(), except calling
* ADIOI_Lustre_Calc_aggregator() instead of the old one */
int *count_my_req_per_proc, count_my_req_procs, **buf_idx;
int i, l, proc;
ADIO_Offset avail_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
* process in process i's file domain. calloc initializes to zero.
* I'm allocating memory of size nprocs, so that I can do an
* MPI_Alltoall later on.
*/
buf_idx = (int **) ADIOI_Malloc(nprocs * sizeof(int*));
/* one pass just to calculate how much space to allocate for my_req;
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/
for (i = 0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write
*/
if (len_list[i] == 0)
continue;
off = offset_list[i];
avail_len = len_list[i];
/* note: we set avail_len to be the total size of the access.
* then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
* the amount that was available.
*/
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
count_my_req_per_proc[proc]++;
/* figure out how many data is remaining in the access
* we'll take care of this data (if there is any)
* in the while loop below.
*/
rem_len = len_list[i] - avail_len;
while (rem_len != 0) {
off += avail_len; /* point to first remaining byte */
avail_len = rem_len; /* save remaining size, pass to calc */
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
count_my_req_per_proc[proc]++;
rem_len -= avail_len; /* reduce remaining length by amount from fd */
}
}
/* buf_idx is relevant only if buftype_is_contig.
* buf_idx[i] gives the index into user_buf where data received
* from proc 'i' should be placed. This allows receives to be done
* without extra buffer. This can't be done if buftype is not contig.
*/
/* initialize buf_idx vectors */
for (i = 0; i < nprocs; i++) {
/* add one to count_my_req_per_proc[i] to avoid zero size malloc */
buf_idx[i] = (int *) ADIOI_Malloc((count_my_req_per_proc[i] + 1)
* sizeof(int));
}
/* now allocate space for my_req, offset, and len */
*my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
my_req = *my_req_ptr;
count_my_req_procs = 0;
for (i = 0; i < nprocs; i++) {
if (count_my_req_per_proc[i]) {
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(ADIO_Offset));
my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(int));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed later */
}
/* now fill in my_req */
curr_idx = 0;
for (i = 0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
avail_len = len_list[i];
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
l = my_req[proc].count;
ADIOI_Assert(curr_idx == (int) curr_idx);
ADIOI_Assert(l < count_my_req_per_proc[proc]);
buf_idx[proc][l] = (int) curr_idx;
curr_idx += avail_len;
rem_len = len_list[i] - avail_len;
/* store the proc, offset, and len information in an array
* of structures, my_req. Each structure contains the
* offsets and lengths located in that process's FD,
* and the associated count.
*/
my_req[proc].offsets[l] = off;
ADIOI_Assert(avail_len == (int) avail_len);
my_req[proc].lens[l] = (int) avail_len;
my_req[proc].count++;
while (rem_len != 0) {
off += avail_len;
avail_len = rem_len;
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
striping_info);
l = my_req[proc].count;
ADIOI_Assert(curr_idx == (int) curr_idx);
ADIOI_Assert(l < count_my_req_per_proc[proc]);
buf_idx[proc][l] = (int) curr_idx;
curr_idx += avail_len;
rem_len -= avail_len;
my_req[proc].offsets[l] = off;
ADIOI_Assert(avail_len == (int) avail_len);
my_req[proc].lens[l] = (int) avail_len;
my_req[proc].count++;
}
}
#ifdef AGG_DEBUG
for (i = 0; i < nprocs; i++) {
if (count_my_req_per_proc[i] > 0) {
FPRINTF(stdout, "data needed from %d (count = %d):\n",
i, my_req[i].count);
for (l = 0; l < my_req[i].count; l++) {
FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
l, my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
}
#endif
*count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx;
}
int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
ADIO_Offset *len_list, int nprocs)
{
/* If the processes are non-interleaved, we will check the req_size.
* if (avg_req_size > big_req_size) {
* docollect = 0;
* }
*/
int i, docollect = 1, big_req_size = 0;
ADIO_Offset req_size = 0, total_req_size;
int avg_req_size, total_access_count;
/* calculate total_req_size and total_access_count */
for (i = 0; i < contig_access_count; i++)
req_size += len_list[i];
MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
fd->comm);
MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
fd->comm);
/* estimate average req_size */
avg_req_size = (int)(total_req_size / total_access_count);
/* get hint of big_req_size */
big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
/* Don't perform collective I/O if there are big requests */
if ((big_req_size > 0) && (avg_req_size > big_req_size))
docollect = 0;
return docollect;
}

Просмотреть файл

@ -25,7 +25,7 @@ void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
if (fd->fp_sys_posn != -1)
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
if (fcntl_struct->fsize == -1) {
*error_code = MPIR_Err_create_code(MPI_SUCCESS,
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s", strerror(errno));
}
@ -56,7 +56,7 @@ void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done,
&status, error_code);
if (*error_code != MPI_SUCCESS) {
*error_code = MPIR_Err_create_code(MPI_SUCCESS,
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s", strerror(errno));
return;

Просмотреть файл

@ -4,6 +4,8 @@
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
@ -11,10 +13,12 @@
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
char *value, *value_in_fd;
int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
char *value;
int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
struct lov_user_md lum = { 0 };
int err, myrank, fd_sys, perm, amode, old_mask;
int int_val, tmp_val;
static char myname[] = "ADIOI_LUSTRE_SETINFO";
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
if ( (fd->info) == MPI_INFO_NULL) {
@ -22,54 +26,63 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if necessary. */
MPI_Info_create(&(fd->info));
MPI_Info_set(fd->info, "direct_read", "false");
MPI_Info_set(fd->info, "direct_write", "false");
ADIOI_Info_set(fd->info, "direct_read", "false");
ADIOI_Info_set(fd->info, "direct_write", "false");
fd->direct_read = fd->direct_write = 0;
/* initialize lustre hints */
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
fd->hints->fs_hints.lustre.co_ratio = 1;
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
fd->hints->fs_hints.lustre.coll_threshold = 0;
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
/* has user specified striping or server buffering parameters
and do they have the same value on all processes? */
if (users_info != MPI_INFO_NULL) {
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
/* striping information */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag)
str_unit=atoi(value);
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag)
str_factor=atoi(value);
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag);
ADIOI_Info_get(users_info, "romio_lustre_start_iodevice",
MPI_MAX_INFO_VAL, value, &flag);
if (flag)
start_iodev=atoi(value);
MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
/* direct read and write */
ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
MPI_Info_set(fd->info, "direct_read", "true");
ADIOI_Info_set(fd->info, "direct_read", "true");
fd->direct_read = 1;
}
MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
MPI_Info_set(fd->info, "direct_write", "true");
ADIOI_Info_set(fd->info, "direct_write", "true");
fd->direct_write = 1;
}
}
/* set striping information with ioctl */
MPI_Comm_rank(fd->comm, &myrank);
if (myrank == 0) {
tmp_val[0] = str_factor;
tmp_val[1] = str_unit;
tmp_val[2] = start_iodev;
stripe_val[0] = str_factor;
stripe_val[1] = str_unit;
stripe_val[2] = start_iodev;
}
MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
if (tmp_val[0] != str_factor
|| tmp_val[1] != str_unit
|| tmp_val[2] != start_iodev) {
if (stripe_val[0] != str_factor
|| stripe_val[1] != str_unit
|| stripe_val[2] != start_iodev) {
FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
"-striping_factor:striping_unit:start_iodevice "
"need to be identical across all processes\n");
@ -119,17 +132,65 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
} /* End of striping parameters validation */
}
MPI_Barrier(fd->comm);
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
} else {
/* The file has been opened previously and fd->fd_sys is a valid
file descriptor. cannot set striping parameters now. */
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
}
/* get other hint */
if (users_info != MPI_INFO_NULL) {
/* CO: IO Clients/OST,
* to keep the load balancing between clients and OSTs */
ADIOI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
&flag);
if (flag && (int_val = atoi(value)) > 0) {
tmp_val = int_val;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_co_ratio",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", value);
fd->hints->fs_hints.lustre.co_ratio = atoi(value);
}
/* coll_threshold:
* if the req size is bigger than this, collective IO may not be performed.
*/
ADIOI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
&flag);
if (flag && (int_val = atoi(value)) > 0) {
tmp_val = int_val;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_coll_threshold",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
}
/* ds_in_coll: disable data sieving in collective IO */
ADIOI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))) {
tmp_val = int_val = 2;
MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_ds_in_coll",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
}
}
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
if (ADIOI_Direct_read) fd->direct_read = 1;
if (ADIOI_Direct_write) fd->direct_write = 1;

Просмотреть файл

@ -4,14 +4,22 @@
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
/* what is the basis for this define?
* what happens if there are more than 1k UUIDs? */
#define MAX_LOV_UUID_COUNT 1000
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode, amode_direct;
struct lov_user_md lum = { 0 };
int lumlen;
struct lov_user_md *lum = NULL;
char *value;
#if defined(MPICH2) || !defined(PRINT_ERR_MSG)
@ -44,23 +52,37 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
if (fd->fd_sys != -1) {
int err;
/* get file striping information and set it in info */
/* odd malloc here because lov_user_md contains some fixed data and
* then a list of 'lmm_objects' representing stripe */
lumlen = sizeof(struct lov_user_md) +
MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data);
/* furthermore, Pascal Deveze reports that, even though we pass a
* "GETSTRIPE" (read) flag to the ioctl, if some of the values of this
* struct are uninitialzed, the call can give an error. calloc in case
* there are other members that must be initialized and in case
* lov_user_md struct changes in future */
lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen);
lum->lmm_magic = LOV_USER_MAGIC;
err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum);
if (!err) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
/* get file striping information and set it in info */
lum.lmm_magic = LOV_USER_MAGIC;
err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
fd->hints->striping_unit = lum->lmm_stripe_size;
sprintf(value, "%d", lum->lmm_stripe_size);
ADIOI_Info_set(fd->info, "striping_unit", value);
if (!err) {
sprintf(value, "%d", lum.lmm_stripe_size);
MPI_Info_set(fd->info, "striping_unit", value);
fd->hints->striping_factor = lum->lmm_stripe_count;
sprintf(value, "%d", lum->lmm_stripe_count);
ADIOI_Info_set(fd->info, "striping_factor", value);
sprintf(value, "%d", lum.lmm_stripe_count);
MPI_Info_set(fd->info, "striping_factor", value);
fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset;
sprintf(value, "%d", lum->lmm_stripe_offset);
ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
sprintf(value, "%d", lum.lmm_stripe_offset);
MPI_Info_set(fd->info, "start_iodevice", value);
}
ADIOI_Free(value);
}
ADIOI_Free(lum);
if (fd->access_mode & ADIO_APPEND)
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);

Просмотреть файл

@ -4,6 +4,8 @@
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#define _XOPEN_SOURCE 600
@ -18,7 +20,7 @@ static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len,
ADIO_Offset offset, int *err)
{
int ntimes, rem, newrem, i, size, nbytes;
int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz)) {
*err = pwrite(fd->fd_direct, buf, len, offset);
} else if (len < fd->d_miniosz) {
@ -37,7 +39,7 @@ static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
ADIO_Offset offset, int *err)
{
int ntimes, rem, newrem, i, size, nbytes;
int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz))
*err = pread(fd->fd_direct, buf, len, offset);
else if (len < fd->d_miniosz)
@ -59,7 +61,6 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
{
int err=-1, diff, size=len, nbytes = 0;
void *newbuf;
static char myname[] = "ADIOI_LUSTRE_Directio";
if (offset % fd->d_miniosz) {
diff = fd->d_miniosz - (offset % fd->d_miniosz);
@ -87,7 +88,7 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
memcpy(newbuf, buf, size);
ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err);
nbytes += err;
free(newbuf);
ADIOI_Free(newbuf);
}
else nbytes += pwrite(fd->fd_sys, buf, size, offset);
}
@ -102,7 +103,7 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err);
if (err > 0) memcpy(buf, newbuf, err);
nbytes += err;
free(newbuf);
ADIOI_Free(newbuf);
}
else nbytes += pread(fd->fd_sys, buf, size, offset);
}
@ -136,10 +137,23 @@ static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
if (err == -1) goto ioerr;
}
if (io_mode)
if (io_mode) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
#endif
err = write(fd->fd_sys, buf, len);
else
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
#endif
} else {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
#endif
err = read(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
#endif
}
} else {
err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
}

Просмотреть файл

@ -0,0 +1,954 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
/* prototypes of functions used for collective writes only. */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
MPI_Datatype datatype, int nprocs,
int myrank,
ADIOI_Access *others_req,
ADIOI_Access *my_req,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
int *striping_info,
int **buf_idx, int *error_code);
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
MPI_Request *requests,
int *sent_to_proc, int nprocs,
int myrank, int contig_access_count,
int *striping_info,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent);
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
int *striping_info,
ADIOI_Access *others_req,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int *hole,
int iter, MPI_Aint buftype_extent,
int *buf_idx, int *error_code);
void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
ADIO_Offset *srt_off, int *srt_len, int *start_pos,
int nprocs, int nprocs_recv, int total_elements);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype,
int file_ptr_type, ADIO_Offset offset,
ADIO_Status *status, int *error_code)
{
/* Uses a generalized version of the extended two-phase method described
* in "An Extended Two-Phase Method for Accessing Sections of
* Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
* Scientific Programming, (5)4:301--317, Winter 1996.
* http://www.mcs.anl.gov/home/thakur/ext2ph.ps
*/
ADIOI_Access *my_req;
/* array of nprocs access structures, one for each other process has
this process's request */
ADIOI_Access *others_req;
/* array of nprocs access structures, one for each other process
whose request is written by this process. */
int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
ADIO_Offset orig_fp, start_offset, end_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
ADIO_Offset *len_list = NULL;
int **buf_idx = NULL, *striping_info = NULL;
int old_error, tmp_error;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
orig_fp = fd->fp_ind;
/* IO patten identification if cb_write isn't disabled */
if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
/* For this process's request, calculate the list of offsets and
lengths in the file and determine the start and end offsets. */
/* Note: end_offset points to the last byte-offset that will be accessed.
* e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
*/
ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
&offset_list, &len_list, &start_offset,
&end_offset, &contig_access_count);
/* each process communicates its start and end offsets to other
* processes. The result is an array each of start and end offsets
* stored in order of process rank.
*/
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
ADIO_OFFSET, fd->comm);
MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
ADIO_OFFSET, fd->comm);
/* are the accesses of different processes interleaved? */
for (i = 1; i < nprocs; i++)
if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice
for the moment. */
/* Two typical access patterns can benefit from collective write.
* 1) the processes are interleaved, and
* 2) the req size is small.
*/
if (interleave_count > 0) {
do_collect = 1;
} else {
do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
len_list, nprocs);
}
}
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
/* Decide if collective I/O should be done */
if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
fd->hints->cb_write == ADIOI_HINT_DISABLE) {
/* use independent accesses */
if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
}
fd->fp_ind = orig_fp;
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_WriteContig(fd, buf, count, datatype,
ADIO_EXPLICIT_OFFSET,
off, status, error_code);
} else
ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
0, status, error_code);
} else {
ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
offset, status, error_code);
}
return;
}
/* Get Lustre hints information */
ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
/* calculate what portions of the access requests of this process are
* located in which process
*/
ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
striping_info, nprocs, &count_my_req_procs,
&count_my_req_per_proc, &my_req,
&buf_idx);
/* based on everyone's my_req, calculate what requests of other processes
* will be accessed by this process.
* count_others_req_procs = number of processes whose requests (including
* this process itself) will be accessed by this process
* count_others_req_per_proc[i] indicates how many separate contiguous
* requests of proc. i will be accessed by this process.
*/
ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
my_req, nprocs, myrank, &count_others_req_procs,
&others_req);
ADIOI_Free(count_my_req_per_proc);
/* exchange data and write in sizes of no more than stripe_size. */
ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
others_req, my_req, offset_list, len_list,
contig_access_count, striping_info,
buf_idx, error_code);
/* If this collective write is followed by an independent write,
* it's possible to have those subsequent writes on other processes
* race ahead and sneak in before the read-modify-write completes.
* We carry out a collective communication at the end here so no one
* can start independent i/o before collective I/O completes.
*
* need to do some gymnastics with the error codes so that if something
* went wrong, all processes report error, but if a process has a more
* specific error code, we can still have that process report the
* additional information */
old_error = *error_code;
if (*error_code != MPI_SUCCESS)
*error_code = MPI_ERR_IO;
/* optimization: if only one process performing i/o, we can perform
* a less-expensive Bcast */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
#endif
if (fd->hints->cb_nodes == 1)
MPI_Bcast(error_code, 1, MPI_INT,
fd->hints->ranklist[0], fd->comm);
else {
tmp_error = *error_code;
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
MPI_MAX, fd->comm);
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
#endif
if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
*error_code = old_error;
if (!buftype_is_contig)
ADIOI_Delete_flattened(datatype);
/* free all memory allocated for collective I/O */
/* free others_req */
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
ADIOI_Free(others_req[i].offsets);
ADIOI_Free(others_req[i].lens);
ADIOI_Free(others_req[i].mem_ptrs);
}
}
ADIOI_Free(others_req);
/* free my_req here */
for (i = 0; i < nprocs; i++) {
if (my_req[i].count) {
ADIOI_Free(my_req[i].offsets);
ADIOI_Free(my_req[i].lens);
}
}
ADIOI_Free(my_req);
for (i = 0; i < nprocs; i++) {
ADIOI_Free(buf_idx[i]);
}
ADIOI_Free(buf_idx);
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
ADIOI_Free(striping_info);
#ifdef HAVE_STATUS_SET_BYTES
if (status) {
int bufsize, size;
/* Don't set status if it isn't needed */
MPI_Type_size(datatype, &size);
bufsize = size * count;
MPIR_Status_set_bytes(status, datatype, bufsize);
}
/* This is a temporary way of filling in status. The right way is to
* keep track of how much data was actually written during collective I/O.
*/
#endif
fd->fp_sys_posn = -1; /* set it to null. */
}
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
* code is created and returned in error_code.
*/
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
MPI_Datatype datatype, int nprocs,
int myrank, ADIOI_Access *others_req,
ADIOI_Access *my_req,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
int *striping_info, int **buf_idx,
int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
* than lustre stripe_size.
* The idea is to reduce the amount of extra memory required for
* collective I/O. If all data were written all at once, which is much
* easier, it would require temp space more than the size of user_buf,
* which is often unacceptable. For example, to write a distributed
* array to a file, where each local array is 8Mbytes, requiring
* at least another 8Mbytes of temp space is unacceptable.
*/
int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
ADIO_Offset max_size, step_size = 0;
int real_size, req_len, send_len;
int *recv_curr_offlen_ptr, *recv_count, *recv_size;
int *send_curr_offlen_ptr, *send_size;
int *partial_recv, *sent_to_proc, *recv_start_pos;
int *send_buf_idx, *curr_to_proc, *done_to_proc;
int *this_buf_idx;
char *write_buf = NULL;
MPI_Status status;
ADIOI_Flatlist_node *flat_buf = NULL;
MPI_Aint buftype_extent;
int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
int data_sieving = 0;
*error_code = MPI_SUCCESS; /* changed below if error */
/* only I/O errors are currently reported */
/* calculate the number of writes of stripe size to be done.
* That gives the no. of communication phases as well.
* Note:
* Because we redistribute data in stripe-contiguous pattern for Lustre,
* each process has the same no. of communication phases.
*/
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
st_loc = others_req[i].offsets[0];
end_loc = others_req[i].offsets[0];
break;
}
}
for (i = 0; i < nprocs; i++) {
for (j = 0; j < others_req[i].count; j++) {
st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
others_req[i].lens[j] - 1));
}
}
/* this process does no writing. */
if ((st_loc == -1) && (end_loc == -1))
ntimes = 0;
MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
/* avoid min_st_loc be -1 */
if (st_loc == -1)
st_loc = max_end_loc;
MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
/* align downward */
min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
/* Each time, only avail_cb_nodes number of IO clients perform IO,
* so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
* and ntimes=whole_file_portion/step_size
*/
step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
+ (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/* max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
if (ntimes)
write_buf = (char *) ADIOI_Malloc(stripe_size);
/* calculate the start offset for each iteration */
off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
for (m = 0; m < max_ntimes; m ++)
off_list[m] = max_end_loc;
for (i = 0; i < nprocs; i++) {
for (j = 0; j < others_req[i].count; j ++) {
req_off = others_req[i].offsets[j];
m = (int)((req_off - min_st_loc) / step_size);
off_list[m] = ADIOI_MIN(off_list[m], req_off);
}
}
recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* their use is explained below. calloc initializes to 0. */
recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* to store count of how many off-len pairs per proc are satisfied
in an iteration. */
send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* total size of data to be sent to each proc. in an iteration.
Of size nprocs so that I can use MPI_Alltoall later. */
recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* total size of data to be recd. from each proc. in an iteration. */
sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* amount of data sent to each proc so far. Used in
ADIOI_Fill_send_buffer. initialized to 0 here. */
send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* Above three are used in ADIOI_Fill_send_buffer */
this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* used to store the starting value of recv_curr_offlen_ptr[i] in
this iteration */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
if (!buftype_is_contig) {
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype)
flat_buf = flat_buf->next;
}
MPI_Type_extent(datatype, &buftype_extent);
/* I need to check if there are any outstanding nonblocking writes to
* the file, which could potentially interfere with the writes taking
* place in this collective write call. Since this is not likely to be
* common, let me do the simplest thing possible here: Each process
* completes all pending nonblocking operations before completing.
*/
/*ADIOI_Complete_async(error_code);
if (*error_code != MPI_SUCCESS) return;
MPI_Barrier(fd->comm);
*/
iter_st_off = min_st_loc;
/* Although we have recognized the data according to OST index,
* a read-modify-write will be done if there is a hole between the data.
* For example: if blocksize=60, xfersize=30 and stripe_size=100,
* then rank0 will collect data [0, 30] and [60, 90] then write. There
* is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
*
* To reduce its impact on the performance, we can disable data sieving
* by hint "ds_in_coll".
*/
/* check the hint for data sieving */
data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
for (m = 0; m < max_ntimes; m++) {
/* go through all others_req and my_req to check which will be received
* and sent in this iteration.
*/
/* Note that MPI guarantees that displacements in filetypes are in
monotonically nondecreasing order and that, for writes, the
filetypes cannot specify overlapping regions in the file. This
simplifies implementation a bit compared to reads. */
/*
off = start offset in the file for the data to be written in
this iteration
iter_st_off = start offset of this iteration
real_size = size of data written (bytes) corresponding to off
max_size = possible maximum size of data written in this iteration
req_off = offset in the file for a particular contiguous request minus
what was satisfied in previous iteration
send_off = offset the request needed by other processes in this iteration
req_len = size corresponding to req_off
send_len = size corresponding to send_off
*/
/* first calculate what should be communicated */
for (i = 0; i < nprocs; i++)
recv_count[i] = recv_size[i] = send_size[i] = 0;
off = off_list[m];
max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size -
off,
end_loc - off + 1);
for (i = 0; i < nprocs; i++) {
if (my_req[i].count) {
this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
send_off = my_req[i].offsets[j];
send_len = my_req[i].lens[j];
if (send_off < iter_st_off + max_size) {
send_size[i] += send_len;
} else {
break;
}
}
send_curr_offlen_ptr[i] = j;
}
if (others_req[i].count) {
recv_start_pos[i] = recv_curr_offlen_ptr[i];
for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
req_off = others_req[i].offsets[j];
req_len = others_req[i].lens[j];
if (req_off < iter_st_off + max_size) {
recv_count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
MPI_Address(write_buf + req_off - off,
&(others_req[i].mem_ptrs[j]));
recv_size[i] += req_len;
} else {
break;
}
}
recv_curr_offlen_ptr[i] = j;
}
}
/* use variable "hole" to pass data_sieving flag into W_Exchange_data */
hole = data_sieving;
ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, real_size,
recv_count, recv_start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
striping_info, others_req, send_buf_idx,
curr_to_proc, done_to_proc, &hole, m,
buftype_extent, this_buf_idx, error_code);
if (*error_code != MPI_SUCCESS)
goto over;
flag = 0;
for (i = 0; i < nprocs; i++)
if (recv_count[i]) {
flag = 1;
break;
}
if (flag) {
/* check whether to do data sieving */
if(data_sieving == ADIOI_HINT_ENABLE) {
ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status,
error_code);
} else {
/* if there is no hole, write data in one time;
* otherwise, write data in several times */
if (!hole) {
ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status,
error_code);
} else {
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
for (j = 0; j < others_req[i].count; j++) {
if (others_req[i].offsets[j] < off + real_size &&
others_req[i].offsets[j] >= off) {
ADIO_WriteContig(fd,
write_buf + others_req[i].offsets[j] - off,
others_req[i].lens[j],
MPI_BYTE, ADIO_EXPLICIT_OFFSET,
others_req[i].offsets[j], &status,
error_code);
if (*error_code != MPI_SUCCESS)
goto over;
}
}
}
}
}
}
if (*error_code != MPI_SUCCESS)
goto over;
}
iter_st_off += max_size;
}
over:
if (ntimes)
ADIOI_Free(write_buf);
ADIOI_Free(recv_curr_offlen_ptr);
ADIOI_Free(send_curr_offlen_ptr);
ADIOI_Free(recv_count);
ADIOI_Free(send_size);
ADIOI_Free(recv_size);
ADIOI_Free(sent_to_proc);
ADIOI_Free(recv_start_pos);
ADIOI_Free(send_buf_idx);
ADIOI_Free(curr_to_proc);
ADIOI_Free(done_to_proc);
ADIOI_Free(this_buf_idx);
ADIOI_Free(off_list);
}
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
* in the case of error.
*/
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
int *striping_info,
ADIOI_Access *others_req,
int *send_buf_idx,
int *curr_to_proc, int *done_to_proc,
int *hole, int iter,
MPI_Aint buftype_extent,
int *buf_idx, int *error_code)
{
int i, j, nprocs_recv, nprocs_send, err;
char **send_buf = NULL;
MPI_Request *requests, *send_req;
MPI_Datatype *recv_types;
MPI_Status *statuses, status;
int *srt_len, sum, sum_recv;
ADIO_Offset *srt_off;
int data_sieving = *hole;
static char myname[] = "ADIOI_W_EXCHANGE_DATA";
/* create derived datatypes for recv */
nprocs_recv = 0;
for (i = 0; i < nprocs; i++)
if (recv_size[i])
nprocs_recv++;
recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
sizeof(MPI_Datatype));
/* +1 to avoid a 0-size malloc */
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
MPI_Type_hindexed(count[i],
&(others_req[i].lens[start_pos[i]]),
&(others_req[i].mem_ptrs[start_pos[i]]),
MPI_BYTE, recv_types + j);
/* absolute displacements; use MPI_BOTTOM in recv */
MPI_Type_commit(recv_types + j);
j++;
}
}
/* To avoid a read-modify-write,
* check if there are holes in the data to be written.
* For this, merge the (sorted) offset lists others_req using a heap-merge.
*/
sum = 0;
for (i = 0; i < nprocs; i++)
sum += count[i];
srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
/* +1 to avoid a 0-size malloc */
ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
nprocs, nprocs_recv, sum);
/* check if there are any holes */
*hole = 0;
for (i = 0; i < sum - 1; i++) {
if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
*hole = 1;
break;
}
}
/* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
* between aggregation, nominally contiguous regions, and cb_buffer_size
* should be handled with a read-modify-write (otherwise we will write out
* more data than we receive from everyone else (inclusive), so override
* hole detection
*/
if (*hole == 0) {
sum_recv = 0;
for (i = 0; i < nprocs; i++)
sum_recv += recv_size[i];
if (size > sum_recv)
*hole = 1;
}
/* check the hint for data sieving */
if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, &err);
// --BEGIN ERROR HANDLING--
if (err != MPI_SUCCESS) {
*error_code = MPIO_Err_create_code(err,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**ioRMWrdwr", 0);
ADIOI_Free(recv_types);
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
return;
}
// --END ERROR HANDLING--
}
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
nprocs_send = 0;
for (i = 0; i < nprocs; i++)
if (send_size[i])
nprocs_send++;
if (fd->atomicity) {
/* bug fix from Wei-keng Liao and Kenin Coloma */
requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
sizeof(MPI_Request));
send_req = requests;
} else {
requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
sizeof(MPI_Request));
/* +1 to avoid a 0-size malloc */
/* post receives */
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
myrank + i + 100 * iter, fd->comm, requests + j);
j++;
}
}
send_req = requests + nprocs_recv;
}
/* post sends.
* if buftype_is_contig, data can be directly sent from
* user buf at location given by buf_idx. else use send_buf.
*/
if (buftype_is_contig) {
j = 0;
for (i = 0; i < nprocs; i++)
if (send_size[i]) {
ADIOI_Assert(buf_idx[i] != -1);
MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
send_req + j);
j++;
}
} else
if (nprocs_send) {
/* buftype is not contig */
send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
for (i = 0; i < nprocs; i++)
if (send_size[i])
send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
len_list, send_size, send_req,
sent_to_proc, nprocs, myrank,
contig_access_count, striping_info,
send_buf_idx, curr_to_proc, done_to_proc,
iter, buftype_extent);
/* the send is done in ADIOI_Fill_send_buffer */
}
/* bug fix from Wei-keng Liao and Kenin Coloma */
if (fd->atomicity) {
j = 0;
for (i = 0; i < nprocs; i++) {
MPI_Status wkl_status;
if (recv_size[i]) {
MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
myrank + i + 100 * iter, fd->comm, &wkl_status);
j++;
}
}
}
for (i = 0; i < nprocs_recv; i++)
MPI_Type_free(recv_types + i);
ADIOI_Free(recv_types);
/* bug fix from Wei-keng Liao and Kenin Coloma */
/* +1 to avoid a 0-size malloc */
if (fd->atomicity) {
statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
sizeof(MPI_Status));
} else {
statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
sizeof(MPI_Status));
}
#ifdef NEEDS_MPI_TEST
i = 0;
if (fd->atomicity) {
/* bug fix from Wei-keng Liao and Kenin Coloma */
while (!i)
MPI_Testall(nprocs_send, send_req, &i, statuses);
} else {
while (!i)
MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
}
#else
/* bug fix from Wei-keng Liao and Kenin Coloma */
if (fd->atomicity)
MPI_Waitall(nprocs_send, send_req, statuses);
else
MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
#endif
ADIOI_Free(statuses);
ADIOI_Free(requests);
if (!buftype_is_contig && nprocs_send) {
for (i = 0; i < nprocs; i++)
if (send_size[i])
ADIOI_Free(send_buf[i]);
ADIOI_Free(send_buf);
}
}
#define ADIOI_BUF_INCR \
{ \
while (buf_incr) { \
size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
else { \
flat_buf_idx = 0; \
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
buf_incr -= size_in_buf; \
} \
}
#define ADIOI_BUF_COPY \
{ \
while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)((MPIR_Upint)buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(&(send_buf[p][send_buf_idx[p]]), \
((char *) buf) + user_buf_idx, size_in_buf); \
send_buf_idx[p] += size_in_buf; \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
else { \
flat_buf_idx = 0; \
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
size -= size_in_buf; \
buf_incr -= size_in_buf; \
} \
ADIOI_BUF_INCR \
}
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
MPI_Request *requests,
int *sent_to_proc, int nprocs,
int myrank,
int contig_access_count,
int *striping_info,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent)
{
/* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already
* been accounted for so far
* done_to_proc[p] = amount of data already sent to proc. p in
* previous iterations
* user_buf_idx = current location in user buffer
* send_buf_idx[p] = current location in send_buf of proc. p
*/
for (i = 0; i < nprocs; i++) {
send_buf_idx[i] = curr_to_proc[i] = 0;
done_to_proc[i] = sent_to_proc[i];
}
jj = 0;
user_buf_idx = flat_buf->indices[0];
flat_buf_idx = 0;
n_buftypes = 0;
flat_buf_sz = flat_buf->blocklens[0];
/* flat_buf_idx = current index into flattened buftype
* flat_buf_sz = size of current contiguous component in flattened buf
*/
for (i = 0; i < contig_access_count; i++) {
off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i];
/*this request may span to more than one process */
while (rem_len != 0) {
len = rem_len;
/* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
* longer than the single region that processor "p" is responsible
* for.
*/
p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p] + len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) {
size = (int) ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p],
send_size[p] -
send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR
ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = (int) (curr_to_proc[p] + len -
done_to_proc[p]);
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY
} else {
size = (int) ADIOI_MIN(len, send_size[p] -
send_buf_idx[p]);
buf_incr = (int) len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size;
ADIOI_BUF_COPY
}
if (send_buf_idx[p] == send_size[p]) {
MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
myrank + p + 100 * iter, fd->comm,
requests + jj);
jj++;
}
} else {
ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
curr_to_proc[p] += (int) len;
buf_incr = (int) len;
ADIOI_BUF_INCR
}
} else {
buf_incr = (int) len;
ADIOI_BUF_INCR
}
off += len;
rem_len -= len;
}
}
for (i = 0; i < nprocs; i++)
if (send_size[i])
sent_to_proc[i] = curr_to_proc[i];
}

Просмотреть файл

@ -0,0 +1,530 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
if (writebuf_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, \
&status1, error_code); \
if (!(fd->atomicity)) \
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
} \
writebuf_off = req_off; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
if (!(fd->atomicity)) \
ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, \
writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowsrc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, \
writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == \
ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf + req_off - writebuf_off, (char *)buf +userbuf_off, write_sz); \
while (write_sz != req_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
if (!(fd->atomicity)) \
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
if (!(fd->atomicity)) \
ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, \
writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowsrc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
} \
}
/* this macro is used when filetype is contig and buftype is not contig.
it does not do a read-modify-write and does not lock*/
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, \
error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
writebuf_off = req_off; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
} \
write_sz = (unsigned) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf + req_off - writebuf_off, \
(char *)buf + userbuf_off, write_sz); \
while (write_sz != req_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
} \
}
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status * status,
int *error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, sum, size_in_filetype;
int i, j, k, st_index=0;
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf;
unsigned bufsize, writebuf_len, write_sz;
ADIO_Status status1;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
int stripe_size;
static char myname[] = "ADIOI_LUSTRE_WriteStrided";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on writes, use naive
* approach instead.
*/
ADIOI_GEN_WriteStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset, status, error_code);
return;
}
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
if (!filetype_size) {
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get striping info */
stripe_size = fd->hints->striping_unit;
/* Different buftype to different filetype */
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype)
flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off;
end_offset = start_off + bufsize - 1;
/* write stripe size buffer each time */
writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
writebuf_off = 0;
writebuf_len = 0;
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
for (j = 0; j < count; j++) {
for (i = 0; i < flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent +
flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
}
/* write the buffer out finally */
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
error_code);
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
if (*error_code != MPI_SUCCESS) {
ADIOI_Free(writebuf);
return;
}
ADIOI_Free(writebuf);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind = off;
} else {
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype)
flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i = 0; i < flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file
* contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
req_off = start_off;
req_len = bufsize;
end_offset = start_off + bufsize - 1;
writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
writebuf_off = 0;
writebuf_len = 0;
userbuf_off = 0;
ADIOI_BUFFERED_WRITE_WITHOUT_READ
/* write the buffer out finally */
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
ADIOI_Free(writebuf);
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
i_offset += fwr_size;
end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
writebuf_off = 0;
writebuf_len = 0;
writebuf = (char *) ADIOI_Malloc(stripe_size);
memset(writebuf, -1, stripe_size);
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] +
n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
new_fwr_size = fwr_size;
new_bwr_size = bwr_size;
if (size == fwr_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i_offset += size;
new_bwr_size -= size;
}
}
if (size == bwr_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = (ADIO_Offset)buftype_extent *
(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;
new_fwr_size -= size;
}
}
num += size;
fwr_size = new_fwr_size;
bwr_size = new_bwr_size;
}
}
/* write the buffer out finally */
if (writebuf_len) {
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET,
writebuf_off, &status1, error_code);
if (!(fd->atomicity))
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
if (*error_code != MPI_SUCCESS) return;
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
ADIOI_Free(writebuf);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig)
ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -24,6 +24,7 @@ libadio_nfs_la_SOURCES = \
ad_nfs.h \
ad_nfs_done.c \
ad_nfs_fcntl.c \
ad_nfs_features.c \
ad_nfs_getsh.c \
ad_nfs_hints.c \
ad_nfs_iread.c \

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_NFS_operations = {
ADIOI_NFS_Open, /* Open */
ADIOI_FAILSAFE_OpenColl, /* OpenColl */
ADIOI_NFS_ReadContig, /* ReadContig */
ADIOI_NFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -35,4 +36,5 @@ struct ADIOI_Fns_struct ADIO_NFS_operations = {
ADIOI_GEN_Flush, /* Flush */
ADIOI_NFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_NFS_Feature, /* Features */
};

Просмотреть файл

@ -78,5 +78,6 @@ void ADIOI_NFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp,
int *error_code);
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
int ADIOI_NFS_Feature(ADIO_File fd, int feature_flag);
#endif

Просмотреть файл

@ -0,0 +1,16 @@
#include "adio.h"
#include "ad_nfs.h"
int ADIOI_NFS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
return 1;
case ADIO_SCALABLE_OPEN:
default:
return 0;
}
}

Просмотреть файл

@ -59,6 +59,7 @@ int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
struct aiocb *aiocbp;
ADIOI_AIO_Request *aio_req;
MPI_Status status;
fd_sys = fd->fd_sys;
@ -108,7 +109,7 @@ int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
/* exceeded the max. no. of outstanding requests.
complete all previous async. requests and try again. */
ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, NULL, &error_code);
offset, &status, &error_code);
MPIO_Completed_request_create(&fd, len, &error_code, request);
return 0;
} else {

Просмотреть файл

@ -177,7 +177,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value;
int flag, st_frd_size, st_n_filetypes, readbuf_len;
int st_frd_size, st_n_filetypes, readbuf_len;
int new_brd_size, new_frd_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_READSTRIDED";
@ -201,7 +201,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
@ -278,24 +278,31 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i]
>= offset) {
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* frd_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0 ) {
frd_size = dist;
break;
}
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
@ -316,11 +323,42 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
@ -333,11 +371,11 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
while (i < bufsize) {
i += frd_size;
end_offset = off + frd_size - 1;
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent;
@ -402,10 +440,11 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */
else {
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
@ -445,12 +484,12 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
if (size == frd_size) {
/* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;

Просмотреть файл

@ -10,120 +10,6 @@ void ADIOI_NFS_ReadComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return;
#if 0
#ifdef ROMIO_HAVE_WORKING_AIO
int err;
static char myname[] = "ADIOI_NFS_READCOMPLETE";
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_HANDLE
struct aiocb *tmp1;
#endif
#endif
if (*request == ADIO_REQUEST_NULL) {
*error_code = MPI_SUCCESS;
return;
}
#ifdef ROMIO_HAVE_AIO_SUSPEND_TWO_ARGS
/* old IBM */
if ((*request)->queued) {
do {
#if !defined(_AIO_AIX_SOURCE) && !defined(_NO_PROTO)
err = aio_suspend((*request)->handle,1,NULL);
#else
err = aio_suspend(1, (struct aiocb **) &((*request)->handle));
#endif
} while ((err == -1) && (errno == EINTR));
tmp1 = (struct aiocb *) (*request)->handle;
if (err != -1) {
err = aio_return(tmp1->aio_handle);
(*request)->nbytes = err;
errno = aio_error(tmp1->aio_handle);
}
else (*request)->nbytes = -1;
/* on DEC, it is required to call aio_return to dequeue the request.
IBM man pages don't indicate what function to use for dequeue.
I'm assuming it is aio_return! */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else *error_code = MPI_SUCCESS; /* if ( (*request)->queued ) */
#ifdef HAVE_STATUS_SET_BYTES
if ((*request)->nbytes != -1)
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
#elif defined(ROMIO_HAVE_WORKING_AIO)
/* all other aio types */
if ((*request)->queued) {
do {
err = aio_suspend((const struct aiocb **) &((*request)->handle), 1, 0);
} while ((err == -1) && (errno == EINTR));
if (err != -1) {
err = aio_return((struct aiocb *) (*request)->handle);
(*request)->nbytes = err;
errno = aio_error((struct aiocb *) (*request)->handle);
}
else (*request)->nbytes = -1;
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else *error_code = MPI_SUCCESS; /* if ((*request)->queued) ... */
#ifdef HAVE_STATUS_SET_BYTES
if ((*request)->nbytes != -1)
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
#endif
#ifdef ROMIO_HAVE_WORKING_AIO
if ((*request)->queued != -1) {
/* queued = -1 is an internal hack used when the request must
be completed, but the request object should not be
freed. This is used in ADIOI_Complete_async, because the user
will call MPI_Wait later, which would require status to
be filled. Ugly but works. queued = -1 should be used only
in ADIOI_Complete_async.
This should not affect the user in any way. */
/* if request is still queued in the system, it is also there
on ADIOI_Async_list. Delete it from there. */
if ((*request)->queued) ADIOI_Del_req_from_list(request);
(*request)->fd->async_count--;
if ((*request)->handle) ADIOI_Free((*request)->handle);
ADIOI_Free_request((ADIOI_Req_node *) (*request));
*request = ADIO_REQUEST_NULL;
}
#else
/* no aio */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
(*request)->fd->async_count--;
ADIOI_Free_request((ADIOI_Req_node *) (*request));
*request = ADIO_REQUEST_NULL;
*error_code = MPI_SUCCESS;
#endif
#endif
}

Просмотреть файл

@ -281,7 +281,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value;
int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
int st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_WRITESTRIDED";
@ -304,7 +304,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
@ -381,24 +381,31 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i]
>= offset) {
st_index = i;
fwr_size = (int) (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
@ -419,10 +426,40 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
@ -436,14 +473,15 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
while (i < bufsize) {
i += fwr_size;
end_offset = off + fwr_size - 1;
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent;
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
@ -509,10 +547,11 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
@ -552,10 +591,11 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
if (size == fwr_size) {
/* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_NTFS_ReadContig, /* ReadContig */
ADIOI_NTFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -33,4 +34,5 @@ struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Flush, /* Flush */
ADIOI_NTFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature /* Features */
};

Просмотреть файл

@ -60,7 +60,6 @@ int ADIOI_NTFS_aio_poll_fn(void *extra_state, MPI_Status *status)
/* TODO: unsure how to handle this */
}
}else{
MPIR_Nest_incr();
mpi_errno = MPI_Grequest_complete(aio_req->req);
if (mpi_errno != MPI_SUCCESS) {
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
@ -69,7 +68,6 @@ int ADIOI_NTFS_aio_poll_fn(void *extra_state, MPI_Status *status)
MPI_ERR_IO, "**mpi_grequest_complete",
0);
}
MPIR_Nest_decr();
}
return mpi_errno;
}
@ -111,7 +109,6 @@ int ADIOI_NTFS_aio_wait_fn(int count, void **array_of_states,
aio_reqlist[retObject]->lpOvl, &(aio_reqlist[retObject]->nbytes),
FALSE)){
/* XXX: mark completed requests as 'done'*/
MPIR_Nest_incr();
mpi_errno = MPI_Grequest_complete(aio_reqlist[retObject]->req);
if (mpi_errno != MPI_SUCCESS) {
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
@ -120,7 +117,6 @@ int ADIOI_NTFS_aio_wait_fn(int count, void **array_of_states,
MPI_ERR_IO, "**mpi_grequest_complete",
0);
}
MPIR_Nest_decr();
}else{
if(GetLastError() == ERROR_IO_INCOMPLETE){
/* IO in progress */
@ -146,7 +142,6 @@ int ADIOI_NTFS_aio_query_fn(void *extra_state, MPI_Status *status)
MPI_Status_set_elements(status, MPI_BYTE, aio_req->nbytes);
/* do i need to nest_incr/nest_decr here? */
/* can never cancel so always true */
MPI_Status_set_cancelled(status, 0);

Просмотреть файл

@ -13,6 +13,7 @@
struct ADIOI_Fns_struct ADIO_PANFS_operations = {
ADIOI_PANFS_Open, /* Open */
ADIOI_GEN_OpenColl,
ADIOI_PANFS_ReadContig, /* ReadContig */
ADIOI_PANFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -39,4 +40,5 @@ struct ADIOI_Fns_struct ADIO_PANFS_operations = {
ADIOI_GEN_Flush, /* Flush */
ADIOI_PANFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature,
};

Просмотреть файл

@ -36,7 +36,7 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
concurrent_write = strtoul(value,NULL,10);
@ -46,10 +46,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_concurrent_write\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_concurrent_write", value);
ADIOI_Info_set(fd->info, "panfs_concurrent_write", value);
}
MPI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_type = strtoul(value,NULL,10);
@ -59,10 +59,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_type\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_layout_type", value);
ADIOI_Info_set(fd->info, "panfs_layout_type", value);
}
MPI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_stripe_unit = strtoul(value,NULL,10);
@ -72,10 +72,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_stripe_unit\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", value);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", value);
}
MPI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
layout_parity_stripe_width = strtoul(value,NULL,10);
@ -85,10 +85,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_width\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value);
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value);
}
MPI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
layout_parity_stripe_depth = strtoul(value,NULL,10);
@ -98,10 +98,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_depth\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value);
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value);
}
MPI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_total_num_comps = strtoul(value,NULL,10);
@ -111,10 +111,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_total_num_comps\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", value);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", value);
}
MPI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE || layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
layout_visit_policy = strtoul(value,NULL,10);
@ -124,7 +124,7 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_visit_policy\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Info_set(fd->info, "panfs_layout_visit_policy", value);
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", value);
}
ADIOI_Free(value);

Просмотреть файл

@ -39,32 +39,32 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
*error_code = MPI_SUCCESS;
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_type = strtoul(value,NULL,10);
}
MPI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_stripe_unit = strtoul(value,NULL,10);
}
MPI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_total_num_comps = strtoul(value,NULL,10);
}
MPI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_parity_stripe_width = strtoul(value,NULL,10);
}
MPI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_parity_stripe_depth = strtoul(value,NULL,10);
}
MPI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_visit_policy = strtoul(value,NULL,10);
@ -266,7 +266,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
amode = amode | O_EXCL;
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
unsigned long int concurrent_write = strtoul(value,NULL,10);
@ -291,41 +291,41 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
if (rc < 0)
{
/* Error - set layout type to unknown */
MPI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
ADIOI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
}
else
{
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.agg_type);
MPI_Info_set(fd->info, "panfs_layout_type", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_type", temp_buffer);
if (file_query_args.layout.layout_is_valid == 1)
{
switch (file_query_args.layout.agg_type)
{
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID0:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.stripe_unit);
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.total_num_comps);
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
break;
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit);
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_width);
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth);
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.total_num_comps);
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.layout_visit_policy);
MPI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break;
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.stripe_unit);
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.total_num_comps);
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy);
MPI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break;
}
}

Просмотреть файл

@ -24,7 +24,7 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
str_factor=atoi(value);
@ -40,7 +40,7 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */
}
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
str_unit=atoi(value);
@ -56,7 +56,7 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */
}
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
start_iodev=atoi(value);
@ -119,15 +119,15 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
If so, mark it as true in fd->info and turn it on in
ADIOI_PFS_Open after the file is opened */
MPI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true")))
MPI_Info_set(fd->info, "pfs_svr_buf", "true");
else MPI_Info_set(fd->info, "pfs_svr_buf", "false");
ADIOI_Info_set(fd->info, "pfs_svr_buf", "true");
else ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
ADIOI_Free(value);
}
else MPI_Info_set(fd->info, "pfs_svr_buf", "false");
else ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
@ -144,23 +144,23 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "false"))) {
value_in_fd = (char *)
ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value_in_fd, &flag);
if (strcmp(value, value_in_fd)) {
if (!strcmp(value, "true")) {
err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE);
if (!err)
MPI_Info_set(fd->info, "pfs_svr_buf", "true");
ADIOI_Info_set(fd->info, "pfs_svr_buf", "true");
}
else {
err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, FALSE);
if (!err)
MPI_Info_set(fd->info, "pfs_svr_buf", "false");
ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
}
}
ADIOI_Free(value_in_fd);

Просмотреть файл

@ -49,11 +49,11 @@ void ADIOI_PFS_Open(ADIO_File fd, int *error_code)
to ADIOI_PFS_SetInfo. Turn it on now, since we now have a
valid file descriptor. */
MPI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true"))) {
err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE);
if (err) MPI_Info_set(fd->info, "pfs_svr_buf", "false");
if (err) ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
}
/* get file striping information and set it in info */
@ -61,13 +61,13 @@ void ADIOI_PFS_Open(ADIO_File fd, int *error_code)
if (!err) {
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sunitsize);
MPI_Info_set(fd->info, "striping_unit", value);
ADIOI_Info_set(fd->info, "striping_unit", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sfactor);
MPI_Info_set(fd->info, "striping_factor", value);
ADIOI_Info_set(fd->info, "striping_factor", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_start_sdir);
MPI_Info_set(fd->info, "start_iodevice", value);
ADIOI_Info_set(fd->info, "start_iodevice", value);
}
ADIOI_Free(value);

Просмотреть файл

@ -25,6 +25,7 @@ libadio_piofs_la_SOURCES = \
ad_piofs.c \
ad_piofs.h \
ad_piofs_fcntl.c \
ad_piofs_features.c \
ad_piofs_hints.c \
ad_piofs_open.c \
ad_piofs_read.c \

Просмотреть файл

@ -33,4 +33,5 @@ struct ADIOI_Fns_struct ADIO_PIOFS_operations = {
ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_PIOFS_Feature,
};

Просмотреть файл

@ -35,4 +35,6 @@ void ADIOI_PIOFS_WriteStrided(ADIO_File fd, void *buf, int count,
*error_code);
void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_PIOFS_Feature(ADIO_File fd, int flag);
#endif

Просмотреть файл

@ -0,0 +1,13 @@
int ADIOI_PIOFS_Features(int flag)
{
switch(flag) {
case ADIO_LOCKS:
case ADIO_SHARED_FP:
case ADIO_ATOMIC_MODE:
case ADIO_DATA_SIEVING_WRITES:
case ADIO_SCALABLE_OPEN:
default:
return 0;
break;
}
}

Просмотреть файл

@ -25,7 +25,7 @@ void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
str_factor=atoi(value);
@ -37,7 +37,7 @@ void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
}
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
str_unit=atoi(value);
@ -49,7 +49,7 @@ void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
}
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
start_iodev=atoi(value);

Просмотреть файл

@ -49,13 +49,13 @@ void ADIOI_PIOFS_Open(ADIO_File fd, int *error_code)
if (!err) {
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_bsu);
MPI_Info_set(fd->info, "striping_unit", value);
ADIOI_Info_set(fd->info, "striping_unit", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_cells);
MPI_Info_set(fd->info, "striping_factor", value);
ADIOI_Info_set(fd->info, "striping_factor", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_base_node);
MPI_Info_set(fd->info, "start_iodevice", value);
ADIOI_Info_set(fd->info, "start_iodevice", value);
}
ADIOI_Free(value);

Просмотреть файл

@ -33,4 +33,5 @@ struct ADIOI_Fns_struct ADIO_PVFS_operations = {
ADIOI_PVFS_Flush, /* Flush */
ADIOI_PVFS_Resize, /* Resize */
ADIOI_PVFS_Delete, /* Delete */
ADIOI_PVFS_Feature, /* Features */
};

Просмотреть файл

@ -17,8 +17,8 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* This must be part of the open call. can set striping parameters
if necessary. */
MPI_Info_create(&(fd->info));
MPI_Info_set(fd->info, "romio_pvfs_listio_read", "disable");
MPI_Info_set(fd->info, "romio_pvfs_listio_write", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", "disable");
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE;
@ -27,7 +27,7 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
str_factor=atoi(value);
@ -41,10 +41,10 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
return;
/* --END ERROR HANDLING-- */
}
else MPI_Info_set(fd->info, "striping_factor", value);
else ADIOI_Info_set(fd->info, "striping_factor", value);
}
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
str_unit=atoi(value);
@ -58,10 +58,10 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
return;
/* --END ERROR HANDLING-- */
}
else MPI_Info_set(fd->info, "striping_unit", value);
else ADIOI_Info_set(fd->info, "striping_unit", value);
}
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
start_iodev=atoi(value);
@ -75,25 +75,25 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
return;
/* --END ERROR HANDLING-- */
}
else MPI_Info_set(fd->info, "start_iodevice", value);
else ADIOI_Info_set(fd->info, "start_iodevice", value);
}
MPI_Info_get(users_info, "romio_pvfs_listio_read",
ADIOI_Info_get(users_info, "romio_pvfs_listio_read",
MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
MPI_Info_set(fd->info, "romio_pvfs_listio_read", value);
ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value);
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_ENABLE;
} else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{
MPI_Info_set(fd->info , "romio_pvfs_listio_read", value);
ADIOI_Info_set(fd->info , "romio_pvfs_listio_read", value);
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE;
}
else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
MPI_Info_set(fd->info, "romio_pvfs_listio_read", value);
ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value);
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->fs_hints.pvfs.listio_read;
@ -107,21 +107,21 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */
}
}
MPI_Info_get(users_info, "romio_pvfs_listio_write", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "romio_pvfs_listio_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
MPI_Info_set(fd->info, "romio_pvfs_listio_write", value);
ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_ENABLE;
} else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{
MPI_Info_set(fd->info, "romio_pvfs_listio_write", value);
ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE;
}
else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
MPI_Info_set(fd->info, "romio_pvfs_listio_write", value);
ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->fs_hints.pvfs.listio_write;

Просмотреть файл

@ -37,15 +37,15 @@ void ADIOI_PVFS_Open(ADIO_File fd, int *error_code)
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (atoi(value) > 0)) pstat.pcount = atoi(value);
MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (atoi(value) > 0)) pstat.ssize = atoi(value);
MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL,
ADIOI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (atoi(value) >= 0)) pstat.base = atoi(value);
@ -71,11 +71,11 @@ void ADIOI_PVFS_Open(ADIO_File fd, int *error_code)
if (fd->fd_sys != -1) {
pvfs_ioctl(fd->fd_sys, GETMETA, &pstat);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.pcount);
MPI_Info_set(fd->info, "striping_factor", value);
ADIOI_Info_set(fd->info, "striping_factor", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.ssize);
MPI_Info_set(fd->info, "striping_unit", value);
ADIOI_Info_set(fd->info, "striping_unit", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.base);
MPI_Info_set(fd->info, "start_iodevice", value);
ADIOI_Info_set(fd->info, "start_iodevice", value);
}
ADIOI_Free(value);

Просмотреть файл

@ -43,6 +43,7 @@ void ADIOI_PVFS_ReadContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
if (err>0)
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
@ -63,6 +64,7 @@ void ADIOI_PVFS_ReadContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
if (err > 0)
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}

Просмотреть файл

@ -43,6 +43,7 @@ void ADIOI_PVFS_WriteContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
if (err > 0)
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
@ -63,6 +64,7 @@ void ADIOI_PVFS_WriteContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
if (err > 0)
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}

Просмотреть файл

@ -28,9 +28,15 @@ libadio_pvfs2_la_SOURCES = \
ad_pvfs2_common.c \
ad_pvfs2_delete.c \
ad_pvfs2_fcntl.c \
ad_pvfs2_features.c \
ad_pvfs2_flush.c \
ad_pvfs2_hints.c \
ad_pvfs2_io.h \
ad_pvfs2_io_dtype.c \
ad_pvfs2_io_list.c \
ad_pvfs2_open.c \
ad_pvfs2_read.c \
ad_pvfs2_read_list_classic.c
ad_pvfs2_resize.c \
ad_pvfs2_write.c
ad_pvfs2_write.c \
ad_pvfs2_write_list_classic.c

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
ADIOI_PVFS2_Open, /* Open */
ADIOI_SCALEABLE_OpenColl, /* OpenColl */
ADIOI_PVFS2_ReadContig, /* ReadContig */
ADIOI_PVFS2_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -22,13 +23,8 @@ struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
ADIOI_PVFS2_ReadStrided, /* ReadStrided */
ADIOI_PVFS2_WriteStrided, /* WriteStrided */
ADIOI_PVFS2_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
ADIOI_PVFS2_IReadContig, /* IreadContig */
ADIOI_PVFS2_IWriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_FAKE_IODone, /* ReadDone */
ADIOI_FAKE_IODone, /* WriteDone */
ADIOI_FAKE_IOComplete, /* ReadComplete */
@ -38,6 +34,7 @@ struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
ADIOI_PVFS2_Flush, /* Flush */
ADIOI_PVFS2_Resize, /* Resize */
ADIOI_PVFS2_Delete, /* Delete */
ADIOI_PVFS2_Feature,
};
/*

Просмотреть файл

@ -17,7 +17,6 @@
#include "pvfs2-compat.h"
#endif
void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code);
void ADIOI_PVFS2_Close(ADIO_File fd, int *error_code);
void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count,
@ -42,6 +41,8 @@ void ADIOI_PVFS2_Flush(ADIO_File fd, int *error_code);
void ADIOI_PVFS2_Delete(char *filename, int *error_code);
void ADIOI_PVFS2_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
int ADIOI_PVFS2_Feature(ADIO_File fd, int flag);
void ADIOI_PVFS2_IReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, MPI_Request *request,
@ -54,4 +55,12 @@ void ADIOI_PVFS2_AIO_contig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, MPI_Request *request,
int flag, int *error_code);
void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
#endif

Просмотреть файл

@ -17,7 +17,6 @@
#define READ 0
#define WRITE 1
#ifdef ROMIO_HAVE_WORKING_AIO
static int ADIOI_PVFS2_greq_class = 0;
int ADIOI_PVFS2_aio_free_fn(void *extra_state);
int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status *status);
@ -168,12 +167,10 @@ int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status *status)
aio_req = (ADIOI_AIO_Request *)extra_state;
/* BUG: cannot PVFS_sys_testsome: does not work for a specific request */
ret = PVFS_sys_wait(aio_req->op_id, __FUNCTION__, &error);
ret = PVFS_sys_wait(aio_req->op_id, "ADIOI_PVFS2_aio_poll_fn", &error);
if (ret == 0) {
aio_req->nbytes = aio_req->resp_io.total_completed;
MPIR_Nest_incr();
MPI_Grequest_complete(aio_req->req);
MPIR_Nest_decr();
return MPI_SUCCESS;
} else
return MPI_UNDEFINED; /* TODO: what's this error? */
@ -186,7 +183,7 @@ int ADIOI_PVFS2_aio_wait_fn(int count, void ** array_of_states,
ADIOI_AIO_Request **aio_reqlist;
PVFS_sys_op_id *op_id_array;
int i,j, greq_count;
int i,j, greq_count, completed_count=0;
int *error_array;
aio_reqlist = (ADIOI_AIO_Request **)array_of_states;
@ -195,25 +192,27 @@ int ADIOI_PVFS2_aio_wait_fn(int count, void ** array_of_states,
error_array = (int *)ADIOI_Calloc(count, sizeof(int));
greq_count = count;
/* PVFS-2.6: testsome actually tests all requests and fills in op_id_array
* with the ones that have completed. count is an in/out parameter.
* returns with the number of completed operations. what a mess! */
while (completed_count < greq_count ) {
count = greq_count;
PVFS_sys_testsome(op_id_array, &count, NULL, error_array, INT_MAX);
completed_count += count;
for (i=0; i< count; i++) {
for (j=0; j<greq_count; j++) {
if (op_id_array[i] == aio_reqlist[j]->op_id) {
aio_reqlist[j]->nbytes =
aio_reqlist[j]->resp_io.total_completed;
MPIR_Nest_incr();
MPI_Grequest_complete(aio_reqlist[j]->req);
MPIR_Nest_decr();
}
}
}
}
return MPI_SUCCESS; /* TODO: no idea how to deal with errors */
}
#endif
/*
* vim: ts=8 sts=4 sw=4 noexpandtab

Просмотреть файл

@ -42,6 +42,7 @@ int ADIOI_PVFS2_End_call(MPI_Comm comm, int keyval,
{
int error_code;
ADIOI_PVFS2_End(&error_code);
MPI_Keyval_free(&keyval);
return error_code;
}
@ -81,7 +82,7 @@ void ADIOI_PVFS2_Init(int *error_code )
&ADIOI_PVFS2_Initialized, (void *)0);
/* just like romio does, we make a dummy attribute so we
* get cleaned up */
MPI_Attr_put(MPI_COMM_WORLD, ADIOI_PVFS2_Initialized, (void *)0);
MPI_Attr_put(MPI_COMM_SELF, ADIOI_PVFS2_Initialized, (void *)0);
}
void ADIOI_PVFS2_makeattribs(PVFS_sys_attr * attribs)
@ -107,9 +108,43 @@ void ADIOI_PVFS2_makecredentials(PVFS_credentials * credentials)
int ADIOI_PVFS2_error_convert(int pvfs_error)
{
switch(pvfs_error)
{
case PVFS_EPERM:
case PVFS_EACCES:
return MPI_ERR_ACCESS;
case PVFS_ENOENT:
case PVFS_ENXIO:
case PVFS_ENODEV:
return MPI_ERR_NO_SUCH_FILE;
case PVFS_EIO:
return MPI_ERR_IO;
case PVFS_EEXIST:
return MPI_ERR_FILE_EXISTS;
case PVFS_ENOTDIR: /* ??? */
case PVFS_EISDIR: /* ??? */
case PVFS_ENAMETOOLONG:
return MPI_ERR_BAD_FILE;
case PVFS_EINVAL:
return MPI_ERR_FILE;
case PVFS_EFBIG: /* ??? */
case PVFS_ENOSPC:
return MPI_ERR_NO_SPACE;
case PVFS_EROFS:
return MPI_ERR_READ_ONLY;
case PVFS_ENOSYS:
return MPI_ERR_UNSUPPORTED_OPERATION;
/* PVFS does not support quotas */
case EDQUOT:
return MPI_ERR_QUOTA;
case PVFS_ENOMEM:
return MPI_ERR_INTERN;
default:
return MPI_UNDEFINED;
}
}
/*
* vim: ts=8 sts=4 sw=4 noexpandtab
*/

Просмотреть файл

@ -0,0 +1,16 @@
#include "adio.h"
#include "ad_pvfs2.h"
int ADIOI_PVFS2_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
return 1;
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -17,20 +17,37 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if ((fd->info) == MPI_INFO_NULL) {
/* part of the open call */
MPI_Info_create(&(fd->info));
MPI_Info_set(fd->info, "romio_pvfs2_debugmask", "0");
ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", "0");
fd->hints->fs_hints.pvfs2.debugmask = 0;
MPI_Info_set(fd->info, "striping_factor", "0");
ADIOI_Info_set(fd->info, "striping_factor", "0");
fd->hints->striping_factor = 0;
MPI_Info_set(fd->info, "striping_unit", "0");
ADIOI_Info_set(fd->info, "striping_unit", "0");
fd->hints->striping_unit = 0;
/* disable the aggressive strided optimizations by default */
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", "disable");
fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE;
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", "disable");
fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE;
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", "disable");
fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE;
/* any user-provided hints? */
if (users_info != MPI_INFO_NULL) {
/* pvfs2 debugging */
value = (char *) ADIOI_Malloc( (MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "romio_pvfs2_debugmask",
ADIOI_Info_get(users_info, "romio_pvfs2_debugmask",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
tmp_value = fd->hints->fs_hints.pvfs2.debugmask =
@ -46,11 +63,11 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
/* --END ERROR HANDLING-- */
MPI_Info_set(fd->info, "romio_pvfs2_debugmask", value);
ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", value);
}
/* the striping factor */
MPI_Info_get(users_info, "striping_factor",
ADIOI_Info_get(users_info, "striping_factor",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
tmp_value = fd->hints->striping_factor = atoi(value);
@ -65,11 +82,11 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
/* --END ERROR HANDLING-- */
MPI_Info_set(fd->info, "striping_factor", value);
ADIOI_Info_set(fd->info, "striping_factor", value);
}
/* the striping unit */
MPI_Info_get(users_info, "striping_unit",
ADIOI_Info_get(users_info, "striping_unit",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
tmp_value = fd->hints->striping_unit = atoi(value);
@ -83,16 +100,167 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
}
/* --END ERROR HANDLING-- */
MPI_Info_set(fd->info, "striping_unit", value);
ADIOI_Info_set(fd->info, "striping_unit", value);
}
/* distribution name */
MPI_Info_get(users_info, "romio_pvfs2_distribution_name",
ADIOI_Info_get(users_info, "romio_pvfs2_distribution_name",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
}
/* POSIX read */
ADIOI_Info_get(users_info, "romio_pvfs2_posix_read",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", value);
fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_posix_read", value);
fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.posix_read;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.posix_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"posix_read",
error_code);
return;
}
}
/* POSIX write */
ADIOI_Info_get(users_info, "romio_pvfs2_posix_write",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", value);
fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_posix_write", value);
fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.posix_write;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.posix_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"posix_write",
error_code);
return;
}
}
/* Datatype read */
ADIOI_Info_get(users_info, "romio_pvfs2_dtype_read",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", value);
fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_dtype_read", value);
fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.dtype_read;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.dtype_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"dtype_read",
error_code);
return;
}
}
/* Datatype write */
ADIOI_Info_get(users_info, "romio_pvfs2_dtype_write",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", value);
fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_dtype_write", value);
fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.dtype_write;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.dtype_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"dtype_write",
error_code);
return;
}
}
/* Listio read */
ADIOI_Info_get(users_info, "romio_pvfs2_listio_read",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", value);
fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_listio_read", value);
fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.listio_read;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.listio_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"listio_read",
error_code);
return;
}
}
/* Datatype write */
ADIOI_Info_get(users_info, "romio_pvfs2_listio_write",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", value);
fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_listio_write", value);
fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.listio_write;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.listio_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"listio_write",
error_code);
return;
}
}
ADIOI_Free(value);
}
}
/* set the values for collective I/O and data sieving parameters */

Просмотреть файл

@ -0,0 +1,79 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2006 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/* Contig I/O helper prototypes */
#define READ 0
#define WRITE 1
/* #define DEBUG_CONTIG */
/* #define DEBUG_LIST */
/* #define DEBUG_DTYPE */
/* Contig I/O helper prototypes */
int ADIOI_PVFS2_Contig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type);
/* List I/O helper prototypes */
int ADIOI_PVFS2_StridedListIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type);
int gen_listio_arr(ADIOI_Flatlist_node *flat_buf,
int *flat_buf_index_p,
int64_t *cur_flat_buf_reg_off_p,
int flat_buf_size,
int flat_buf_extent,
ADIOI_Flatlist_node *flat_file,
int *flat_file_index_p,
int64_t *cur_flat_file_reg_off_p,
int flat_file_size,
int flat_file_extent,
int max_ol_count,
ADIO_Offset disp,
int bytes_into_filetype,
int64_t *bytes_completed,
int64_t total_io_size,
int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t *buf_ol_count_p,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t *file_ol_count_p);
void print_buf_file_ol_pairs(int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t buf_ol_count,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t file_ol_count,
void *buf,
int rw_type);
/* Datatype I/O helper prototypes */
int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type);
int convert_named(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype, int combiner);
void print_dtype_info(int combiner,
int num_int,
int num_addr,
int num_dtype,
int *arr_int,
MPI_Aint *arr_addr,
MPI_Datatype *arr_dtype);
int convert_mpi_pvfs2_dtype(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype);

Просмотреть файл

@ -0,0 +1,720 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2006 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include <assert.h>
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_io.h"
#include "ad_pvfs2_common.h"
int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code,
int rw_type)
{
int filetype_size = -1, ret = -1, filetype_is_contig = -1;
int num_filetypes = 0, cur_flat_file_reg_off = 0;
PVFS_Request tmp_mem_req, mem_req, tmp_file_req, file_req;
PVFS_sysresp_io resp_io;
ADIO_Offset off = -1, bytes_into_filetype = 0;
MPI_Aint filetype_extent = -1;
int etype_size = -1, i = -1;
PVFS_size pvfs_disp = -1;
ADIOI_Flatlist_node *flat_file_p = ADIOI_Flatlist;
/* Use for offseting the PVFS2 filetype */
int pvfs_blk = 1;
ADIOI_PVFS2_fs *pvfs_fs;
static char myname[] = "ADIOI_PVFS2_STRIDED_DTYPE";
memset(&tmp_mem_req, 0, sizeof(PVFS_Request));
memset(&mem_req, 0, sizeof(PVFS_Request));
memset(&tmp_file_req, 0, sizeof(PVFS_Request));
memset(&file_req, 0, sizeof(PVFS_Request));
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* changed below if error */
*error_code = MPI_SUCCESS;
/* datatype is the memory type
* fd->filetype is the file type */
MPI_Type_size(fd->filetype, &filetype_size);
if (filetype_size == 0) {
*error_code = MPI_SUCCESS;
return -1;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(fd->etype, &etype_size);
if (filetype_size == 0) {
*error_code = MPI_SUCCESS;
return -1;
}
/* offset is in units of etype relative to the filetype. We
* convert this to off in terms of actual data bytes (the offset
* minus the number of bytes that are not used). We are allowed
* to do this since PVFS2 handles offsets with respect to a
* file_req in bytes, otherwise we would have to convert into a
* pure byte offset as is done in other methods. Explicit offset
* case is handled by using fd->disp and byte-converted off. */
pvfs_disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL)
{
if (filetype_is_contig)
{
off = fd->fp_ind - fd->disp;
}
else
{
int flag = 0;
/* Should have already been flattened in ADIO_Open*/
while (flat_file_p->type != fd->filetype)
{
flat_file_p = flat_file_p->next;
}
num_filetypes = -1;
while (!flag)
{
num_filetypes++;
for (i = 0; i < flat_file_p->count; i++)
{
/* Start on a non zero-length region */
if (flat_file_p->blocklens[i])
{
if (fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent) +
flat_file_p->blocklens[i] > fd->fp_ind &&
fd->disp + flat_file_p->indices[i] <=
fd->fp_ind)
{
cur_flat_file_reg_off = fd->fp_ind -
(fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent));
flag = 1;
break;
}
else
bytes_into_filetype += flat_file_p->blocklens[i];
}
}
}
/* Impossible that we don't find it in this datatype */
assert(i != flat_file_p->count);
off = bytes_into_filetype + cur_flat_file_reg_off;
}
}
else /* ADIO_EXPLICIT */
{
off = etype_size * offset;
}
#ifdef DEBUG_DTYPE
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: (fd->fp_ind=%Ld,fd->disp=%Ld,"
" offset=%Ld),(pvfs_disp=%Ld,off=%Ld)\n",
fd->fp_ind, fd->disp, offset, pvfs_disp, off);
#endif
/* Convert the MPI memory and file datatypes into
* PVFS2 datatypes */
ret = convert_mpi_pvfs2_dtype(&datatype, &tmp_mem_req);
if (ret < 0)
{
goto error_state;
}
ret = convert_mpi_pvfs2_dtype(&(fd->filetype), &tmp_file_req);
if (ret < 0)
{
goto error_state;
}
ret = PVFS_Request_contiguous(count, tmp_mem_req, &mem_req);
if (ret != 0) /* TODO: convert this to MPIO error handling */
fprintf(stderr, "ADIOI_PVFS2_stridedDtypeIO: error in final"
" CONTIG memory type\n");
PVFS_Request_free(&tmp_mem_req);
/* pvfs_disp is used to offset the filetype */
ret = PVFS_Request_hindexed(1, &pvfs_blk, &pvfs_disp,
tmp_file_req, &file_req);
if (ret != 0)
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: error in final"
" HINDEXED file type\n");
PVFS_Request_free(&tmp_file_req);
if (rw_type == READ)
ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, off, buf,
mem_req, &(pvfs_fs->credentials), &resp_io);
else
ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, off, buf,
mem_req, &(pvfs_fs->credentials), &resp_io);
if (ret != 0) {
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: Warning - PVFS_sys_"
"read/write returned %d and completed %Ld bytes.\n",
ret, resp_io.total_completed);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(ret),
"Error in PVFS_sys_io \n", 0);
goto error_state;
}
if (file_ptr_type == ADIO_INDIVIDUAL)
{
fd->fp_ind = off += resp_io.total_completed;
}
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
#ifdef DEBUG_DTYPE
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: "
"resp_io.total_completed=%Ld,ret=%d\n",
resp_io.total_completed, ret);
#endif
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, (int)resp_io.total_completed);
/* This is a temporary way of filling in status. The right way is to
* keep track of how much data was actually acccessed by
* ADIOI_BUFFERED operations */
#endif
return ret;
}
/* convert_mpi_pvfs2_dtype - Convert a MPI datatype into
* a PVFS2 datatype so that we can natively use the PVFS2
* datatypes in the PVFS2 I/O calls instead of converting
* all datatypes to the hindexed method
* return 1 - a leaf node
* return 0 - normal return
* return -1 - problems */
int convert_mpi_pvfs2_dtype(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype)
{
int num_int = -1, num_addr = -1, num_dtype = -1,
combiner = -1, i = -1, ret = -1, leaf = -1;
int *arr_int = NULL, *arr_addr = NULL;
MPI_Datatype *arr_dtype = NULL;
PVFS_Request *old_pvfs_dtype = NULL;
PVFS_Request *old_pvfs_dtype_arr = NULL;
int arr_count = -1;
PVFS_size *pvfs_arr_disp = NULL;
int *pvfs_arr_len = NULL;
MPI_Type_get_envelope(*mpi_dtype,
&num_int,
&num_addr,
&num_dtype,
&combiner);
/* Depending on type of datatype do the following
* operations */
if (combiner == MPI_COMBINER_NAMED)
{
convert_named(mpi_dtype, pvfs_dtype, combiner);
return 1;
}
/* Allocate space for the arrays necessary for
* MPI_Type_get_contents */
if ((arr_int = ADIOI_Malloc(sizeof(int)*num_int)) == NULL)
{
fprintf(stderr, "Failed to allocate array_int\n");
return -1;
}
if ((arr_addr = ADIOI_Malloc(sizeof(int)*num_addr)) == NULL)
{
ADIOI_Free(arr_int);
fprintf(stderr, "Failed to allocate array_addr\n");
return -1;
}
if ((arr_dtype = ADIOI_Malloc(sizeof(MPI_Datatype)*num_dtype)) == NULL)
{
ADIOI_Free(arr_int);
ADIOI_Free(arr_addr);
fprintf(stderr, "Failed to allocate array_dtypes\n");
return -1;
}
MPI_Type_get_contents(*mpi_dtype,
num_int,
num_addr,
num_dtype,
arr_int,
arr_addr,
arr_dtype);
/* If it's not a predefined datatype, it is either a
* derived datatype or a structured datatype */
if (combiner != MPI_COMBINER_STRUCT)
{
if ((old_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL)
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate PVFS_Request\n");
switch (combiner)
{
case MPI_COMBINER_CONTIGUOUS:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_contiguous(arr_int[0],
*old_pvfs_dtype, pvfs_dtype);
break;
case MPI_COMBINER_VECTOR:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_vector(arr_int[0], arr_int[1],
arr_int[2], *old_pvfs_dtype,
pvfs_dtype);
break;
case MPI_COMBINER_HVECTOR:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_hvector(arr_int[0], arr_int[1],
arr_addr[0], *old_pvfs_dtype,
pvfs_dtype);
break;
/* Both INDEXED and HINDEXED types require PVFS_size
* address arrays. Therefore, we need to copy and
* convert the data from MPI_get_contents() into
* a PVFS_size buffer */
case MPI_COMBINER_INDEXED:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
if ((pvfs_arr_disp =
ADIOI_Malloc(arr_int[0]*sizeof(PVFS_size))) == 0)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_disp\n");
}
for (i = 0; i < arr_int[0]; i++)
{
pvfs_arr_disp[i] =
(PVFS_size) arr_int[arr_int[0]+1+i];
}
ret = PVFS_Request_indexed(arr_int[0], &arr_int[1],
pvfs_arr_disp,
*old_pvfs_dtype, pvfs_dtype);
ADIOI_Free(pvfs_arr_disp);
break;
case MPI_COMBINER_HINDEXED:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
if ((pvfs_arr_disp =
ADIOI_Malloc(arr_int[0]*sizeof(PVFS_size))) == 0)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_disp\n");
}
for (i = 0; i < arr_int[0]; i++)
{
pvfs_arr_disp[i] =
(PVFS_size) arr_addr[i];
}
ret = PVFS_Request_hindexed(arr_int[0], &arr_int[1],
(int64_t *)&arr_addr[0],
*old_pvfs_dtype, pvfs_dtype);
ADIOI_Free(pvfs_arr_disp);
break;
case MPI_COMBINER_DUP:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_contiguous(1,
*old_pvfs_dtype, pvfs_dtype);
break;
case MPI_COMBINER_INDEXED_BLOCK:
/* No native PVFS2 support for this operation currently */
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"INDEXED_BLOCK is unsupported\n");
break;
case MPI_COMBINER_HINDEXED_INTEGER:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"HINDEXED_INTEGER is unsupported\n");
break;
case MPI_COMBINER_STRUCT_INTEGER:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"STRUCT_INTEGER is unsupported\n");
break;
case MPI_COMBINER_SUBARRAY:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"SUBARRAY is unsupported\n");
break;
case MPI_COMBINER_DARRAY:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"DARRAY is unsupported\n");
break;
case MPI_COMBINER_F90_REAL:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"F90_REAL is unsupported\n");
break;
case MPI_COMBINER_F90_COMPLEX:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"F90_COMPLEX is unsupported\n");
break;
case MPI_COMBINER_F90_INTEGER:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"F90_INTEGER is unsupported\n");
break;
case MPI_COMBINER_RESIZED:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"RESIZED is unsupported\n");
break;
default:
break;
}
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_* "
"for a derived datatype\n");
#ifdef DEBUG_DTYPE
print_dtype_info(combiner,
num_int,
num_addr,
num_dtype,
arr_int,
arr_addr,
arr_dtype);
#endif
if (leaf != 1 && combiner != MPI_COMBINER_DUP)
MPI_Type_free(&arr_dtype[0]);
ADIOI_Free(arr_int);
ADIOI_Free(arr_addr);
ADIOI_Free(arr_dtype);
PVFS_Request_free(old_pvfs_dtype);
ADIOI_Free(old_pvfs_dtype);
return ret;
}
else /* MPI_COMBINER_STRUCT */
{
MPI_Aint mpi_lb = -1, mpi_extent = -1;
PVFS_offset pvfs_lb = -1;
PVFS_size pvfs_extent = -1;
int has_lb_ub = 0;
/* When converting into a PVFS_Request_struct, we no longer
* can use MPI_LB and MPI_UB. Therfore, we have to do the
* following.
* We simply ignore all the MPI_LB and MPI_UB types and
* get the lb and extent and pass it on through a
* PVFS resized_req */
arr_count = 0;
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
{
arr_count++;
}
}
if (arr_int[0] != arr_count)
{
MPI_Type_get_extent(*mpi_dtype, &mpi_lb, &mpi_extent);
pvfs_lb = mpi_lb;
pvfs_extent = mpi_extent;
if ((pvfs_arr_len = ADIOI_Malloc(arr_count*sizeof(int)))
== NULL)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_len\n");
}
has_lb_ub = 1;
}
if ((old_pvfs_dtype_arr
= ADIOI_Malloc(arr_count*sizeof(PVFS_Request))) == NULL)
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate PVFS_Requests\n");
if ((pvfs_arr_disp = ADIOI_Malloc(arr_count*sizeof(PVFS_size)))
== NULL)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_disp\n");
}
arr_count = 0;
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
{
leaf = convert_mpi_pvfs2_dtype(
&arr_dtype[i], &old_pvfs_dtype_arr[arr_count]);
if (leaf != 1)
MPI_Type_free(&arr_dtype[i]);
pvfs_arr_disp[arr_count] =
(PVFS_size) arr_addr[i];
if (has_lb_ub)
{
pvfs_arr_len[arr_count] =
arr_int[i+1];
}
arr_count++;
}
}
/* If a MPI_UB or MPI_LB did exist, we have to
* resize the datatype */
if (has_lb_ub)
{
PVFS_Request *tmp_pvfs_dtype = NULL;
if ((tmp_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL)
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate PVFS_Request\n");
ret = PVFS_Request_struct(arr_count, pvfs_arr_len,
pvfs_arr_disp,
old_pvfs_dtype_arr, tmp_pvfs_dtype);
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_struct\n");
arr_count = 0;
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
{
PVFS_Request_free(&old_pvfs_dtype_arr[arr_count]);
arr_count++;
}
}
#ifdef DEBUG_DTYPE
fprintf(stderr, "STRUCT(WITHOUT %d LB or UB)(%d,[",
arr_int[0] - arr_count, arr_count);
for (i = 0; i < arr_count; i++)
fprintf(stderr, "(%d,%Ld) ",
pvfs_arr_len[i],
pvfs_arr_disp[i]);
fprintf(stderr, "]\n");
fprintf(stderr, "RESIZED(LB = %Ld, EXTENT = %Ld)\n",
pvfs_lb, pvfs_extent);
#endif
ret = PVFS_Request_resized(*tmp_pvfs_dtype,
pvfs_lb, pvfs_extent, pvfs_dtype);
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_resize\n");
PVFS_Request_free(tmp_pvfs_dtype);
ADIOI_Free(tmp_pvfs_dtype);
}
else /* No MPI_LB or MPI_UB datatypes */
{
ret = PVFS_Request_struct(arr_int[0], &arr_int[1],
pvfs_arr_disp,
old_pvfs_dtype_arr, pvfs_dtype);
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_struct\n");
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
PVFS_Request_free(&old_pvfs_dtype_arr[i]);
}
#ifdef DEBUG_DTYPE
print_dtype_info(combiner,
num_int,
num_addr,
num_dtype,
arr_int,
arr_addr,
arr_dtype);
#endif
}
ADIOI_Free(arr_int);
ADIOI_Free(arr_addr);
ADIOI_Free(arr_dtype);
ADIOI_Free(old_pvfs_dtype_arr);
ADIOI_Free(pvfs_arr_disp);
ADIOI_Free(pvfs_arr_len);
return ret;
}
/* Shouldn't have gotten here */
fprintf(stderr, "convert_mpi_pvfs2_dtype: SERIOUS ERROR\n");
return -1;
}
int convert_named(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype, int combiner)
{
int ret = -1;
#ifdef DEBUG_DTYPE
fprintf(stderr, "NAMED");
#endif
switch (*mpi_dtype)
{
case MPI_CHAR:
ret = PVFS_Request_contiguous(1, PVFS_CHAR, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_CHAR\n");
#endif
break;
case MPI_BYTE:
ret = PVFS_Request_contiguous(1, PVFS_BYTE, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_BYTE\n");
#endif
break;
case MPI_SHORT:
ret = PVFS_Request_contiguous(1, PVFS_SHORT, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_SHORT\n");
#endif
break;
case MPI_INT:
ret = PVFS_Request_contiguous(1, PVFS_INT, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_INT\n");
#endif
break;
case MPI_LONG:
ret = PVFS_Request_contiguous(1, PVFS_LONG, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_LONG\n");
#endif
break;
case MPI_FLOAT:
ret = PVFS_Request_contiguous(1, PVFS_FLOAT, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_FLOAT\n");
#endif
break;
case MPI_DOUBLE:
ret = PVFS_Request_contiguous(1, PVFS_DOUBLE, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_DOUBLE\n");
#endif
break;
case MPI_UNSIGNED_CHAR:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED_CHAR, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_UNSIGNED_CHAR\n");
#endif
break;
case MPI_UNSIGNED_SHORT:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_UNSIGNED_SHORT\n");
#endif
break;
case MPI_UNSIGNED:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_SHORT\n");
#endif
break;
case MPI_UNSIGNED_LONG:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED_LONG, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_UNSIGNED_LONG\n");
#endif
break;
case MPI_LONG_DOUBLE:
ret = PVFS_Request_contiguous(1, PVFS_LONG_DOUBLE, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_LONG_DOUBLE\n");
#endif
break;
default:
fprintf(stderr, "convert_named: predefined type not found");
return -1;
break;
}
if (ret != 0)
fprintf(stderr, "convert_named: Datatype creation failed\n");
return ret;
}
void print_dtype_info(int combiner,
int num_int,
int num_addr,
int num_dtype,
int *arr_int,
MPI_Aint *arr_addr,
MPI_Datatype *arr_dtype)
{
int i = -1;
switch (combiner)
{
case MPI_COMBINER_CONTIGUOUS:
fprintf(stderr, "CONTIG(%d)\n", arr_int[0]);
break;
case MPI_COMBINER_VECTOR:
fprintf(stderr, "VECTOR(%d,%d,%d)\n",
arr_int[0], arr_int[1], arr_int[2]);
break;
case MPI_COMBINER_HVECTOR:
fprintf(stderr, "HVECTOR(%d,%d,%d)\n",
arr_int[0], arr_int[1],arr_addr[0]);
break;
case MPI_COMBINER_INDEXED:
fprintf(stderr, "INDEXED(%d,[",
arr_int[0]);
for (i = 0; i < arr_int[0]; i++)
fprintf(stderr, "(%d,%Ld) ",
arr_int[1+i],
(int64_t) arr_int[arr_int[0]+1+i]);
fprintf(stderr, "]\n");
break;
case MPI_COMBINER_HINDEXED:
fprintf(stderr, "HINDEXED(%d,[",
arr_int[0]);
for (i = 0; i < arr_int[0]; i++)
fprintf(stderr, "(%d,%Ld) ",
arr_int[1+i],
(int64_t) arr_addr[i]);
fprintf(stderr, "]\n");
break;
case MPI_COMBINER_STRUCT:
fprintf(stderr, "STRUCT(%d,[",
arr_int[0]);
for (i = 0; i < arr_int[0]; i++)
fprintf(stderr, "(%d,%Ld) ",
arr_int[1+i],
(int64_t) arr_addr[i]);
fprintf(stderr, "]\n");
break;
case MPI_COMBINER_DUP:
fprintf(stderr, "DUP\n");
break;
default:
fprintf(stderr, "no available information on this datatype");
}
}

Просмотреть файл

@ -0,0 +1,665 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2006 Unknown (TODO: fix this)
*/
#include <assert.h>
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_io.h"
#include "ad_pvfs2_common.h"
#define COALESCE_REGIONS /* TODO: would we ever want to *not* coalesce? */
#define MAX_OL_COUNT 64
int ADIOI_PVFS2_StridedListIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type)
{
/* list I/O parameters */
int i = -1, ret = -1;
int tmp_filetype_size = -1;
int64_t cur_io_size = 0, io_size = 0;
int etype_size = -1;
int num_etypes_in_filetype = -1, num_filetypes = -1;
int etypes_in_filetype = -1, size_in_filetype = -1;
int bytes_into_filetype = 0;
MPI_Offset total_bytes_accessed = 0;
/* parameters for offset-length pairs arrays */
int64_t buf_off_arr[MAX_OL_COUNT];
int32_t buf_len_arr[MAX_OL_COUNT];
int64_t file_off_arr[MAX_OL_COUNT];
int32_t file_len_arr[MAX_OL_COUNT];
int32_t buf_ol_count = 0;
int32_t file_ol_count = 0;
/* parameters for flattened memory and file datatypes*/
int flat_buf_index = 0;
int flat_file_index = 0;
int64_t cur_flat_buf_reg_off = 0;
int64_t cur_flat_file_reg_off = 0;
ADIOI_Flatlist_node *flat_buf_p, *flat_file_p;
int buftype_size = -1, buftype_extent = -1,
filetype_size = -1, filetype_extent = -1;
int buftype_is_contig = -1, filetype_is_contig = -1;
/* PVFS2 specific parameters */
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
static char myname[] = "ADIOI_PVFS2_STRIDED_LISTIO";
if (fd->atomicity) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"Atomic noncontiguous writes"
" are not supported by PVFS2", 0);
return -1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if (filetype_size == 0) {
*error_code = MPI_SUCCESS;
return -1;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
io_size = buftype_size*count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
/* Flatten the memory datatype
* (file datatype has already been flattened in ADIO open
* unless it is contibuous, then we need to flatten it manually)
* and set the correct buffers for flat_buf and flat_file */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
if (buftype_is_contig == 0)
{
ADIOI_Flatten_datatype(datatype);
flat_buf_p = ADIOI_Flatlist;
while (flat_buf_p->type != datatype)
flat_buf_p = flat_buf_p->next;
}
else
{
/* flatten and add to the list */
flat_buf_p = (ADIOI_Flatlist_node *) ADIOI_Malloc
(sizeof(ADIOI_Flatlist_node));
flat_buf_p->blocklens = (ADIO_Offset*)ADIOI_Malloc(sizeof(ADIO_Offset));
flat_buf_p->indices =
(ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
/* For the buffer, we can optimize the buftype, this is not
* possible with the filetype since it is tiled */
buftype_size = buftype_size*count;
buftype_extent = buftype_size*count;
flat_buf_p->blocklens[0] = buftype_size;
flat_buf_p->indices[0] = 0;
flat_buf_p->count = 1;
}
if (filetype_is_contig == 0)
{
/* TODO: why does avery say this should already have been
* flattened in Open, but also says contig types don't get
* flattened */
ADIOI_Flatten_datatype(fd->filetype);
flat_file_p = ADIOI_Flatlist;
while (flat_file_p->type != fd->filetype)
flat_file_p = flat_file_p->next;
}
else
{
/* flatten and add to the list */
flat_file_p = (ADIOI_Flatlist_node *) ADIOI_Malloc
(sizeof(ADIOI_Flatlist_node));
flat_file_p->blocklens =(ADIO_Offset*)ADIOI_Malloc(sizeof(ADIO_Offset));
flat_file_p->indices =
(ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
flat_file_p->blocklens[0] = filetype_size;
flat_file_p->indices[0] = 0;
flat_file_p->count = 1;
}
/* Find out where we are in the flattened filetype (the block index,
* how far into the block, and how many bytes_into_filetype)
* If the file_ptr_type == ADIO_INDIVIDUAL we will use disp, fp_ind
* to figure this out (offset should always be zero)
* If file_ptr_type == ADIO_EXPLICIT, we will use disp and offset
* to figure this out. */
etype_size = fd->etype_size;
num_etypes_in_filetype = filetype_size / etype_size;
if (file_ptr_type == ADIO_INDIVIDUAL)
{
int flag = 0;
/* Should have already been flattened in ADIO_Open*/
num_filetypes = -1;
while (!flag)
{
num_filetypes++;
for (i = 0; i < flat_file_p->count; i++)
{
/* Start on a non zero-length region */
if (flat_file_p->blocklens[i])
{
if (fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent) +
flat_file_p->blocklens[i] > fd->fp_ind &&
fd->disp + flat_file_p->indices[i] <=
fd->fp_ind)
{
flat_file_index = i;
cur_flat_file_reg_off = fd->fp_ind -
(fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent));
flag = 1;
break;
}
else
bytes_into_filetype += flat_file_p->blocklens[i];
}
}
}
/* Impossible that we don't find it in this datatype */
assert(i != flat_file_p->count);
}
else
{
num_filetypes = (int) (offset / num_etypes_in_filetype);
etypes_in_filetype = (int) (offset % num_etypes_in_filetype);
size_in_filetype = etypes_in_filetype * etype_size;
tmp_filetype_size = 0;
for (i=0; i<flat_file_p->count; i++) {
tmp_filetype_size += flat_file_p->blocklens[i];
if (tmp_filetype_size > size_in_filetype)
{
flat_file_index = i;
cur_flat_file_reg_off = flat_file_p->blocklens[i] -
(tmp_filetype_size - size_in_filetype);
bytes_into_filetype = offset * filetype_size -
flat_file_p->blocklens[i];
break;
}
}
}
#ifdef DEBUG_LIST
fprintf(stderr, "ADIOI_PVFS2_StridedListIO: (fd->fp_ind=%Ld,fd->disp=%Ld,"
" offset=%Ld)\n(flat_file_index=%d,cur_flat_file_reg_off=%Ld,"
"bytes_into_filetype=%d)\n",
fd->fp_ind, fd->disp, offset, flat_file_index,
cur_flat_file_reg_off, bytes_into_filetype);
#endif
#ifdef DEBUG_LIST2
fprintf(stderr, "flat_buf:\n");
for (i = 0; i < flat_buf_p->count; i++)
fprintf(stderr, "(offset, length) = (%Ld, %d)\n",
flat_buf_p->indices[i],
flat_buf_p->blocklens[i]);
fprintf(stderr, "flat_file:\n");
for (i = 0; i < flat_file_p->count; i++)
fprintf(stderr, "(offset, length) = (%Ld, %d)\n",
flat_file_p->indices[i],
flat_file_p->blocklens[i]);
#endif
/* total data written */
cur_io_size = 0;
while (cur_io_size != io_size)
{
/* Initialize the temporarily unrolling lists and
* and associated variables */
buf_ol_count = 0;
file_ol_count = 0;
for (i = 0; i < MAX_OL_COUNT; i++)
{
buf_off_arr[i] = 0;
buf_len_arr[i] = 0;
file_off_arr[i] = 0;
file_len_arr[i] = 0;
}
/* Generate the offset-length pairs for a
* list I/O operation */
gen_listio_arr(flat_buf_p,
&flat_buf_index,
&cur_flat_buf_reg_off,
buftype_size,
buftype_extent,
flat_file_p,
&flat_file_index,
&cur_flat_file_reg_off,
filetype_size,
filetype_extent,
MAX_OL_COUNT,
fd->disp,
bytes_into_filetype,
&cur_io_size,
io_size,
buf_off_arr,
buf_len_arr,
&buf_ol_count,
file_off_arr,
file_len_arr,
&file_ol_count);
assert(buf_ol_count <= MAX_OL_COUNT);
assert(file_ol_count <= MAX_OL_COUNT);
#ifdef DEBUG_LIST2
print_buf_file_ol_pairs(buf_off_arr,
buf_len_arr,
buf_ol_count,
file_off_arr,
file_len_arr,
file_ol_count,
buf,
rw_type);
#endif
#ifdef DEBUG_LIST2
do {
int y, z;
fprintf(stderr, "ad_pvfs2_io_list.c::\n");
for (y = 0; y < buf_ol_count; y++)
{
for (z = 0; z < buf_len_arr[y]; z++)
{
fprintf(stderr, "buf[%d][%d]=%c\n",
y, z, ((char *) buf + buf_off_arr[y])[z]);
}
}
} while (0);
#endif
/* Run list I/O operation */
ret = PVFS_Request_hindexed(buf_ol_count, buf_len_arr,
buf_off_arr, PVFS_BYTE, &mem_req);
ret = PVFS_Request_hindexed(file_ol_count, file_len_arr,
file_off_arr, PVFS_BYTE, &file_req);
if (rw_type == READ)
{
ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
buf, mem_req,
&(pvfs_fs->credentials), &resp_io);
}
else
{
ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
buf, mem_req,
&(pvfs_fs->credentials), &resp_io);
}
if (ret != 0)
{
fprintf(stderr, "ADIOI_PVFS2_StridedListIO: Warning - PVFS_sys_"
"read/write returned %d and completed %Ld bytes.\n",
ret, resp_io.total_completed);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(ret),
"Error in PVFS_sys_io \n", 0);
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
goto error_state;
}
total_bytes_accessed += resp_io.total_completed;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
}
#ifdef DEBUG_LIST
fprintf(stderr, "ADIOI_PVFS2_StridedListIO: "
"total_bytes_accessed=%Ld,ret=%d\n",
total_bytes_accessed, ret);
#endif
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_accessed;
*error_code = MPI_SUCCESS;
error_state:
#ifdef HAVE_STATUS_SET_BYTES
/* TODO: why the cast? */
MPIR_Status_set_bytes(status, datatype, (int)total_bytes_accessed);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (buftype_is_contig == 0)
ADIOI_Delete_flattened(datatype);
else
{
ADIOI_Free(flat_buf_p->blocklens);
ADIOI_Free(flat_buf_p->indices);
ADIOI_Free(flat_buf_p);
}
if (filetype_is_contig == 0)
ADIOI_Delete_flattened(fd->filetype);
else
{
ADIOI_Free(flat_file_p->blocklens);
ADIOI_Free(flat_file_p->indices);
ADIOI_Free(flat_file_p);
}
return 0;
}
/* To do: Fix the code to coalesce the offset-length pairs for memory
* and file. */
/* gen_listio_arr - fills in offset-length pairs for memory and file
* for list I/O */
int gen_listio_arr(ADIOI_Flatlist_node *flat_buf_p,
int *flat_buf_index_p,
int64_t *cur_flat_buf_reg_off_p,
int flat_buf_size,
int flat_buf_extent,
ADIOI_Flatlist_node *flat_file_p,
int *flat_file_index_p,
int64_t *cur_flat_file_reg_off_p,
int flat_file_size,
int flat_file_extent,
int max_ol_count,
ADIO_Offset disp,
int bytes_into_filetype,
int64_t *bytes_completed,
int64_t total_io_size,
int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t *buf_ol_count_p,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t *file_ol_count_p)
{
int region_size = -1;
/* parameters for flattened memory and file datatypes*/
int64_t cur_flat_buf_reg_left = 0;
int64_t cur_flat_file_reg_left = 0;
#ifdef DEBUG_LIST2
fprintf(stderr, "gen_list_arr:\n");
#endif
if ((*buf_ol_count_p) != 0 ||(*file_ol_count_p) != 0)
{
fprintf(stderr, "buf_ol_count != 0 || file_ol_count != 0\n");
return -1;
}
/* Start on a non-zero memory and file region
* Note this does not affect the bytes_completed
* since no data is in these regions. Initialize the
* first memory and file offsets. */
while (flat_buf_p->blocklens[(*flat_buf_index_p)] == 0)
{
(*flat_buf_index_p) = ((*flat_buf_index_p) + 1) %
flat_buf_p->count;
}
buf_off_arr[*buf_ol_count_p] =
(*bytes_completed / flat_buf_size) *
flat_buf_extent +
flat_buf_p->indices[*flat_buf_index_p] +
*cur_flat_buf_reg_off_p;
buf_len_arr[*buf_ol_count_p] = 0;
while (flat_file_p->blocklens[(*flat_file_index_p)] == 0)
{
(*flat_file_index_p) = ((*flat_file_index_p) + 1) %
flat_file_p->count;
}
file_off_arr[*file_ol_count_p] = disp +
(((bytes_into_filetype + *bytes_completed) / flat_file_size) *
flat_file_extent) +
flat_file_p->indices[*flat_file_index_p] +
*cur_flat_file_reg_off_p;
file_len_arr[*file_ol_count_p] = 0;
#ifdef DEBUG_LIST2
fprintf(stderr, "initial buf_off_arr[%d] = %Ld\n", *buf_ol_count_p,
buf_off_arr[*buf_ol_count_p]);
fprintf(stderr, "initial file_off_arr[%d] = %Ld\n", *file_ol_count_p,
file_off_arr[*file_ol_count_p]);
#endif
while (*bytes_completed != total_io_size
&& (*buf_ol_count_p) < max_ol_count
&& (*file_ol_count_p) < max_ol_count)
{
/* How much data is left in the current piece in
* the flattened datatypes */
cur_flat_buf_reg_left = flat_buf_p->blocklens[*flat_buf_index_p]
- *cur_flat_buf_reg_off_p;
cur_flat_file_reg_left = flat_file_p->blocklens[*flat_file_index_p]
- *cur_flat_file_reg_off_p;
#ifdef DEBUG_LIST2
fprintf(stderr,
"flat_buf_index=%d flat_buf->blocklens[%d]=%d\n"
"cur_flat_buf_reg_left=%Ld "
"*cur_flat_buf_reg_off_p=%Ld\n"
"flat_file_index=%d flat_file->blocklens[%d]=%d\n"
"cur_flat_file_reg_left=%Ld "
"*cur_flat_file_reg_off_p=%Ld\n"
"bytes_completed=%Ld\n"
"buf_ol_count=%d file_ol_count=%d\n"
"buf_len_arr[%d]=%d file_len_arr[%d]=%d\n\n",
*flat_buf_index_p, *flat_buf_index_p,
flat_buf_p->blocklens[*flat_buf_index_p],
cur_flat_buf_reg_left,
*cur_flat_buf_reg_off_p,
*flat_file_index_p, *flat_file_index_p,
flat_file_p->blocklens[*flat_file_index_p],
cur_flat_file_reg_left,
*cur_flat_file_reg_off_p,
*bytes_completed,
*buf_ol_count_p, *file_ol_count_p,
*buf_ol_count_p,
buf_len_arr[*buf_ol_count_p],
*file_ol_count_p,
file_len_arr[*file_ol_count_p]);
#endif
/* What is the size of the next contiguous region agreed
* upon by both memory and file regions that does not
* surpass the file size */
if (cur_flat_buf_reg_left > cur_flat_file_reg_left)
region_size = cur_flat_file_reg_left;
else
region_size = cur_flat_buf_reg_left;
if (region_size > total_io_size - *bytes_completed)
region_size = total_io_size - *bytes_completed;
/* Add this piece to both the mem and file arrays
* coalescing offset-length pairs if possible and advance
* the pointers through the flatten mem and file datatypes
* as well Note: no more than a single piece can be done
* since we take the smallest one possible */
if (cur_flat_buf_reg_left == region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "reached end of memory block...\n");
#endif
(*flat_buf_index_p) = ((*flat_buf_index_p) + 1) %
flat_buf_p->count;
while (flat_buf_p->blocklens[(*flat_buf_index_p)] == 0)
{
(*flat_buf_index_p) = ((*flat_buf_index_p) + 1) %
flat_buf_p->count;
}
*cur_flat_buf_reg_off_p = 0;
#ifdef COALESCE_REGIONS
if (*buf_ol_count_p != 0)
{
if (buf_off_arr[(*buf_ol_count_p) - 1] +
buf_len_arr[(*buf_ol_count_p) - 1] ==
buf_off_arr[*buf_ol_count_p])
{
buf_len_arr[(*buf_ol_count_p) - 1] +=
region_size;
}
else
{
buf_len_arr[*buf_ol_count_p] += region_size;
(*buf_ol_count_p)++;
}
}
else
{
#endif
buf_len_arr[*buf_ol_count_p] += region_size;
(*buf_ol_count_p)++;
#ifdef COALESCE_REGIONS
}
#endif
/* Don't prepare for the next piece if we have reached
* the limit or else it will segment fault. */
if ((*buf_ol_count_p) != max_ol_count)
{
buf_off_arr[*buf_ol_count_p] =
((*bytes_completed + region_size) / flat_buf_size) *
flat_buf_extent +
flat_buf_p->indices[*flat_buf_index_p] +
(*cur_flat_buf_reg_off_p);
buf_len_arr[*buf_ol_count_p] = 0;
}
}
else if (cur_flat_buf_reg_left > region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "advanced %d in memory block...\n",
region_size);
#endif
(*cur_flat_buf_reg_off_p) += region_size;
buf_len_arr[*buf_ol_count_p] += region_size;
}
else
{
fprintf(stderr, "gen_listio_arr: Error\n");
}
/* To calculate the absolute file offset we need to
* add the disp, how many filetypes we have gone through,
* the relative block offset in the filetype and how far
* into the block we have gone. */
if (cur_flat_file_reg_left == region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "reached end of file block...\n");
#endif
(*flat_file_index_p) = ((*flat_file_index_p) + 1) %
flat_file_p->count;
while (flat_file_p->blocklens[(*flat_file_index_p)] == 0)
{
(*flat_file_index_p) = ((*flat_file_index_p) + 1) %
flat_file_p->count;
}
(*cur_flat_file_reg_off_p) = 0;
#ifdef COALESCE_REGIONS
if (*file_ol_count_p != 0)
{
if (file_off_arr[(*file_ol_count_p) - 1] +
file_len_arr[(*file_ol_count_p) - 1] ==
file_off_arr[*file_ol_count_p])
{
file_len_arr[(*file_ol_count_p) - 1] +=
region_size;
}
else
{
file_len_arr[*file_ol_count_p] += region_size;
(*file_ol_count_p)++;
}
}
else
{
#endif
file_len_arr[*file_ol_count_p] += region_size;
(*file_ol_count_p)++;
#ifdef COALESCE_REGIONS
}
#endif
/* Don't prepare for the next piece if we have reached
* the limit or else it will segment fault. */
if ((*file_ol_count_p) != max_ol_count)
{
file_off_arr[*file_ol_count_p] = disp +
(((bytes_into_filetype + *bytes_completed + region_size)
/ flat_file_size) *
flat_file_extent) +
flat_file_p->indices[*flat_file_index_p] +
(*cur_flat_file_reg_off_p);
file_len_arr[*file_ol_count_p] = 0;
}
}
else if (cur_flat_file_reg_left > region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "advanced %d in file block...\n",
region_size);
#endif
(*cur_flat_file_reg_off_p) += region_size;
file_len_arr[*file_ol_count_p] += region_size;
}
else
{
fprintf(stderr, "gen_listio_arr: Error\n");
}
#ifdef DEBUG_LIST2
fprintf(stderr,
"------------------------------\n\n");
#endif
*bytes_completed += region_size;
}
/* Increment the count if we stopped in the middle of a
* memory or file region */
if (*cur_flat_buf_reg_off_p != 0)
(*buf_ol_count_p)++;
if (*cur_flat_file_reg_off_p != 0)
(*file_ol_count_p)++;
return 0;
}
void print_buf_file_ol_pairs(int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t buf_ol_count,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t file_ol_count,
void *buf,
int rw_type)
{
int i = -1;
fprintf(stderr, "buf_ol_pairs(offset,length) count = %d\n",
buf_ol_count);
for (i = 0; i < buf_ol_count; i++)
{
fprintf(stderr, "(%Ld, %d) ", buf_off_arr[i], buf_len_arr[i]);
}
fprintf(stderr, "\n");
fprintf(stderr, "file_ol_pairs(offset,length) count = %d\n",
file_ol_count);
for (i = 0; i < file_ol_count; i++)
{
fprintf(stderr, "(%Ld, %d) ", file_off_arr[i], file_len_arr[i]);
}
fprintf(stderr, "\n\n");
}

Просмотреть файл

@ -8,7 +8,7 @@
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_io.h"
#include "ad_pvfs2_common.h"
void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count,
@ -92,898 +92,76 @@ fn_exit:
return;
}
static int ADIOI_PVFS2_ReadStridedListIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
return ADIOI_PVFS2_StridedListIO(fd, buf, count,
datatype, file_ptr_type,
offset, status,
error_code, READ);
}
static int ADIOI_PVFS2_ReadStridedDtypeIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
return ADIOI_PVFS2_StridedDtypeIO(fd, buf, count,
datatype, file_ptr_type,
offset, status, error_code,
READ);
}
void ADIOI_PVFS2_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, brd_size, frd_size=0, st_index=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset off, disp, start_off, initial_off;
int flag, st_frd_size, st_n_filetypes;
/* four ways (to date) that we can carry out strided i/o accesses:
* - naive posix
* - 'true' Datatype (from avery)
* - new List I/O (from avery)
* - classic List I/O (the one that's always been in ROMIO)
* I imagine we'll keep Datatype as an optional optimization, and afer a
* release or two promote it to the default
*/
int ret = -1;
int mem_list_count, file_list_count;
PVFS_size *mem_offsets;
int64_t *file_offsets;
int *mem_lengths;
int32_t *file_lengths;
int total_blks_to_read;
int max_mem_list, max_file_list;
int b_blks_read;
int f_data_read;
int size_read=0, n_read_lists, extra_blks;
int end_brd_size, end_frd_size;
int start_k, start_j, new_file_read, new_buffer_read;
int start_mem_offset;
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
int err_flag=0;
MPI_Offset total_bytes_read = 0;
static char myname[] = "ADIOI_PVFS2_ReadStrided";
#define MAX_ARRAY_SIZE 64
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* the HDF5 tests showed a bug in this list processing code (see many many
* lines down below). We added a workaround, but common HDF5 file types
* are actually contiguous and do not need the expensive workarond */
if (!filetype_is_contig) {
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
if (flat_file->count == 1 && !buftype_is_contig)
filetype_is_contig = 1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
*error_code = MPI_SUCCESS;
if (fd->hints->fs_hints.pvfs2.posix_read == ADIOI_HINT_ENABLE) {
ADIOI_GEN_ReadStrided(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
return;
}
if (fd->hints->fs_hints.pvfs2.dtype_read == ADIOI_HINT_ENABLE) {
ret = ADIOI_PVFS2_ReadStridedDtypeIO(fd, buf, count,
datatype, file_ptr_type,
offset, status, error_code);
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
int64_t file_offsets;
int32_t file_lengths;
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
file_list_count = 1;
file_offsets = off;
file_lengths = 0;
total_blks_to_read = count*flat_buf->count;
b_blks_read = 0;
/* allocate arrays according to max usage */
if (total_blks_to_read > MAX_ARRAY_SIZE)
mem_list_count = MAX_ARRAY_SIZE;
else mem_list_count = total_blks_to_read;
mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size));
mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));
/* TODO: CHECK RESULTS OF MEMORY ALLOCATION */
j = 0;
/* step through each block in memory, filling memory arrays */
while (b_blks_read < total_blks_to_read) {
for (i=0; i<flat_buf->count; i++) {
mem_offsets[b_blks_read % MAX_ARRAY_SIZE] =
/* TODO: fix this compiler warning */
((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]);
mem_lengths[b_blks_read % MAX_ARRAY_SIZE] =
flat_buf->blocklens[i];
file_lengths += flat_buf->blocklens[i];
b_blks_read++;
if (!(b_blks_read % MAX_ARRAY_SIZE) ||
(b_blks_read == total_blks_to_read)) {
/* in the case of the last read list call,
adjust mem_list_count */
if (b_blks_read == total_blks_to_read) {
mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE;
/* in case last read list call fills max arrays */
if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
}
err_flag = PVFS_Request_hindexed(mem_list_count,
mem_lengths, mem_offsets, PVFS_BYTE, &mem_req);
if (err_flag < 0) break;
err_flag = PVFS_Request_contiguous(file_lengths,
PVFS_BYTE, &file_req);
if (err_flag < 0) break;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req,
file_offsets, PVFS_BOTTOM, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
/* --END ERROR HANDLING-- */
/* in the case of error or the last read list call,
* leave here */
if (err_flag || b_blks_read == total_blks_to_read) break;
file_offsets += file_lengths;
file_lengths = 0;
}
} /* for (i=0; i<flat_buf->count; i++) */
j++;
} /* while (b_blks_read < total_blks_to_read) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_read;
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This isa temporary way of filling in status. The right way is to
keep tracke of how much data was actually read adn placed in buf
by ADIOI_BUFFERED_READ. */
#endif
ADIOI_Delete_flattened(datatype);
return;
} /* if (!buftype_is_contig && filetype_is_contig) */
/* know file is noncontiguous from above */
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
initial_off = offset;
/* for each case - ADIO_Individual pointer or explicit, find the file
offset in bytes (offset), n_filetypes (how many filetypes into
file to start), frd_size (remaining amount of data in present
file block), and st_index (start point in terms of blocks in
starting filetype) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent +
flat_file->blocklens[i] >= offset) {
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
break;
}
}
} /* while (!flag) */
} /* if (file_ptr_type == ADIO_INDIVIDUAL) */
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
abs_off_in_filetype;
} /* else [file_ptr_type != ADIO_INDIVIDUAL] */
start_off = offset;
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
int mem_lengths;
char *mem_offsets;
i = 0;
j = st_index;
n_filetypes = st_n_filetypes;
mem_list_count = 1;
/* determine how many blocks in file to read */
f_data_read = ADIOI_MIN(st_frd_size, bufsize);
total_blks_to_read = 1;
if (j < (flat_file->count-1)) j++;
else {
j = 0;
n_filetypes++;
}
while (f_data_read < bufsize) {
f_data_read += flat_file->blocklens[j];
total_blks_to_read++;
if (j<(flat_file->count-1)) j++;
else j = 0;
}
j = st_index;
n_filetypes = st_n_filetypes;
n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE;
extra_blks = total_blks_to_read%MAX_ARRAY_SIZE;
mem_offsets = buf;
mem_lengths = 0;
/* if at least one full readlist, allocate file arrays
at max array size and don't free until very end */
if (n_read_lists) {
file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int32_t));
}
/* if there's no full readlist allocate file arrays according
to needed size (extra_blks) */
else {
file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
sizeof(int32_t));
}
/* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
for (i=0; i<n_read_lists; i++) {
file_list_count = MAX_ARRAY_SIZE;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_frd_size;
mem_lengths = st_frd_size;
}
for (k=0; k<MAX_ARRAY_SIZE; k++) {
if (i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent
+ flat_file->indices[j];
file_lengths[k] = flat_file->blocklens[j];
mem_lengths += file_lengths[k];
}
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* PVFS_Request_hindexed already expresses the offsets into the
* file, so we should not pass in an offset if we are using
* hindexed for the file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
mem_offsets += mem_lengths;
mem_lengths = 0;
} /* for (i=0; i<n_read_lists; i++) */
/* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */
if (extra_blks) {
file_list_count = extra_blks;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_frd_size;
}
for (k=0; k<extra_blks; k++) {
if(i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent +
flat_file->indices[j];
if (k == (extra_blks - 1)) {
file_lengths[k] = bufsize - (int32_t) mem_lengths
- (int32_t) mem_offsets + (int32_t) buf;
}
else file_lengths[k] = flat_file->blocklens[j];
} /* if(i || k) */
mem_lengths += file_lengths[k];
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<extra_blks; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* as above, use 0 for 'offset' when using hindexed file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
max_mem_list = 0;
max_file_list = 0;
/* run through and file max_file_list and max_mem_list so that you
can allocate the file and memory arrays less than MAX_ARRAY_SIZE
if possible */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fakes filling the readlist arrays of lengths found above */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
if(i) {
if (i == (mem_list_count - 1)) {
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
if (i) {
if (i == (file_list_count - 1)) {
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
size_read += new_buffer_read;
start_k = k;
start_j = j;
if (max_mem_list < mem_list_count)
max_mem_list = mem_list_count;
if (max_file_list < file_list_count)
max_file_list = file_list_count;
} /* while (size_read < bufsize) */
/* one last check before we actually carry out the operation:
* this code has hard-to-fix bugs when a noncontiguous file type has
* such large pieces that the sum of the lengths of the memory type is
* not larger than one of those pieces (and vice versa for large memory
* types and many pices of file types. In these cases, give up and
* fall back to naive reads and writes. The testphdf5 test created a
* type with two very large memory regions and 600 very small file
* regions. The same test also created a type with one very large file
* region and many (700) very small memory regions. both cases caused
* problems for this code */
if ( ( (file_list_count == 1) &&
(new_file_read < flat_file->blocklens[0] ) ) ||
((mem_list_count == 1) &&
(new_buffer_read < flat_buf->blocklens[0]) ) ||
((file_list_count == MAX_ARRAY_SIZE) &&
(new_file_read < flat_buf->blocklens[0]) ) ||
( (mem_list_count == MAX_ARRAY_SIZE) &&
(new_buffer_read < flat_file->blocklens[0])) )
/* Fall back to list I/O if datatype I/O didn't work */
if (ret != 0)
{
fprintf(stderr,
"Falling back to list I/O since datatype I/O failed\n");
ret = ADIOI_PVFS2_ReadStridedListIO(fd, buf, count,
datatype, file_ptr_type,
offset, status, error_code);
}
return;
}
if (fd->hints->fs_hints.pvfs2.listio_read == ADIOI_HINT_ENABLE) {
ret = ADIOI_PVFS2_ReadStridedListIO(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
return;
}
/* Use classic list I/O if no hints given base case */
ADIOI_Delete_flattened(datatype);
ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype,
file_ptr_type, initial_off, status, error_code);
ADIOI_PVFS2_OldReadStrided(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
return;
}
mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size));
mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
/* this section calculates mem_list_count and file_list_count
and also finds the possibly odd sized last array elements
in new_frd_size and new_brd_size */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fills the allocated readlist arrays */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
mem_offsets[i] = ((PVFS_size)buf + buftype_extent*
(buf_count/flat_buf->count) +
(int)flat_buf->indices[k]);
if(!i) {
mem_lengths[0] = brd_size;
mem_offsets[0] += flat_buf->blocklens[k] - brd_size;
}
else {
if (i == (mem_list_count - 1)) {
mem_lengths[i] = end_brd_size;
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
else {
mem_lengths[i] = flat_buf->blocklens[k];
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
file_offsets[i] = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes) * filetype_extent;
if (!i) {
file_lengths[0] = frd_size;
file_offsets[0] += flat_file->blocklens[j] - frd_size;
}
else {
if (i == (file_list_count - 1)) {
file_lengths[i] = end_frd_size;
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
else file_lengths[i] = flat_file->blocklens[j];
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths,
mem_offsets, PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0 ) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
goto error_state;
}
/* -- END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* offset will be expressed in memory and file datatypes */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
size_read += new_buffer_read;
start_k = k;
start_j = j;
} /* while (size_read < bufsize) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
}
ADIOI_Free(file_offsets);
ADIOI_Free(file_lengths);
/* Other ADIO routines will convert absolute bytes into counts of datatypes */
/* when incrementing fp_ind, need to also take into account the file type:
* consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
* if we wrote N elements, offset needs to point at beginning of type, not
* at empty region at offset N+1) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* this is closer, but still incorrect for the cases where a small
* amount of a file type is "leftover" after a write */
fd->fp_ind = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes)*filetype_extent;
}
if (err_flag == 0) *error_code = MPI_SUCCESS;
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}
/*
* vim: ts=8 sts=4 sw=4 noexpandtab

Просмотреть файл

@ -0,0 +1,909 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_common.h"
void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, brd_size, frd_size=0, st_index=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset off, disp, start_off, initial_off;
int flag, st_frd_size, st_n_filetypes;
int mem_list_count, file_list_count;
PVFS_size *mem_offsets;
int64_t *file_offsets;
int *mem_lengths;
int32_t *file_lengths;
int total_blks_to_read;
int max_mem_list, max_file_list;
int b_blks_read;
int f_data_read;
int size_read=0, n_read_lists, extra_blks;
int end_brd_size, end_frd_size;
int start_k, start_j, new_file_read, new_buffer_read;
int start_mem_offset;
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
int err_flag=0;
MPI_Offset total_bytes_read = 0;
static char myname[] = "ADIOI_PVFS2_ReadStrided";
#define MAX_ARRAY_SIZE 64
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* the HDF5 tests showed a bug in this list processing code (see many many
* lines down below). We added a workaround, but common HDF5 file types
* are actually contiguous and do not need the expensive workarond */
if (!filetype_is_contig) {
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
if (flat_file->count == 1 && !buftype_is_contig)
filetype_is_contig = 1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
int64_t file_offsets;
int32_t file_lengths;
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
file_list_count = 1;
file_offsets = off;
file_lengths = 0;
total_blks_to_read = count*flat_buf->count;
b_blks_read = 0;
/* allocate arrays according to max usage */
if (total_blks_to_read > MAX_ARRAY_SIZE)
mem_list_count = MAX_ARRAY_SIZE;
else mem_list_count = total_blks_to_read;
mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size));
mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));
/* TODO: CHECK RESULTS OF MEMORY ALLOCATION */
j = 0;
/* step through each block in memory, filling memory arrays */
while (b_blks_read < total_blks_to_read) {
for (i=0; i<flat_buf->count; i++) {
mem_offsets[b_blks_read % MAX_ARRAY_SIZE] =
/* TODO: fix this compiler warning */
((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]);
mem_lengths[b_blks_read % MAX_ARRAY_SIZE] =
flat_buf->blocklens[i];
file_lengths += flat_buf->blocklens[i];
b_blks_read++;
if (!(b_blks_read % MAX_ARRAY_SIZE) ||
(b_blks_read == total_blks_to_read)) {
/* in the case of the last read list call,
adjust mem_list_count */
if (b_blks_read == total_blks_to_read) {
mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE;
/* in case last read list call fills max arrays */
if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
}
err_flag = PVFS_Request_hindexed(mem_list_count,
mem_lengths, mem_offsets, PVFS_BYTE, &mem_req);
if (err_flag < 0) break;
err_flag = PVFS_Request_contiguous(file_lengths,
PVFS_BYTE, &file_req);
if (err_flag < 0) break;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req,
file_offsets, PVFS_BOTTOM, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
/* --END ERROR HANDLING-- */
/* in the case of error or the last read list call,
* leave here */
if (err_flag || b_blks_read == total_blks_to_read) break;
file_offsets += file_lengths;
file_lengths = 0;
}
} /* for (i=0; i<flat_buf->count; i++) */
j++;
} /* while (b_blks_read < total_blks_to_read) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_read;
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This isa temporary way of filling in status. The right way is to
keep tracke of how much data was actually read adn placed in buf
by ADIOI_BUFFERED_READ. */
#endif
ADIOI_Delete_flattened(datatype);
return;
} /* if (!buftype_is_contig && filetype_is_contig) */
/* know file is noncontiguous from above */
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
initial_off = offset;
/* for each case - ADIO_Individual pointer or explicit, find the file
offset in bytes (offset), n_filetypes (how many filetypes into
file to start), frd_size (remaining amount of data in present
file block), and st_index (start point in terms of blocks in
starting filetype) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent +
flat_file->blocklens[i] >= offset) {
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
break;
}
}
} /* while (!flag) */
} /* if (file_ptr_type == ADIO_INDIVIDUAL) */
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
abs_off_in_filetype;
} /* else [file_ptr_type != ADIO_INDIVIDUAL] */
start_off = offset;
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
int mem_lengths;
char *mem_offsets;
i = 0;
j = st_index;
n_filetypes = st_n_filetypes;
mem_list_count = 1;
/* determine how many blocks in file to read */
f_data_read = ADIOI_MIN(st_frd_size, bufsize);
total_blks_to_read = 1;
if (j < (flat_file->count-1)) j++;
else {
j = 0;
n_filetypes++;
}
while (f_data_read < bufsize) {
f_data_read += flat_file->blocklens[j];
total_blks_to_read++;
if (j<(flat_file->count-1)) j++;
else j = 0;
}
j = st_index;
n_filetypes = st_n_filetypes;
n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE;
extra_blks = total_blks_to_read%MAX_ARRAY_SIZE;
mem_offsets = buf;
mem_lengths = 0;
/* if at least one full readlist, allocate file arrays
at max array size and don't free until very end */
if (n_read_lists) {
file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int32_t));
}
/* if there's no full readlist allocate file arrays according
to needed size (extra_blks) */
else {
file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
sizeof(int32_t));
}
/* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
for (i=0; i<n_read_lists; i++) {
file_list_count = MAX_ARRAY_SIZE;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_frd_size;
mem_lengths = st_frd_size;
}
for (k=0; k<MAX_ARRAY_SIZE; k++) {
if (i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent
+ flat_file->indices[j];
file_lengths[k] = flat_file->blocklens[j];
mem_lengths += file_lengths[k];
}
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* PVFS_Request_hindexed already expresses the offsets into the
* file, so we should not pass in an offset if we are using
* hindexed for the file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
mem_offsets += mem_lengths;
mem_lengths = 0;
} /* for (i=0; i<n_read_lists; i++) */
/* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */
if (extra_blks) {
file_list_count = extra_blks;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = ADIOI_MIN(st_frd_size, bufsize);
}
for (k=0; k<extra_blks; k++) {
if(i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent +
flat_file->indices[j];
if (k == (extra_blks - 1)) {
file_lengths[k] = bufsize - (int32_t) mem_lengths
- (int32_t) mem_offsets + (int32_t) buf;
}
else file_lengths[k] = flat_file->blocklens[j];
} /* if(i || k) */
mem_lengths += file_lengths[k];
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<extra_blks; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* as above, use 0 for 'offset' when using hindexed file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
max_mem_list = 0;
max_file_list = 0;
/* run through and file max_file_list and max_mem_list so that you
can allocate the file and memory arrays less than MAX_ARRAY_SIZE
if possible */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fakes filling the readlist arrays of lengths found above */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
if(i) {
if (i == (mem_list_count - 1)) {
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
if (i) {
if (i == (file_list_count - 1)) {
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
size_read += new_buffer_read;
start_k = k;
start_j = j;
if (max_mem_list < mem_list_count)
max_mem_list = mem_list_count;
if (max_file_list < file_list_count)
max_file_list = file_list_count;
} /* while (size_read < bufsize) */
/* one last check before we actually carry out the operation:
* this code has hard-to-fix bugs when a noncontiguous file type has
* such large pieces that the sum of the lengths of the memory type is
* not larger than one of those pieces (and vice versa for large memory
* types and many pices of file types. In these cases, give up and
* fall back to naive reads and writes. The testphdf5 test created a
* type with two very large memory regions and 600 very small file
* regions. The same test also created a type with one very large file
* region and many (700) very small memory regions. both cases caused
* problems for this code */
if ( ( (file_list_count == 1) &&
(new_file_read < flat_file->blocklens[0] ) ) ||
((mem_list_count == 1) &&
(new_buffer_read < flat_buf->blocklens[0]) ) ||
((file_list_count == MAX_ARRAY_SIZE) &&
(new_file_read < flat_buf->blocklens[0]) ) ||
( (mem_list_count == MAX_ARRAY_SIZE) &&
(new_buffer_read < flat_file->blocklens[0])) )
{
ADIOI_Delete_flattened(datatype);
ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype,
file_ptr_type, initial_off, status, error_code);
return;
}
mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size));
mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
/* this section calculates mem_list_count and file_list_count
and also finds the possibly odd sized last array elements
in new_frd_size and new_brd_size */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fills the allocated readlist arrays */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
mem_offsets[i] = ((PVFS_size)buf + buftype_extent*
(buf_count/flat_buf->count) +
(int)flat_buf->indices[k]);
if(!i) {
mem_lengths[0] = brd_size;
mem_offsets[0] += flat_buf->blocklens[k] - brd_size;
}
else {
if (i == (mem_list_count - 1)) {
mem_lengths[i] = end_brd_size;
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
else {
mem_lengths[i] = flat_buf->blocklens[k];
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
file_offsets[i] = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes) * filetype_extent;
if (!i) {
file_lengths[0] = frd_size;
file_offsets[0] += flat_file->blocklens[j] - frd_size;
}
else {
if (i == (file_list_count - 1)) {
file_lengths[i] = end_frd_size;
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
else file_lengths[i] = flat_file->blocklens[j];
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths,
mem_offsets, PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0 ) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
goto error_state;
}
/* -- END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* offset will be expressed in memory and file datatypes */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
size_read += new_buffer_read;
start_k = k;
start_j = j;
} /* while (size_read < bufsize) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
}
/* Other ADIO routines will convert absolute bytes into counts of datatypes */
/* when incrementing fp_ind, need to also take into account the file type:
* consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
* if we wrote N elements, offset needs to point at beginning of type, not
* at empty region at offset N+1)
*
* As we discussed on mpich-discuss in may/june 2009, the code below might
* look wierd, but by putting fp_ind at the last byte written, the next
* time we run through the strided code we'll update the fp_ind to the
* right location. */
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind = file_offsets[file_list_count-1]+
file_lengths[file_list_count-1];
}
ADIOI_Free(file_offsets);
ADIOI_Free(file_lengths);
if (err_flag == 0) *error_code = MPI_SUCCESS;
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,963 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_common.h"
void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
/* as with all the other WriteStrided functions, offset is in units of
* etype relative to the filetype */
/* Since PVFS2 does not support file locking, can't do buffered writes
as on Unix */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, bwr_size, fwr_size=0, st_index=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset off, disp, start_off, initial_off;
int flag, st_fwr_size, st_n_filetypes;
int err_flag=0;
int mem_list_count, file_list_count;
PVFS_size * mem_offsets;
int64_t *file_offsets;
int *mem_lengths;
int32_t *file_lengths;
int total_blks_to_write;
int max_mem_list, max_file_list;
int b_blks_wrote;
int f_data_wrote;
int size_wrote=0, n_write_lists, extra_blks;
int end_bwr_size, end_fwr_size;
int start_k, start_j, new_file_write, new_buffer_write;
int start_mem_offset;
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
MPI_Offset total_bytes_written=0;
static char myname[] = "ADIOI_PVFS2_WRITESTRIDED";
/* note: don't increase this: several parts of PVFS2 now
* assume this limit*/
#define MAX_ARRAY_SIZE 64
/* --BEGIN ERROR HANDLING-- */
if (fd->atomicity) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"Atomic noncontiguous writes are not supported by PVFS2", 0);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* the HDF5 tests showed a bug in this list processing code (see many many
* lines down below). We added a workaround, but common HDF5 file types
* are actually contiguous and do not need the expensive workarond */
if (!filetype_is_contig) {
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
if (flat_file->count == 1 && !buftype_is_contig)
filetype_is_contig = 1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
int64_t file_offsets;
int32_t file_lengths;
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + etype_size * offset;
}
else off = fd->fp_ind;
file_list_count = 1;
file_offsets = off;
file_lengths = 0;
total_blks_to_write = count*flat_buf->count;
b_blks_wrote = 0;
/* allocate arrays according to max usage */
if (total_blks_to_write > MAX_ARRAY_SIZE)
mem_list_count = MAX_ARRAY_SIZE;
else mem_list_count = total_blks_to_write;
mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size));
mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));
j = 0;
/* step through each block in memory, filling memory arrays */
while (b_blks_wrote < total_blks_to_write) {
for (i=0; i<flat_buf->count; i++) {
mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] =
/* TODO: fix this warning by casting to an integer that's
* the same size as a char * and /then/ casting to
* PVFS_size */
((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]);
mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] =
flat_buf->blocklens[i];
file_lengths += flat_buf->blocklens[i];
b_blks_wrote++;
if (!(b_blks_wrote % MAX_ARRAY_SIZE) ||
(b_blks_wrote == total_blks_to_write)) {
/* in the case of the last write list call,
adjust mem_list_count */
if (b_blks_wrote == total_blks_to_write) {
mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE;
/* in case last write list call fills max arrays */
if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
}
err_flag = PVFS_Request_hindexed(mem_list_count,
mem_lengths, mem_offsets,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
break;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_contiguous(file_lengths,
PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (file)", 0);
break;
}
/* --END ERROR HANDLING-- */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req,
file_offsets, PVFS_BOTTOM,
mem_req,
&(pvfs_fs->credentials),
&resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
total_bytes_written += resp_io.total_completed;
/* in the case of error or the last write list call,
* leave here */
/* --BEGIN ERROR HANDLING-- */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
break;
}
/* --END ERROR HANDLING-- */
if (b_blks_wrote == total_blks_to_write) break;
file_offsets += file_lengths;
file_lengths = 0;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
}
} /* for (i=0; i<flat_buf->count; i++) */
j++;
} /* while (b_blks_wrote < total_blks_to_write) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_written;
if (!err_flag) *error_code = MPI_SUCCESS;
fd->fp_sys_posn = -1; /* clear this. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
ADIOI_Delete_flattened(datatype);
return;
} /* if (!buftype_is_contig && filetype_is_contig) */
/* already know that file is noncontiguous from above */
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
initial_off = offset;
/* for each case - ADIO_Individual pointer or explicit, find offset
(file offset in bytes), n_filetypes (how many filetypes into file
to start), fwr_size (remaining amount of data in present file
block), and st_index (start point in terms of blocks in starting
filetype) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent +
flat_file->blocklens[i] >= offset) {
st_index = i;
fwr_size = disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent
+ flat_file->blocklens[i] - offset;
flag = 1;
break;
}
}
} /* while (!flag) */
} /* if (file_ptr_type == ADIO_INDIVIDUAL) */
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
abs_off_in_filetype;
} /* else [file_ptr_type != ADIO_INDIVIDUAL] */
start_off = offset;
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
int mem_lengths;
char *mem_offsets;
i = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
mem_list_count = 1;
/* determine how many blocks in file to write */
f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize);
total_blks_to_write = 1;
if (j < (flat_file->count -1)) j++;
else {
j = 0;
n_filetypes++;
}
while (f_data_wrote < bufsize) {
f_data_wrote += flat_file->blocklens[j];
total_blks_to_write++;
if (j<(flat_file->count-1)) j++;
else j = 0;
}
j = st_index;
n_filetypes = st_n_filetypes;
n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE;
extra_blks = total_blks_to_write%MAX_ARRAY_SIZE;
mem_offsets = buf;
mem_lengths = 0;
/* if at least one full writelist, allocate file arrays
at max array size and don't free until very end */
if (n_write_lists) {
file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int32_t));
}
/* if there's no full writelist allocate file arrays according
to needed size (extra_blks) */
else {
file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
sizeof(int32_t));
}
/* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
for (i=0; i<n_write_lists; i++) {
file_list_count = MAX_ARRAY_SIZE;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_fwr_size;
mem_lengths = st_fwr_size;
}
for (k=0; k<MAX_ARRAY_SIZE; k++) {
if (i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent
+ flat_file->indices[j];
file_lengths[k] = flat_file->blocklens[j];
mem_lengths += file_lengths[k];
}
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* PVFS_Request_hindexed already expresses the offsets into the
* file, so we should not pass in an offset if we are using
* hindexed for the file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
total_bytes_written += resp_io.total_completed;
mem_offsets += mem_lengths;
mem_lengths = 0;
PVFS_Request_free(&file_req);
PVFS_Request_free(&mem_req);
} /* for (i=0; i<n_write_lists; i++) */
/* for file arrays smaller than MAX_ARRAY_SIZE (last write_list call) */
if (extra_blks) {
file_list_count = extra_blks;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = ADIOI_MIN(st_fwr_size, bufsize);
}
for (k=0; k<extra_blks; k++) {
if(i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent +
flat_file->indices[j];
if (k == (extra_blks - 1)) {
file_lengths[k] = bufsize - (int32_t) mem_lengths
- (int32_t) mem_offsets + (int32_t) buf;
}
else file_lengths[k] = flat_file->blocklens[j];
} /* if(i || k) */
mem_lengths += file_lengths[k];
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<extra_blks; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed(file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* as above, use 0 for 'offset' when using hindexed file type*/
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
total_bytes_written += resp_io.total_completed;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
size_wrote = 0;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
max_mem_list = 0;
max_file_list = 0;
/* run through and file max_file_list and max_mem_list so that you
can allocate the file and memory arrays less than MAX_ARRAY_SIZE
if possible */
while (size_wrote < bufsize) {
k = start_k;
new_buffer_write = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data written and data to be
written in the next immediate write list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k] +
size_wrote) > bufsize) {
end_bwr_size = new_buffer_write +
flat_buf->blocklens[k] - (bufsize - size_wrote);
new_buffer_write = bufsize - size_wrote;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
else new_buffer_write = bwr_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
j = start_j;
new_file_write = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_write < new_buffer_write)) {
if(file_list_count) {
if((new_file_write + flat_file->blocklens[j]) >
new_buffer_write) {
end_fwr_size = new_buffer_write - new_file_write;
new_file_write = new_buffer_write;
j--;
}
else {
new_file_write += flat_file->blocklens[j];
end_fwr_size = flat_file->blocklens[j];
}
}
else {
if (fwr_size > new_buffer_write) {
new_file_write = new_buffer_write;
fwr_size = new_file_write;
}
else new_file_write = fwr_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_write = 0;
mem_list_count = 0;
while (new_buffer_write < new_file_write) {
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k]) >
new_file_write) {
end_bwr_size = new_file_write -
new_buffer_write;
new_buffer_write = new_file_write;
k--;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_write = bwr_size;
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_write < new_file_write) */
} /* if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
/* fakes filling the writelist arrays of lengths found above */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
if(i) {
if (i == (mem_list_count - 1)) {
if (flat_buf->blocklens[k] == end_bwr_size)
bwr_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
bwr_size = flat_buf->blocklens[k] - end_bwr_size;
k--;
buf_count--;
}
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
if (i) {
if (i == (file_list_count - 1)) {
if (flat_file->blocklens[j] == end_fwr_size)
fwr_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
fwr_size = flat_file->blocklens[j] - end_fwr_size;
j--;
}
}
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
size_wrote += new_buffer_write;
start_k = k;
start_j = j;
if (max_mem_list < mem_list_count)
max_mem_list = mem_list_count;
if (max_file_list < file_list_count)
max_file_list = file_list_count;
} /* while (size_wrote < bufsize) */
/* one last check before we actually carry out the operation:
* this code has hard-to-fix bugs when a noncontiguous file type has
* such large pieces that the sum of the lengths of the memory type is
* not larger than one of those pieces (and vice versa for large memory
* types and many pices of file types. In these cases, give up and
* fall back to naive reads and writes. The testphdf5 test created a
* type with two very large memory regions and 600 very small file
* regions. The same test also created a type with one very large file
* region and many (700) very small memory regions. both cases caused
* problems for this code */
if ( ( (file_list_count == 1) &&
(new_file_write < flat_file->blocklens[0] ) ) ||
((mem_list_count == 1) &&
(new_buffer_write < flat_buf->blocklens[0]) ) ||
((file_list_count == MAX_ARRAY_SIZE) &&
(new_file_write < flat_buf->blocklens[0]) ) ||
( (mem_list_count == MAX_ARRAY_SIZE) &&
(new_buffer_write < flat_file->blocklens[0])) )
{
ADIOI_Delete_flattened(datatype);
ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype,
file_ptr_type, initial_off, status, error_code);
return;
}
mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size));
mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
size_wrote = 0;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
/* this section calculates mem_list_count and file_list_count
and also finds the possibly odd sized last array elements
in new_fwr_size and new_bwr_size */
while (size_wrote < bufsize) {
k = start_k;
new_buffer_write = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data written and data to be
written in the next immediate write list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k] +
size_wrote) > bufsize) {
end_bwr_size = new_buffer_write +
flat_buf->blocklens[k] - (bufsize - size_wrote);
new_buffer_write = bufsize - size_wrote;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
else new_buffer_write = bwr_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
j = start_j;
new_file_write = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_write < new_buffer_write)) {
if(file_list_count) {
if((new_file_write + flat_file->blocklens[j]) >
new_buffer_write) {
end_fwr_size = new_buffer_write - new_file_write;
new_file_write = new_buffer_write;
j--;
}
else {
new_file_write += flat_file->blocklens[j];
end_fwr_size = flat_file->blocklens[j];
}
}
else {
if (fwr_size > new_buffer_write) {
new_file_write = new_buffer_write;
fwr_size = new_file_write;
}
else new_file_write = fwr_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_write = 0;
mem_list_count = 0;
while (new_buffer_write < new_file_write) {
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k]) >
new_file_write) {
end_bwr_size = new_file_write -
new_buffer_write;
new_buffer_write = new_file_write;
k--;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_write = bwr_size;
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_write < new_file_write) */
} /* if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
/* fills the allocated writelist arrays */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
/* TODO: fix this warning by casting to an integer that's the
* same size as a char * and /then/ casting to PVFS_size */
mem_offsets[i] = ((PVFS_size)buf + buftype_extent*
(buf_count/flat_buf->count) +
(int)flat_buf->indices[k]);
if(!i) {
mem_lengths[0] = bwr_size;
mem_offsets[0] += flat_buf->blocklens[k] - bwr_size;
}
else {
if (i == (mem_list_count - 1)) {
mem_lengths[i] = end_bwr_size;
if (flat_buf->blocklens[k] == end_bwr_size)
bwr_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
bwr_size = flat_buf->blocklens[k] - end_bwr_size;
k--;
buf_count--;
}
}
else {
mem_lengths[i] = flat_buf->blocklens[k];
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
file_offsets[i] = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes) * filetype_extent;
if (!i) {
file_lengths[0] = fwr_size;
file_offsets[0] += flat_file->blocklens[j] - fwr_size;
}
else {
if (i == (file_list_count - 1)) {
file_lengths[i] = end_fwr_size;
if (flat_file->blocklens[j] == end_fwr_size)
fwr_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
fwr_size = flat_file->blocklens[j] - end_fwr_size;
j--;
}
}
else file_lengths[i] = flat_file->blocklens[j];
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths,
mem_offsets, PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0 ) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* offset will be expressed in memory and file datatypes */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
PVFS_BOTTOM, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
size_wrote += new_buffer_write;
total_bytes_written += resp_io.total_completed;
start_k = k;
start_j = j;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
} /* while (size_wrote < bufsize) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
}
/* when incrementing fp_ind, need to also take into account the file type:
* consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
* if we wrote N elements, offset needs to point at beginning of type, not
* at empty region at offset N+1).
*
* As we discussed on mpich-discuss in may/june 2009, the code below might
* look wierd, but by putting fp_ind at the last byte written, the next
* time we run through the strided code we'll update the fp_ind to the
* right location. */
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind = file_offsets[file_list_count-1]+
file_lengths[file_list_count-1];
}
ADIOI_Free(file_offsets);
ADIOI_Free(file_lengths);
*error_code = MPI_SUCCESS;
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_TESTFS_operations = {
ADIOI_TESTFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_TESTFS_ReadContig, /* ReadContig */
ADIOI_TESTFS_WriteContig, /* WriteContig */
ADIOI_TESTFS_ReadStridedColl, /* ReadStridedColl */
@ -33,4 +34,5 @@ struct ADIOI_Fns_struct ADIO_TESTFS_operations = {
ADIOI_TESTFS_Flush, /* Flush */
ADIOI_TESTFS_Resize, /* Resize */
ADIOI_TESTFS_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
};

Просмотреть файл

@ -7,7 +7,9 @@
#include "ad_testfs.h"
#include "adioi.h"
#ifdef ROMIO_BGL
#include "../ad_bgl/ad_bgl.h"
#endif
void ADIOI_TESTFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
int myrank, nprocs;
@ -21,5 +23,10 @@ void ADIOI_TESTFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stdout, "[%d/%d] calling ADIOI_GEN_SetInfo\n",
myrank, nprocs);
#ifdef ROMIO_BGL /* BlueGene support for pvfs through ufs */
/* BlueGene hack: force testfs to mimic BlueGene hints */
ADIOI_BGL_SetInfo(fd, users_info, error_code);
#else
ADIOI_GEN_SetInfo(fd, users_info, error_code);
#endif
}

Просмотреть файл

@ -26,10 +26,6 @@ void ADIOI_TESTFS_ReadContig(ADIO_File fd, void *buf, int count,
offset = fd->fp_ind;
fd->fp_ind += datatype_size * count;
fd->fp_sys_posn = fd->fp_ind;
#if 0
FPRINTF(stdout, "[%d/%d] new file position is %lld\n", myrank,
nprocs, (long long) fd->fp_ind);
#endif
}
else {
fd->fp_sys_posn = offset + datatype_size * count;

Просмотреть файл

@ -26,8 +26,8 @@ ADIO_Offset ADIOI_TESTFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
ADIO_Offset off;
ADIOI_Flatlist_node *flat_file;
int i, n_etypes_in_filetype, n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int size_in_filetype, sum;
ADIO_Offset abs_off_in_filetype=0, sum;
int size_in_filetype;
int filetype_size, etype_size, filetype_is_contig;
MPI_Aint filetype_extent;
@ -54,6 +54,7 @@ ADIO_Offset ADIOI_TESTFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
}
n_etypes_in_filetype = filetype_size/etype_size;
ADIOI_Assert((offset / n_etypes_in_filetype) == (int) (offset / n_etypes_in_filetype));
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
@ -70,7 +71,7 @@ ADIO_Offset ADIOI_TESTFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
}
/* abs. offset in bytes in the file */
off = fd->disp + (ADIO_Offset) n_filetypes * filetype_extent +
off = fd->disp + (ADIO_Offset)n_filetypes * (ADIO_Offset)filetype_extent +
abs_off_in_filetype;
}

Просмотреть файл

@ -23,7 +23,7 @@ void ADIOI_TESTFS_WriteContig(ADIO_File fd, void *buf, int count,
nprocs, fd->filename);
FPRINTF(stdout, "[%d/%d] writing (buf = %p, loc = %lld, sz = %lld)\n",
myrank, nprocs, buf, (long long) offset,
(long long) datatype_size * count);
(long long)datatype_size * (long long)count);
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_UFS_operations = {
ADIOI_UFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -38,4 +39,5 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
};

Просмотреть файл

@ -22,13 +22,9 @@ noinst_LTLIBRARIES = libadio_xfs.la
libadio_xfs_la_SOURCES = \
ad_xfs.c \
ad_xfs.h \
ad_xfs_done.c \
ad_xfs_fcntl.c \
ad_xfs_hints.c \
ad_xfs_iread.c \
ad_xfs_iwrite.c \
ad_xfs_open.c \
ad_xfs_read.c \
ad_xfs_resize.c \
ad_xfs_wait.c \
ad_xfs_write.c

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_XFS_operations = {
ADIOI_XFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_XFS_ReadContig, /* ReadContig */
ADIOI_XFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -22,15 +23,21 @@ struct ADIOI_Fns_struct ADIO_XFS_operations = {
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
ADIOI_XFS_IreadContig, /* IreadContig */
ADIOI_XFS_IwriteContig, /* IwriteContig */
ADIOI_XFS_ReadDone, /* ReadDone */
ADIOI_XFS_WriteDone, /* WriteDone */
ADIOI_XFS_ReadComplete, /* ReadComplete */
ADIOI_XFS_WriteComplete, /* WriteComplete */
#if defined(ROMIO_HAVE_WORKING_AIO)
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif /* ROMIO_HAVE_WORKING_AIO */
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_XFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
};

Просмотреть файл

@ -8,20 +8,19 @@
#ifndef AD_XFS_INCLUDE
#define AD_XFS_INCLUDE
#define _XOPEN_SOURCE 500
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#include <aio.h>
int ADIOI_XFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle);
#if (defined(HAVE_PREAD64) && (_ABIO32 == 1))
# define pread pread64
# define pwrite pwrite64
#if defined(MPISGI)
#include "xfs/xfs_fs.h"
#ifndef __USE_LARGEFILE64
#define __USE_LARGEFILE64
#endif
typedef struct aiocb64 aiocb64_t;
#endif
/* above needed for IRIX 6.5 */
void ADIOI_XFS_Open(ADIO_File fd, int *error_code);
void ADIOI_XFS_Close(ADIO_File fd, int *error_code);
@ -33,22 +32,6 @@ void ADIOI_XFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_XFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_XFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_XFS_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_XFS_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_XFS_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_XFS_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code);
void ADIOI_XFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);

Просмотреть файл

@ -1,69 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_xfs.h"
int ADIOI_XFS_ReadDone(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
int err, done=0;
static char myname[] = "ADIOI_XFS_READDONE";
if (*request == ADIO_REQUEST_NULL) {
*error_code = MPI_SUCCESS;
return 1;
}
if ((*request)->queued) {
errno = aio_error64((const aiocb64_t *) (*request)->handle);
if (errno == EINPROGRESS) {
done = 0;
*error_code = MPI_SUCCESS;
}
else {
err = aio_return64((aiocb64_t *) (*request)->handle);
(*request)->nbytes = err;
errno = aio_error64((const aiocb64_t *) (*request)->handle);
done = 1;
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
} /* if ((*request)->queued) */
else {
done = 1;
*error_code = MPI_SUCCESS;
}
#ifdef HAVE_STATUS_SET_BYTES
if (done && ((*request)->nbytes != -1))
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
if (done) {
/* if request is still queued in the system, it is also there
on ADIOI_Async_list. Delete it from there. */
if ((*request)->queued) ADIOI_Del_req_from_list(request);
(*request)->fd->async_count--;
if ((*request)->handle) ADIOI_Free((*request)->handle);
ADIOI_Free_request((ADIOI_Req_node *) (*request));
*request = ADIO_REQUEST_NULL;
/* status to be filled */
}
return done;
}
int ADIOI_XFS_WriteDone(ADIO_Request *request, ADIO_Status *status, int *error_code)
{
return ADIOI_XFS_ReadDone(request, status, error_code);
}

Просмотреть файл

@ -7,6 +7,11 @@
#include "ad_xfs.h"
#include "adio_extern.h"
#include <sys/ioctl.h>
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{
@ -37,7 +42,7 @@ void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *er
fl.l_len = fcntl_struct->diskspace;
#if defined(LINUX) && defined(MPISGI)
err = fcntl(fd->fd_sys, XFS_IOC_RESVSP64, &fl);
err = ioctl(fd->fd_sys, XFS_IOC_RESVSP64, &fl);
#else
err = fcntl(fd->fd_sys, F_RESVSP64, &fl);
#endif

Просмотреть файл

@ -8,36 +8,76 @@
#include "ad_xfs.h"
#include "adio_extern.h"
static unsigned xfs_direct_read_chunk_size;
static unsigned xfs_direct_write_chunk_size;
void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
char *value;
char *value, * c;
int flag;
static char xfs_initialized = 0;
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
/* the nightly builds say somthing is calling MPI_Info_set w/ a null info,
* so protect the calls to MPI_Info_set */
if (fd->info != MPI_INFO_NULL ) {
MPI_Info_set(fd->info, "direct_read", "false");
MPI_Info_set(fd->info, "direct_write", "false");
ADIOI_Info_set(fd->info, "direct_read", "false");
ADIOI_Info_set(fd->info, "direct_write", "false");
fd->direct_read = fd->direct_write = 0;
if (!xfs_initialized) {
xfs_initialized = 1;
c = getenv("MPIO_DIRECT_READ_CHUNK_SIZE");
if (c) {
int io;
io = atoi(c);
if (io <= 0) {
fprintf(stderr,
"MPI: Ignoring an invalid setting for MPIO_DIRECT_READ_CHUNK_SIZE.\n"
" It must be set to a positive integer value.\n");
} else {
xfs_direct_read_chunk_size = io;
}
} else {
xfs_direct_read_chunk_size = 0;
}
/* has user specified values for keys "direct_read" and "direct wirte"? */
c = getenv("MPIO_DIRECT_WRITE_CHUNK_SIZE");
if (c) {
int io;
io = atoi(c);
if (io <= 0) {
fprintf(stderr,
"MPI: Ignoring an invalid setting for MPIO_DIRECT_WRITE_CHUNK_SIZE.\n"
" It must be set to a positive integer value.\n");
} else {
xfs_direct_write_chunk_size = io;
}
} else {
xfs_direct_write_chunk_size = 0;
}
}
if (!fd->hints->initialized) {
fd->hints->fs_hints.xfs.read_chunk_sz =
xfs_direct_read_chunk_size;
fd->hints->fs_hints.xfs.write_chunk_sz =
xfs_direct_write_chunk_size;
}
/* has user specified values for keys "direct_read" and "direct write"? */
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && !strcmp(value, "true")) {
MPI_Info_set(fd->info, "direct_read", "true");
ADIOI_Info_set(fd->info, "direct_read", "true");
fd->direct_read = 1;
}
MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && !strcmp(value, "true")) {
MPI_Info_set(fd->info, "direct_write", "true");
ADIOI_Info_set(fd->info, "direct_write", "true");
fd->direct_write = 1;
}
@ -47,8 +87,10 @@ void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
/* Environment variables override MPI_Info hints */
if (ADIOI_Direct_read) fd->direct_read = 1;
if (ADIOI_Direct_write) fd->direct_write = 1;
/* environment variables checked in ADIO_Init */
*error_code = MPI_SUCCESS;

Просмотреть файл

@ -1,42 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_xfs.h"
void ADIOI_XFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int *error_code)
{
int len, typesize, aio_errno = 0;
static char myname[] = "ADIOI_XFS_IREADCONTIG";
(*request) = ADIOI_Malloc_request();
(*request)->optype = ADIOI_READ;
(*request)->fd = fd;
(*request)->datatype = datatype;
MPI_Type_size(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
aio_errno = ADIOI_XFS_aio(fd, buf, len, offset, 0, &((*request)->handle));
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;
(*request)->queued = 1;
ADIOI_Add_req_to_list(request);
fd->fp_sys_posn = -1;
/* --BEGIN ERROR HANDLING-- */
if (aio_errno != 0) {
MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
fd->async_count++;
}

Просмотреть файл

@ -1,145 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_xfs.h"
void ADIOI_XFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request,
int *error_code)
{
int len, typesize, aio_errno = 0;
static char myname[] = "ADIOI_XFS_IWRITECONTIG";
*request = ADIOI_Malloc_request();
(*request)->optype = ADIOI_WRITE;
(*request)->fd = fd;
(*request)->datatype = datatype;
MPI_Type_size(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
aio_errno = ADIOI_XFS_aio(fd, buf, len, offset, 1, &((*request)->handle));
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;
(*request)->queued = 1;
ADIOI_Add_req_to_list(request);
fd->fp_sys_posn = -1;
/* --BEGIN ERROR HANDLING-- */
if (aio_errno != 0) {
MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
fd->async_count++;
}
void ADIOI_XFS_IwriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code)
{
ADIO_Status status;
#ifdef HAVE_STATUS_SET_BYTES
int typesize;
#endif
*request = ADIOI_Malloc_request();
(*request)->optype = ADIOI_WRITE;
(*request)->fd = fd;
(*request)->datatype = datatype;
(*request)->queued = 0;
(*request)->handle = 0;
/* call the blocking version. It is faster because it does data sieving. */
ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
offset, &status, error_code);
fd->async_count++;
#ifdef HAVE_STATUS_SET_BYTES
if (*error_code == MPI_SUCCESS) {
MPI_Type_size(datatype, &typesize);
(*request)->nbytes = count * typesize;
}
#endif
}
/* This function is for implementation convenience. It is not user-visible.
* It takes care of the differences in the interface for nonblocking I/O
* on various Unix machines! If wr==1 write, wr==0 read.
*
* Returns 0 on success, -errno on failure.
*/
int ADIOI_XFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle)
{
int err, error_code;
aiocb64_t *aiocbp;
aiocbp = (aiocb64_t *) ADIOI_Calloc(sizeof(aiocb64_t), 1);
if (((wr && fd->direct_write) || (!wr && fd->direct_read))
&& !(((long) buf) % fd->d_mem) && !(offset % fd->d_miniosz) &&
!(len % fd->d_miniosz) && (len >= fd->d_miniosz) &&
(len <= fd->d_maxiosz))
aiocbp->aio_fildes = fd->fd_direct;
else aiocbp->aio_fildes = fd->fd_sys;
aiocbp->aio_offset = offset;
aiocbp->aio_buf = buf;
aiocbp->aio_nbytes = len;
aiocbp->aio_reqprio = 0;
#ifdef AIO_SIGNOTIFY_NONE
/* SGI IRIX 6 */
aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
#else
aiocbp->aio_sigevent.sigev_signo = 0;
#endif
if (wr) err = aio_write64(aiocbp);
else err = aio_read64(aiocbp);
if (err != 0) {
if (errno == EAGAIN) {
/* exceeded the max. no. of outstanding requests.
complete all previous async. requests and try again. */
/* ADIOI_Complete_async(&error_code); */
if (error_code != MPI_SUCCESS) return -EIO;
if (wr) err = aio_write64(aiocbp);
else err = aio_read64(aiocbp);
while (err != 0) {
if (errno == EAGAIN) {
/* sleep and try again */
sleep(1);
if (wr) err = aio_write64(aiocbp);
else err = aio_read64(aiocbp);
}
else {
return -errno;
}
}
}
else {
return -errno;
}
}
*((aiocb64_t **) handle) = aiocbp;
return 0;
}

Просмотреть файл

@ -5,22 +5,26 @@
* See COPYRIGHT notice in top-level directory.
*/
#define _GNU_SOURCE // for O_DIRECT
#include "ad_xfs.h"
#include <sys/ioctl.h>
#ifdef HAVE_STDDEF_H
#include <stddef.h>
#endif
#if defined(MPISGI)
#include <mpitypedefs.h>
#include <mpifunctions.h>
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_XFS_Open(ADIO_File fd, int *error_code)
{
int perm, amode, amode_direct;
int perm, amode, amode_direct, factor;
unsigned int old_mask;
struct dioattr st;
static char myname[] = "ADIOI_XFS_OPEN";
unsigned read_chunk_sz = fd->hints->fs_hints.xfs.read_chunk_sz;
unsigned write_chunk_sz = fd->hints->fs_hints.xfs.write_chunk_sz;
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
@ -49,7 +53,7 @@ void ADIOI_XFS_Open(ADIO_File fd, int *error_code)
fd->fd_direct = open(fd->filename, amode_direct, perm);
if (fd->fd_direct != -1) {
#if defined(LINUX) && defined(MPISGI)
#if defined(MPISGI)
ioctl(fd->fd_direct, XFS_IOC_DIOINFO, &st);
#else
fcntl(fd->fd_direct, F_DIOINFO, &st);
@ -57,7 +61,34 @@ void ADIOI_XFS_Open(ADIO_File fd, int *error_code)
fd->d_mem = st.d_mem;
fd->d_miniosz = st.d_miniosz;
fd->d_maxiosz = st.d_maxiosz;
if (read_chunk_sz == 0) {
fd->hints->fs_hints.xfs.read_chunk_sz = st.d_maxiosz;
} else {
/*
* MPIO_DIRECT_READ_CHUNK_SIZE was set.
* Make read_chunk_sz a multiple of d_miniosz.
*/
factor = read_chunk_sz / fd->d_miniosz;
if (factor == 0 || read_chunk_sz != fd->d_miniosz * factor) {
fd->hints->fs_hints.xfs.read_chunk_sz =
fd->d_miniosz * (factor + 1);
}
}
if (write_chunk_sz == 0) {
fd->hints->fs_hints.xfs.write_chunk_sz = st.d_maxiosz;
} else {
/*
* MPIO_DIRECT_WRITE_CHUNK_SIZE was set.
* Make write_chunk_sz a multiple of d_miniosz.
*/
factor = write_chunk_sz / fd->d_miniosz;
if (factor == 0 || write_chunk_sz != fd->d_miniosz * factor) {
fd->hints->fs_hints.xfs.write_chunk_sz =
fd->d_miniosz * (factor + 1);
}
}
if (fd->d_mem > XFS_MEMALIGN) {
FPRINTF(stderr, "MPI: Run-time Direct-IO memory alignment, %d, does not match compile-time value, %d.\n",

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше