1
1

Romio Refresh from mpich2-1.3.1. Work by Pascal Deveze, tested through bitbucket by Jeff Squyres (https://bitbucket.org/devezep/new-romio-for-openmpi).

This commit was SVN r24264.
Этот коммит содержится в:
Sylvain Jeaugey 2011-01-19 15:55:10 +00:00
родитель b2f3a5b7c2
Коммит 0e921bba7f
359 изменённых файлов: 27054 добавлений и 8895 удалений

Просмотреть файл

@ -35,4 +35,5 @@ __sgi_mpi
__hp_mpi __hp_mpi
__cray_mpi __cray_mpi
__lam_mpi __lam_mpi
__Darwin
__open_mpi __open_mpi

Просмотреть файл

@ -1,58 +0,0 @@
<dir>
<file name="ad_bgl_getsh.c" info="1205188711"/>
<file name="ad_bgl_fcntl.c" info="1205188711"/>
<file name="ad_bgl_tuning.c" info="1205188711"/>
<file name="ad_bgl_pset.h" info="1205188711"/>
<file name="ad_bgl_aggrs.c" info="1205188711"/>
<file name="ad_bgl_wrcoll.c" info="1205188711"/>
<file name="ad_bgl_aggrs.h" info="1205188711"/>
<file name="ad_bgl_pset.c" info="1205188711"/>
<file name="ad_bgl_setsh.c" info="1205188711"/>
<file name="ad_bgl_close.c" info="1206398065"/>
<file name="ad_bgl.h" info="1205188711"/>
<file name="ad_bgl_read.c" info="1205188711"/>
<file name="ad_bgl_rdcoll.c" info="1205188711"/>
<file name="ad_bgl_open.c" info="1205188711"/>
<file name="ad_bgl_tuning.h" info="1205188711"/>
<file name="ad_bgl_write.c" info="1205188711"/>
<file name="ad_bgl_hints.c" info="1205188711"/>
<file name="ad_bgl.c" info="1205188711"/>
</dir>
<data>
<fileinfo name="ad_bgl_getsh.c">
</fileinfo>
<fileinfo name="ad_bgl_fcntl.c">
</fileinfo>
<fileinfo name="ad_bgl_tuning.c">
</fileinfo>
<fileinfo name="ad_bgl_pset.h">
</fileinfo>
<fileinfo name="ad_bgl_aggrs.c">
</fileinfo>
<fileinfo name="ad_bgl_wrcoll.c">
</fileinfo>
<fileinfo name="ad_bgl_aggrs.h">
</fileinfo>
<fileinfo name="ad_bgl_pset.c">
</fileinfo>
<fileinfo name="ad_bgl_setsh.c">
</fileinfo>
<fileinfo name="ad_bgl_close.c">
</fileinfo>
<fileinfo name="ad_bgl.h">
</fileinfo>
<fileinfo name="ad_bgl_read.c">
</fileinfo>
<fileinfo name="ad_bgl_rdcoll.c">
</fileinfo>
<fileinfo name="ad_bgl_open.c">
</fileinfo>
<fileinfo name="ad_bgl_tuning.h">
</fileinfo>
<fileinfo name="ad_bgl_write.c">
</fileinfo>
<fileinfo name="ad_bgl_hints.c">
</fileinfo>
<fileinfo name="ad_bgl.c">
</fileinfo>
</data>

Просмотреть файл

@ -26,6 +26,7 @@ libadio_bgl_la_SOURCES = \
ad_bgl.c \ ad_bgl.c \
ad_bgl_close.c \ ad_bgl_close.c \
ad_bgl_fcntl.c \ ad_bgl_fcntl.c \
ad_bgl_flush.c \
ad_bgl_getsh.c \ ad_bgl_getsh.c \
ad_bgl.h \ ad_bgl.h \
ad_bgl_hints.c \ ad_bgl_hints.c \

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */ /* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/** /**
* \file ad_bgl.c * \file ad_bgl.c
* \brief ??? * \brief ???
@ -18,6 +19,7 @@
struct ADIOI_Fns_struct ADIO_BGL_operations = { struct ADIOI_Fns_struct ADIO_BGL_operations = {
ADIOI_BGL_Open, /* Open */ ADIOI_BGL_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_BGL_ReadContig, /* ReadContig */ ADIOI_BGL_ReadContig, /* ReadContig */
ADIOI_BGL_WriteContig, /* WriteContig */ ADIOI_BGL_WriteContig, /* WriteContig */
#if BGL_OPTIM_STEP1_2 #if BGL_OPTIM_STEP1_2
@ -51,7 +53,8 @@ struct ADIOI_Fns_struct ADIO_BGL_operations = {
ADIOI_GEN_IOComplete, /* WriteComplete */ ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */ ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */ ADIOI_BGL_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */ ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
}; };

Просмотреть файл

@ -28,8 +28,10 @@
#include <aio.h> #include <aio.h>
#endif #endif
#if 0
int ADIOI_BGL_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, int ADIOI_BGL_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle); int wr, void *handle);
#endif
void ADIOI_BGL_Open(ADIO_File fd, int *error_code); void ADIOI_BGL_Open(ADIO_File fd, int *error_code);
@ -87,6 +89,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code); void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code); void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code);
#include "ad_bgl_tuning.h" #include "ad_bgl_tuning.h"

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */ /* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/** /**
* \file ad_bgl_aggrs.c * \file ad_bgl_aggrs.c
* \brief The externally used function from this file is is declared in ad_bgl_aggrs.h * \brief The externally used function from this file is is declared in ad_bgl_aggrs.h
@ -7,7 +8,7 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (C) 1997 University of Chicago. * Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
*/ */
@ -16,10 +17,49 @@
#include "ad_bgl.h" #include "ad_bgl.h"
#include "ad_bgl_pset.h" #include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h" #include "ad_bgl_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
int aggrsInPsetSize=0;
int *aggrsInPset=NULL; static int aggrsInPsetSize=0;
static int *aggrsInPset=NULL;
/* Comments copied from common:
* This file contains four functions:
*
* ADIOI_Calc_aggregator()
* ADIOI_Calc_file_domains()
* ADIOI_Calc_my_req()
* ADIOI_Calc_others_req()
*
* The last three of these were originally in ad_read_coll.c, but they are
* also shared with ad_write_coll.c. I felt that they were better kept with
* the rest of the shared aggregation code.
*/
/* Discussion of values available from above:
*
* ADIO_Offset st_offsets[0..nprocs-1]
* ADIO_Offset end_offsets[0..nprocs-1]
* These contain a list of start and end offsets for each process in
* the communicator. For example, an access at loc 10, size 10 would
* have a start offset of 10 and end offset of 19.
* int nprocs
* number of processors in the collective I/O communicator
* ADIO_Offset min_st_offset
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
* starting location of "file domain"; region that a given process will
* perform aggregation for (i.e. actually do I/O)
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
* start + size - 1 roughly, but it can be less, or 0, in the case of
* uneven distributions
*/
/* forward declaration */ /* forward declaration */
static void static void
@ -219,8 +259,7 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
ADIOI_BGL_ProcInfo_t *all_procInfo, ADIOI_BGL_ProcInfo_t *all_procInfo,
int *aggrsInPset ) int *aggrsInPset )
{ {
# define DEBUG 0 # if AGG_DEBUG
# if DEBUG
int i; int i;
# endif # endif
int naggs; int naggs;
@ -229,9 +268,10 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
/* compute the ranklist of IO aggregators and put into tmp_ranklist */ /* compute the ranklist of IO aggregators and put into tmp_ranklist */
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int)); tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
# if DEBUG # if AGG_DEBUG
for (i=0; i<confInfo->nProcs; i++) for (i=0; i<confInfo->nProcs; i++) {
printf( "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank ); DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
}
# endif # endif
naggs = naggs =
@ -239,7 +279,7 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
# define VERIFY 0 # define VERIFY 0
# if VERIFY # if VERIFY
printf( "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n", DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n",
confInfo->PsetSize , confInfo->PsetSize ,
confInfo->numPsets , confInfo->numPsets ,
confInfo->isVNM , confInfo->isVNM ,
@ -250,9 +290,10 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
naggs ); naggs );
# endif # endif
# if DEBUG # if AGG_DEBUG
for (i=0; i<naggs; i++) for (i=0; i<naggs; i++) {
printf( "\taggr %-4d = %6d\n", i, tmp_ranklist[i] ); DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
}
# endif # endif
/* copy the ranklist of IO aggregators to fd->hints */ /* copy the ranklist of IO aggregators to fd->hints */
@ -267,293 +308,34 @@ ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
return; return;
} }
/* Description from common/ad_aggregate.c. (Does it completely apply to bgl?)
* ADIOI_Calc_aggregator()
/*
* Compute a dynamic access range based file domain partition among I/O aggregators,
* which align to the GPFS block size
* Divide the I/O workload among "nprocs_for_coll" processes. This is
* done by (logically) dividing the file into file domains (FDs); each
* process may directly access only its own file domain.
* Additional effort is to make sure that each I/O aggregator get
* a file domain that aligns to the GPFS block size. So, there will
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
*/
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
__blksize_t blksize = 1048576; /* default to 1M */
if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
/* FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);*/
/* find the range of all the requests */
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
// printf( "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
/* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process */
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
int naggs = nprocs_for_coll;
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
for (i=0; i<naggs; i++)
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
else fd_size[i] = (nb_cn_small+1) * blksize;
/* FPRINTF(stderr,"%s(%d): "
"gpfs_ub %llu, "
"gpfs_lb %llu, "
"gpfs_ub_rdoff %llu, "
"gpfs_lb_rdoff %llu, "
"fd_gpfs_range %llu, "
"n_gpfs_blk %llu, "
"nb_cn_small %llu, "
"naggs_large %llu, "
"naggs_small %llu, "
"\n",
myname,__LINE__,
gpfs_ub ,
gpfs_lb ,
gpfs_ub_rdoff,
gpfs_lb_rdoff,
fd_gpfs_range,
n_gpfs_blk ,
nb_cn_small ,
naggs_large ,
naggs_small
);
*/
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
/* compute the file domain for each aggr */
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
ADIOI_Free (fd_size);
}
/*
* deprecated
* *
void ADIOI_BGL_GPFS_Calc_file_domain0(ADIO_Offset *st_offsets, * The intention here is to implement a function which provides basically
ADIO_Offset *end_offsets, * the same functionality as in Rajeev's original version of
int nprocs, * ADIOI_Calc_my_req(). He used a ceiling division approach to assign the
int nprocs_for_coll, * file domains, and we use the same approach here when calculating the
ADIO_Offset *min_st_offset_ptr, * location of an offset/len in a specific file domain. Further we assume
ADIO_Offset **fd_start_ptr, * this same distribution when calculating the rank_index, which is later
ADIO_Offset **fd_end_ptr, * used to map to a specific process rank in charge of the file domain.
ADIO_Offset *fd_size_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i;
static int GPFS_BSIZE=1048576;
* find the range of all the requests *
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process *
* GPFS specific, pseudo starting/end point has to round to GPFS_BSIZE *
ADIO_Offset gpfs_ub = (max_end_offset +GPFS_BSIZE-1) / GPFS_BSIZE * GPFS_BSIZE - 1;
ADIO_Offset gpfs_lb = min_st_offset / GPFS_BSIZE * GPFS_BSIZE;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +GPFS_BSIZE-1) / GPFS_BSIZE * GPFS_BSIZE - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / GPFS_BSIZE * GPFS_BSIZE;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
* all computation of partition is based on the rounded pseudo-range *
ADIO_Offset fds_ub = (fd_gpfs_range +nprocs_for_coll-1) / nprocs_for_coll;
ADIO_Offset fds_lb = fd_gpfs_range / nprocs_for_coll;
int naggs = nprocs_for_coll;
int npsets = aggrsInPset[0]; * special meaning for element 0 *
fd_size = (ADIO_Offset *) ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
* some pre-computation to determine rough ratio of when to up-fit, when to low-fit *
* 1. get the estimated data per pset *
* 2. determine a factor between up and down *
int avg_aggrsInPset = (naggs +npsets-1)/npsets;
ADIO_Offset avg_bytes_perPset = fd_gpfs_range / npsets;
ADIO_Offset resid = avg_bytes_perPset % GPFS_BSIZE;
ADIO_Offset downr = GPFS_BSIZE - resid;
int small = (resid < downr);
int ratio = downr == 0 ? npsets + 2 : (resid +downr-1)/downr;
if (small) ratio = resid == 0 ? npsets + 2 : (downr +resid-1)/resid;
* go through aggrsInfo of all PSETs *
ADIO_Offset fd_range = fd_gpfs_range;
int aggr = 0, pset;
for (pset=0; pset<npsets; pset++) {
ADIO_Offset fds_try = fds_lb;
int my_naggs = aggrsInPset[pset+1];
ADIO_Offset fds_pset;
* Last pset will deal with the residuals *
if (pset == npsets-1)
fds_pset = fd_range;
else
{
int cond1 = ((pset+1) % ratio == 0);
int cond2 = ((pset+1) % ratio != 0);
if (small) {
int temp = cond1; cond1 = cond2; cond2 = temp;
}
if (cond1) {
fds_pset = fds_try * my_naggs;
if (fds_pset % GPFS_BSIZE) // align to GPFS_BSIZE
fds_pset = ((fds_pset +GPFS_BSIZE-1)/GPFS_BSIZE) * GPFS_BSIZE;
}
if (cond2)
{
fds_try = fds_ub;
fds_pset = fds_try * my_naggs;
if (fds_pset % GPFS_BSIZE) // align to GPFS_BSIZE
fds_pset = (fds_pset / GPFS_BSIZE) * GPFS_BSIZE;
}
}
* for aggrs in each PSET, divide evenly the data range *
#define CN_ALIGN 1
#if !CN_ALIGN
fd_range -= fds_pset;
if ( pset == 0 ) fds_pset -= gpfs_lb_rdoff;
if ( pset == npsets-1 ) fds_pset -= gpfs_ub_rdoff;
int p;
for (p=0; p<my_naggs; p++) {
fd_size[aggr] = (fds_pset +my_naggs-1) / my_naggs;
if (p== my_naggs-1)
fd_size[aggr] -= (fd_size[aggr]*my_naggs - fds_pset);
aggr++;
}
#else
ADIO_Offset avg_bytes_perP = fds_pset / my_naggs;
ADIO_Offset resid2 = avg_bytes_perP % GPFS_BSIZE;
ADIO_Offset downr2 = GPFS_BSIZE - resid2;
int small2 = (resid2 < downr2);
int ratio2 = downr2 == 0 ? my_naggs + 2 : (resid2 +downr2-1)/downr2;
if (small2) ratio2 = resid2 == 0 ? my_naggs + 2 : (downr2 +resid2-1)/resid2;
ADIO_Offset accu = 0;
int p;
for (p=0; p<my_naggs; p++) {
int cond1 = ((p+1) % ratio2 == 0);
int cond2 = ((p+1) % ratio2 != 0);
if (small2) {
int temp = cond1; cond1 = cond2; cond2 = temp;
}
fd_size[aggr] = avg_bytes_perP;
if (cond2) fd_size[aggr] = ((fd_size[aggr] +GPFS_BSIZE-1)/GPFS_BSIZE) * GPFS_BSIZE;
if (cond1) fd_size[aggr] = ((fd_size[aggr] )/GPFS_BSIZE) * GPFS_BSIZE;
if (p== my_naggs-1)
fd_size[aggr] = (fds_pset - accu);
accu += fd_size[aggr];
fd_range -= fd_size[aggr];
aggr++;
}
#endif
}
* after scheduling, the first and the last region has to remove the round-off effect *
#if CN_ALIGN
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
#endif
* compute the file domain for each aggr *
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
* *
printf( "\t%6d : %12qd:%12qd, %12qd:%12qd:%12qd, %12qd:%12qd:%12qd\n", * A better (i.e. more general) approach would be to use the list of file
naggs, * domains only. This would be slower in the case where the
min_st_offset, * original ceiling division was used, but it would allow for arbitrary
max_end_offset, * distributions of regions to aggregators. We'd need to know the
fd_start[0], * nprocs_for_coll in that case though, which we don't have now.
fd_end [0],
fd_size [0],
fd_start[naggs-1],
fd_end [naggs-1],
fd_size [naggs-1] );
* *
* Note a significant difference between this function and Rajeev's old code:
* this code doesn't necessarily return a rank in the range
*fd_size_ptr = fd_size[0]; * 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a
*min_st_offset_ptr = min_st_offset; * result of the rank mapping; any set of ranks in the communicator could be
* used now.
ADIOI_Free (fd_size); *
} * Returns an integer representing a rank in the collective I/O communicator.
*
* The "len" parameter is also modified to indicate the amount of data
* actually available in this file domain.
*/ */
/*
* When a process is an IO aggregator, this will return its index in the aggrs list.
* Otherwise, this will return -1
*/
int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
{
int i;
for (i=0; i<fd->hints->cb_nodes; i++)
if (fd->hints->ranklist[i] == myrank) return i;
return -1;
}
/* /*
* This is more general aggregator search function which does not base on the assumption * This is more general aggregator search function which does not base on the assumption
* that each aggregator hosts the file domain with the same size * that each aggregator hosts the file domain with the same size
@ -574,6 +356,21 @@ int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
/* binary search --> rank_index is returned */ /* binary search --> rank_index is returned */
int ub = fd->hints->cb_nodes; int ub = fd->hints->cb_nodes;
int lb = 0; int lb = 0;
/* get an index into our array of aggregators */
/* Common code for striping - bgl doesn't use it but it's
here to make diff'ing easier.
rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
if (fd->hints->striping_unit > 0) {
* wkliao: implementation for file domain alignment
fd_start[] and fd_end[] have been aligned with file lock
boundaries when returned from ADIOI_Calc_file_domains() so cannot
just use simple arithmatic as above *
rank_index = 0;
while (off > fd_end[rank_index]) rank_index++;
}
bgl does it's own striping below
*/
rank_index = fd->hints->cb_nodes / 2; rank_index = fd->hints->cb_nodes / 2;
while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) { while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
if ( off > fd_end [rank_index] ) { if ( off > fd_end [rank_index] ) {
@ -586,8 +383,15 @@ int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
rank_index = (rank_index + lb) / 2; rank_index = (rank_index + lb) / 2;
} }
} }
/* we index into fd_end with rank_index, and fd_end was allocated to be no
// printf ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index ); * bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort */
if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
rank_index,fd->hints->cb_nodes,fd_size,off);
MPI_Abort(MPI_COMM_WORLD, 1);
}
// DBG_FPRINTF ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index );
/* /*
* remember here that even in Rajeev's original code it was the case that * remember here that even in Rajeev's original code it was the case that
@ -611,16 +415,161 @@ int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
return rank; return rank;
} }
/*
* Compute a dynamic access range based file domain partition among I/O aggregators,
* which align to the GPFS block size
* Divide the I/O workload among "nprocs_for_coll" processes. This is
* done by (logically) dividing the file into file domains (FDs); each
* process may directly access only its own file domain.
* Additional effort is to make sure that each I/O aggregator get
* a file domain that aligns to the GPFS block size. So, there will
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
*
* The common version of this now accepts a min_fd_size and striping_unit.
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
*/
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5004, 0, NULL);
#endif
# if AGG_DEBUG
static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
myname,__LINE__,nprocs_for_coll);
# endif
__blksize_t blksize = 1048576; /* default to 1M */
if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
# endif
/* find min of start offsets and max of end offsets of all processes */
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
// DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
/* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process */
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
int naggs = nprocs_for_coll;
/* Tweak the file domains so that no fd is smaller than a threshold. We
* have to strike a balance between efficency and parallelism: somewhere
* between 10k processes sending 32-byte requests and one process sending a
* 320k request is a (system-dependent) sweet spot
This is from the common code - the new min_fd_size parm that we didn't implement.
(And common code uses a different declaration of fd_size so beware)
if (fd_size < min_fd_size)
fd_size = min_fd_size;
*/
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
for (i=0; i<naggs; i++)
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
else fd_size[i] = (nb_cn_small+1) * blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): "
"gpfs_ub %llu, "
"gpfs_lb %llu, "
"gpfs_ub_rdoff %llu, "
"gpfs_lb_rdoff %llu, "
"fd_gpfs_range %llu, "
"n_gpfs_blk %llu, "
"nb_cn_small %llu, "
"naggs_large %llu, "
"naggs_small %llu, "
"\n",
myname,__LINE__,
gpfs_ub ,
gpfs_lb ,
gpfs_ub_rdoff,
gpfs_lb_rdoff,
fd_gpfs_range,
n_gpfs_blk ,
nb_cn_small ,
naggs_large ,
naggs_small
);
# endif
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
/* compute the file domain for each aggr */
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5005, 0, NULL);
#endif
ADIOI_Free (fd_size);
}
/*
* When a process is an IO aggregator, this will return its index in the aggrs list.
* Otherwise, this will return -1
*/
int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
{
int i;
for (i=0; i<fd->hints->cb_nodes; i++)
if (fd->hints->ranklist[i] == myrank) return i;
return -1;
}
/* /*
* ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation * ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
* is specific for static file domain partitioning. * is specific for static file domain partitioning.
* *
* ADIOI_Calc_my_req() calculate what portions of the access requests * ADIOI_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes * of this process are located in the file domains of various processes
* (including this one) * (including this one)
*/ */
void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list, void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start, min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size, ADIO_Offset *fd_end, ADIO_Offset fd_size,
@ -629,12 +578,17 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
int **count_my_req_per_proc_ptr, int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr, ADIOI_Access **my_req_ptr,
int **buf_idx_ptr) int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
They are used as memory buffer indices so it seems like the 2G limit is in effect */
{ {
int *count_my_req_per_proc, count_my_req_procs, *buf_idx; int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
int i, l, proc; int i, l, proc;
ADIO_Offset fd_len, rem_len, curr_idx, off; ADIO_Offset fd_len, rem_len, curr_idx, off;
ADIOI_Access *my_req; ADIOI_Access *my_req;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5024, 0, NULL);
#endif
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr; count_my_req_per_proc = *count_my_req_per_proc_ptr;
@ -656,10 +610,10 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len() * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/ */
for (i=0; i < contig_access_count; i++) { for (i=0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
/* When there is no data being processed, bypass this loop */ * (zero-byte read/write */
if (len_list[i] == 0) continue; if (len_list[i] == 0)
continue;
off = offset_list[i]; off = offset_list[i];
fd_len = len_list[i]; fd_len = len_list[i];
/* note: we set fd_len to be the total size of the access. then /* note: we set fd_len to be the total size of the access. then
@ -710,20 +664,24 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
/* now fill in my_req */ /* now fill in my_req */
curr_idx = 0; curr_idx = 0;
for (i=0; i<contig_access_count; i++) { for (i=0; i<contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
/* When there is no data being processed, bypass this loop */ * (zero-byte read/write */
if (len_list[i] == 0) continue; if (len_list[i] == 0)
continue;
off = offset_list[i]; off = offset_list[i];
fd_len = len_list[i]; fd_len = len_list[i];
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end); fd_start, fd_end);
/* for each separate contiguous access from this process */ /* for each separate contiguous access from this process */
if (buf_idx[proc] == -1) buf_idx[proc] = (int) curr_idx; if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count; l = my_req[proc].count;
curr_idx += (int) fd_len; /* NOTE: Why is curr_idx an int? Fix? */ curr_idx += fd_len;
rem_len = len_list[i] - fd_len; rem_len = len_list[i] - fd_len;
@ -733,6 +691,7 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
* and the associated count. * and the associated count.
*/ */
my_req[proc].offsets[l] = off; my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len; my_req[proc].lens[l] = (int) fd_len;
my_req[proc].count++; my_req[proc].count++;
@ -742,13 +701,18 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end); fd_size, fd_start, fd_end);
if (buf_idx[proc] == -1) buf_idx[proc] = (int) curr_idx; if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count; l = my_req[proc].count;
curr_idx += fd_len; curr_idx += fd_len;
rem_len -= fd_len; rem_len -= fd_len;
my_req[proc].offsets[l] = off; my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len; my_req[proc].lens[l] = (int) fd_len;
my_req[proc].count++; my_req[proc].count++;
} }
@ -757,27 +721,26 @@ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, int *len_list
#ifdef AGG_DEBUG #ifdef AGG_DEBUG
for (i=0; i<nprocs; i++) { for (i=0; i<nprocs; i++) {
if (count_my_req_per_proc[i] > 0) { if (count_my_req_per_proc[i] > 0) {
FPRINTF(stdout, "data needed from %d (count = %d):\n", i, DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
my_req[i].count); my_req[i].count);
for (l=0; l < my_req[i].count; l++) { for (l=0; l < my_req[i].count; l++) {
FPRINTF(stdout, " off[%d] = %Ld, len[%d] = %d\n", l, DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l,
my_req[i].offsets[l], l, my_req[i].lens[l]); my_req[i].offsets[l], l, my_req[i].lens[l]);
} }
} }
DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
} }
#if 0
for (i=0; i<nprocs; i++) {
FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
}
#endif
#endif #endif
*count_my_req_procs_ptr = count_my_req_procs; *count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx; *buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5025, 0, NULL);
#endif
} }
/* /*
* ADIOI_Calc_others_req * ADIOI_Calc_others_req (copied to bgl and switched to all to all for performance)
* *
* param[in] count_my_req_procs Number of processes whose file domain my * param[in] count_my_req_procs Number of processes whose file domain my
* request touches. * request touches.
@ -826,7 +789,9 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
*recvBufForLens =(void*)0xFFFFFFFF; *recvBufForLens =(void*)0xFFFFFFFF;
/* first find out how much to send/recv and from/to whom */ /* first find out how much to send/recv and from/to whom */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5026, 0, NULL);
#endif
/* Send 1 int to each process. count_my_req_per_proc[i] is the number of /* Send 1 int to each process. count_my_req_per_proc[i] is the number of
* requests that my process will do to the file domain owned by process[i]. * requests that my process will do to the file domain owned by process[i].
* Receive 1 int from each process. count_others_req_per_proc[i] is the number of * Receive 1 int from each process. count_others_req_per_proc[i] is the number of
@ -866,9 +831,9 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
others_req[i].lens = (int *) others_req[i].lens = (int *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int)); ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
if ( (unsigned)others_req[i].offsets < (unsigned)recvBufForOffsets ) if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
recvBufForOffsets = others_req[i].offsets; recvBufForOffsets = others_req[i].offsets;
if ( (unsigned)others_req[i].lens < (unsigned)recvBufForLens ) if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
recvBufForLens = others_req[i].lens; recvBufForLens = others_req[i].lens;
others_req[i].mem_ptrs = (MPI_Aint *) others_req[i].mem_ptrs = (MPI_Aint *)
@ -883,6 +848,9 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
others_req[i].lens = NULL; others_req[i].lens = NULL;
} }
} }
/* If no recv buffer was allocated in the loop above, make it NULL */
if ( recvBufForOffsets == (void*)0xFFFFFFFF) recvBufForOffsets = NULL;
if ( recvBufForLens == (void*)0xFFFFFFFF) recvBufForLens = NULL;
/* Now send the calculated offsets and lengths to respective processes */ /* Now send the calculated offsets and lengths to respective processes */
@ -894,14 +862,18 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
for (i=0; i<nprocs; i++) for (i=0; i<nprocs; i++)
{ {
if ( (my_req[i].count) && if ( (my_req[i].count) &&
((unsigned)my_req[i].offsets <= (unsigned)sendBufForOffsets) ) ((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
sendBufForOffsets = my_req[i].offsets; sendBufForOffsets = my_req[i].offsets;
if ( (my_req[i].count) && if ( (my_req[i].count) &&
((unsigned)my_req[i].lens <= (unsigned)sendBufForLens) ) ((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
sendBufForLens = my_req[i].lens; sendBufForLens = my_req[i].lens;
} }
/* If no send buffer was found in the loop above, make it NULL */
if ( sendBufForOffsets == (void*)0xFFFFFFFF) sendBufForOffsets = NULL;
if ( sendBufForLens == (void*)0xFFFFFFFF) sendBufForLens = NULL;
/* Calculate the displacements from the sendBufForOffsets/Lens */ /* Calculate the displacements from the sendBufForOffsets/Lens */
for (i=0; i<nprocs; i++) for (i=0; i<nprocs; i++)
{ {
@ -910,16 +882,20 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
if ( scounts[i] == 0 ) if ( scounts[i] == 0 )
sdispls[i] = 0; sdispls[i] = 0;
else else
sdispls[i] = ( (unsigned)my_req[i].offsets - sdispls[i] = (int)
(unsigned)sendBufForOffsets ) / sizeof(ADIO_Offset); ( ( (MPIR_Upint)my_req[i].offsets -
(MPIR_Upint)sendBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
// Receive these offsets from process i. // Receive these offsets from process i.
rcounts[i] = count_others_req_per_proc[i]; rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 ) if ( rcounts[i] == 0 )
rdispls[i] = 0; rdispls[i] = 0;
else else
rdispls[i] = ( (unsigned)others_req[i].offsets - rdispls[i] = (int)
(unsigned)recvBufForOffsets ) / sizeof(ADIO_Offset); ( ( (MPIR_Upint)others_req[i].offsets -
(MPIR_Upint)recvBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
} }
/* Exchange the offsets */ /* Exchange the offsets */
@ -940,16 +916,20 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
if ( scounts[i] == 0 ) if ( scounts[i] == 0 )
sdispls[i] = 0; sdispls[i] = 0;
else else
sdispls[i] = ( (unsigned)my_req[i].lens - sdispls[i] = (int)
(unsigned)sendBufForLens ) / sizeof(int); ( ( (MPIR_Upint)my_req[i].lens -
(MPIR_Upint)sendBufForLens ) /
(MPIR_Upint) sizeof(int) );
// Receive these offsets from process i. // Receive these offsets from process i.
rcounts[i] = count_others_req_per_proc[i]; rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 ) if ( rcounts[i] == 0 )
rdispls[i] = 0; rdispls[i] = 0;
else else
rdispls[i] = ( (unsigned)others_req[i].lens - rdispls[i] = (int)
(unsigned)recvBufForLens ) / sizeof(int); ( ( (MPIR_Upint)others_req[i].lens -
(MPIR_Upint)recvBufForLens ) /
(MPIR_Upint) sizeof(int) );
} }
/* Exchange the lengths */ /* Exchange the lengths */
@ -967,4 +947,7 @@ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
ADIOI_Free (rdispls); ADIOI_Free (rdispls);
*count_others_req_procs_ptr = count_others_req_procs; *count_others_req_procs_ptr = count_others_req_procs;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5027, 0, NULL);
#endif
} }

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */ /* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/** /**
* \file ad_bgl_aggrs.h * \file ad_bgl_aggrs.h
* \brief ??? * \brief ???
@ -22,13 +23,22 @@
#include "adio.h" #include "adio.h"
#include <sys/stat.h> #include <sys/stat.h>
extern int *aggrsInPset; /* defined in ad_bgl_aggrs.c */ #if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
/* File system (BGL) specific information - /* File system (BGL) specific information -
hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */ hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
typedef struct ADIOI_BGL_fs_s { typedef struct ADIOI_BGL_fs_s {
__blksize_t blksize; __blksize_t blksize;
int fsync_aggr; /* "fsync aggregation" flags (below) */
#define ADIOI_BGL_FSYNC_AGGREGATION_DISABLED 0x00
#define ADIOI_BGL_FSYNC_AGGREGATION_ENABLED 0x01
#define ADIOI_BGL_FSYNC_AGGREGATOR 0x10 /* This rank is an aggregator */
} ADIOI_BGL_fs; } ADIOI_BGL_fs;
/* generate a list of I/O aggregators that utilizes BGL-PSET orginization. */ /* generate a list of I/O aggregators that utilizes BGL-PSET orginization. */
@ -60,7 +70,7 @@
/* overriding ADIOI_Calc_my_req for the default implementation is specific for /* overriding ADIOI_Calc_my_req for the default implementation is specific for
static file domain partitioning */ static file domain partitioning */
void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, int *len_list, void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start, min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size, ADIO_Offset *fd_end, ADIO_Offset fd_size,

Просмотреть файл

@ -1,7 +1,8 @@
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */ /* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/** /**
* \file ad_bgl_open.c * \file ad_bgl_close.c
* \brief ??? * \brief ???
*/ */

Просмотреть файл

@ -1,5 +1,6 @@
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */ /* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/** /**
* \file ad_bgl_fcntl.c * \file ad_bgl_fcntl.c
* \brief ??? * \brief ???

Просмотреть файл

@ -0,0 +1,90 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_flush.c
* \brief Scalable flush based on underlying filesystem and psets
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "ad_bgl_aggrs.h"
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code)
{
int err=0;
static char myname[] = "ADIOI_BGL_FLUSH";
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATION_ENABLED)
{
int rank;
/* Barrier so we can collectively do fewer fsync's */
MPI_Barrier(fd->comm);
MPI_Comm_rank(fd->comm, &rank);
/* All ranks marked as "fsync aggregators" should fsync.
(We currently only do one fsync on rank 0 but this is general
enough to support >1 aggregator using allreduce to get the
results instead of simply bcast'ing the results from rank 0.)*/
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATOR)
{
err = fsync(fd->fd_sys);
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
/* We want errno, not the return code if it failed */
if (err == -1) err = errno;
else err = 0;
}
/* Just pick an errno (using unsigned MPI_MAX) from any failures */
MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
if (err) /* if it's non-zero, it must be an errno */
{
errno = err;
err = -1;
}
}
else /* Non-aggregated fsync */
{
#ifdef USE_DBG_LOGGING
int rank;
#endif
err = fsync(fd->fd_sys);
#ifdef USE_DBG_LOGGING
MPI_Comm_rank(fd->comm, &rank);
if(rank == 0)
{
DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
else
{
DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
#endif
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -38,8 +38,8 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
MPI_Info info; MPI_Info info;
char *value; char *value;
int flag, intval, tmp_val, nprocs, nprocs_is_valid = 0; int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0;
static char myname[] = "ADIOI_GEN_SETINFO"; static char myname[] = "ADIOI_BGL_SETINFO";
int did_anything = 0; int did_anything = 0;
@ -61,15 +61,15 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
did_anything = 1; did_anything = 1;
/* buffer size for collective I/O */ /* buffer size for collective I/O */
MPI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT); ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
/* default is to let romio automatically decide when to use /* default is to let romio automatically decide when to use
* collective buffering * collective buffering
*/ */
MPI_Info_set(info, "romio_cb_read", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = ADIOI_HINT_ENABLE; fd->hints->cb_read = ADIOI_HINT_ENABLE;
MPI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_write", "enable");
fd->hints->cb_write = ADIOI_HINT_ENABLE; fd->hints->cb_write = ADIOI_HINT_ENABLE;
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
@ -78,30 +78,54 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* number of processes that perform I/O in collective I/O */ /* number of processes that perform I/O in collective I/O */
MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_size(fd->comm, &nprocs);
nprocs_is_valid = 1; nprocs_is_valid = 1;
sprintf(value, "%d", nprocs); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
MPI_Info_set(info, "cb_nodes", value); ADIOI_Info_set(info, "cb_nodes", value);
fd->hints->cb_nodes = -1; fd->hints->cb_nodes = -1;
/* hint indicating that no indep. I/O will be performed on this file */ /* hint indicating that no indep. I/O will be performed on this file */
MPI_Info_set(info, "romio_no_indep_rw", "false"); ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0; fd->hints->no_indep_rw = 0;
/* deferred_open derrived from no_indep_rw and cb_{read,write} */
/* bgl is not implementing file realms (ADIOI_IOStridedColl),
initialize to disabled it. */
/* hint instructing the use of persistent file realms */
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
/* hint guiding the assignment of persistent file realms */
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
fd->hints->cb_fr_type = ADIOI_FR_AAR;
/* hint to align file realms with a certain byte value */
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
fd->hints->cb_fr_alignment = 1;
/* hint to set a threshold percentage for a datatype's size/extent at
* which data sieving should be done in collective I/O */
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
fd->hints->cb_ds_threshold = 0;
/* hint to switch between point-to-point or all-to-all for two-phase */
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
/* deferred_open derived from no_indep_rw and cb_{read,write} */
fd->hints->deferred_open = 0; fd->hints->deferred_open = 0;
/* buffer size for data sieving in independent reads */ /* buffer size for data sieving in independent reads */
MPI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
/* buffer size for data sieving in independent writes */ /* buffer size for data sieving in independent writes */
MPI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
if(fd->file_system == ADIO_UFS) if(fd->file_system == ADIO_UFS)
{ {
/* default for ufs/pvfs is to disable data sieving */ /* default for ufs/pvfs is to disable data sieving */
MPI_Info_set(info, "romio_ds_read", "disable"); ADIOI_Info_set(info, "romio_ds_read", "disable");
fd->hints->ds_read = ADIOI_HINT_DISABLE; fd->hints->ds_read = ADIOI_HINT_DISABLE;
MPI_Info_set(info, "romio_ds_write", "disable"); ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE; fd->hints->ds_write = ADIOI_HINT_DISABLE;
} }
else else
@ -109,18 +133,23 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* default is to let romio automatically decide when to use data /* default is to let romio automatically decide when to use data
* sieving * sieving
*/ */
MPI_Info_set(info, "romio_ds_read", "automatic"); ADIOI_Info_set(info, "romio_ds_read", "automatic");
fd->hints->ds_read = ADIOI_HINT_AUTO; fd->hints->ds_read = ADIOI_HINT_AUTO;
MPI_Info_set(info, "romio_ds_write", "automatic"); ADIOI_Info_set(info, "romio_ds_write", "automatic");
fd->hints->ds_write = ADIOI_HINT_AUTO; fd->hints->ds_write = ADIOI_HINT_AUTO;
} }
/* still to do: tune this a bit for a variety of file systems. there's
* no good default value so just leave it unset */
fd->hints->min_fdomain_size = 0;
fd->hints->striping_unit = 0;
fd->hints->initialized = 1; fd->hints->initialized = 1;
} }
/* add in user's info if supplied */ /* add in user's info if supplied */
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
MPI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && ((intval=atoi(value)) > 0)) { if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval; tmp_val = intval;
@ -135,30 +164,106 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
MPI_Info_set(info, "cb_buffer_size", value); ADIOI_Info_set(info, "cb_buffer_size", value);
fd->hints->cb_buffer_size = intval; fd->hints->cb_buffer_size = intval;
} }
#if 0
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
/* aligning file realms to certain sizes (e.g. stripe sizes)
* may benefit I/O performance */
ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_alignment",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_alignment", value);
fd->hints->cb_fr_alignment = intval;
}
/* for collective I/O, try to be smarter about when to do data sieving
* using a specific threshold for the datatype size/extent
* (percentage 0-100%) */
ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_ds_threshold",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_ds_threshold", value);
fd->hints->cb_ds_threshold = intval;
}
ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_alltoall;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_alltoall) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_alltoall",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
#endif
/* new hints for enabling/disabling coll. buffering on /* new hints for enabling/disabling coll. buffering on
* reads/writes * reads/writes
*/ */
MPI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag); ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) { if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_cb_read", value); ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE; fd->hints->cb_read = ADIOI_HINT_ENABLE;
} }
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
/* romio_cb_read overrides no_indep_rw */ /* romio_cb_read overrides no_indep_rw */
MPI_Info_set(info, "romio_cb_read", value); ADIOI_Info_set(info, "romio_cb_read", value);
MPI_Info_set(info, "romio_no_indep_rw", "false"); ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_read = ADIOI_HINT_DISABLE; fd->hints->cb_read = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
} }
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{ {
MPI_Info_set(info, "romio_cb_read", value); ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_AUTO; fd->hints->cb_read = ADIOI_HINT_AUTO;
} }
@ -174,24 +279,25 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
MPI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag); ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) { if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_cb_write", value); ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_ENABLE; fd->hints->cb_write = ADIOI_HINT_ENABLE;
} }
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{ {
/* romio_cb_write overrides no_indep_rw, too */ /* romio_cb_write overrides no_indep_rw, too */
MPI_Info_set(info, "romio_cb_write", value); ADIOI_Info_set(info, "romio_cb_write", value);
MPI_Info_set(info, "romio_no_indep_rw", "false"); ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_write = ADIOI_HINT_DISABLE; fd->hints->cb_write = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
} }
else if (!strcmp(value, "automatic") || else if (!strcmp(value, "automatic") ||
!strcmp(value, "AUTOMATIC")) !strcmp(value, "AUTOMATIC"))
{ {
MPI_Info_set(info, "romio_cb_write", value); ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_AUTO; fd->hints->cb_write = ADIOI_HINT_AUTO;
} }
@ -208,23 +314,81 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
#if 0
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
/* enable/disable persistent file realms for collective I/O */
/* may want to check for no_indep_rdwr hint as well */
ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_pfr;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_pfr) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_pfr",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
/* file realm assignment types ADIOI_FR_AAR(0),
ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
a regular fr size in bytes. probably not the best way... */
ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) >= -2)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_type",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_type", value);
fd->hints->cb_fr_type = intval;
}
#endif
/* new hint for specifying no indep. read/write will be performed */ /* new hint for specifying no indep. read/write will be performed */
MPI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag); ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) { if (flag) {
if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
/* if 'no_indep_rw' set, also hint that we will do /* if 'no_indep_rw' set, also hint that we will do
* collective buffering: if we aren't doing independent io, * collective buffering: if we aren't doing independent io,
* then we have to do collective */ * then we have to do collective */
MPI_Info_set(info, "romio_no_indep_rw", value); ADIOI_Info_set(info, "romio_no_indep_rw", value);
MPI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_write", "enable");
MPI_Info_set(info, "romio_cb_read", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->no_indep_rw = 1; fd->hints->no_indep_rw = 1;
fd->hints->cb_read = 1; fd->hints->cb_read = 1;
fd->hints->cb_write = 1; fd->hints->cb_write = 1;
tmp_val = 1; tmp_val = 1;
} }
else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
MPI_Info_set(info, "romio_no_indep_rw", value); ADIOI_Info_set(info, "romio_no_indep_rw", value);
fd->hints->no_indep_rw = 0; fd->hints->no_indep_rw = 0;
tmp_val = 0; tmp_val = 0;
} }
@ -246,64 +410,80 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* new hints for enabling/disabling data sieving on /* new hints for enabling/disabling data sieving on
* reads/writes * reads/writes
*/ */
MPI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
&flag); &flag);
if (flag) { if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_ds_read", value); ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_ENABLE; fd->hints->ds_read = ADIOI_HINT_ENABLE;
} }
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
MPI_Info_set(info, "romio_ds_read", value); ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_DISABLE; fd->hints->ds_read = ADIOI_HINT_DISABLE;
} }
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{ {
MPI_Info_set(info, "romio_ds_read", value); ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_AUTO; fd->hints->ds_read = ADIOI_HINT_AUTO;
} }
/* otherwise ignore */ /* otherwise ignore */
} }
MPI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
&flag); &flag);
if (flag) { if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
MPI_Info_set(info, "romio_ds_write", value); ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_ENABLE; fd->hints->ds_write = ADIOI_HINT_ENABLE;
} }
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
MPI_Info_set(info, "romio_ds_write", value); ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_DISABLE; fd->hints->ds_write = ADIOI_HINT_DISABLE;
} }
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{ {
MPI_Info_set(info, "romio_ds_write", value); ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_AUTO; fd->hints->ds_write = ADIOI_HINT_AUTO;
} }
/* otherwise ignore */ /* otherwise ignore */
} }
MPI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && ((intval = atoi(value)) > 0)) { if (flag && ((intval = atoi(value)) > 0)) {
MPI_Info_set(info, "ind_wr_buffer_size", value); ADIOI_Info_set(info, "ind_wr_buffer_size", value);
fd->hints->ind_wr_buffer_size = intval; fd->hints->ind_wr_buffer_size = intval;
} }
MPI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && ((intval = atoi(value)) > 0)) { if (flag && ((intval = atoi(value)) > 0)) {
MPI_Info_set(info, "ind_rd_buffer_size", value); ADIOI_Info_set(info, "ind_rd_buffer_size", value);
fd->hints->ind_rd_buffer_size = intval; fd->hints->ind_rd_buffer_size = intval;
} }
memset( value, 0, MPI_MAX_INFO_VAL+1 ); memset( value, 0, MPI_MAX_INFO_VAL+1 );
MPI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
fd->hints->min_fdomain_size = intval;
}
/* Now we use striping unit in common code so we should
process hints for it. */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "striping_unit", value);
fd->hints->striping_unit = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && ((intval = atoi(value)) > 0)) { if (flag && ((intval = atoi(value)) > 0)) {
did_anything = 1; did_anything = 1;
MPI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value); ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value);
fd->hints->cb_nodes = intval; fd->hints->cb_nodes = intval;
} }
} }
@ -312,24 +492,30 @@ void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (did_anything) { if (did_anything) {
ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes); ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes);
} }
/* ignore defered open hints and do not enable it for bluegene: need all
/* deferred_open won't be set by callers, but if the user doesn't * processors in the open path so we can stat-and-broadcast the blocksize
* explicitly disable collecitve buffering (two-phase) and does hint that */
* io w/o independent io is going on, we'll set this internal hint as a ADIOI_Info_set(info, "romio_no_indep_rw", "false");
* convenience */
if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE)
&& (fd->hints->cb_write != ADIOI_HINT_DISABLE)
&& fd->hints->no_indep_rw ) )
{
fd->hints->deferred_open = 1;
} else {
/* setting romio_no_indep_rw enable and romio_cb_{read,write}
* disable at the same time doesn't make sense. honor
* romio_cb_{read,write} and force the no_indep_rw hint to
* 'disable' */
MPI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0; fd->hints->no_indep_rw = 0;
fd->hints->deferred_open = 0; fd->hints->deferred_open = 0;
/* BobC commented this out, but since hint processing runs on both bgl and
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
* PVFS */
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
/* disable data sieving for fs that do not
support file locking */
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
/* get rid of this value if it is set */
ADIOI_Info_delete(info, "ind_wr_buffer_size");
}
/* note: leave ind_wr_buffer_size alone; used for other cases
* as well. -- Rob Ross, 04/22/2003
*/
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
} }
ADIOI_Free(value); ADIOI_Free(value);

Просмотреть файл

@ -15,6 +15,181 @@
#include "ad_bgl.h" #include "ad_bgl.h"
#include "ad_bgl_aggrs.h" #include "ad_bgl_aggrs.h"
#include <sys/statfs.h>
#include <sys/vfs.h>
/* COPIED FROM ad_fstype.c since it is static in that file
ADIO_FileSysType_parentdir - determines a string pathname for the
parent directory of a given filename.
Input Parameters:
. filename - pointer to file name character array
Output Parameters:
. dirnamep - pointer to location in which to store a pointer to a string
Note that the caller should free the memory located at the pointer returned
after the string is no longer needed.
*/
#ifndef PATH_MAX
#define PATH_MAX 65535
#endif
/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that
here. We assume that S_ISLNK is *always* defined as a macro. If
that is not universally true, then add a test to the romio
configure that trys to link a program that references S_ISLNK */
#if !defined(S_ISLNK)
# if defined(S_IFLNK)
/* Check for the link bit */
# define S_ISLNK(mode) ((mode) & S_IFLNK)
# else
/* no way to check if it is a link, so say false */
# define S_ISLNK(mode) 0
# endif
#endif /* !(S_ISLNK) */
/* ADIO_FileSysType_parentdir
*
* Returns pointer to string in dirnamep; that string is allocated with
* strdup and must be free()'d.
*/
static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
{
int err;
char *dir = NULL, *slash;
struct stat statbuf;
err = lstat(filename, &statbuf);
if (err || (!S_ISLNK(statbuf.st_mode))) {
/* no such file, or file is not a link; these are the "normal"
* cases where we can just return the parent directory.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* filename is a symlink. we've presumably already tried
* to stat it and found it to be missing (dangling link),
* but this code doesn't care if the target is really there
* or not.
*/
int namelen;
char *linkbuf;
linkbuf = ADIOI_Malloc(PATH_MAX+1);
namelen = readlink(filename, linkbuf, PATH_MAX+1);
if (namelen == -1) {
/* something strange has happened between the time that
* we determined that this was a link and the time that
* we attempted to read it; punt and use the old name.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* successfully read the link */
linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
dir = ADIOI_Strdup(linkbuf);
ADIOI_Free(linkbuf);
}
}
slash = strrchr(dir, '/');
if (!slash) ADIOI_Strncpy(dir, ".", 2);
else {
if (slash == dir) *(dir + 1) = '\0';
else *slash = '\0';
}
*dirnamep = dir;
return;
}
static void scaleable_stat(ADIO_File fd)
{
struct stat64 bgl_stat;
struct statfs bgl_statfs;
int rank, rc;
char * dir;
long buf[2];
MPI_Comm_rank(fd->comm, &rank);
if (rank == 0) {
/* Get the (real) underlying file system block size */
rc = stat64(fd->filename, &bgl_stat);
if (rc >= 0)
{
buf[0] = bgl_stat.st_blksize;
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
fd->filename,bgl_stat.st_blksize);
}
else
{
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
}
/* Get the (real) underlying file system type so we can
* plan our fsync scaling strategy */
rc = statfs(fd->filename,&bgl_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",
fd->filename,bgl_statfs.f_type);
buf[1] = bgl_statfs.f_type;
}
else
{
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
ADIO_FileSysType_parentdir(fd->filename, &dir);
rc = statfs(dir,&bgl_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",dir,bgl_statfs.f_type);
buf[1] = bgl_statfs.f_type;
}
else
{
/* Hmm. Guess we'll assume the worst-case, that it's not GPFS
* or PVFS2 below */
buf[1] = -1; /* bogus magic number */
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
}
free(dir);
}
}
/* now we can broadcast the stat/statfs data to everyone else */
MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
bgl_stat.st_blksize = buf[0];
bgl_statfs.f_type = buf[1];
/* data from stat64 */
/* store the blksize in the file system specific storage */
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
/* data from statfs */
if ((bgl_statfs.f_type == GPFS_SUPER_MAGIC) ||
(bgl_statfs.f_type == PVFS2_SUPER_MAGIC))
{
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BGL_FSYNC_AGGREGATION_ENABLED;
/* Only one rank is an "fsync aggregator" because only one
* fsync is needed */
if (rank == 0)
{
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr |=
ADIOI_BGL_FSYNC_AGGREGATOR;
DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
}
else ; /* aggregation enabled but this rank is not an aggregator*/
}
else; /* Other filesystems default to no fsync aggregation */
}
void ADIOI_BGL_Open(ADIO_File fd, int *error_code) void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
{ {
int perm, old_mask, amode; int perm, old_mask, amode;
@ -41,8 +216,14 @@ void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
amode = amode | O_RDWR; amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL) if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL; amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
fd->fd_sys = open(fd->filename, amode, perm); fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
fd->fd_direct = -1; fd->fd_direct = -1;
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
@ -51,17 +232,28 @@ void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
if(fd->fd_sys != -1) if(fd->fd_sys != -1)
{ {
struct stat64 bgl_stat; struct stat64 bgl_stat;
int rc = stat64(fd->filename,&bgl_stat); struct statfs bgl_statfs;
if (rc >= 0) char* dir;
{ int rc;
/* store the blksize in the file system specific storage */
/* Initialize the ad_bgl file system specific information */
AD_BGL_assert(fd->fs_ptr == NULL); AD_BGL_assert(fd->fs_ptr == NULL);
fd->fs_ptr = (ADIOI_BGL_fs*) ADIOI_Malloc(sizeof(ADIOI_BGL_fs)); fd->fs_ptr = (ADIOI_BGL_fs*) ADIOI_Malloc(sizeof(ADIOI_BGL_fs));
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
/* FPRINTF(stderr,"%s(%d):Successful stat '%s'. Blocksize=%ld\n",myname,__LINE__,fd->filename,bgl_stat.st_blksize);*/ ((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
}
/* else /* default is no fsync aggregation */
FPRINTF(stderr,"%s(%d):Stat '%s' failed with rc=%d, errno=%d\n",myname,__LINE__,fd->filename,rc,errno);*/ ((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BGL_FSYNC_AGGREGATION_DISABLED;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
scaleable_stat(fd);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
} }
if (fd->fd_sys == -1) { if (fd->fd_sys == -1) {
@ -112,3 +304,6 @@ void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
} }
else *error_code = MPI_SUCCESS; else *error_code = MPI_SUCCESS;
} }
/*
*vim: ts=8 sts=4 sw=4 noexpandtab
*/

Просмотреть файл

@ -8,6 +8,7 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
*
* Copyright (C) 1997 University of Chicago. * Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
*/ */
@ -22,18 +23,25 @@
#include "mpe.h" #include "mpe.h"
#endif #endif
#ifdef USE_DBG_LOGGING
#define RDCOLL_DEBUG 1
#endif
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
/* prototypes of functions used for collective reads only. */ /* prototypes of functions used for collective reads only. */
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs, datatype, int nprocs,
int myrank, ADIOI_Access int myrank, ADIOI_Access
*others_req, ADIO_Offset *offset_list, *others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count, ADIO_Offset *len_list, int contig_access_count,
ADIO_Offset ADIO_Offset
min_st_offset, ADIO_Offset fd_size, min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code); int *buf_idx, int *error_code);
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size, *len_list, int *send_size, int *recv_size,
int *count, int *start_pos, int *count, int *start_pos,
int *partial_send, int *partial_send,
@ -47,7 +55,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
int iter, int iter,
MPI_Aint buftype_extent, int *buf_idx); MPI_Aint buftype_extent, int *buf_idx);
static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size, *len_list, int *send_size, int *recv_size,
int *count, int *start_pos, int *count, int *start_pos,
int *partial_send, int *partial_send,
@ -62,8 +70,8 @@ static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatl
MPI_Aint buftype_extent, int *buf_idx); MPI_Aint buftype_extent, int *buf_idx);
static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **recv_buf, ADIO_Offset *flat_buf, char **recv_buf, ADIO_Offset
*offset_list, int *len_list, *offset_list, ADIO_Offset *len_list,
int *recv_size, unsigned *recv_size,
MPI_Request *requests, MPI_Status *statuses, MPI_Request *requests, MPI_Status *statuses,
int *recd_from_proc, int nprocs, int *recd_from_proc, int nprocs,
int contig_access_count, int contig_access_count,
@ -74,7 +82,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
datatype, int file_ptr_type, ADIO_Offset datatype, int file_ptr_type, ADIO_Offset
offset, ADIO_Offset **offset_list_ptr, int offset, ADIO_Offset **offset_list_ptr, ADIO_Offset
**len_list_ptr, ADIO_Offset *start_offset_ptr, **len_list_ptr, ADIO_Offset *start_offset_ptr,
ADIO_Offset *end_offset_ptr, int ADIO_Offset *end_offset_ptr, int
*contig_access_count_ptr); *contig_access_count_ptr);
@ -99,25 +107,15 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
whose request lies in this process's file domain. */ whose request lies in this process's file domain. */
int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
int contig_access_count, interleave_count = 0, buftype_is_contig; int contig_access_count=0, interleave_count = 0, buftype_is_contig;
int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
*fd_end = NULL, *end_offsets = NULL; *fd_end = NULL, *end_offsets = NULL;
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL; ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
int ii; int ii;
int *len_list = NULL, *buf_idx = NULL; ADIO_Offset *len_list = NULL;
int *buf_idx = NULL;
double io_time = 0., all_time, max_all_time;
double tstep1, max_tstep1;
double tstep1_1, max_tstep1_1;
double tstep1_2, max_tstep1_2;
double tstep1_3, max_tstep1_3;
double tstep2, max_tstep2;
double tstep3, max_tstep3;
double tstep4, max_tstep4;
double sum_sz;
#if BGL_PROFILE #if BGL_PROFILE
BGLMPIO_T_CIO_RESET( 0, r ) BGLMPIO_T_CIO_RESET( 0, r )
#endif #endif
@ -126,6 +124,14 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
int bufsize, size; int bufsize, size;
#endif #endif
#if 0
/* From common code - not implemented for bgl. */
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype,
file_ptr_type, offset, status, error_code);
return;
} */
#endif
#ifdef PROFILE #ifdef PROFILE
MPE_Log_event(13, 0, "start computation"); MPE_Log_event(13, 0, "start computation");
#endif #endif
@ -157,14 +163,16 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP ) BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
#endif #endif
/* for (i=0; i<contig_access_count; i++) { #ifdef RDCOLL_DEBUG
FPRINTF(stderr, "rank %d off %ld len %d\n", myrank, offset_list[i], for (i=0; i<contig_access_count; i++) {
len_list[i]); DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n",
}*/ myrank, offset_list[i], len_list[i]);
}
#endif
/* each process communicates its start and end offsets to other /* each process communicates its start and end offsets to other
processes. The result is an array each of start and end offsets stored processes. The result is an array each of start and end offsets
in order of process rank. */ stored in order of process rank. */
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
@ -200,7 +208,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
/* are the accesses of different processes interleaved? */ /* are the accesses of different processes interleaved? */
for (i=1; i<nprocs; i++) for (i=1; i<nprocs; i++)
if (st_offsets[i] < end_offsets[i-1]) interleave_count++; if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice /* This is a rudimentary check for interleaving, but should suffice
for the moment. */ for the moment. */
} }
@ -223,7 +233,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
if (buftype_is_contig && filetype_is_contig) { if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (fd->etype_size) * offset; off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
off, status, error_code); off, status, error_code);
} }
@ -263,7 +273,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
else else
ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
nprocs_for_coll, &min_st_offset, nprocs_for_coll, &min_st_offset,
&fd_start, &fd_end, &fd_size); &fd_start, &fd_end,
fd->hints->min_fdomain_size, &fd_size,
fd->hints->striping_unit);
#if BGL_PROFILE #if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART ) BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
@ -381,205 +393,11 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
fd->fp_sys_posn = -1; /* set it to null. */ fd->fp_sys_posn = -1; /* set it to null. */
} }
#if 0
void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
datatype, int file_ptr_type, ADIO_Offset
offset, ADIO_Offset **offset_list_ptr, int
**len_list_ptr, ADIO_Offset *start_offset_ptr,
ADIO_Offset *end_offset_ptr, int
*contig_access_count_ptr)
{
int filetype_size, buftype_size, etype_size;
int i, j, k, frd_size=0, old_frd_size=0, st_index=0;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int contig_access_count, *len_list, flag, filetype_is_contig;
MPI_Aint filetype_extent, filetype_lb;
ADIOI_Flatlist_node *flat_file;
ADIO_Offset *offset_list, off, end_offset=0, disp;
/* For this process's request, calculate the list of offsets and
lengths in the file and determine the start and end offsets. */
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_lb(fd->filetype, &filetype_lb);
MPI_Type_size(datatype, &buftype_size);
etype_size = fd->etype_size;
if ( ! filetype_size ) {
*contig_access_count_ptr = 0;
*offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
*len_list_ptr = (int *) ADIOI_Malloc(2*sizeof(int));
/* 2 is for consistency. everywhere I malloc one more than needed */
offset_list = *offset_list_ptr;
len_list = *len_list_ptr;
offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
len_list[0] = 0;
*start_offset_ptr = offset_list[0];
*end_offset_ptr = offset_list[0] + len_list[0] - 1;
return;
}
if (filetype_is_contig) {
*contig_access_count_ptr = 1;
*offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
*len_list_ptr = (int *) ADIOI_Malloc(2*sizeof(int));
/* 2 is for consistency. everywhere I malloc one more than needed */
offset_list = *offset_list_ptr;
len_list = *len_list_ptr;
offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
len_list[0] = bufcount * buftype_size;
*start_offset_ptr = offset_list[0];
*end_offset_ptr = offset_list[0] + len_list[0] - 1;
/* update file pointer */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = *end_offset_ptr + 1;
}
else {
/* First calculate what size of offset_list and len_list to allocate */
/* filetype already flattened in ADIO_Open or ADIO_Fcntl */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent +
flat_file->blocklens[i] >= offset)
{
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
(ADIO_Offset) n_filetypes*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
break;
}
}
}
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
/* calculate how much space to allocate for offset_list, len_list */
old_frd_size = frd_size;
contig_access_count = i = 0;
j = st_index;
bufsize = buftype_size * bufcount;
frd_size = ADIOI_MIN(frd_size, bufsize);
while (i < bufsize) {
if (frd_size) contig_access_count++;
i += frd_size;
j = (j + 1) % flat_file->count;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
/* allocate space for offset_list and len_list */
*offset_list_ptr = (ADIO_Offset *)
ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset));
*len_list_ptr = (int *) ADIOI_Malloc((contig_access_count+1)*sizeof(int));
/* +1 to avoid a 0-size malloc */
offset_list = *offset_list_ptr;
len_list = *len_list_ptr;
/* find start offset, end offset, and fill in offset_list and len_list */
*start_offset_ptr = offset; /* calculated above */
i = k = 0;
j = st_index;
off = offset;
frd_size = ADIOI_MIN(old_frd_size, bufsize);
while (i < bufsize) {
if (frd_size) {
offset_list[k] = off;
len_list[k] = frd_size;
k++;
}
i += frd_size;
end_offset = off + frd_size - 1;
/* Note: end_offset points to the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] +
(ADIO_Offset) n_filetypes*filetype_extent)
{
off += frd_size;
/* did not reach end of contiguous block in filetype.
* no more I/O needed. off is incremented by frd_size.
*/
}
else {
if (j < (flat_file->count - 1)) j++;
else {
/* hit end of flattened filetype;
* start at beginning again
*/
j = 0;
n_filetypes++;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
}
/* update file pointer */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
*contig_access_count_ptr = contig_access_count;
*end_offset_ptr = end_offset;
}
}
#endif
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs, datatype, int nprocs,
int myrank, ADIOI_Access int myrank, ADIOI_Access
*others_req, ADIO_Offset *offset_list, *others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count, ADIO_Offset ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset fd_size, min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code) int *buf_idx, int *error_code)
@ -594,19 +412,21 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
array from a file, where each local array is 8Mbytes, requiring array from a file, where each local array is 8Mbytes, requiring
at least another 8Mbytes of temp space is unacceptable. */ at least another 8Mbytes of temp space is unacceptable. */
int i, j, m, size, ntimes, max_ntimes, buftype_is_contig; int i, j, m, ntimes, max_ntimes, buftype_is_contig;
ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off; ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
char *read_buf = NULL, *tmp_buf; char *read_buf = NULL, *tmp_buf;
int *curr_offlen_ptr, *count, *send_size, *recv_size; int *curr_offlen_ptr, *count, *send_size, *recv_size;
int *partial_send, *recd_from_proc, *start_pos, for_next_iter; int *partial_send, *recd_from_proc, *start_pos;
int real_size, req_len, flag, for_curr_iter, rank; /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
int req_len, flag, rank;
MPI_Status status; MPI_Status status;
ADIOI_Flatlist_node *flat_buf=NULL; ADIOI_Flatlist_node *flat_buf=NULL;
MPI_Aint buftype_extent; MPI_Aint buftype_extent;
int coll_bufsize; int coll_bufsize;
#ifdef RDCOLL_DEBUG
int iii; int iii;
#endif
*error_code = MPI_SUCCESS; /* changed below if error */ *error_code = MPI_SUCCESS; /* changed below if error */
/* only I/O errors are currently reported */ /* only I/O errors are currently reported */
@ -738,7 +558,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
#ifdef PROFILE #ifdef PROFILE
MPE_Log_event(13, 0, "start computation"); MPE_Log_event(13, 0, "start computation");
#endif #endif
size = (int) (ADIOI_MIN(coll_bufsize, end_loc-st_loc+1-done)); size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done);
real_off = off - for_curr_iter; real_off = off - for_curr_iter;
real_size = size + for_curr_iter; real_size = size + for_curr_iter;
@ -746,7 +566,9 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
for_next_iter = 0; for_next_iter = 0;
for (i=0; i<nprocs; i++) { for (i=0; i<nprocs; i++) {
/* FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); */ #ifdef RDCOLL_DEBUG
DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count);
#endif
if (others_req[i].count) { if (others_req[i].count) {
start_pos[i] = curr_offlen_ptr[i]; start_pos[i] = curr_offlen_ptr[i];
for (j=curr_offlen_ptr[i]; j<others_req[i].count; for (j=curr_offlen_ptr[i]; j<others_req[i].count;
@ -769,22 +591,22 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
} }
if (req_off < real_off + real_size) { if (req_off < real_off + real_size) {
count[i]++; count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
MPI_Address(read_buf+req_off-real_off, MPI_Address(read_buf+req_off-real_off,
&(others_req[i].mem_ptrs[j])); &(others_req[i].mem_ptrs[j]));
send_size[i] += (int)(ADIOI_MIN(real_off + (ADIO_Offset)real_size - ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
req_off, req_len)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
(ADIO_Offset)(unsigned)req_len));
if (real_off+real_size-req_off < req_len) { if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
partial_send[i] = (int) (real_off+real_size- partial_send[i] = (int) (real_off + real_size - req_off);
req_off);
if ((j+1 < others_req[i].count) && if ((j+1 < others_req[i].count) &&
(others_req[i].offsets[j+1] < (others_req[i].offsets[j+1] <
real_off+real_size)) { real_off+real_size)) {
/* this is the case illustrated in the /* this is the case illustrated in the
figure above. */ figure above. */
for_next_iter = (int) (ADIOI_MAX(for_next_iter, for_next_iter = ADIOI_MAX(for_next_iter,
real_off + real_size - real_off + real_size - others_req[i].offsets[j+1]);
others_req[i].offsets[j+1]));
/* max because it must cover requests /* max because it must cover requests
from different processes */ from different processes */
} }
@ -805,13 +627,14 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
MPE_Log_event(14, 0, "end computation"); MPE_Log_event(14, 0, "end computation");
#endif #endif
if (flag) { if (flag) {
ADIO_ReadContig(fd, read_buf+for_curr_iter, size, MPI_BYTE, ADIOI_Assert(size == (int)size);
ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, error_code); ADIO_EXPLICIT_OFFSET, off, &status, error_code);
/* #ifdef RDCOLL_DEBUG
printf( "\tread_coll: 700, data read [%3d] = ", size ); DBG_FPRINTF(stderr, "\tread_coll: 700, data read [%lld] = ", size );
for (iii=0; iii<size; iii++) { printf( "%3d,", *((unsigned char *)read_buf + for_curr_iter + iii) ); } for (iii=0; iii<size && iii<80; iii++) { DBGV_FPRINTF(stderr, "%3d,", *((unsigned char *)read_buf + for_curr_iter + iii) ); }
printf( "\n" ); DBG_FPRINTF(stderr, "\n" );
*/ #endif
if (*error_code != MPI_SUCCESS) return; if (*error_code != MPI_SUCCESS) return;
} }
@ -849,6 +672,8 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
if (for_next_iter) { if (for_next_iter) {
tmp_buf = (char *) ADIOI_Malloc(for_next_iter); tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
ADIOI_Free(read_buf); ADIOI_Free(read_buf);
read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize); read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
@ -902,7 +727,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
} }
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size, *len_list, int *send_size, int *recv_size,
int *count, int *start_pos, int *partial_send, int *count, int *start_pos, int *partial_send,
int *recd_from_proc, int nprocs, int *recd_from_proc, int nprocs,
@ -937,6 +762,10 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
/* post recvs. if buftype_is_contig, data can be directly recd. into /* post recvs. if buftype_is_contig, data can be directly recd. into
user buf at location given by buf_idx. else use recv_buf. */ user buf at location given by buf_idx. else use recv_buf. */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5032, 0, NULL);
#endif
if (buftype_is_contig) { if (buftype_is_contig) {
j = 0; j = 0;
for (i=0; i < nprocs; i++) for (i=0; i < nprocs; i++)
@ -960,8 +789,10 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i,
myrank+i+100*iter, fd->comm, requests+j); myrank+i+100*iter, fd->comm, requests+j);
j++; j++;
/* FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", #ifdef RDCOLL_DEBUG
myrank, recv_size[i], myrank+i+100*iter); */ DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n",
myrank, recv_size[i], myrank+i+100*iter);
#endif
} }
} }
@ -1006,7 +837,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
/* if noncontiguous, to the copies from the recv buffers */ /* if noncontiguous, to the copies from the recv buffers */
if (!buftype_is_contig) if (!buftype_is_contig)
ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
offset_list, len_list, recv_size, offset_list, len_list, (unsigned*)recv_size,
requests, statuses, recd_from_proc, requests, statuses, recd_from_proc,
nprocs, contig_access_count, nprocs, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end, min_st_offset, fd_size, fd_start, fd_end,
@ -1024,9 +855,11 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
if (recv_size[i]) ADIOI_Free(recv_buf[i]); if (recv_size[i]) ADIOI_Free(recv_buf[i]);
ADIOI_Free(recv_buf); ADIOI_Free(recv_buf);
} }
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5033, 0, NULL);
#endif
} }
#define ADIOI_BUF_INCR \ #define ADIOI_BUF_INCR \
{ \ { \
while (buf_incr) { \ while (buf_incr) { \
@ -1040,7 +873,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
n_buftypes++; \ n_buftypes++; \
} \ } \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \ } \
buf_incr -= size_in_buf; \ buf_incr -= size_in_buf; \
@ -1052,9 +885,11 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
{ \ { \
while (size) { \ while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)(buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(((char *) buf) + user_buf_idx, \ memcpy(((char *) buf) + user_buf_idx, \
&(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \ &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \
recv_buf_idx[p] += size_in_buf; \ recv_buf_idx[p] += size_in_buf; /* already tested (size_t)size_in_buf*/ \
user_buf_idx += size_in_buf; \ user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \ flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \ if (!flat_buf_sz) { \
@ -1064,7 +899,7 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
n_buftypes++; \ n_buftypes++; \
} \ } \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \ } \
size -= size_in_buf; \ size -= size_in_buf; \
@ -1073,11 +908,10 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
ADIOI_BUF_INCR \ ADIOI_BUF_INCR \
} }
static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **recv_buf, ADIO_Offset *flat_buf, char **recv_buf, ADIO_Offset
*offset_list, int *len_list, *offset_list, ADIO_Offset *len_list,
int *recv_size, unsigned *recv_size,
MPI_Request *requests, MPI_Status *statuses, MPI_Request *requests, MPI_Status *statuses,
int *recd_from_proc, int nprocs, int *recd_from_proc, int nprocs,
int contig_access_count, int contig_access_count,
@ -1086,13 +920,18 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
ADIO_Offset *fd_end, ADIO_Offset *fd_end,
MPI_Aint buftype_extent) MPI_Aint buftype_extent)
{ {
/* this function is only called if buftype is not contig */ /* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size, buf_incr; int i, p, flat_buf_idx;
int flat_buf_sz, size_in_buf, n_buftypes; ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
int n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx; ADIO_Offset off, len, rem_len, user_buf_idx;
/* Not sure unsigned is necessary, but it makes the math safer */
unsigned *curr_from_proc, *done_from_proc, *recv_buf_idx;
int *curr_from_proc, *done_from_proc, *recv_buf_idx; ADIOI_UNREFERENCED_ARG(requests);
ADIOI_UNREFERENCED_ARG(statuses);
/* curr_from_proc[p] = amount of data recd from proc. p that has already /* curr_from_proc[p] = amount of data recd from proc. p that has already
been accounted for so far been accounted for so far
@ -1100,9 +939,9 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
filled into user buffer in previous iterations filled into user buffer in previous iterations
user_buf_idx = current location in user buffer user_buf_idx = current location in user buffer
recv_buf_idx[p] = current location in recv_buf of proc. p */ recv_buf_idx[p] = current location in recv_buf of proc. p */
curr_from_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); curr_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
done_from_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); done_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
recv_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); recv_buf_idx = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
for (i=0; i < nprocs; i++) { for (i=0; i < nprocs; i++) {
recv_buf_idx[i] = curr_from_proc[i] = 0; recv_buf_idx[i] = curr_from_proc[i] = 0;
@ -1120,7 +959,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
for (i=0; i<contig_access_count; i++) { for (i=0; i<contig_access_count; i++) {
off = offset_list[i]; off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i]; rem_len = len_list[i];
/* this request may span the file domains of more than one process */ /* this request may span the file domains of more than one process */
while (rem_len > 0) { while (rem_len > 0) {
@ -1140,29 +979,32 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
if (recv_buf_idx[p] < recv_size[p]) { if (recv_buf_idx[p] < recv_size[p]) {
if (curr_from_proc[p]+len > done_from_proc[p]) { if (curr_from_proc[p]+len > done_from_proc[p]) {
if (done_from_proc[p] > curr_from_proc[p]) { if (done_from_proc[p] > curr_from_proc[p]) {
size = (int)ADIOI_MIN(curr_from_proc[p] + len - size = ADIOI_MIN(curr_from_proc[p] + len -
done_from_proc[p], recv_size[p]-recv_buf_idx[p]); done_from_proc[p], recv_size[p]-recv_buf_idx[p]);
buf_incr = done_from_proc[p] - curr_from_proc[p]; buf_incr = done_from_proc[p] - curr_from_proc[p];
ADIOI_BUF_INCR ADIOI_BUF_INCR
buf_incr = (int)(curr_from_proc[p]+len-done_from_proc[p]); buf_incr = curr_from_proc[p]+len-done_from_proc[p];
ADIOI_Assert((done_from_proc[p] + size) == (unsigned)((ADIO_Offset)done_from_proc[p] + size));
curr_from_proc[p] = done_from_proc[p] + size; curr_from_proc[p] = done_from_proc[p] + size;
ADIOI_BUF_COPY ADIOI_BUF_COPY
} }
else { else {
size = (int)ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]); size = ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]);
buf_incr = (int)len; buf_incr = len;
curr_from_proc[p] += size; ADIOI_Assert((curr_from_proc[p] + size) == (unsigned)((ADIO_Offset)curr_from_proc[p] + size));
curr_from_proc[p] += (unsigned) size;
ADIOI_BUF_COPY ADIOI_BUF_COPY
} }
} }
else { else {
curr_from_proc[p] += (int)len; ADIOI_Assert((curr_from_proc[p] + len) == (unsigned)((ADIO_Offset)curr_from_proc[p] + len));
buf_incr = (int)len; curr_from_proc[p] += (unsigned) len;
buf_incr = len;
ADIOI_BUF_INCR ADIOI_BUF_INCR
} }
} }
else { else {
buf_incr = (int)len; buf_incr = len;
ADIOI_BUF_INCR ADIOI_BUF_INCR
} }
off += len; off += len;
@ -1179,7 +1021,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
static void ADIOI_R_Exchange_data_alltoallv( static void ADIOI_R_Exchange_data_alltoallv(
ADIO_File fd, void *buf, ADIOI_Flatlist_node ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, ADIO_Offset *offset_list, int *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
*len_list, int *send_size, int *recv_size, *len_list, int *send_size, int *recv_size,
int *count, int *start_pos, int *partial_send, int *count, int *start_pos, int *partial_send,
int *recd_from_proc, int nprocs, int *recd_from_proc, int nprocs,
@ -1192,9 +1034,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
{ {
int i, j, k=0, tmp=0, nprocs_recv, nprocs_send; int i, j, k=0, tmp=0, nprocs_recv, nprocs_send;
char **recv_buf = NULL; char **recv_buf = NULL;
MPI_Request *requests; MPI_Request *requests=NULL;
MPI_Datatype send_type; MPI_Status *statuses=NULL;
MPI_Status *statuses;
int rtail, stail; int rtail, stail;
char *sbuf_ptr, *from_ptr; char *sbuf_ptr, *from_ptr;
int len; int len;
@ -1238,7 +1079,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
} }
sbuf_ptr = all_send_buf + sdispls[i]; sbuf_ptr = all_send_buf + sdispls[i];
for (j=0; j<count[i]; j++) { for (j=0; j<count[i]; j++) {
from_ptr = (char *)( others_req[i].mem_ptrs[ start_pos[i]+j ] ); ADIOI_ENSURE_AINT_FITS_IN_PTR( others_req[i].mem_ptrs[ start_pos[i]+j ]);
from_ptr = (char *) ADIOI_AINT_CAST_TO_VOID_PTR ( others_req[i].mem_ptrs[ start_pos[i]+j ] );
len = others_req[i].lens[ start_pos[i]+j ] ; len = others_req[i].lens[ start_pos[i]+j ] ;
memcpy( sbuf_ptr, from_ptr, len ); memcpy( sbuf_ptr, from_ptr, len );
sbuf_ptr += len; sbuf_ptr += len;
@ -1247,26 +1089,19 @@ static void ADIOI_R_Exchange_data_alltoallv(
} }
} }
#if 0 #if RDCOLL_DEBUG
printf( "\tsend_size = " ); DBG_FPRINTF(stderr, "\tsend_size = [%d]%2d,",0,send_size[0]);
for (i=0; i<nprocs; i++) { printf( "%2d,", send_size[i] ); } for (i=1; i<nprocs; i++) if(send_size[i-1]!=send_size[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,send_size[i] ); }
printf( "\n" ); DBG_FPRINTF(stderr, "\trecv_size = [%d]%2d,",0,recv_size[0]);
printf( "\trecv_size = " ); for (i=1; i<nprocs; i++) if(recv_size[i-1]!=recv_size[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,recv_size[i] ); }
for (i=0; i<nprocs; i++) { printf( "%2d,", recv_size[i] ); } DBG_FPRINTF(stderr, "\tsdispls = [%d]%2d,",0,sdispls[0]);
printf( "\n" ); for (i=1; i<nprocs; i++) if(sdispls[i-1]!=sdispls[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,sdispls [i] ); }
printf( "\tsdispls = " ); DBG_FPRINTF(stderr, "\trdispls = [%d]%2d,",0,rdispls[0]);
for (i=0; i<nprocs; i++) { printf( "%2d,", sdispls [i] ); } for (i=1; i<nprocs; i++) if(rdispls[i-1]!=rdispls[i]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i,rdispls [i] ); }
printf( "\n" ); DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail );
printf( "\trdispls = " );
for (i=0; i<nprocs; i++) { printf( "%2d,", rdispls [i] ); }
printf( "\n" );
printf( "\ttails = %4d, %4d\n", stail, rtail );
#endif
#if 0
if (nprocs_send) { if (nprocs_send) {
printf( "\tall_send_buf = " ); DBG_FPRINTF(stderr, "\tall_send_buf = [%d]%2d,",0,all_send_buf[0]);
for (i=0; i<nprocs; i++) { printf( "%2d,", all_send_buf [i*131072] ); } for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf [i*131072] ); }
printf( "\n" );
} }
#endif #endif
@ -1277,16 +1112,16 @@ static void ADIOI_R_Exchange_data_alltoallv(
fd->comm ); fd->comm );
#if 0 #if 0
printf( "\tall_recv_buf = " ); DBG_FPRINTF(stderr, "\tall_recv_buf = " );
for (i=131072; i<131073; i++) { printf( "%2d,", all_recv_buf [i] ); } for (i=131072; i<131073; i++) { DBG_FPRINTF(stderr, "%2d,", all_recv_buf [i] ); }
printf( "\n" ); DBG_FPRINTF(stderr, "\n" );
#endif #endif
/* unpack at the receiver side */ /* unpack at the receiver side */
if (nprocs_recv) { if (nprocs_recv) {
if (!buftype_is_contig) if (!buftype_is_contig)
ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
offset_list, len_list, recv_size, offset_list, len_list, (unsigned*)recv_size,
requests, statuses, /* never used inside */ requests, statuses, /* never used inside */
recd_from_proc, recd_from_proc,
nprocs, contig_access_count, nprocs, contig_access_count,

Просмотреть файл

@ -21,9 +21,9 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code) ADIO_Offset offset, ADIO_Status *status, int *error_code)
{ {
int err=-1, datatype_size, len; int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BGL_READCONTIG"; static char myname[] = "ADIOI_BGL_READCONTIG";
#if BGL_PROFILE #if BGL_PROFILE
/* timing */ /* timing */
double io_time, io_time2; double io_time, io_time2;
@ -35,7 +35,8 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
#endif #endif
MPI_Type_size(datatype, &datatype_size); MPI_Type_size(datatype, &datatype_size);
len = datatype_size * count; len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
#if BGL_PROFILE #if BGL_PROFILE
@ -48,7 +49,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len); else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime(); if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, len); err = read(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2); if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err; fd->fp_sys_posn = offset + err;
@ -64,7 +65,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len); else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime(); if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, len); err = read(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2); if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err; fd->fp_ind += err;
@ -79,7 +80,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (fd->atomicity) if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len); else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, len); err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err; fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */ /* individual file pointer not updated */
@ -91,7 +92,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (fd->atomicity) if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len); else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, len); err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err; fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind; fd->fp_sys_posn = fd->fp_ind;
@ -120,12 +121,11 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
} }
#define ADIOI_BUFFERED_READ \ #define ADIOI_BUFFERED_READ \
{ \ { \
if (req_off >= readbuf_off + readbuf_len) { \ if (req_off >= readbuf_off + readbuf_len) { \
readbuf_off = req_off; \ readbuf_off = req_off; \
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\ readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\ lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\ if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
err = read(fd->fd_sys, readbuf, readbuf_len);\ err = read(fd->fd_sys, readbuf, readbuf_len);\
@ -133,6 +133,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (err == -1) err_flag = 1; \ if (err == -1) err_flag = 1; \
} \ } \
while (req_len > readbuf_off + readbuf_len - req_off) { \ while (req_len > readbuf_off + readbuf_len - req_off) { \
ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
partial_read = (int) (readbuf_off + readbuf_len - req_off); \ partial_read = (int) (readbuf_off + readbuf_len - req_off); \
tmp_buf = (char *) ADIOI_Malloc(partial_read); \ tmp_buf = (char *) ADIOI_Malloc(partial_read); \
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \ memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
@ -141,7 +142,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
memcpy(readbuf, tmp_buf, partial_read); \ memcpy(readbuf, tmp_buf, partial_read); \
ADIOI_Free(tmp_buf); \ ADIOI_Free(tmp_buf); \
readbuf_off += readbuf_len-partial_read; \ readbuf_off += readbuf_len-partial_read; \
readbuf_len = (int) (partial_read + ADIOI_MIN(max_bufsize, \ readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
end_offset-readbuf_off+1)); \ end_offset-readbuf_off+1)); \
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\ lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\ if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
@ -149,6 +150,7 @@ void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
if (err == -1) err_flag = 1; \ if (err == -1) err_flag = 1; \
} \ } \
ADIOI_Assert(req_len == (size_t)req_len); \
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \ memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
} }
@ -160,20 +162,23 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
{ {
/* offset is in units of etype relative to the filetype. */ /* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file; ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, err=-1, brd_size, frd_size=0, st_index=0; ADIO_Offset i_offset, new_brd_size, brd_size, size;
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int i, j, k, err=-1, st_index=0;
int n_filetypes, etype_in_filetype; ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
unsigned num, bufsize;
int n_etypes_in_filetype;
ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
ADIO_Offset abs_off_in_filetype=0; ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, req_len, partial_read; int filetype_size, etype_size, buftype_size, partial_read;
MPI_Aint filetype_extent, buftype_extent; MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig; int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off; ADIO_Offset userbuf_off, req_len, sum;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value; char *readbuf, *tmp_buf, *value;
int flag, st_frd_size, st_n_filetypes, readbuf_len; int err_flag=0, info_flag;
int new_brd_size, new_frd_size, err_flag=0, info_flag, max_bufsize; unsigned max_bufsize, readbuf_len;
static char myname[] = "ADIOI_BGL_READSTRIDED"; static char myname[] = "ADIOI_BGL_READSTRIDED";
if (fd->hints->ds_read == ADIOI_HINT_DISABLE) { if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
@ -207,12 +212,13 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Type_extent(datatype, &buftype_extent); MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size; etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count; bufsize = buftype_size * count;
/* get max_bufsize from the info object. */ /* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag); &info_flag);
max_bufsize = atoi(value); max_bufsize = atoi(value);
ADIOI_Free(value); ADIOI_Free(value);
@ -226,13 +232,13 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
while (flat_buf->type != datatype) flat_buf = flat_buf->next; while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset; fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off; start_off = off;
end_offset = off + bufsize - 1; end_offset = off + bufsize - 1;
readbuf_off = off; readbuf_off = off;
readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
/* if atomicity is true, lock (exclusive) the region to be accessed */ /* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity) if (fd->atomicity)
@ -245,13 +251,16 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
if (err == -1) err_flag = 1; if (err == -1) err_flag = 1;
for (j=0; j<count; j++) for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) { for (i=0; i<flat_buf->count; i++) {
userbuf_off = j*buftype_extent + flat_buf->indices[i]; userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off; req_off = off;
req_len = flat_buf->blocklens[i]; req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_READ ADIOI_BUFFERED_READ
off += flat_buf->blocklens[i]; off += flat_buf->blocklens[i];
} }
}
if (fd->atomicity) if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
@ -277,29 +286,36 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp; disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) { if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */ /* Wei-keng reworked type processing to be a bit more efficient */
n_filetypes = -1; offset = fd->fp_ind - disp;
flag = 0; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
while (!flag) { offset -= (ADIO_Offset)n_filetypes * filetype_extent;
n_filetypes++; /* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) { for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] + ADIO_Offset dist;
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] if (flat_file->blocklens[i] == 0) continue;
>= offset) { dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
st_index = i; /* frd_size is from offset to the end of block i */
frd_size = (int) (disp + flat_file->indices[i] + if (dist == 0) {
(ADIO_Offset) n_filetypes*filetype_extent i++;
+ flat_file->blocklens[i] - offset); offset = flat_file->indices[i];
flag = 1; frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
frd_size = dist;
break; break;
} }
} }
} st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
} }
else { else {
n_etypes_in_filetype = filetype_size/etype_size; n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype); n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = (int) (offset % n_etypes_in_filetype); etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size; size_in_filetype = etype_in_filetype * etype_size;
sum = 0; sum = 0;
@ -315,32 +331,63 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
} }
/* abs. offset in bytes in the file */ /* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
} }
start_off = offset; start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed. /* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
st_frd_size = frd_size; st_frd_size = frd_size;
st_n_filetypes = n_filetypes; st_n_filetypes = n_filetypes;
i = 0; i_offset = 0;
j = st_index; j = st_index;
off = offset; off = offset;
frd_size = ADIOI_MIN(st_frd_size, bufsize); frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i < bufsize) { while (i_offset < bufsize) {
i += frd_size; i_offset += frd_size;
end_offset = off + frd_size - 1; end_offset = off + frd_size - 1;
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
} }
/* if atomicity is true, lock (exclusive) the region to be accessed */ /* if atomicity is true, lock (exclusive) the region to be accessed */
@ -350,7 +397,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
/* initial read into readbuf */ /* initial read into readbuf */
readbuf_off = offset; readbuf_off = offset;
readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
lseek(fd->fd_sys, offset, SEEK_SET); lseek(fd->fd_sys, offset, SEEK_SET);
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len); if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
@ -364,12 +411,12 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
/* contiguous in memory, noncontiguous in file. should be the most /* contiguous in memory, noncontiguous in file. should be the most
common case. */ common case. */
i = 0; i_offset = 0;
j = st_index; j = st_index;
off = offset; off = offset;
n_filetypes = st_n_filetypes; n_filetypes = st_n_filetypes;
frd_size = ADIOI_MIN(st_frd_size, bufsize); frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i < bufsize) { while (i_offset < bufsize) {
if (frd_size) { if (frd_size) {
/* TYPE_UB and TYPE_LB can result in /* TYPE_UB and TYPE_LB can result in
frd_size = 0. save system call in such cases */ frd_size = 0. save system call in such cases */
@ -378,25 +425,26 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
req_off = off; req_off = off;
req_len = frd_size; req_len = frd_size;
userbuf_off = i; userbuf_off = i_offset;
ADIOI_BUFFERED_READ ADIOI_BUFFERED_READ
} }
i += frd_size; i_offset += frd_size;
if (off + frd_size < disp + flat_file->indices[j] + if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += frd_size; off += frd_size;
/* did not reach end of contiguous block in filetype. /* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */ no more I/O needed. off is incremented by frd_size. */
else { else {
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
} }
} }
} }
@ -408,7 +456,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
while (flat_buf->type != datatype) flat_buf = flat_buf->next; while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0; k = num = buf_count = 0;
i = (int) (flat_buf->indices[0]); i_offset = flat_buf->indices[0];
j = st_index; j = st_index;
off = offset; off = offset;
n_filetypes = st_n_filetypes; n_filetypes = st_n_filetypes;
@ -423,7 +471,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
req_off = off; req_off = off;
req_len = size; req_len = size;
userbuf_off = i; userbuf_off = i_offset;
ADIOI_BUFFERED_READ ADIOI_BUFFERED_READ
} }
@ -432,18 +480,19 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
if (size == frd_size) { if (size == frd_size) {
/* reached end of contiguous block in file */ /* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; n_filetypes*(ADIO_Offset)filetype_extent;
new_frd_size = flat_file->blocklens[j]; new_frd_size = flat_file->blocklens[j];
if (size != brd_size) { if (size != brd_size) {
i += size; i_offset += size;
new_brd_size -= size; new_brd_size -= size;
} }
} }
@ -453,7 +502,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
k = (k + 1)%flat_buf->count; k = (k + 1)%flat_buf->count;
buf_count++; buf_count++;
i = (int) (buftype_extent*(buf_count/flat_buf->count) + i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k]); flat_buf->indices[k]);
new_brd_size = flat_buf->blocklens[k]; new_brd_size = flat_buf->blocklens[k];
if (size != frd_size) { if (size != frd_size) {
@ -461,6 +510,7 @@ void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
new_frd_size -= size; new_frd_size -= size;
} }
} }
ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
num += size; num += size;
frd_size = new_frd_size; frd_size = new_frd_size;
brd_size = new_brd_size; brd_size = new_brd_size;

Просмотреть файл

@ -3,7 +3,13 @@
/* ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- */
/** /**
* \file ad_bgl_tuning.c * \file ad_bgl_tuning.c
* \brief ??? * \brief defines ad_bgl performance tuning
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/ */
/*--------------------------------------------------------------------- /*---------------------------------------------------------------------
@ -26,6 +32,40 @@ double bglmpio_prof_cw [BGLMPIO_CIO_LAST];
double bglmpio_prof_cr [BGLMPIO_CIO_LAST]; double bglmpio_prof_cr [BGLMPIO_CIO_LAST];
/* set internal variables for tuning environment variables */ /* set internal variables for tuning environment variables */
/** \page env_vars Environment Variables
* - BGLMPIO_COMM - Define how data is exchanged on collective
* reads and writes. Possible values:
* - 0 - Use MPI_Alltoallv.
* - 1 - Use MPI_Isend/MPI_Irecv.
* - Default is 0.
*
* - BGLMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
* Must also compile the library with BGL_PROFILE defined. Possible values:
* - 0 - Do not collect/report timing.
* - 1 - Collect/report timing.
* - Default is 0.
*
* - BGLMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
* Must also compile the library with BGL_PROFILE defined. Possible values:
* - 0 - Do not collect/report averages.
* - 1 - Collect/report averages.
* - Default is 0.
*
* - BGLMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
* for aggregator collective i/o. Possible values:
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
* - Default is 1.
*
* - BGLMPIO_TUNEBLOCKING - Tune how aggregate file domains are
* calculated (block size). Possible values:
* - 0 - Evenly calculate file domains across aggregators. Also use
* MPI_Isend/MPI_Irecv to exchange domain information.
* - 1 - Align file domains with the underlying file system's block size. Also use
* MPI_Alltoallv to exchange domain information.
* - Default is 1.
*
*/
void ad_bgl_get_env_vars() { void ad_bgl_get_env_vars() {
char *x; char *x;

Просмотреть файл

@ -18,6 +18,9 @@
#include "ad_bgl_pset.h" #include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h" #include "ad_bgl_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef PROFILE #ifdef PROFILE
#include "mpe.h" #include "mpe.h"
#endif #endif
@ -26,13 +29,13 @@
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs, int myrank, ADIOI_Access datatype, int nprocs, int myrank, ADIOI_Access
*others_req, ADIO_Offset *offset_list, *others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count, ADIO_Offset ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset fd_size, min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code); int *buf_idx, int *error_code);
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIOI_Flatlist_node *flat_buf, ADIO_Offset ADIOI_Flatlist_node *flat_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size, *offset_list, ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off, int size, int *recv_size, ADIO_Offset off, int size,
int *count, int *start_pos, int *partial_recv, int *count, int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs, int *sent_to_proc, int nprocs,
@ -49,7 +52,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
char *write_buf, /* 1 */ char *write_buf, /* 1 */
ADIOI_Flatlist_node *flat_buf, ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list, ADIO_Offset *offset_list,
int *len_list, int *send_size, int *recv_size, ADIO_Offset *len_list, int *send_size, int *recv_size,
ADIO_Offset off, int size, /* 2 */ ADIO_Offset off, int size, /* 2 */
int *count, int *start_pos, int *partial_recv, int *count, int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs, int myrank, int *sent_to_proc, int nprocs, int myrank,
@ -65,7 +68,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
int *error_code); int *error_code);
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset *flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size, *offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc, MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank, int nprocs, int myrank,
int contig_access_count, ADIO_Offset int contig_access_count, ADIO_Offset
@ -76,7 +79,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
MPI_Aint buftype_extent); MPI_Aint buftype_extent);
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset *flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size, *offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc, MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank, int nprocs, int myrank,
int contig_access_count, ADIO_Offset int contig_access_count, ADIO_Offset
@ -118,26 +121,27 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL; ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
int ii; int ii;
int *buf_idx = NULL, *len_list = NULL; int *buf_idx = NULL;
ADIO_Offset *len_list = NULL;
double io_time = 0, all_time, max_all_time;
double tstep1, max_tstep1;
double tstep1_1, max_tstep1_1;
double tstep1_2, max_tstep1_2;
double tstep1_3, max_tstep1_3;
double tstep2, max_tstep2;
double tstep3, max_tstep3;
double tstep4, max_tstep4;
double sum_sz;
#if BGL_PROFILE #if BGL_PROFILE
BGLMPIO_T_CIO_RESET( 0, w ) BGLMPIO_T_CIO_RESET( 0, w )
#endif #endif
#if 0
/* From common code - not implemented for bgl.*/
int old_error, tmp_error;
#endif
#ifdef PROFILE #ifdef PROFILE
MPE_Log_event(13, 0, "start computation"); MPE_Log_event(13, 0, "start computation");
#endif #endif
#if 0
/* From common code - not implemented for bgl. */
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
ADIOI_IOStridedColl (fd, buf, count, ADIOI_WRITE, datatype,
file_ptr_type, offset, status, error_code);
return;
}
#endif
MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank); MPI_Comm_rank(fd->comm, &myrank);
@ -207,7 +211,8 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
/* are the accesses of different processes interleaved? */ /* are the accesses of different processes interleaved? */
for (i=1; i<nprocs; i++) for (i=1; i<nprocs; i++)
if ((st_offsets[i] < end_offsets[i-1]) && if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i])) interleave_count++; (st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice /* This is a rudimentary check for interleaving, but should suffice
for the moment. */ for the moment. */
} }
@ -231,7 +236,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
if (buftype_is_contig && filetype_is_contig) { if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (fd->etype_size) * offset; off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_WriteContig(fd, buf, count, datatype, ADIO_WriteContig(fd, buf, count, datatype,
ADIO_EXPLICIT_OFFSET, ADIO_EXPLICIT_OFFSET,
off, status, error_code); off, status, error_code);
@ -260,7 +265,9 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
else else
ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
nprocs_for_coll, &min_st_offset, nprocs_for_coll, &min_st_offset,
&fd_start, &fd_end, &fd_size); &fd_start, &fd_end,
fd->hints->min_fdomain_size, &fd_size,
fd->hints->striping_unit);
#if BGL_PROFILE #if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART ) BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
@ -329,9 +336,50 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
BGLMPIO_T_CIO_REPORT( 0, w, fd, myrank ) BGLMPIO_T_CIO_REPORT( 0, w, fd, myrank )
#endif #endif
#if 0
/* From common code - not implemented for bgl.
*
* If this collective write is followed by an independent write,
* it's possible to have those subsequent writes on other processes
* race ahead and sneak in before the read-modify-write completes.
* We carry out a collective communication at the end here so no one
* can start independent i/o before collective I/O completes.
*
* need to do some gymnastics with the error codes so that if something
* went wrong, all processes report error, but if a process has a more
* specific error code, we can still have that process report the
* additional information */
old_error = *error_code;
if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
/* optimization: if only one process performing i/o, we can perform
* a less-expensive Bcast */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
#endif
if (fd->hints->cb_nodes == 1)
MPI_Bcast(error_code, 1, MPI_INT,
fd->hints->ranklist[0], fd->comm);
else {
tmp_error = *error_code;
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
MPI_MAX, fd->comm);
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
#endif
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5012, 0, NULL);
#endif
if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
*error_code = old_error;
#endif
/* free all memory allocated for collective I/O */ /* free all memory allocated for collective I/O */
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
for (i=0; i<nprocs; i++) { for (i=0; i<nprocs; i++) {
if (others_req[i].count) { if (others_req[i].count) {
@ -363,6 +411,9 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
#endif #endif
fd->fp_sys_posn = -1; /* set it to null. */ fd->fp_sys_posn = -1; /* set it to null. */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5013, 0, NULL);
#endif
} }
@ -371,12 +422,12 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
* code is created and returned in error_code. * code is created and returned in error_code.
*/ */
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
datatype, int nprocs, int myrank, datatype, int nprocs,
int myrank,
ADIOI_Access ADIOI_Access
*others_req, ADIO_Offset *offset_list, *others_req, ADIO_Offset *offset_list,
int *len_list, int contig_access_count, ADIO_Offset *len_list, int contig_access_count,
ADIO_Offset ADIO_Offset min_st_offset, ADIO_Offset fd_size,
min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code) int *buf_idx, int *error_code)
{ {
@ -389,7 +440,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
array to a file, where each local array is 8Mbytes, requiring array to a file, where each local array is 8Mbytes, requiring
at least another 8Mbytes of temp space is unacceptable. */ at least another 8Mbytes of temp space is unacceptable. */
int hole, i, j, m, size=0, ntimes, max_ntimes, buftype_is_contig; /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
ADIO_Offset size=0;
int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off; ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
char *write_buf=NULL; char *write_buf=NULL;
int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size; int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
@ -410,7 +463,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
That gives the no. of communication phases as well. */ That gives the no. of communication phases as well. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag); &info_flag);
coll_bufsize = atoi(value); coll_bufsize = atoi(value);
ADIOI_Free(value); ADIOI_Free(value);
@ -526,7 +579,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
#endif #endif
for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0; for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0;
size = (int) (ADIOI_MIN(coll_bufsize, end_loc-st_loc+1-done)); size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done);
for (i=0; i < nprocs; i++) { for (i=0; i < nprocs; i++) {
if (others_req[i].count) { if (others_req[i].count) {
@ -550,12 +603,14 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
} }
if (req_off < off + size) { if (req_off < off + size) {
count[i]++; count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
MPI_Address(write_buf+req_off-off, MPI_Address(write_buf+req_off-off,
&(others_req[i].mem_ptrs[j])); &(others_req[i].mem_ptrs[j]));
recv_size[i] += (int)(ADIOI_MIN(off + (ADIO_Offset)size - ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off));
req_off, req_len)); recv_size[i] += (int)(ADIOI_MIN(off + size - req_off,
(unsigned)req_len));
if (off+size-req_off < req_len) if (off+size-req_off < (unsigned)req_len)
{ {
partial_recv[i] = (int) (off + size - req_off); partial_recv[i] = (int) (off + size - req_off);
@ -618,7 +673,8 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
if (count[i]) flag = 1; if (count[i]) flag = 1;
if (flag) { if (flag) {
ADIO_WriteContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, ADIOI_Assert(size == (int)size);
ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
off, &status, error_code); off, &status, error_code);
if (*error_code != MPI_SUCCESS) return; if (*error_code != MPI_SUCCESS) return;
} }
@ -678,7 +734,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
*/ */
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIOI_Flatlist_node *flat_buf, ADIO_Offset ADIOI_Flatlist_node *flat_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size, *offset_list, ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off, int size, int *recv_size, ADIO_Offset off, int size,
int *count, int *start_pos, int *count, int *start_pos,
int *partial_recv, int *partial_recv,
@ -758,20 +814,27 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
} }
ADIOI_Free(tmp_len); ADIOI_Free(tmp_len);
/* check if there are any holes */ /* check if there are any holes. If yes, must do read-modify-write.
* holes can be in three places. 'middle' is what you'd expect: the
* processes are operating on noncontigous data. But holes can also show
* up at the beginning or end of the file domain (see John Bent ROMIO REQ
* #835). Missing these holes would result in us writing more data than
* recieved by everyone else. */
*hole = 0; *hole = 0;
/* See if there are holes before the first request or after the last request*/ if (off != srt_off[0]) /* hole at the front */
if((srt_off[0] > off) ||
((srt_off[sum-1] + srt_len[sum-1]) < (off + size)))
{
*hole = 1; *hole = 1;
else { /* coalesce the sorted offset-length pairs */
for (i=1; i<sum; i++) {
if (srt_off[i] <= srt_off[0] + srt_len[0]) {
int new_len = srt_off[i] + srt_len[i] - srt_off[0];
if (new_len > srt_len[0]) srt_len[0] = new_len;
} }
else /* See if there are holes between the requests, if there are more than one */ else
for (i=0; i<sum-1; i++)
if (srt_off[i]+srt_len[i] < srt_off[i+1]) {
*hole = 1;
break; break;
} }
if (i < sum || size != srt_len[0]) /* hole in middle or end */
*hole = 1;
}
ADIOI_Free(srt_off); ADIOI_Free(srt_off);
ADIOI_Free(srt_len); ADIOI_Free(srt_len);
@ -821,6 +884,9 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
/* post sends. if buftype_is_contig, data can be directly sent from /* post sends. if buftype_is_contig, data can be directly sent from
user buf at location given by buf_idx. else use send_buf. */ user buf at location given by buf_idx. else use send_buf. */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5032, 0, NULL);
#endif
if (buftype_is_contig) { if (buftype_is_contig) {
j = 0; j = 0;
for (i=0; i < nprocs; i++) for (i=0; i < nprocs; i++)
@ -895,6 +961,9 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
MPI_Waitall(nprocs_send+nprocs_recv, requests, statuses); MPI_Waitall(nprocs_send+nprocs_recv, requests, statuses);
#endif #endif
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5033, 0, NULL);
#endif
ADIOI_Free(statuses); ADIOI_Free(statuses);
ADIOI_Free(requests); ADIOI_Free(requests);
if (!buftype_is_contig && nprocs_send) { if (!buftype_is_contig && nprocs_send) {
@ -918,7 +987,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
n_buftypes++; \ n_buftypes++; \
} \ } \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \ } \
buf_incr -= size_in_buf; \ buf_incr -= size_in_buf; \
@ -930,6 +999,8 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
{ \ { \
while (size) { \ while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)((MPIR_Upint)buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(&(send_buf[p][send_buf_idx[p]]), \ memcpy(&(send_buf[p][send_buf_idx[p]]), \
((char *) buf) + user_buf_idx, size_in_buf); \ ((char *) buf) + user_buf_idx, size_in_buf); \
send_buf_idx[p] += size_in_buf; \ send_buf_idx[p] += size_in_buf; \
@ -942,7 +1013,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
n_buftypes++; \ n_buftypes++; \
} \ } \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \ user_buf_idx = flat_buf->indices[flat_buf_idx] + \
n_buftypes*buftype_extent; \ (ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \ } \
size -= size_in_buf; \ size -= size_in_buf; \
@ -951,11 +1022,9 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIOI_BUF_INCR \ ADIOI_BUF_INCR \
} }
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset *flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size, *offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc, MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank, int nprocs, int myrank,
int contig_access_count, int contig_access_count,
@ -967,8 +1036,9 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
{ {
/* this function is only called if buftype is not contig */ /* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size; int i, p, flat_buf_idx;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes; ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
int jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx; ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already /* curr_to_proc[p] = amount of data sent to proc. p that has already
@ -995,7 +1065,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
for (i=0; i<contig_access_count; i++) { for (i=0; i<contig_access_count; i++) {
off = offset_list[i]; off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i]; rem_len = len_list[i];
/*this request may span the file domains of more than one process*/ /*this request may span the file domains of more than one process*/
while (rem_len != 0) { while (rem_len != 0) {
@ -1015,17 +1085,20 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
if (send_buf_idx[p] < send_size[p]) { if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p]+len > done_to_proc[p]) { if (curr_to_proc[p]+len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) { if (done_to_proc[p] > curr_to_proc[p]) {
size = (int)ADIOI_MIN(curr_to_proc[p] + len - size = ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p], send_size[p]-send_buf_idx[p]); done_to_proc[p], send_size[p]-send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p]; buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR ADIOI_BUF_INCR
buf_incr = (int)(curr_to_proc[p] + len - done_to_proc[p]); ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = curr_to_proc[p] + len - done_to_proc[p];
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size; curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY ADIOI_BUF_COPY
} }
else { else {
size = (int)ADIOI_MIN(len,send_size[p]-send_buf_idx[p]); size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
buf_incr = (int)len; buf_incr = len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size; curr_to_proc[p] += size;
ADIOI_BUF_COPY ADIOI_BUF_COPY
} }
@ -1036,13 +1109,14 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
} }
} }
else { else {
curr_to_proc[p] += (int)len; ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
buf_incr = (int)len; curr_to_proc[p] += len;
buf_incr = len;
ADIOI_BUF_INCR ADIOI_BUF_INCR
} }
} }
else { else {
buf_incr = (int)len; buf_incr = len;
ADIOI_BUF_INCR ADIOI_BUF_INCR
} }
off += len; off += len;
@ -1181,7 +1255,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
char *write_buf, /* 1 */ char *write_buf, /* 1 */
ADIOI_Flatlist_node *flat_buf, ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list, ADIO_Offset *offset_list,
int *len_list, int *send_size, int *recv_size, ADIO_Offset *len_list, int *send_size, int *recv_size,
ADIO_Offset off, int size, /* 2 */ ADIO_Offset off, int size, /* 2 */
int *count, int *start_pos, int *partial_recv, int *count, int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs, int myrank, int *sent_to_proc, int nprocs, int myrank,
@ -1196,11 +1270,10 @@ static void ADIOI_W_Exchange_data_alltoallv(
int iter, MPI_Aint buftype_extent, int *buf_idx, int iter, MPI_Aint buftype_extent, int *buf_idx,
int *error_code) int *error_code)
{ {
int i, j, k=0, tmp=0, nprocs_recv, nprocs_send, erri, *tmp_len, err; int i, j, k=0, nprocs_recv, nprocs_send, *tmp_len, err;
char **send_buf = NULL; char **send_buf = NULL;
MPI_Request *requests, *send_req; MPI_Request *send_req=NULL;
MPI_Datatype recv_type; MPI_Status status;
MPI_Status *statuses, status;
int rtail, stail; int rtail, stail;
char *sbuf_ptr, *to_ptr; char *sbuf_ptr, *to_ptr;
int len; int len;
@ -1324,7 +1397,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
sbuf_ptr = all_recv_buf + rdispls[i]; sbuf_ptr = all_recv_buf + rdispls[i];
for (j=0; j<count[i]; j++) { for (j=0; j<count[i]; j++) {
to_ptr = (char *)( others_req[i].mem_ptrs[ start_pos[i]+j ] ); ADIOI_ENSURE_AINT_FITS_IN_PTR(others_req[i].mem_ptrs[ start_pos[i]+j ]);
to_ptr = (char *) ADIOI_AINT_CAST_TO_VOID_PTR ( others_req[i].mem_ptrs[ start_pos[i]+j ] );
len = others_req[i].lens[ start_pos[i]+j ] ; len = others_req[i].lens[ start_pos[i]+j ] ;
memcpy( to_ptr, sbuf_ptr, len ); memcpy( to_ptr, sbuf_ptr, len );
sbuf_ptr += len; sbuf_ptr += len;
@ -1349,7 +1423,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset *flat_buf, char **send_buf, ADIO_Offset
*offset_list, int *len_list, int *send_size, *offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc, MPI_Request *requests, int *sent_to_proc,
int nprocs, int myrank, int nprocs, int myrank,
int contig_access_count, int contig_access_count,
@ -1361,8 +1435,9 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
{ {
/* this function is only called if buftype is not contig */ /* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size; int i, p, flat_buf_idx;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes; ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
int jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx; ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already /* curr_to_proc[p] = amount of data sent to proc. p that has already
@ -1389,7 +1464,7 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
for (i=0; i<contig_access_count; i++) { for (i=0; i<contig_access_count; i++) {
off = offset_list[i]; off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i]; rem_len = len_list[i];
/*this request may span the file domains of more than one process*/ /*this request may span the file domains of more than one process*/
while (rem_len != 0) { while (rem_len != 0) {
@ -1409,17 +1484,20 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
if (send_buf_idx[p] < send_size[p]) { if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p]+len > done_to_proc[p]) { if (curr_to_proc[p]+len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) { if (done_to_proc[p] > curr_to_proc[p]) {
size = (int)ADIOI_MIN(curr_to_proc[p] + len - size = ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p], send_size[p]-send_buf_idx[p]); done_to_proc[p], send_size[p]-send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p]; buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR ADIOI_BUF_INCR
buf_incr = (int)(curr_to_proc[p] + len - done_to_proc[p]); ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = curr_to_proc[p] + len - done_to_proc[p];
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size; curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY ADIOI_BUF_COPY
} }
else { else {
size = (int)ADIOI_MIN(len,send_size[p]-send_buf_idx[p]); size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]);
buf_incr = (int)len; buf_incr = len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size; curr_to_proc[p] += size;
ADIOI_BUF_COPY ADIOI_BUF_COPY
} }
@ -1433,13 +1511,14 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
*/ */
} }
else { else {
ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
curr_to_proc[p] += (int)len; curr_to_proc[p] += (int)len;
buf_incr = (int)len; buf_incr = len;
ADIOI_BUF_INCR ADIOI_BUF_INCR
} }
} }
else { else {
buf_incr = (int)len; buf_incr = len;
ADIOI_BUF_INCR ADIOI_BUF_INCR
} }
off += len; off += len;

Просмотреть файл

@ -17,13 +17,20 @@
#include "ad_bgl_tuning.h" #include "ad_bgl_tuning.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count, void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code) ADIO_Offset offset, ADIO_Status *status, int *error_code)
{ {
int err=-1, datatype_size, len; int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BGL_WRITECONTIG"; static char myname[] = "ADIOI_BGL_WRITECONTIG";
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5036, 0, NULL);
#endif
#if BGL_PROFILE #if BGL_PROFILE
/* timing */ /* timing */
double io_time, io_time2; double io_time, io_time2;
@ -35,7 +42,8 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
#endif #endif
MPI_Type_size(datatype, &datatype_size); MPI_Type_size(datatype, &datatype_size);
len = datatype_size * count; len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
#if BGL_PROFILE #if BGL_PROFILE
@ -46,7 +54,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2); if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime(); if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, len); err = write(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2); if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err; fd->fp_sys_posn = offset + err;
@ -60,7 +68,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2); if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime(); if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, len); err = write(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2); if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err; fd->fp_ind += err;
@ -73,7 +81,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (fd->fp_sys_posn != offset) if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET); lseek(fd->fd_sys, offset, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, len); err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err; fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */ /* individual file pointer not updated */
@ -83,7 +91,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (fd->fp_sys_posn != fd->fp_ind) if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, len); err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len); ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err; fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind; fd->fp_sys_posn = fd->fp_ind;
@ -110,11 +118,12 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
#endif #endif
*error_code = MPI_SUCCESS; *error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5037, 0, NULL);
#endif
} }
#define ADIOI_BUFFERED_WRITE \ #define ADIOI_BUFFERED_WRITE \
{ \ { \
if (req_off >= writebuf_off + writebuf_len) { \ if (req_off >= writebuf_off + writebuf_len) { \
@ -123,7 +132,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \ if (err == -1) err_flag = 1; \
writebuf_off = req_off; \ writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\ writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \ err = read(fd->fd_sys, writebuf, writebuf_len); \
@ -135,7 +144,8 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
return; \ return; \
} \ } \
} \ } \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \ write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\ memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \ while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
@ -145,7 +155,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
req_len -= write_sz; \ req_len -= write_sz; \
userbuf_off += write_sz; \ userbuf_off += write_sz; \
writebuf_off += writebuf_len; \ writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\ writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \ err = read(fd->fd_sys, writebuf, writebuf_len); \
@ -173,9 +183,10 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \ if (err == -1) err_flag = 1; \
writebuf_off = req_off; \ writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\ writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
} \ } \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \ write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\ memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \ while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \ lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
@ -186,7 +197,7 @@ void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
req_len -= write_sz; \ req_len -= write_sz; \
userbuf_off += write_sz; \ userbuf_off += write_sz; \
writebuf_off += writebuf_len; \ writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\ writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
write_sz = ADIOI_MIN(req_len, writebuf_len); \ write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\ memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \ } \
@ -201,19 +212,23 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
{ {
/* offset is in units of etype relative to the filetype. */ /* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file; ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; ADIO_Offset i_offset, sum, size_in_filetype;
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int i, j, k, err=-1, st_index=0;
int n_filetypes, etype_in_filetype; int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0; ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, req_len; int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent; MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig; int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off; ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value; char *writebuf, *value;
int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; unsigned bufsize, writebuf_len, max_bufsize, write_sz;
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize; int err_flag=0, info_flag;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
static char myname[] = "ADIOI_BGL_WRITESTRIDED"; static char myname[] = "ADIOI_BGL_WRITESTRIDED";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
@ -247,12 +262,13 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Type_extent(datatype, &buftype_extent); MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size; etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count; bufsize = buftype_size * count;
/* get max_bufsize from the info object. */ /* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag); &info_flag);
max_bufsize = atoi(value); max_bufsize = atoi(value);
ADIOI_Free(value); ADIOI_Free(value);
@ -272,20 +288,23 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
end_offset = off + bufsize - 1; end_offset = off + bufsize - 1;
writebuf_off = off; writebuf_off = off;
writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
/* if atomicity is true, lock the region to be accessed */ /* if atomicity is true, lock the region to be accessed */
if (fd->atomicity) if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
for (j=0; j<count; j++) for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) { for (i=0; i<flat_buf->count; i++) {
userbuf_off = j*buftype_extent + flat_buf->indices[i]; userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off; req_off = off;
req_len = flat_buf->blocklens[i]; req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i]; off += flat_buf->blocklens[i];
} }
}
/* write the buffer out finally */ /* write the buffer out finally */
lseek(fd->fd_sys, writebuf_off, SEEK_SET); lseek(fd->fd_sys, writebuf_off, SEEK_SET);
@ -317,29 +336,37 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp; disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) { if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */ /* Wei-keng reworked type processing to be a bit more efficient */
n_filetypes = -1; offset = fd->fp_ind - disp;
flag = 0; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
while (!flag) { offset -= (ADIO_Offset)n_filetypes * filetype_extent;
n_filetypes++; /* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) { for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] + ADIO_Offset dist;
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] if (flat_file->blocklens[i] == 0) continue;
>= offset) { dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
st_index = i; /* fwr_size is from offset to the end of block i */
fwr_size = (int) (disp + flat_file->indices[i] + if (dist == 0) {
(ADIO_Offset) n_filetypes*filetype_extent i++;
+ flat_file->blocklens[i] - offset); offset = flat_file->indices[i];
flag = 1; fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break; break;
} }
} }
} st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
} }
else { else {
int i;
n_etypes_in_filetype = filetype_size/etype_size; n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype); n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = (int) (offset % n_etypes_in_filetype); etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size; size_in_filetype = etype_in_filetype * etype_size;
sum = 0; sum = 0;
@ -355,32 +382,64 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
} }
/* abs. offset in bytes in the file */ /* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
} }
start_off = offset; start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed. /* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size; st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes; st_n_filetypes = n_filetypes;
i = 0; i_offset = 0;
j = st_index; j = st_index;
off = offset; off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize); fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i < bufsize) { while (i_offset < bufsize) {
i += fwr_size; i_offset += fwr_size;
end_offset = off + fwr_size - 1; end_offset = off + fwr_size - 1;
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; off = disp + flat_file->indices[j] +
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
} }
/* if atomicity is true, lock the region to be accessed */ /* if atomicity is true, lock the region to be accessed */
@ -390,7 +449,7 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
/* initial read for the read-modify-write */ /* initial read for the read-modify-write */
writebuf_off = offset; writebuf_off = offset;
writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (int)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
lseek(fd->fd_sys, writebuf_off, SEEK_SET); lseek(fd->fd_sys, writebuf_off, SEEK_SET);
err = read(fd->fd_sys, writebuf, writebuf_len); err = read(fd->fd_sys, writebuf, writebuf_len);
@ -408,39 +467,41 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
/* contiguous in memory, noncontiguous in file. should be the most /* contiguous in memory, noncontiguous in file. should be the most
common case. */ common case. */
i = 0; i_offset = 0;
j = st_index; j = st_index;
off = offset; off = offset;
n_filetypes = st_n_filetypes; n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize); fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i < bufsize) { while (i_offset < bufsize) {
if (fwr_size) { if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in /* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */ fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET); /* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);*/ err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off; req_off = off;
req_len = fwr_size; req_len = fwr_size;
userbuf_off = i; userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE ADIOI_BUFFERED_WRITE
} }
i += fwr_size; i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] + if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size; off += fwr_size;
/* did not reach end of contiguous block in filetype. /* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */ no more I/O needed. off is incremented by fwr_size. */
else { else {
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
} }
} }
} }
@ -452,7 +513,7 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
while (flat_buf->type != datatype) flat_buf = flat_buf->next; while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0; k = num = buf_count = 0;
i = (int) (flat_buf->indices[0]); i_offset = flat_buf->indices[0];
j = st_index; j = st_index;
off = offset; off = offset;
n_filetypes = st_n_filetypes; n_filetypes = st_n_filetypes;
@ -463,11 +524,11 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
size = ADIOI_MIN(fwr_size, bwr_size); size = ADIOI_MIN(fwr_size, bwr_size);
if (size) { if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET); /* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i, size); */ err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off; req_off = off;
req_len = size; req_len = size;
userbuf_off = i; userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE ADIOI_BUFFERED_WRITE
} }
@ -476,18 +537,19 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
if (size == fwr_size) { if (size == fwr_size) {
/* reached end of contiguous block in file */ /* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j]; new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) { if (size != bwr_size) {
i += size; i_offset += size;
new_bwr_size -= size; new_bwr_size -= size;
} }
} }
@ -497,8 +559,8 @@ void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
k = (k + 1)%flat_buf->count; k = (k + 1)%flat_buf->count;
buf_count++; buf_count++;
i = (int) (buftype_extent*(buf_count/flat_buf->count) + i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k]); flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k]; new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) { if (size != fwr_size) {
off += size; off += size;

Просмотреть файл

@ -1,7 +0,0 @@
<dir>
<file name="ad_bglockless.c" info="1205188711"/>
</dir>
<data>
<fileinfo name="ad_bglockless.c">
</fileinfo>
</data>

Просмотреть файл

@ -21,4 +21,6 @@ include $(top_srcdir)/Makefile.options
noinst_LTLIBRARIES = libadio_bglockless.la noinst_LTLIBRARIES = libadio_bglockless.la
libadio_bglockless_la_SOURCES = \ libadio_bglockless_la_SOURCES = \
ad_bglockless.c ad_bglockless.c \
ad_bglockless.h \
ad_bglockless_features.c

Просмотреть файл

@ -6,12 +6,14 @@
*/ */
#include "../ad_bgl/ad_bgl.h" #include "../ad_bgl/ad_bgl.h"
#include "ad_bglockless.h"
/* adioi.h has the ADIOI_Fns_struct define */ /* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h" #include "adioi.h"
struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = { struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
ADIOI_BGL_Open, /* Open */ ADIOI_BGL_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_GEN_ReadContig, /* ReadContig */ ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */ ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_BGL_ReadStridedColl, /* ReadStridedColl */ ADIOI_BGL_ReadStridedColl, /* ReadStridedColl */
@ -35,7 +37,8 @@ struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
ADIOI_GEN_IOComplete, /* WriteComplete */ ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */ ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */ ADIOI_BGL_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */ ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_BGLOCKLESS_Feature /* Features */
}; };

Просмотреть файл

@ -0,0 +1,14 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 2008 Uchicago Argonne LLC
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_BGLOCKLESS_INCLUDE
#define AD_PVFS2_INCLUDE
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag);
#endif

Просмотреть файл

@ -0,0 +1,15 @@
#include "adio.h"
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
return 1;
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -25,6 +25,7 @@ libadio_gridftp_la_SOURCES = \
ad_gridftp_close.c \ ad_gridftp_close.c \
ad_gridftp_delete.c \ ad_gridftp_delete.c \
ad_gridftp_fcntl.c \ ad_gridftp_fcntl.c \
ad_gridftp_features.c \
ad_gridftp_flush.c \ ad_gridftp_flush.c \
ad_gridftp_hints.c \ ad_gridftp_hints.c \
ad_gridftp_open.c \ ad_gridftp_open.c \

Просмотреть файл

@ -33,4 +33,5 @@ struct ADIOI_Fns_struct ADIO_GRIDFTP_operations = {
ADIOI_GRIDFTP_Flush, /* Flush */ ADIOI_GRIDFTP_Flush, /* Flush */
ADIOI_GRIDFTP_Resize, /* Resize */ ADIOI_GRIDFTP_Resize, /* Resize */
ADIOI_GRIDFTP_Delete, /* Delete */ ADIOI_GRIDFTP_Delete, /* Delete */
ADIOI_GRIDFTP_Feature, /* Features */
}; };

Просмотреть файл

@ -0,0 +1,12 @@
int ADIOI_GRIDFTP_Feature (ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -56,8 +56,8 @@ void ADIOI_GRIDFTP_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
MPI_Info_get_valuelen(users_info,key,&valuelen,&flag); MPI_Info_get_valuelen(users_info,key,&valuelen,&flag);
if (flag) if (flag)
{ {
MPI_Info_get(users_info,key,valuelen,value,&flag); ADIOI_Info_get(users_info,key,valuelen,value,&flag);
if (flag) MPI_Info_set(fd->info,key,value); if (flag) ADIOI_Info_set(fd->info,key,value);
} }
} }
} }

Просмотреть файл

@ -136,7 +136,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
oattr[] (eg. parallelism, striping, etc.) goes here */ oattr[] (eg. parallelism, striping, etc.) goes here */
if ( fd->info!=MPI_INFO_NULL ) if ( fd->info!=MPI_INFO_NULL )
{ {
MPI_Info_get(fd->info,"ftp_control_mode",MPI_MAX_INFO_VAL,hintval,&keyfound); ADIOI_Info_get(fd->info,"ftp_control_mode",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound ) if ( keyfound )
{ {
if ( ( !strcmp(hintval,"extended") || !strcmp(hintval,"extended_block") ) && if ( ( !strcmp(hintval,"extended") || !strcmp(hintval,"extended_block") ) &&
@ -153,7 +153,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result); globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
} }
MPI_Info_get(fd->info,"parallelism",MPI_MAX_INFO_VAL,hintval,&keyfound); ADIOI_Info_get(fd->info,"parallelism",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound ) if ( keyfound )
{ {
int nftpthreads; int nftpthreads;
@ -170,14 +170,14 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
} }
} }
MPI_Info_get(fd->info,"striped_ftp",MPI_MAX_INFO_VAL,hintval,&keyfound); ADIOI_Info_get(fd->info,"striped_ftp",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound ) if ( keyfound )
{ {
/* if set to "true" or "enable", set up round-robin block layout */ /* if set to "true" or "enable", set up round-robin block layout */
if ( !strncmp("true",hintval,4) || !strncmp("TRUE",hintval,4) || if ( !strncmp("true",hintval,4) || !strncmp("TRUE",hintval,4) ||
!strncmp("enable",hintval,4) || !strncmp("ENABLE",hintval,4) ) !strncmp("enable",hintval,4) || !strncmp("ENABLE",hintval,4) )
{ {
MPI_Info_get(fd->info,"striping_factor",MPI_MAX_INFO_VAL,hintval,&keyfound); ADIOI_Info_get(fd->info,"striping_factor",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound ) if ( keyfound )
{ {
int striping_factor; int striping_factor;
@ -197,7 +197,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
} }
} }
MPI_Info_get(fd->info,"tcp_buffer",MPI_MAX_INFO_VAL,hintval,&keyfound); ADIOI_Info_get(fd->info,"tcp_buffer",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound ) if ( keyfound )
{ {
/* set tcp buffer size */ /* set tcp buffer size */
@ -214,7 +214,7 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
} }
} }
MPI_Info_get(fd->info,"transfer_type",MPI_MAX_INFO_VAL,hintval,&keyfound); ADIOI_Info_get(fd->info,"transfer_type",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound ) if ( keyfound )
{ {
globus_ftp_control_type_t filetype; globus_ftp_control_type_t filetype;
@ -340,84 +340,4 @@ void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
} }
} }
num_gridftp_handles++; num_gridftp_handles++;
#if 0
/* Debugging info for testing PASV mode behind firewalls */
if ( myrank==0 )
{
globus_bool_t striped;
globus_ftp_control_mode_t mode;
globus_ftp_control_type_t filetype;
globus_ftp_control_parallelism_t parallelism;
FPRINTF(stderr,"--gridftp details for %s--\n",
fd->filename);
/*
FPRINTF(stderr,"Connection caching: ");
globus_ftp_client_handleattr_get_cache_all(&hattr,&cached);
if ( cached==GLOBUS_TRUE )
FPRINTF(stderr,"Y\n");
else
FPRINTF(stderr,"N\n");
*/
FPRINTF(stderr,"Control mode: ");
globus_ftp_client_operationattr_get_mode(&(oattr[fd->fd_sys]),&mode);
if ( mode==GLOBUS_FTP_CONTROL_MODE_BLOCK )
FPRINTF(stderr,"block\n");
else if ( mode==GLOBUS_FTP_CONTROL_MODE_COMPRESSED )
FPRINTF(stderr,"compressed\n");
else if ( mode==GLOBUS_FTP_CONTROL_MODE_EXTENDED_BLOCK )
FPRINTF(stderr,"extended block\n");
else if ( mode==GLOBUS_FTP_CONTROL_MODE_STREAM )
FPRINTF(stderr,"stream\n");
else
FPRINTF(stderr,"unknown\n");
FPRINTF(stderr,"File type: ");
globus_ftp_client_operationattr_get_type(&(oattr[fd->fd_sys]),&filetype);
if ( filetype==GLOBUS_FTP_CONTROL_TYPE_ASCII )
FPRINTF(stderr,"ASCII\n");
else if ( filetype==GLOBUS_FTP_CONTROL_TYPE_IMAGE )
FPRINTF(stderr,"binary\n");
else if ( filetype==GLOBUS_FTP_CONTROL_TYPE_EBCDIC )
FPRINTF(stderr,"EBCDIC\n");
else
FPRINTF(stderr,"unknown\n");
FPRINTF(stderr,"Parallelism: ");
globus_ftp_client_operationattr_get_parallelism(&(oattr[fd->fd_sys]),&parallelism);
if ( parallelism.mode==GLOBUS_FTP_CONTROL_PARALLELISM_NONE )
FPRINTF(stderr,"none\n");
else if ( parallelism.mode==GLOBUS_FTP_CONTROL_PARALLELISM_FIXED )
FPRINTF(stderr,"fixed with %d streams\n",parallelism.fixed.size);
else
FPRINTF(stderr,"unknown\n");
FPRINTF(stderr,"Striping: ");
globus_ftp_client_operationattr_get_striped(&(oattr[fd->fd_sys]),&striped);
if ( striped==GLOBUS_TRUE )
{
globus_ftp_control_layout_t layout;
FPRINTF(stderr,"Y\nLayout: ");
globus_ftp_client_operationattr_get_layout(&(oattr[fd->fd_sys]),
&layout);
if ( layout.mode==GLOBUS_FTP_CONTROL_STRIPING_NONE )
FPRINTF(stderr,"none\n");
else if ( layout.mode==GLOBUS_FTP_CONTROL_STRIPING_PARTITIONED )
FPRINTF(stderr,"partitioned, size=%d\n",layout.partitioned.size);
else if ( layout.mode==GLOBUS_FTP_CONTROL_STRIPING_BLOCKED_ROUND_ROBIN )
FPRINTF(stderr,"round-robin, block size=%d\n",layout.round_robin.block_size);
else
FPRINTF(stderr,"unknown\n");
}
else
FPRINTF(stderr,"N\n");
fflush(stderr);
}
#endif
} }

Просмотреть файл

@ -50,10 +50,6 @@ static void readcontig_data_cb(void *myargs, globus_ftp_client_handle_t *handle,
readcontig_data_cb: buffer 0x404c0008 length 65536 offset 32112640 eof 0 readcontig_data_cb: buffer 0x404c0008 length 65536 offset 32112640 eof 0
readcontig_data_cb: buffer 0x404d0008 length 65536 offset 32178176 eof 0 readcontig_data_cb: buffer 0x404d0008 length 65536 offset 32178176 eof 0
*/ */
#if 0
FPRINTF(stderr, "%s: buffer %p length %d offset %Ld eof %d\n",
__func__, buffer, length, offset, eof);
#endif
if ( !eof ) if ( !eof )
globus_ftp_client_register_read(handle, globus_ftp_client_register_read(handle,
buffer+length, buffer+length,

Просмотреть файл

@ -364,10 +364,6 @@ void ADIOI_GRIDFTP_WriteDiscontig(ADIO_File fd, void *buf, int count,
{ {
fd->fp_ind += extent; fd->fp_ind += extent;
fd->fp_sys_posn = fd->fp_ind; fd->fp_sys_posn = fd->fp_ind;
#if 0
FPRINTF(stdout, "[%d/%d] new file position is %Ld\n", myrank,
nprocs, (long long) fd->fp_ind);
#endif
} }
else { else {
fd->fp_sys_posn = offset + extent; fd->fp_sys_posn = offset + extent;

Просмотреть файл

@ -8,6 +8,9 @@
#include "ad_hfs.h" #include "ad_hfs.h"
#include "adio_extern.h" #include "adio_extern.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) void ADIOI_HFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{ {
int i, ntimes, err; int i, ntimes, err;

Просмотреть файл

@ -7,6 +7,10 @@
#include "ad_hfs.h" #include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_Open(ADIO_File fd, int *error_code) void ADIOI_HFS_Open(ADIO_File fd, int *error_code)
{ {
int perm, old_mask, amode; int perm, old_mask, amode;

Просмотреть файл

@ -7,6 +7,10 @@
#include "ad_hfs.h" #include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count, void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code) ADIO_Offset offset, ADIO_Status *status, int *error_code)

Просмотреть файл

@ -7,6 +7,10 @@
#include "ad_hfs.h" #include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count, void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code) ADIO_Offset offset, ADIO_Status *status, int *error_code)

Просмотреть файл

@ -1,22 +0,0 @@
<dir>
<file name="ad_lustre_fcntl.c" info="1204573775"/>
<file name="ad_lustre_hints.c" info="1204573775"/>
<file name="ad_lustre_open.c" info="1204573775"/>
<file name="ad_lustre_rwcontig.c" info="1204573775"/>
<file name="ad_lustre.h" info="1204573775"/>
<file name="ad_lustre.c" info="1204573775"/>
</dir>
<data>
<fileinfo name="ad_lustre_fcntl.c">
</fileinfo>
<fileinfo name="ad_lustre_hints.c">
</fileinfo>
<fileinfo name="ad_lustre_open.c">
</fileinfo>
<fileinfo name="ad_lustre_rwcontig.c">
</fileinfo>
<fileinfo name="ad_lustre.h">
</fileinfo>
<fileinfo name="ad_lustre.c">
</fileinfo>
</data>

Просмотреть файл

@ -24,8 +24,11 @@ EXTRA_DIST = README
noinst_LTLIBRARIES = libadio_lustre.la noinst_LTLIBRARIES = libadio_lustre.la
libadio_lustre_la_SOURCES = \ libadio_lustre_la_SOURCES = \
ad_lustre.c \ ad_lustre.c \
ad_lustre_aggregate.c \
ad_lustre_fcntl.c \ ad_lustre_fcntl.c \
ad_lustre.h \ ad_lustre.h \
ad_lustre_hints.c \ ad_lustre_hints.c \
ad_lustre_open.c \ ad_lustre_open.c \
ad_lustre_rwcontig.c ad_lustre_wrcoll.c \
ad_lustre_rwcontig.c \
ad_lustre_wrstr.c

Просмотреть файл

@ -4,6 +4,21 @@ Upcoming soon:
Further out: Further out:
o To post the code for ParColl (Partitioned collective IO) o To post the code for ParColl (Partitioned collective IO)
-----------------------------------------------------
V05:
-----------------------------------------------------
Improved data redistribution
o Improve I/O pattern identification. Besides checking interleaving,
if request I/O size is small, collective I/O will be performed.
The hint bigsize can be used to define the req size value.
o Provide hint CO for load balancing to control the number of
IO clients for each OST
o Produce stripe-contiguous I/O pattern that Lustre prefers
o Control read-modify-write in data sieving in collective IO
by hint ds_in_coll.
o Reduce extent lock conflicts by make each OST accessed by one or
more constant clients.
----------------------------------------------------- -----------------------------------------------------
V04: V04:
----------------------------------------------------- -----------------------------------------------------

Просмотреть файл

@ -4,21 +4,24 @@
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
* *
* Copyright (C) 2007 Oak Ridge National Laboratory * Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/ */
#include "ad_lustre.h" #include "ad_lustre.h"
struct ADIOI_Fns_struct ADIO_LUSTRE_operations = { struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
ADIOI_LUSTRE_Open, /* Open */ ADIOI_LUSTRE_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_LUSTRE_ReadContig, /* ReadContig */ ADIOI_LUSTRE_ReadContig, /* ReadContig */
ADIOI_LUSTRE_WriteContig, /* WriteContig */ ADIOI_LUSTRE_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */ ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GEN_Fcntl, /* Fcntl */ ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_LUSTRE_SetInfo, /* SetInfo */ ADIOI_LUSTRE_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */ ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */ ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */ ADIOI_GEN_Close, /* Close */
#if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE) #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
ADIOI_GEN_IreadContig, /* IreadContig */ ADIOI_GEN_IreadContig, /* IreadContig */
@ -36,4 +39,5 @@ struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
ADIOI_GEN_Flush, /* Flush */ ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */ ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
}; };

Просмотреть файл

@ -4,6 +4,8 @@
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
* *
* Copyright (C) 2007 Oak Ridge National Laboratory * Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/ */
#ifndef AD_UNIX_INCLUDE #ifndef AD_UNIX_INCLUDE
@ -17,6 +19,7 @@
#ifdef __linux__ #ifdef __linux__
# include <sys/ioctl.h> /* necessary for: */ # include <sys/ioctl.h> /* necessary for: */
# include <time.h>
# define __USE_GNU /* O_DIRECT and */ # define __USE_GNU /* O_DIRECT and */
# include <fcntl.h> /* IO operations */ # include <fcntl.h> /* IO operations */
# undef __USE_GNU # undef __USE_GNU
@ -24,7 +27,7 @@
/*#include <fcntl.h>*/ /*#include <fcntl.h>*/
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include "lustre/lustre_user.h" #include <lustre/lustre_user.h>
#include "adio.h" #include "adio.h"
/*#include "adioi.h"*/ /*#include "adioi.h"*/
@ -43,22 +46,46 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int ADIO_Offset offset, ADIO_Status *status,
*error_code); int *error_code);
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int ADIO_Offset offset, ADIO_Status *status,
*error_code); int *error_code);
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int ADIO_Offset offset, ADIO_Status *status,
*error_code); int *error_code);
void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int ADIO_Offset offset, ADIO_Status *status,
*error_code); int *error_code);
void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code); int *error_code);
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
/* the lustre utilities: */
int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
ADIO_Offset *len_list, int nprocs);
void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
int mode);
void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
ADIO_Offset *len_list, int contig_access_count,
int *striping_info, int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int ***buf_idx_ptr);
int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
ADIO_Offset *len, int *striping_info);
#endif /* End of AD_UNIX_INCLUDE */ #endif /* End of AD_UNIX_INCLUDE */

Просмотреть файл

@ -0,0 +1,322 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#undef AGG_DEBUG
void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
int mode)
{
int *striping_info = NULL;
/* get striping information:
* striping_info[0]: stripe_size
* striping_info[1]: stripe_count
* striping_info[2]: avail_cb_nodes
*/
int stripe_size, stripe_count, CO = 1;
int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
/* Get hints value */
/* stripe size */
stripe_size = fd->hints->striping_unit;
/* stripe count */
/* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
stripe_count = fd->hints->striping_factor;
/* Calculate the available number of I/O clients */
if (!mode) {
/* for collective read,
* if "CO" clients access the same OST simultaneously,
* the OST disk seek time would be much. So, to avoid this,
* it might be better if 1 client only accesses 1 OST.
* So, we set CO = 1 to meet the above requirement.
*/
CO = 1;
/*XXX: maybe there are other better way for collective read */
} else {
/* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
CO = fd->hints->fs_hints.lustre.co_ratio;
}
/* Calculate how many IO clients we need */
/* Algorithm courtesy Pascal Deveze (pascal.deveze@bull.net) */
/* To avoid extent lock conflicts,
* avail_cb_nodes should either
* - be a multiple of stripe_count,
* - or divide stripe_count exactly
* so that each OST is accessed by a maximum of CO constant clients. */
if (nprocs_for_coll >= stripe_count)
/* avail_cb_nodes should be a multiple of stripe_count and the number
* of procs per OST should be limited to the minimum between
* nprocs_for_coll/stripe_count and CO
*
* e.g. if stripe_count=20, nprocs_for_coll=42 and CO=3 then
* avail_cb_nodes should be equal to 40 */
avail_cb_nodes =
stripe_count * ADIOI_MIN(nprocs_for_coll/stripe_count, CO);
else {
/* nprocs_for_coll is less than stripe_count */
/* avail_cb_nodes should divide stripe_count */
/* e.g. if stripe_count=60 and nprocs_for_coll=8 then
* avail_cb_nodes should be egal to 6 */
/* This could be done with :
while (stripe_count % avail_cb_nodes != 0) avail_cb_nodes--;
but this can be optimized for large values of nprocs_for_coll and
stripe_count */
divisor = 2;
avail_cb_nodes = 1;
/* try to divise */
while (stripe_count >= divisor*divisor) {
if ((stripe_count % divisor) == 0) {
if (stripe_count/divisor <= nprocs_for_coll) {
/* The value is found ! */
avail_cb_nodes = stripe_count/divisor;
break;
}
/* if divisor is less than nprocs_for_coll, divisor is a
* solution, but it is not sure that it is the best one */
else if (divisor <= nprocs_for_coll)
avail_cb_nodes = divisor;
}
divisor++;
}
}
*striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
striping_info = *striping_info_ptr;
striping_info[0] = stripe_size;
striping_info[1] = stripe_count;
striping_info[2] = avail_cb_nodes;
}
int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
ADIO_Offset *len, int *striping_info)
{
int rank_index, rank;
ADIO_Offset avail_bytes;
int stripe_size = striping_info[0];
int avail_cb_nodes = striping_info[2];
/* Produce the stripe-contiguous pattern for Lustre */
rank_index = (int)((off / stripe_size) % avail_cb_nodes);
/* we index into fd_end with rank_index, and fd_end was allocated to be no
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort
*/
if (rank_index >= fd->hints->cb_nodes)
MPI_Abort(MPI_COMM_WORLD, 1);
avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
(ADIO_Offset)stripe_size - off;
if (avail_bytes < *len) {
/* this proc only has part of the requested contig. region */
*len = avail_bytes;
}
/* map our index to a rank */
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
rank = fd->hints->ranklist[rank_index];
return rank;
}
/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
ADIO_Offset *len_list, int contig_access_count,
int *striping_info, int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int ***buf_idx_ptr)
{
/* Nothing different from ADIOI_Calc_my_req(), except calling
* ADIOI_Lustre_Calc_aggregator() instead of the old one */
int *count_my_req_per_proc, count_my_req_procs, **buf_idx;
int i, l, proc;
ADIO_Offset avail_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
* process in process i's file domain. calloc initializes to zero.
* I'm allocating memory of size nprocs, so that I can do an
* MPI_Alltoall later on.
*/
buf_idx = (int **) ADIOI_Malloc(nprocs * sizeof(int*));
/* one pass just to calculate how much space to allocate for my_req;
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/
for (i = 0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write
*/
if (len_list[i] == 0)
continue;
off = offset_list[i];
avail_len = len_list[i];
/* note: we set avail_len to be the total size of the access.
* then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
* the amount that was available.
*/
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
count_my_req_per_proc[proc]++;
/* figure out how many data is remaining in the access
* we'll take care of this data (if there is any)
* in the while loop below.
*/
rem_len = len_list[i] - avail_len;
while (rem_len != 0) {
off += avail_len; /* point to first remaining byte */
avail_len = rem_len; /* save remaining size, pass to calc */
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
count_my_req_per_proc[proc]++;
rem_len -= avail_len; /* reduce remaining length by amount from fd */
}
}
/* buf_idx is relevant only if buftype_is_contig.
* buf_idx[i] gives the index into user_buf where data received
* from proc 'i' should be placed. This allows receives to be done
* without extra buffer. This can't be done if buftype is not contig.
*/
/* initialize buf_idx vectors */
for (i = 0; i < nprocs; i++) {
/* add one to count_my_req_per_proc[i] to avoid zero size malloc */
buf_idx[i] = (int *) ADIOI_Malloc((count_my_req_per_proc[i] + 1)
* sizeof(int));
}
/* now allocate space for my_req, offset, and len */
*my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
my_req = *my_req_ptr;
count_my_req_procs = 0;
for (i = 0; i < nprocs; i++) {
if (count_my_req_per_proc[i]) {
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(ADIO_Offset));
my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(int));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed later */
}
/* now fill in my_req */
curr_idx = 0;
for (i = 0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
avail_len = len_list[i];
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
l = my_req[proc].count;
ADIOI_Assert(curr_idx == (int) curr_idx);
ADIOI_Assert(l < count_my_req_per_proc[proc]);
buf_idx[proc][l] = (int) curr_idx;
curr_idx += avail_len;
rem_len = len_list[i] - avail_len;
/* store the proc, offset, and len information in an array
* of structures, my_req. Each structure contains the
* offsets and lengths located in that process's FD,
* and the associated count.
*/
my_req[proc].offsets[l] = off;
ADIOI_Assert(avail_len == (int) avail_len);
my_req[proc].lens[l] = (int) avail_len;
my_req[proc].count++;
while (rem_len != 0) {
off += avail_len;
avail_len = rem_len;
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
striping_info);
l = my_req[proc].count;
ADIOI_Assert(curr_idx == (int) curr_idx);
ADIOI_Assert(l < count_my_req_per_proc[proc]);
buf_idx[proc][l] = (int) curr_idx;
curr_idx += avail_len;
rem_len -= avail_len;
my_req[proc].offsets[l] = off;
ADIOI_Assert(avail_len == (int) avail_len);
my_req[proc].lens[l] = (int) avail_len;
my_req[proc].count++;
}
}
#ifdef AGG_DEBUG
for (i = 0; i < nprocs; i++) {
if (count_my_req_per_proc[i] > 0) {
FPRINTF(stdout, "data needed from %d (count = %d):\n",
i, my_req[i].count);
for (l = 0; l < my_req[i].count; l++) {
FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
l, my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
}
#endif
*count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx;
}
int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
ADIO_Offset *len_list, int nprocs)
{
/* If the processes are non-interleaved, we will check the req_size.
* if (avg_req_size > big_req_size) {
* docollect = 0;
* }
*/
int i, docollect = 1, big_req_size = 0;
ADIO_Offset req_size = 0, total_req_size;
int avg_req_size, total_access_count;
/* calculate total_req_size and total_access_count */
for (i = 0; i < contig_access_count; i++)
req_size += len_list[i];
MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
fd->comm);
MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
fd->comm);
/* estimate average req_size */
avg_req_size = (int)(total_req_size / total_access_count);
/* get hint of big_req_size */
big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
/* Don't perform collective I/O if there are big requests */
if ((big_req_size > 0) && (avg_req_size > big_req_size))
docollect = 0;
return docollect;
}

Просмотреть файл

@ -25,7 +25,7 @@ void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
if (fd->fp_sys_posn != -1) if (fd->fp_sys_posn != -1)
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
if (fcntl_struct->fsize == -1) { if (fcntl_struct->fsize == -1) {
*error_code = MPIR_Err_create_code(MPI_SUCCESS, *error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__, MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s", strerror(errno)); MPI_ERR_IO, "**io", "**io %s", strerror(errno));
} }
@ -56,7 +56,7 @@ void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done,
&status, error_code); &status, error_code);
if (*error_code != MPI_SUCCESS) { if (*error_code != MPI_SUCCESS) {
*error_code = MPIR_Err_create_code(MPI_SUCCESS, *error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__, MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s", strerror(errno)); MPI_ERR_IO, "**io", "**io %s", strerror(errno));
return; return;

Просмотреть файл

@ -4,6 +4,8 @@
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
* *
* Copyright (C) 2007 Oak Ridge National Laboratory * Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/ */
#include "ad_lustre.h" #include "ad_lustre.h"
@ -11,10 +13,12 @@
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{ {
char *value, *value_in_fd; char *value;
int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1; int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
struct lov_user_md lum = { 0 }; struct lov_user_md lum = { 0 };
int err, myrank, fd_sys, perm, amode, old_mask; int err, myrank, fd_sys, perm, amode, old_mask;
int int_val, tmp_val;
static char myname[] = "ADIOI_LUSTRE_SETINFO";
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
if ( (fd->info) == MPI_INFO_NULL) { if ( (fd->info) == MPI_INFO_NULL) {
@ -22,54 +26,63 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if necessary. */ if necessary. */
MPI_Info_create(&(fd->info)); MPI_Info_create(&(fd->info));
MPI_Info_set(fd->info, "direct_read", "false"); ADIOI_Info_set(fd->info, "direct_read", "false");
MPI_Info_set(fd->info, "direct_write", "false"); ADIOI_Info_set(fd->info, "direct_write", "false");
fd->direct_read = fd->direct_write = 0; fd->direct_read = fd->direct_write = 0;
/* initialize lustre hints */
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
fd->hints->fs_hints.lustre.co_ratio = 1;
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
fd->hints->fs_hints.lustre.coll_threshold = 0;
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
/* has user specified striping or server buffering parameters /* has user specified striping or server buffering parameters
and do they have the same value on all processes? */ and do they have the same value on all processes? */
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, /* striping information */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) if (flag)
str_unit=atoi(value); str_unit=atoi(value);
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) if (flag)
str_factor=atoi(value); str_factor=atoi(value);
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "romio_lustre_start_iodevice",
value, &flag); MPI_MAX_INFO_VAL, value, &flag);
if (flag) if (flag)
start_iodev=atoi(value); start_iodev=atoi(value);
MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, /* direct read and write */
ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
MPI_Info_set(fd->info, "direct_read", "true"); ADIOI_Info_set(fd->info, "direct_read", "true");
fd->direct_read = 1; fd->direct_read = 1;
} }
ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
MPI_Info_set(fd->info, "direct_write", "true"); ADIOI_Info_set(fd->info, "direct_write", "true");
fd->direct_write = 1; fd->direct_write = 1;
} }
} }
/* set striping information with ioctl */
MPI_Comm_rank(fd->comm, &myrank); MPI_Comm_rank(fd->comm, &myrank);
if (myrank == 0) { if (myrank == 0) {
tmp_val[0] = str_factor; stripe_val[0] = str_factor;
tmp_val[1] = str_unit; stripe_val[1] = str_unit;
tmp_val[2] = start_iodev; stripe_val[2] = start_iodev;
} }
MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm); MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
if (tmp_val[0] != str_factor if (stripe_val[0] != str_factor
|| tmp_val[1] != str_unit || stripe_val[1] != str_unit
|| tmp_val[2] != start_iodev) { || stripe_val[2] != start_iodev) {
FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
"-striping_factor:striping_unit:start_iodevice " "-striping_factor:striping_unit:start_iodevice "
"need to be identical across all processes\n"); "need to be identical across all processes\n");
@ -119,17 +132,65 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
} /* End of striping parameters validation */ } /* End of striping parameters validation */
} }
MPI_Barrier(fd->comm); MPI_Barrier(fd->comm);
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
} else {
/* The file has been opened previously and fd->fd_sys is a valid
file descriptor. cannot set striping parameters now. */
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
} }
/* get other hint */
if (users_info != MPI_INFO_NULL) {
/* CO: IO Clients/OST,
* to keep the load balancing between clients and OSTs */
ADIOI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
&flag);
if (flag && (int_val = atoi(value)) > 0) {
tmp_val = int_val;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_co_ratio",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", value);
fd->hints->fs_hints.lustre.co_ratio = atoi(value);
}
/* coll_threshold:
* if the req size is bigger than this, collective IO may not be performed.
*/
ADIOI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
&flag);
if (flag && (int_val = atoi(value)) > 0) {
tmp_val = int_val;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_coll_threshold",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
}
/* ds_in_coll: disable data sieving in collective IO */
ADIOI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))) {
tmp_val = int_val = 2;
MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_ds_in_coll",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
}
}
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_read) fd->direct_read = 1;
if (ADIOI_Direct_write) fd->direct_write = 1; if (ADIOI_Direct_write) fd->direct_write = 1;

Просмотреть файл

@ -4,14 +4,22 @@
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
* *
* Copyright (C) 2007 Oak Ridge National Laboratory * Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/ */
#include "ad_lustre.h" #include "ad_lustre.h"
/* what is the basis for this define?
* what happens if there are more than 1k UUIDs? */
#define MAX_LOV_UUID_COUNT 1000
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
{ {
int perm, old_mask, amode, amode_direct; int perm, old_mask, amode, amode_direct;
struct lov_user_md lum = { 0 }; int lumlen;
struct lov_user_md *lum = NULL;
char *value; char *value;
#if defined(MPICH2) || !defined(PRINT_ERR_MSG) #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
@ -44,23 +52,37 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
if (fd->fd_sys != -1) { if (fd->fd_sys != -1) {
int err; int err;
/* get file striping information and set it in info */
/* odd malloc here because lov_user_md contains some fixed data and
* then a list of 'lmm_objects' representing stripe */
lumlen = sizeof(struct lov_user_md) +
MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data);
/* furthermore, Pascal Deveze reports that, even though we pass a
* "GETSTRIPE" (read) flag to the ioctl, if some of the values of this
* struct are uninitialzed, the call can give an error. calloc in case
* there are other members that must be initialized and in case
* lov_user_md struct changes in future */
lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen);
lum->lmm_magic = LOV_USER_MAGIC;
err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum);
if (!err) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
/* get file striping information and set it in info */ fd->hints->striping_unit = lum->lmm_stripe_size;
lum.lmm_magic = LOV_USER_MAGIC; sprintf(value, "%d", lum->lmm_stripe_size);
err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); ADIOI_Info_set(fd->info, "striping_unit", value);
if (!err) { fd->hints->striping_factor = lum->lmm_stripe_count;
sprintf(value, "%d", lum.lmm_stripe_size); sprintf(value, "%d", lum->lmm_stripe_count);
MPI_Info_set(fd->info, "striping_unit", value); ADIOI_Info_set(fd->info, "striping_factor", value);
sprintf(value, "%d", lum.lmm_stripe_count); fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset;
MPI_Info_set(fd->info, "striping_factor", value); sprintf(value, "%d", lum->lmm_stripe_offset);
ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
sprintf(value, "%d", lum.lmm_stripe_offset);
MPI_Info_set(fd->info, "start_iodevice", value);
}
ADIOI_Free(value); ADIOI_Free(value);
}
ADIOI_Free(lum);
if (fd->access_mode & ADIO_APPEND) if (fd->access_mode & ADIO_APPEND)
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);

Просмотреть файл

@ -4,6 +4,8 @@
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
* *
* Copyright (C) 2007 Oak Ridge National Laboratory * Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/ */
#define _XOPEN_SOURCE 600 #define _XOPEN_SOURCE 600
@ -18,7 +20,7 @@ static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len, static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len,
ADIO_Offset offset, int *err) ADIO_Offset offset, int *err)
{ {
int ntimes, rem, newrem, i, size, nbytes; int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz)) { if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz)) {
*err = pwrite(fd->fd_direct, buf, len, offset); *err = pwrite(fd->fd_direct, buf, len, offset);
} else if (len < fd->d_miniosz) { } else if (len < fd->d_miniosz) {
@ -37,7 +39,7 @@ static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len, static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
ADIO_Offset offset, int *err) ADIO_Offset offset, int *err)
{ {
int ntimes, rem, newrem, i, size, nbytes; int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz)) if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz))
*err = pread(fd->fd_direct, buf, len, offset); *err = pread(fd->fd_direct, buf, len, offset);
else if (len < fd->d_miniosz) else if (len < fd->d_miniosz)
@ -59,7 +61,6 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
{ {
int err=-1, diff, size=len, nbytes = 0; int err=-1, diff, size=len, nbytes = 0;
void *newbuf; void *newbuf;
static char myname[] = "ADIOI_LUSTRE_Directio";
if (offset % fd->d_miniosz) { if (offset % fd->d_miniosz) {
diff = fd->d_miniosz - (offset % fd->d_miniosz); diff = fd->d_miniosz - (offset % fd->d_miniosz);
@ -87,7 +88,7 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
memcpy(newbuf, buf, size); memcpy(newbuf, buf, size);
ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err); ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err);
nbytes += err; nbytes += err;
free(newbuf); ADIOI_Free(newbuf);
} }
else nbytes += pwrite(fd->fd_sys, buf, size, offset); else nbytes += pwrite(fd->fd_sys, buf, size, offset);
} }
@ -102,7 +103,7 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err); ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err);
if (err > 0) memcpy(buf, newbuf, err); if (err > 0) memcpy(buf, newbuf, err);
nbytes += err; nbytes += err;
free(newbuf); ADIOI_Free(newbuf);
} }
else nbytes += pread(fd->fd_sys, buf, size, offset); else nbytes += pread(fd->fd_sys, buf, size, offset);
} }
@ -136,10 +137,23 @@ static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
if (err == -1) goto ioerr; if (err == -1) goto ioerr;
} }
if (io_mode) if (io_mode) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
#endif
err = write(fd->fd_sys, buf, len); err = write(fd->fd_sys, buf, len);
else #ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
#endif
} else {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
#endif
err = read(fd->fd_sys, buf, len); err = read(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
#endif
}
} else { } else {
err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode); err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
} }

Просмотреть файл

@ -0,0 +1,954 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
/* prototypes of functions used for collective writes only. */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
MPI_Datatype datatype, int nprocs,
int myrank,
ADIOI_Access *others_req,
ADIOI_Access *my_req,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
int *striping_info,
int **buf_idx, int *error_code);
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
MPI_Request *requests,
int *sent_to_proc, int nprocs,
int myrank, int contig_access_count,
int *striping_info,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent);
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
int *striping_info,
ADIOI_Access *others_req,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int *hole,
int iter, MPI_Aint buftype_extent,
int *buf_idx, int *error_code);
void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
ADIO_Offset *srt_off, int *srt_len, int *start_pos,
int nprocs, int nprocs_recv, int total_elements);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype,
int file_ptr_type, ADIO_Offset offset,
ADIO_Status *status, int *error_code)
{
/* Uses a generalized version of the extended two-phase method described
* in "An Extended Two-Phase Method for Accessing Sections of
* Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
* Scientific Programming, (5)4:301--317, Winter 1996.
* http://www.mcs.anl.gov/home/thakur/ext2ph.ps
*/
ADIOI_Access *my_req;
/* array of nprocs access structures, one for each other process has
this process's request */
ADIOI_Access *others_req;
/* array of nprocs access structures, one for each other process
whose request is written by this process. */
int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
ADIO_Offset orig_fp, start_offset, end_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
ADIO_Offset *len_list = NULL;
int **buf_idx = NULL, *striping_info = NULL;
int old_error, tmp_error;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
orig_fp = fd->fp_ind;
/* IO patten identification if cb_write isn't disabled */
if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
/* For this process's request, calculate the list of offsets and
lengths in the file and determine the start and end offsets. */
/* Note: end_offset points to the last byte-offset that will be accessed.
* e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
*/
ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
&offset_list, &len_list, &start_offset,
&end_offset, &contig_access_count);
/* each process communicates its start and end offsets to other
* processes. The result is an array each of start and end offsets
* stored in order of process rank.
*/
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
ADIO_OFFSET, fd->comm);
MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
ADIO_OFFSET, fd->comm);
/* are the accesses of different processes interleaved? */
for (i = 1; i < nprocs; i++)
if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice
for the moment. */
/* Two typical access patterns can benefit from collective write.
* 1) the processes are interleaved, and
* 2) the req size is small.
*/
if (interleave_count > 0) {
do_collect = 1;
} else {
do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
len_list, nprocs);
}
}
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
/* Decide if collective I/O should be done */
if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
fd->hints->cb_write == ADIOI_HINT_DISABLE) {
/* use independent accesses */
if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
}
fd->fp_ind = orig_fp;
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_WriteContig(fd, buf, count, datatype,
ADIO_EXPLICIT_OFFSET,
off, status, error_code);
} else
ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
0, status, error_code);
} else {
ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
offset, status, error_code);
}
return;
}
/* Get Lustre hints information */
ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
/* calculate what portions of the access requests of this process are
* located in which process
*/
ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
striping_info, nprocs, &count_my_req_procs,
&count_my_req_per_proc, &my_req,
&buf_idx);
/* based on everyone's my_req, calculate what requests of other processes
* will be accessed by this process.
* count_others_req_procs = number of processes whose requests (including
* this process itself) will be accessed by this process
* count_others_req_per_proc[i] indicates how many separate contiguous
* requests of proc. i will be accessed by this process.
*/
ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
my_req, nprocs, myrank, &count_others_req_procs,
&others_req);
ADIOI_Free(count_my_req_per_proc);
/* exchange data and write in sizes of no more than stripe_size. */
ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
others_req, my_req, offset_list, len_list,
contig_access_count, striping_info,
buf_idx, error_code);
/* If this collective write is followed by an independent write,
* it's possible to have those subsequent writes on other processes
* race ahead and sneak in before the read-modify-write completes.
* We carry out a collective communication at the end here so no one
* can start independent i/o before collective I/O completes.
*
* need to do some gymnastics with the error codes so that if something
* went wrong, all processes report error, but if a process has a more
* specific error code, we can still have that process report the
* additional information */
old_error = *error_code;
if (*error_code != MPI_SUCCESS)
*error_code = MPI_ERR_IO;
/* optimization: if only one process performing i/o, we can perform
* a less-expensive Bcast */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
#endif
if (fd->hints->cb_nodes == 1)
MPI_Bcast(error_code, 1, MPI_INT,
fd->hints->ranklist[0], fd->comm);
else {
tmp_error = *error_code;
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
MPI_MAX, fd->comm);
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
#endif
if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
*error_code = old_error;
if (!buftype_is_contig)
ADIOI_Delete_flattened(datatype);
/* free all memory allocated for collective I/O */
/* free others_req */
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
ADIOI_Free(others_req[i].offsets);
ADIOI_Free(others_req[i].lens);
ADIOI_Free(others_req[i].mem_ptrs);
}
}
ADIOI_Free(others_req);
/* free my_req here */
for (i = 0; i < nprocs; i++) {
if (my_req[i].count) {
ADIOI_Free(my_req[i].offsets);
ADIOI_Free(my_req[i].lens);
}
}
ADIOI_Free(my_req);
for (i = 0; i < nprocs; i++) {
ADIOI_Free(buf_idx[i]);
}
ADIOI_Free(buf_idx);
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
ADIOI_Free(striping_info);
#ifdef HAVE_STATUS_SET_BYTES
if (status) {
int bufsize, size;
/* Don't set status if it isn't needed */
MPI_Type_size(datatype, &size);
bufsize = size * count;
MPIR_Status_set_bytes(status, datatype, bufsize);
}
/* This is a temporary way of filling in status. The right way is to
* keep track of how much data was actually written during collective I/O.
*/
#endif
fd->fp_sys_posn = -1; /* set it to null. */
}
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
* code is created and returned in error_code.
*/
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
MPI_Datatype datatype, int nprocs,
int myrank, ADIOI_Access *others_req,
ADIOI_Access *my_req,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
int *striping_info, int **buf_idx,
int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
* than lustre stripe_size.
* The idea is to reduce the amount of extra memory required for
* collective I/O. If all data were written all at once, which is much
* easier, it would require temp space more than the size of user_buf,
* which is often unacceptable. For example, to write a distributed
* array to a file, where each local array is 8Mbytes, requiring
* at least another 8Mbytes of temp space is unacceptable.
*/
int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
ADIO_Offset max_size, step_size = 0;
int real_size, req_len, send_len;
int *recv_curr_offlen_ptr, *recv_count, *recv_size;
int *send_curr_offlen_ptr, *send_size;
int *partial_recv, *sent_to_proc, *recv_start_pos;
int *send_buf_idx, *curr_to_proc, *done_to_proc;
int *this_buf_idx;
char *write_buf = NULL;
MPI_Status status;
ADIOI_Flatlist_node *flat_buf = NULL;
MPI_Aint buftype_extent;
int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
int data_sieving = 0;
*error_code = MPI_SUCCESS; /* changed below if error */
/* only I/O errors are currently reported */
/* calculate the number of writes of stripe size to be done.
* That gives the no. of communication phases as well.
* Note:
* Because we redistribute data in stripe-contiguous pattern for Lustre,
* each process has the same no. of communication phases.
*/
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
st_loc = others_req[i].offsets[0];
end_loc = others_req[i].offsets[0];
break;
}
}
for (i = 0; i < nprocs; i++) {
for (j = 0; j < others_req[i].count; j++) {
st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
others_req[i].lens[j] - 1));
}
}
/* this process does no writing. */
if ((st_loc == -1) && (end_loc == -1))
ntimes = 0;
MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
/* avoid min_st_loc be -1 */
if (st_loc == -1)
st_loc = max_end_loc;
MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
/* align downward */
min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
/* Each time, only avail_cb_nodes number of IO clients perform IO,
* so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
* and ntimes=whole_file_portion/step_size
*/
step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
+ (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/* max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
if (ntimes)
write_buf = (char *) ADIOI_Malloc(stripe_size);
/* calculate the start offset for each iteration */
off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
for (m = 0; m < max_ntimes; m ++)
off_list[m] = max_end_loc;
for (i = 0; i < nprocs; i++) {
for (j = 0; j < others_req[i].count; j ++) {
req_off = others_req[i].offsets[j];
m = (int)((req_off - min_st_loc) / step_size);
off_list[m] = ADIOI_MIN(off_list[m], req_off);
}
}
recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* their use is explained below. calloc initializes to 0. */
recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* to store count of how many off-len pairs per proc are satisfied
in an iteration. */
send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* total size of data to be sent to each proc. in an iteration.
Of size nprocs so that I can use MPI_Alltoall later. */
recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* total size of data to be recd. from each proc. in an iteration. */
sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* amount of data sent to each proc so far. Used in
ADIOI_Fill_send_buffer. initialized to 0 here. */
send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* Above three are used in ADIOI_Fill_send_buffer */
this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* used to store the starting value of recv_curr_offlen_ptr[i] in
this iteration */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
if (!buftype_is_contig) {
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype)
flat_buf = flat_buf->next;
}
MPI_Type_extent(datatype, &buftype_extent);
/* I need to check if there are any outstanding nonblocking writes to
* the file, which could potentially interfere with the writes taking
* place in this collective write call. Since this is not likely to be
* common, let me do the simplest thing possible here: Each process
* completes all pending nonblocking operations before completing.
*/
/*ADIOI_Complete_async(error_code);
if (*error_code != MPI_SUCCESS) return;
MPI_Barrier(fd->comm);
*/
iter_st_off = min_st_loc;
/* Although we have recognized the data according to OST index,
* a read-modify-write will be done if there is a hole between the data.
* For example: if blocksize=60, xfersize=30 and stripe_size=100,
* then rank0 will collect data [0, 30] and [60, 90] then write. There
* is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
*
* To reduce its impact on the performance, we can disable data sieving
* by hint "ds_in_coll".
*/
/* check the hint for data sieving */
data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
for (m = 0; m < max_ntimes; m++) {
/* go through all others_req and my_req to check which will be received
* and sent in this iteration.
*/
/* Note that MPI guarantees that displacements in filetypes are in
monotonically nondecreasing order and that, for writes, the
filetypes cannot specify overlapping regions in the file. This
simplifies implementation a bit compared to reads. */
/*
off = start offset in the file for the data to be written in
this iteration
iter_st_off = start offset of this iteration
real_size = size of data written (bytes) corresponding to off
max_size = possible maximum size of data written in this iteration
req_off = offset in the file for a particular contiguous request minus
what was satisfied in previous iteration
send_off = offset the request needed by other processes in this iteration
req_len = size corresponding to req_off
send_len = size corresponding to send_off
*/
/* first calculate what should be communicated */
for (i = 0; i < nprocs; i++)
recv_count[i] = recv_size[i] = send_size[i] = 0;
off = off_list[m];
max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size -
off,
end_loc - off + 1);
for (i = 0; i < nprocs; i++) {
if (my_req[i].count) {
this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
send_off = my_req[i].offsets[j];
send_len = my_req[i].lens[j];
if (send_off < iter_st_off + max_size) {
send_size[i] += send_len;
} else {
break;
}
}
send_curr_offlen_ptr[i] = j;
}
if (others_req[i].count) {
recv_start_pos[i] = recv_curr_offlen_ptr[i];
for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
req_off = others_req[i].offsets[j];
req_len = others_req[i].lens[j];
if (req_off < iter_st_off + max_size) {
recv_count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
MPI_Address(write_buf + req_off - off,
&(others_req[i].mem_ptrs[j]));
recv_size[i] += req_len;
} else {
break;
}
}
recv_curr_offlen_ptr[i] = j;
}
}
/* use variable "hole" to pass data_sieving flag into W_Exchange_data */
hole = data_sieving;
ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, real_size,
recv_count, recv_start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
striping_info, others_req, send_buf_idx,
curr_to_proc, done_to_proc, &hole, m,
buftype_extent, this_buf_idx, error_code);
if (*error_code != MPI_SUCCESS)
goto over;
flag = 0;
for (i = 0; i < nprocs; i++)
if (recv_count[i]) {
flag = 1;
break;
}
if (flag) {
/* check whether to do data sieving */
if(data_sieving == ADIOI_HINT_ENABLE) {
ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status,
error_code);
} else {
/* if there is no hole, write data in one time;
* otherwise, write data in several times */
if (!hole) {
ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status,
error_code);
} else {
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
for (j = 0; j < others_req[i].count; j++) {
if (others_req[i].offsets[j] < off + real_size &&
others_req[i].offsets[j] >= off) {
ADIO_WriteContig(fd,
write_buf + others_req[i].offsets[j] - off,
others_req[i].lens[j],
MPI_BYTE, ADIO_EXPLICIT_OFFSET,
others_req[i].offsets[j], &status,
error_code);
if (*error_code != MPI_SUCCESS)
goto over;
}
}
}
}
}
}
if (*error_code != MPI_SUCCESS)
goto over;
}
iter_st_off += max_size;
}
over:
if (ntimes)
ADIOI_Free(write_buf);
ADIOI_Free(recv_curr_offlen_ptr);
ADIOI_Free(send_curr_offlen_ptr);
ADIOI_Free(recv_count);
ADIOI_Free(send_size);
ADIOI_Free(recv_size);
ADIOI_Free(sent_to_proc);
ADIOI_Free(recv_start_pos);
ADIOI_Free(send_buf_idx);
ADIOI_Free(curr_to_proc);
ADIOI_Free(done_to_proc);
ADIOI_Free(this_buf_idx);
ADIOI_Free(off_list);
}
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
* in the case of error.
*/
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos, int *partial_recv,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
int *striping_info,
ADIOI_Access *others_req,
int *send_buf_idx,
int *curr_to_proc, int *done_to_proc,
int *hole, int iter,
MPI_Aint buftype_extent,
int *buf_idx, int *error_code)
{
int i, j, nprocs_recv, nprocs_send, err;
char **send_buf = NULL;
MPI_Request *requests, *send_req;
MPI_Datatype *recv_types;
MPI_Status *statuses, status;
int *srt_len, sum, sum_recv;
ADIO_Offset *srt_off;
int data_sieving = *hole;
static char myname[] = "ADIOI_W_EXCHANGE_DATA";
/* create derived datatypes for recv */
nprocs_recv = 0;
for (i = 0; i < nprocs; i++)
if (recv_size[i])
nprocs_recv++;
recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
sizeof(MPI_Datatype));
/* +1 to avoid a 0-size malloc */
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
MPI_Type_hindexed(count[i],
&(others_req[i].lens[start_pos[i]]),
&(others_req[i].mem_ptrs[start_pos[i]]),
MPI_BYTE, recv_types + j);
/* absolute displacements; use MPI_BOTTOM in recv */
MPI_Type_commit(recv_types + j);
j++;
}
}
/* To avoid a read-modify-write,
* check if there are holes in the data to be written.
* For this, merge the (sorted) offset lists others_req using a heap-merge.
*/
sum = 0;
for (i = 0; i < nprocs; i++)
sum += count[i];
srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
/* +1 to avoid a 0-size malloc */
ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
nprocs, nprocs_recv, sum);
/* check if there are any holes */
*hole = 0;
for (i = 0; i < sum - 1; i++) {
if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
*hole = 1;
break;
}
}
/* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
* between aggregation, nominally contiguous regions, and cb_buffer_size
* should be handled with a read-modify-write (otherwise we will write out
* more data than we receive from everyone else (inclusive), so override
* hole detection
*/
if (*hole == 0) {
sum_recv = 0;
for (i = 0; i < nprocs; i++)
sum_recv += recv_size[i];
if (size > sum_recv)
*hole = 1;
}
/* check the hint for data sieving */
if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, &err);
// --BEGIN ERROR HANDLING--
if (err != MPI_SUCCESS) {
*error_code = MPIO_Err_create_code(err,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**ioRMWrdwr", 0);
ADIOI_Free(recv_types);
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
return;
}
// --END ERROR HANDLING--
}
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
nprocs_send = 0;
for (i = 0; i < nprocs; i++)
if (send_size[i])
nprocs_send++;
if (fd->atomicity) {
/* bug fix from Wei-keng Liao and Kenin Coloma */
requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
sizeof(MPI_Request));
send_req = requests;
} else {
requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
sizeof(MPI_Request));
/* +1 to avoid a 0-size malloc */
/* post receives */
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
myrank + i + 100 * iter, fd->comm, requests + j);
j++;
}
}
send_req = requests + nprocs_recv;
}
/* post sends.
* if buftype_is_contig, data can be directly sent from
* user buf at location given by buf_idx. else use send_buf.
*/
if (buftype_is_contig) {
j = 0;
for (i = 0; i < nprocs; i++)
if (send_size[i]) {
ADIOI_Assert(buf_idx[i] != -1);
MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
send_req + j);
j++;
}
} else
if (nprocs_send) {
/* buftype is not contig */
send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
for (i = 0; i < nprocs; i++)
if (send_size[i])
send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
len_list, send_size, send_req,
sent_to_proc, nprocs, myrank,
contig_access_count, striping_info,
send_buf_idx, curr_to_proc, done_to_proc,
iter, buftype_extent);
/* the send is done in ADIOI_Fill_send_buffer */
}
/* bug fix from Wei-keng Liao and Kenin Coloma */
if (fd->atomicity) {
j = 0;
for (i = 0; i < nprocs; i++) {
MPI_Status wkl_status;
if (recv_size[i]) {
MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
myrank + i + 100 * iter, fd->comm, &wkl_status);
j++;
}
}
}
for (i = 0; i < nprocs_recv; i++)
MPI_Type_free(recv_types + i);
ADIOI_Free(recv_types);
/* bug fix from Wei-keng Liao and Kenin Coloma */
/* +1 to avoid a 0-size malloc */
if (fd->atomicity) {
statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
sizeof(MPI_Status));
} else {
statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
sizeof(MPI_Status));
}
#ifdef NEEDS_MPI_TEST
i = 0;
if (fd->atomicity) {
/* bug fix from Wei-keng Liao and Kenin Coloma */
while (!i)
MPI_Testall(nprocs_send, send_req, &i, statuses);
} else {
while (!i)
MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
}
#else
/* bug fix from Wei-keng Liao and Kenin Coloma */
if (fd->atomicity)
MPI_Waitall(nprocs_send, send_req, statuses);
else
MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
#endif
ADIOI_Free(statuses);
ADIOI_Free(requests);
if (!buftype_is_contig && nprocs_send) {
for (i = 0; i < nprocs; i++)
if (send_size[i])
ADIOI_Free(send_buf[i]);
ADIOI_Free(send_buf);
}
}
#define ADIOI_BUF_INCR \
{ \
while (buf_incr) { \
size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
else { \
flat_buf_idx = 0; \
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
buf_incr -= size_in_buf; \
} \
}
#define ADIOI_BUF_COPY \
{ \
while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)((MPIR_Upint)buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(&(send_buf[p][send_buf_idx[p]]), \
((char *) buf) + user_buf_idx, size_in_buf); \
send_buf_idx[p] += size_in_buf; \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
else { \
flat_buf_idx = 0; \
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
size -= size_in_buf; \
buf_incr -= size_in_buf; \
} \
ADIOI_BUF_INCR \
}
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
MPI_Request *requests,
int *sent_to_proc, int nprocs,
int myrank,
int contig_access_count,
int *striping_info,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent)
{
/* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already
* been accounted for so far
* done_to_proc[p] = amount of data already sent to proc. p in
* previous iterations
* user_buf_idx = current location in user buffer
* send_buf_idx[p] = current location in send_buf of proc. p
*/
for (i = 0; i < nprocs; i++) {
send_buf_idx[i] = curr_to_proc[i] = 0;
done_to_proc[i] = sent_to_proc[i];
}
jj = 0;
user_buf_idx = flat_buf->indices[0];
flat_buf_idx = 0;
n_buftypes = 0;
flat_buf_sz = flat_buf->blocklens[0];
/* flat_buf_idx = current index into flattened buftype
* flat_buf_sz = size of current contiguous component in flattened buf
*/
for (i = 0; i < contig_access_count; i++) {
off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i];
/*this request may span to more than one process */
while (rem_len != 0) {
len = rem_len;
/* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
* longer than the single region that processor "p" is responsible
* for.
*/
p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p] + len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) {
size = (int) ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p],
send_size[p] -
send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR
ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = (int) (curr_to_proc[p] + len -
done_to_proc[p]);
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY
} else {
size = (int) ADIOI_MIN(len, send_size[p] -
send_buf_idx[p]);
buf_incr = (int) len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size;
ADIOI_BUF_COPY
}
if (send_buf_idx[p] == send_size[p]) {
MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
myrank + p + 100 * iter, fd->comm,
requests + jj);
jj++;
}
} else {
ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
curr_to_proc[p] += (int) len;
buf_incr = (int) len;
ADIOI_BUF_INCR
}
} else {
buf_incr = (int) len;
ADIOI_BUF_INCR
}
off += len;
rem_len -= len;
}
}
for (i = 0; i < nprocs; i++)
if (send_size[i])
sent_to_proc[i] = curr_to_proc[i];
}

Просмотреть файл

@ -0,0 +1,530 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
if (writebuf_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, \
&status1, error_code); \
if (!(fd->atomicity)) \
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
} \
writebuf_off = req_off; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
if (!(fd->atomicity)) \
ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, \
writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowsrc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, \
writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == \
ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf + req_off - writebuf_off, (char *)buf +userbuf_off, write_sz); \
while (write_sz != req_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
if (!(fd->atomicity)) \
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
if (!(fd->atomicity)) \
ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, \
writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowsrc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
} \
}
/* this macro is used when filetype is contig and buftype is not contig.
it does not do a read-modify-write and does not lock*/
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, \
error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
writebuf_off = req_off; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
} \
write_sz = (unsigned) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf + req_off - writebuf_off, \
(char *)buf + userbuf_off, write_sz); \
while (write_sz != req_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
} \
}
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status * status,
int *error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, sum, size_in_filetype;
int i, j, k, st_index=0;
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf;
unsigned bufsize, writebuf_len, write_sz;
ADIO_Status status1;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
int stripe_size;
static char myname[] = "ADIOI_LUSTRE_WriteStrided";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on writes, use naive
* approach instead.
*/
ADIOI_GEN_WriteStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset, status, error_code);
return;
}
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
if (!filetype_size) {
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get striping info */
stripe_size = fd->hints->striping_unit;
/* Different buftype to different filetype */
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype)
flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off;
end_offset = start_off + bufsize - 1;
/* write stripe size buffer each time */
writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
writebuf_off = 0;
writebuf_len = 0;
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
for (j = 0; j < count; j++) {
for (i = 0; i < flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent +
flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
}
/* write the buffer out finally */
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
error_code);
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
if (*error_code != MPI_SUCCESS) {
ADIOI_Free(writebuf);
return;
}
ADIOI_Free(writebuf);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind = off;
} else {
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype)
flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i = 0; i < flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file
* contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
req_off = start_off;
req_len = bufsize;
end_offset = start_off + bufsize - 1;
writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
writebuf_off = 0;
writebuf_len = 0;
userbuf_off = 0;
ADIOI_BUFFERED_WRITE_WITHOUT_READ
/* write the buffer out finally */
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
ADIOI_Free(writebuf);
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
i_offset += fwr_size;
end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
writebuf_off = 0;
writebuf_len = 0;
writebuf = (char *) ADIOI_Malloc(stripe_size);
memset(writebuf, -1, stripe_size);
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] +
n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
new_fwr_size = fwr_size;
new_bwr_size = bwr_size;
if (size == fwr_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i_offset += size;
new_bwr_size -= size;
}
}
if (size == bwr_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = (ADIO_Offset)buftype_extent *
(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;
new_fwr_size -= size;
}
}
num += size;
fwr_size = new_fwr_size;
bwr_size = new_bwr_size;
}
}
/* write the buffer out finally */
if (writebuf_len) {
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET,
writebuf_off, &status1, error_code);
if (!(fd->atomicity))
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
if (*error_code != MPI_SUCCESS) return;
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
ADIOI_Free(writebuf);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig)
ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -24,6 +24,7 @@ libadio_nfs_la_SOURCES = \
ad_nfs.h \ ad_nfs.h \
ad_nfs_done.c \ ad_nfs_done.c \
ad_nfs_fcntl.c \ ad_nfs_fcntl.c \
ad_nfs_features.c \
ad_nfs_getsh.c \ ad_nfs_getsh.c \
ad_nfs_hints.c \ ad_nfs_hints.c \
ad_nfs_iread.c \ ad_nfs_iread.c \

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_NFS_operations = { struct ADIOI_Fns_struct ADIO_NFS_operations = {
ADIOI_NFS_Open, /* Open */ ADIOI_NFS_Open, /* Open */
ADIOI_FAILSAFE_OpenColl, /* OpenColl */
ADIOI_NFS_ReadContig, /* ReadContig */ ADIOI_NFS_ReadContig, /* ReadContig */
ADIOI_NFS_WriteContig, /* WriteContig */ ADIOI_NFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -35,4 +36,5 @@ struct ADIOI_Fns_struct ADIO_NFS_operations = {
ADIOI_GEN_Flush, /* Flush */ ADIOI_GEN_Flush, /* Flush */
ADIOI_NFS_Resize, /* Resize */ ADIOI_NFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_NFS_Feature, /* Features */
}; };

Просмотреть файл

@ -78,5 +78,6 @@ void ADIOI_NFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp,
int *error_code); int *error_code);
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code); void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
int ADIOI_NFS_Feature(ADIO_File fd, int feature_flag);
#endif #endif

Просмотреть файл

@ -0,0 +1,16 @@
#include "adio.h"
#include "ad_nfs.h"
int ADIOI_NFS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
return 1;
case ADIO_SCALABLE_OPEN:
default:
return 0;
}
}

Просмотреть файл

@ -59,6 +59,7 @@ int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
struct aiocb *aiocbp; struct aiocb *aiocbp;
ADIOI_AIO_Request *aio_req; ADIOI_AIO_Request *aio_req;
MPI_Status status;
fd_sys = fd->fd_sys; fd_sys = fd->fd_sys;
@ -108,7 +109,7 @@ int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
/* exceeded the max. no. of outstanding requests. /* exceeded the max. no. of outstanding requests.
complete all previous async. requests and try again. */ complete all previous async. requests and try again. */
ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, NULL, &error_code); offset, &status, &error_code);
MPIO_Completed_request_create(&fd, len, &error_code, request); MPIO_Completed_request_create(&fd, len, &error_code, request);
return 0; return 0;
} else { } else {

Просмотреть файл

@ -177,7 +177,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
ADIO_Offset userbuf_off; ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value; char *readbuf, *tmp_buf, *value;
int flag, st_frd_size, st_n_filetypes, readbuf_len; int st_frd_size, st_n_filetypes, readbuf_len;
int new_brd_size, new_frd_size, err_flag=0, info_flag, max_bufsize; int new_brd_size, new_frd_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_READSTRIDED"; static char myname[] = "ADIOI_NFS_READSTRIDED";
@ -201,7 +201,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
/* get max_bufsize from the info object. */ /* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag); &info_flag);
max_bufsize = atoi(value); max_bufsize = atoi(value);
ADIOI_Free(value); ADIOI_Free(value);
@ -278,24 +278,31 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp; disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) { if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */ /* Wei-keng reworked type processing to be a bit more efficient */
n_filetypes = -1; offset = fd->fp_ind - disp;
flag = 0; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
while (!flag) { offset -= (ADIO_Offset)n_filetypes * filetype_extent;
n_filetypes++; /* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) { for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] + ADIO_Offset dist;
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] if (flat_file->blocklens[i] == 0) continue;
>= offset) { dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
st_index = i; /* frd_size is from offset to the end of block i */
frd_size = (int) (disp + flat_file->indices[i] + if (dist == 0) {
(ADIO_Offset) n_filetypes*filetype_extent i++;
+ flat_file->blocklens[i] - offset); offset = flat_file->indices[i];
flag = 1; frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0 ) {
frd_size = dist;
break; break;
} }
} }
} st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
} }
else { else {
n_etypes_in_filetype = filetype_size/etype_size; n_etypes_in_filetype = filetype_size/etype_size;
@ -316,11 +323,42 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
} }
/* abs. offset in bytes in the file */ /* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
} }
start_off = offset; start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed. /* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
@ -333,11 +371,11 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
while (i < bufsize) { while (i < bufsize) {
i += frd_size; i += frd_size;
end_offset = off + frd_size - 1; end_offset = off + frd_size - 1;
j = (j+1) % flat_file->count;
if (j < (flat_file->count - 1)) j++; n_filetypes += (j == 0) ? 1 : 0;
else { while (flat_file->blocklens[j]==0) {
j = 0; j = (j+1) % flat_file->count;
n_filetypes++; n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent;
@ -402,10 +440,11 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
/* did not reach end of contiguous block in filetype. /* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */ no more I/O needed. off is incremented by frd_size. */
else { else {
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; (ADIO_Offset) n_filetypes*filetype_extent;
@ -445,12 +484,12 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
if (size == frd_size) { if (size == frd_size) {
/* reached end of contiguous block in file */ /* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; (ADIO_Offset) n_filetypes*filetype_extent;

Просмотреть файл

@ -10,120 +10,6 @@ void ADIOI_NFS_ReadComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code) int *error_code)
{ {
return; return;
#if 0
#ifdef ROMIO_HAVE_WORKING_AIO
int err;
static char myname[] = "ADIOI_NFS_READCOMPLETE";
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_HANDLE
struct aiocb *tmp1;
#endif
#endif
if (*request == ADIO_REQUEST_NULL) {
*error_code = MPI_SUCCESS;
return;
}
#ifdef ROMIO_HAVE_AIO_SUSPEND_TWO_ARGS
/* old IBM */
if ((*request)->queued) {
do {
#if !defined(_AIO_AIX_SOURCE) && !defined(_NO_PROTO)
err = aio_suspend((*request)->handle,1,NULL);
#else
err = aio_suspend(1, (struct aiocb **) &((*request)->handle));
#endif
} while ((err == -1) && (errno == EINTR));
tmp1 = (struct aiocb *) (*request)->handle;
if (err != -1) {
err = aio_return(tmp1->aio_handle);
(*request)->nbytes = err;
errno = aio_error(tmp1->aio_handle);
}
else (*request)->nbytes = -1;
/* on DEC, it is required to call aio_return to dequeue the request.
IBM man pages don't indicate what function to use for dequeue.
I'm assuming it is aio_return! */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else *error_code = MPI_SUCCESS; /* if ( (*request)->queued ) */
#ifdef HAVE_STATUS_SET_BYTES
if ((*request)->nbytes != -1)
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
#elif defined(ROMIO_HAVE_WORKING_AIO)
/* all other aio types */
if ((*request)->queued) {
do {
err = aio_suspend((const struct aiocb **) &((*request)->handle), 1, 0);
} while ((err == -1) && (errno == EINTR));
if (err != -1) {
err = aio_return((struct aiocb *) (*request)->handle);
(*request)->nbytes = err;
errno = aio_error((struct aiocb *) (*request)->handle);
}
else (*request)->nbytes = -1;
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else *error_code = MPI_SUCCESS; /* if ((*request)->queued) ... */
#ifdef HAVE_STATUS_SET_BYTES
if ((*request)->nbytes != -1)
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
#endif
#ifdef ROMIO_HAVE_WORKING_AIO
if ((*request)->queued != -1) {
/* queued = -1 is an internal hack used when the request must
be completed, but the request object should not be
freed. This is used in ADIOI_Complete_async, because the user
will call MPI_Wait later, which would require status to
be filled. Ugly but works. queued = -1 should be used only
in ADIOI_Complete_async.
This should not affect the user in any way. */
/* if request is still queued in the system, it is also there
on ADIOI_Async_list. Delete it from there. */
if ((*request)->queued) ADIOI_Del_req_from_list(request);
(*request)->fd->async_count--;
if ((*request)->handle) ADIOI_Free((*request)->handle);
ADIOI_Free_request((ADIOI_Req_node *) (*request));
*request = ADIO_REQUEST_NULL;
}
#else
/* no aio */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
(*request)->fd->async_count--;
ADIOI_Free_request((ADIOI_Req_node *) (*request));
*request = ADIO_REQUEST_NULL;
*error_code = MPI_SUCCESS;
#endif
#endif
} }

Просмотреть файл

@ -281,7 +281,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
ADIO_Offset userbuf_off; ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value; char *writebuf, *value;
int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; int st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize; int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_WRITESTRIDED"; static char myname[] = "ADIOI_NFS_WRITESTRIDED";
@ -304,7 +304,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
/* get max_bufsize from the info object. */ /* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag); &info_flag);
max_bufsize = atoi(value); max_bufsize = atoi(value);
ADIOI_Free(value); ADIOI_Free(value);
@ -381,24 +381,31 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
disp = fd->disp; disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) { if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */ /* Wei-keng reworked type processing to be a bit more efficient */
n_filetypes = -1; offset = fd->fp_ind - disp;
flag = 0; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
while (!flag) { offset -= (ADIO_Offset)n_filetypes * filetype_extent;
n_filetypes++; /* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) { for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] + ADIO_Offset dist;
(ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] if (flat_file->blocklens[i] == 0) continue;
>= offset) { dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
st_index = i; /* fwr_size is from offset to the end of block i */
fwr_size = (int) (disp + flat_file->indices[i] + if (dist == 0) {
(ADIO_Offset) n_filetypes*filetype_extent i++;
+ flat_file->blocklens[i] - offset); offset = flat_file->indices[i];
flag = 1; fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break; break;
} }
} }
} st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
} }
else { else {
n_etypes_in_filetype = filetype_size/etype_size; n_etypes_in_filetype = filetype_size/etype_size;
@ -419,10 +426,40 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
} }
/* abs. offset in bytes in the file */ /* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
} }
start_off = offset; start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed. /* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
@ -436,14 +473,15 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
while (i < bufsize) { while (i < bufsize) {
i += fwr_size; i += fwr_size;
end_offset = off + fwr_size - 1; end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
if (j < (flat_file->count - 1)) j++; n_filetypes += (j == 0) ? 1 : 0;
else { while (flat_file->blocklens[j]==0) {
j = 0; j = (j+1) % flat_file->count;
n_filetypes++; n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
} }
@ -509,10 +547,11 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
/* did not reach end of contiguous block in filetype. /* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */ no more I/O needed. off is incremented by fwr_size. */
else { else {
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent; (ADIO_Offset) n_filetypes*filetype_extent;
@ -552,10 +591,11 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count,
if (size == fwr_size) { if (size == fwr_size) {
/* reached end of contiguous block in file */ /* reached end of contiguous block in file */
if (j < (flat_file->count - 1)) j++; j = (j+1) % flat_file->count;
else { n_filetypes += (j == 0) ? 1 : 0;
j = 0; while (flat_file->blocklens[j]==0) {
n_filetypes++; j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
} }
off = disp + flat_file->indices[j] + off = disp + flat_file->indices[j] +

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_NTFS_operations = { struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Open, /* Open */ ADIOI_NTFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_NTFS_ReadContig, /* ReadContig */ ADIOI_NTFS_ReadContig, /* ReadContig */
ADIOI_NTFS_WriteContig, /* WriteContig */ ADIOI_NTFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -33,4 +34,5 @@ struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Flush, /* Flush */ ADIOI_NTFS_Flush, /* Flush */
ADIOI_NTFS_Resize, /* Resize */ ADIOI_NTFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature /* Features */
}; };

Просмотреть файл

@ -60,7 +60,6 @@ int ADIOI_NTFS_aio_poll_fn(void *extra_state, MPI_Status *status)
/* TODO: unsure how to handle this */ /* TODO: unsure how to handle this */
} }
}else{ }else{
MPIR_Nest_incr();
mpi_errno = MPI_Grequest_complete(aio_req->req); mpi_errno = MPI_Grequest_complete(aio_req->req);
if (mpi_errno != MPI_SUCCESS) { if (mpi_errno != MPI_SUCCESS) {
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
@ -69,7 +68,6 @@ int ADIOI_NTFS_aio_poll_fn(void *extra_state, MPI_Status *status)
MPI_ERR_IO, "**mpi_grequest_complete", MPI_ERR_IO, "**mpi_grequest_complete",
0); 0);
} }
MPIR_Nest_decr();
} }
return mpi_errno; return mpi_errno;
} }
@ -111,7 +109,6 @@ int ADIOI_NTFS_aio_wait_fn(int count, void **array_of_states,
aio_reqlist[retObject]->lpOvl, &(aio_reqlist[retObject]->nbytes), aio_reqlist[retObject]->lpOvl, &(aio_reqlist[retObject]->nbytes),
FALSE)){ FALSE)){
/* XXX: mark completed requests as 'done'*/ /* XXX: mark completed requests as 'done'*/
MPIR_Nest_incr();
mpi_errno = MPI_Grequest_complete(aio_reqlist[retObject]->req); mpi_errno = MPI_Grequest_complete(aio_reqlist[retObject]->req);
if (mpi_errno != MPI_SUCCESS) { if (mpi_errno != MPI_SUCCESS) {
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
@ -120,7 +117,6 @@ int ADIOI_NTFS_aio_wait_fn(int count, void **array_of_states,
MPI_ERR_IO, "**mpi_grequest_complete", MPI_ERR_IO, "**mpi_grequest_complete",
0); 0);
} }
MPIR_Nest_decr();
}else{ }else{
if(GetLastError() == ERROR_IO_INCOMPLETE){ if(GetLastError() == ERROR_IO_INCOMPLETE){
/* IO in progress */ /* IO in progress */
@ -146,7 +142,6 @@ int ADIOI_NTFS_aio_query_fn(void *extra_state, MPI_Status *status)
MPI_Status_set_elements(status, MPI_BYTE, aio_req->nbytes); MPI_Status_set_elements(status, MPI_BYTE, aio_req->nbytes);
/* do i need to nest_incr/nest_decr here? */
/* can never cancel so always true */ /* can never cancel so always true */
MPI_Status_set_cancelled(status, 0); MPI_Status_set_cancelled(status, 0);

Просмотреть файл

@ -13,6 +13,7 @@
struct ADIOI_Fns_struct ADIO_PANFS_operations = { struct ADIOI_Fns_struct ADIO_PANFS_operations = {
ADIOI_PANFS_Open, /* Open */ ADIOI_PANFS_Open, /* Open */
ADIOI_GEN_OpenColl,
ADIOI_PANFS_ReadContig, /* ReadContig */ ADIOI_PANFS_ReadContig, /* ReadContig */
ADIOI_PANFS_WriteContig, /* WriteContig */ ADIOI_PANFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -39,4 +40,5 @@ struct ADIOI_Fns_struct ADIO_PANFS_operations = {
ADIOI_GEN_Flush, /* Flush */ ADIOI_GEN_Flush, /* Flush */
ADIOI_PANFS_Resize, /* Resize */ ADIOI_PANFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature,
}; };

Просмотреть файл

@ -36,7 +36,7 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
concurrent_write = strtoul(value,NULL,10); concurrent_write = strtoul(value,NULL,10);
@ -46,10 +46,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_concurrent_write\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_concurrent_write\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_concurrent_write", value); ADIOI_Info_set(fd->info, "panfs_concurrent_write", value);
} }
MPI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_type = strtoul(value,NULL,10); layout_type = strtoul(value,NULL,10);
@ -59,10 +59,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_type\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_type\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_layout_type", value); ADIOI_Info_set(fd->info, "panfs_layout_type", value);
} }
MPI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_stripe_unit = strtoul(value,NULL,10); layout_stripe_unit = strtoul(value,NULL,10);
@ -72,10 +72,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_stripe_unit\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_stripe_unit\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", value); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", value);
} }
MPI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) { if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
layout_parity_stripe_width = strtoul(value,NULL,10); layout_parity_stripe_width = strtoul(value,NULL,10);
@ -85,10 +85,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_width\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_width\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value); ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value);
} }
MPI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) { if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
layout_parity_stripe_depth = strtoul(value,NULL,10); layout_parity_stripe_depth = strtoul(value,NULL,10);
@ -98,10 +98,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_depth\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_depth\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value); ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value);
} }
MPI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_total_num_comps = strtoul(value,NULL,10); layout_total_num_comps = strtoul(value,NULL,10);
@ -111,10 +111,10 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_total_num_comps\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_total_num_comps\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", value); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", value);
} }
MPI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE || layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) { if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE || layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
layout_visit_policy = strtoul(value,NULL,10); layout_visit_policy = strtoul(value,NULL,10);
@ -124,7 +124,7 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_visit_policy\" must be the same on all processes\n"); FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_visit_policy\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
MPI_Info_set(fd->info, "panfs_layout_visit_policy", value); ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", value);
} }
ADIOI_Free(value); ADIOI_Free(value);

Просмотреть файл

@ -39,32 +39,32 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
*error_code = MPI_SUCCESS; *error_code = MPI_SUCCESS;
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_type = strtoul(value,NULL,10); layout_type = strtoul(value,NULL,10);
} }
MPI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_stripe_unit = strtoul(value,NULL,10); layout_stripe_unit = strtoul(value,NULL,10);
} }
MPI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_total_num_comps = strtoul(value,NULL,10); layout_total_num_comps = strtoul(value,NULL,10);
} }
MPI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_parity_stripe_width = strtoul(value,NULL,10); layout_parity_stripe_width = strtoul(value,NULL,10);
} }
MPI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_parity_stripe_depth = strtoul(value,NULL,10); layout_parity_stripe_depth = strtoul(value,NULL,10);
} }
MPI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
layout_visit_policy = strtoul(value,NULL,10); layout_visit_policy = strtoul(value,NULL,10);
@ -266,7 +266,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
amode = amode | O_EXCL; amode = amode | O_EXCL;
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
unsigned long int concurrent_write = strtoul(value,NULL,10); unsigned long int concurrent_write = strtoul(value,NULL,10);
@ -291,41 +291,41 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
if (rc < 0) if (rc < 0)
{ {
/* Error - set layout type to unknown */ /* Error - set layout type to unknown */
MPI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID"); ADIOI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
} }
else else
{ {
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.agg_type); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.agg_type);
MPI_Info_set(fd->info, "panfs_layout_type", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_type", temp_buffer);
if (file_query_args.layout.layout_is_valid == 1) if (file_query_args.layout.layout_is_valid == 1)
{ {
switch (file_query_args.layout.agg_type) switch (file_query_args.layout.agg_type)
{ {
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID0: case PAN_FS_CLIENT_LAYOUT_TYPE__RAID0:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.stripe_unit); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.stripe_unit);
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.total_num_comps); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.total_num_comps);
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
break; break;
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE: case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit);
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_width); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_width);
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth);
MPI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.total_num_comps); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.total_num_comps);
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.layout_visit_policy); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.layout_visit_policy);
MPI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break; break;
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10: case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.stripe_unit); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.stripe_unit);
MPI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.total_num_comps); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.total_num_comps);
MPI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy);
MPI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer); ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break; break;
} }
} }

Просмотреть файл

@ -24,7 +24,7 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
str_factor=atoi(value); str_factor=atoi(value);
@ -40,7 +40,7 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
str_unit=atoi(value); str_unit=atoi(value);
@ -56,7 +56,7 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
start_iodev=atoi(value); start_iodev=atoi(value);
@ -119,15 +119,15 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
If so, mark it as true in fd->info and turn it on in If so, mark it as true in fd->info and turn it on in
ADIOI_PFS_Open after the file is opened */ ADIOI_PFS_Open after the file is opened */
MPI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (!strcmp(value, "true"))) if (flag && (!strcmp(value, "true")))
MPI_Info_set(fd->info, "pfs_svr_buf", "true"); ADIOI_Info_set(fd->info, "pfs_svr_buf", "true");
else MPI_Info_set(fd->info, "pfs_svr_buf", "false"); else ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
ADIOI_Free(value); ADIOI_Free(value);
} }
else MPI_Info_set(fd->info, "pfs_svr_buf", "false"); else ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
/* set the values for collective I/O and data sieving parameters */ /* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code); ADIOI_GEN_SetInfo(fd, users_info, error_code);
@ -144,23 +144,23 @@ void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "false"))) { if (flag && (!strcmp(value, "true") || !strcmp(value, "false"))) {
value_in_fd = (char *) value_in_fd = (char *)
ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value_in_fd, &flag); value_in_fd, &flag);
if (strcmp(value, value_in_fd)) { if (strcmp(value, value_in_fd)) {
if (!strcmp(value, "true")) { if (!strcmp(value, "true")) {
err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE); err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE);
if (!err) if (!err)
MPI_Info_set(fd->info, "pfs_svr_buf", "true"); ADIOI_Info_set(fd->info, "pfs_svr_buf", "true");
} }
else { else {
err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, FALSE); err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, FALSE);
if (!err) if (!err)
MPI_Info_set(fd->info, "pfs_svr_buf", "false"); ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
} }
} }
ADIOI_Free(value_in_fd); ADIOI_Free(value_in_fd);

Просмотреть файл

@ -49,11 +49,11 @@ void ADIOI_PFS_Open(ADIO_File fd, int *error_code)
to ADIOI_PFS_SetInfo. Turn it on now, since we now have a to ADIOI_PFS_SetInfo. Turn it on now, since we now have a
valid file descriptor. */ valid file descriptor. */
MPI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (!strcmp(value, "true"))) { if (flag && (!strcmp(value, "true"))) {
err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE); err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE);
if (err) MPI_Info_set(fd->info, "pfs_svr_buf", "false"); if (err) ADIOI_Info_set(fd->info, "pfs_svr_buf", "false");
} }
/* get file striping information and set it in info */ /* get file striping information and set it in info */
@ -61,13 +61,13 @@ void ADIOI_PFS_Open(ADIO_File fd, int *error_code)
if (!err) { if (!err) {
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sunitsize); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sunitsize);
MPI_Info_set(fd->info, "striping_unit", value); ADIOI_Info_set(fd->info, "striping_unit", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sfactor); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sfactor);
MPI_Info_set(fd->info, "striping_factor", value); ADIOI_Info_set(fd->info, "striping_factor", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_start_sdir); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_start_sdir);
MPI_Info_set(fd->info, "start_iodevice", value); ADIOI_Info_set(fd->info, "start_iodevice", value);
} }
ADIOI_Free(value); ADIOI_Free(value);

Просмотреть файл

@ -25,6 +25,7 @@ libadio_piofs_la_SOURCES = \
ad_piofs.c \ ad_piofs.c \
ad_piofs.h \ ad_piofs.h \
ad_piofs_fcntl.c \ ad_piofs_fcntl.c \
ad_piofs_features.c \
ad_piofs_hints.c \ ad_piofs_hints.c \
ad_piofs_open.c \ ad_piofs_open.c \
ad_piofs_read.c \ ad_piofs_read.c \

Просмотреть файл

@ -33,4 +33,5 @@ struct ADIOI_Fns_struct ADIO_PIOFS_operations = {
ADIOI_GEN_Flush, /* Flush */ ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */ ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_PIOFS_Feature,
}; };

Просмотреть файл

@ -35,4 +35,6 @@ void ADIOI_PIOFS_WriteStrided(ADIO_File fd, void *buf, int count,
*error_code); *error_code);
void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_PIOFS_Feature(ADIO_File fd, int flag);
#endif #endif

Просмотреть файл

@ -0,0 +1,13 @@
int ADIOI_PIOFS_Features(int flag)
{
switch(flag) {
case ADIO_LOCKS:
case ADIO_SHARED_FP:
case ADIO_ATOMIC_MODE:
case ADIO_DATA_SIEVING_WRITES:
case ADIO_SCALABLE_OPEN:
default:
return 0;
break;
}
}

Просмотреть файл

@ -25,7 +25,7 @@ void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
str_factor=atoi(value); str_factor=atoi(value);
@ -37,7 +37,7 @@ void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
} }
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
str_unit=atoi(value); str_unit=atoi(value);
@ -49,7 +49,7 @@ void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
} }
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
start_iodev=atoi(value); start_iodev=atoi(value);

Просмотреть файл

@ -49,13 +49,13 @@ void ADIOI_PIOFS_Open(ADIO_File fd, int *error_code)
if (!err) { if (!err) {
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_bsu); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_bsu);
MPI_Info_set(fd->info, "striping_unit", value); ADIOI_Info_set(fd->info, "striping_unit", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_cells); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_cells);
MPI_Info_set(fd->info, "striping_factor", value); ADIOI_Info_set(fd->info, "striping_factor", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_base_node); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", piofs_fstat.st_base_node);
MPI_Info_set(fd->info, "start_iodevice", value); ADIOI_Info_set(fd->info, "start_iodevice", value);
} }
ADIOI_Free(value); ADIOI_Free(value);

Просмотреть файл

@ -33,4 +33,5 @@ struct ADIOI_Fns_struct ADIO_PVFS_operations = {
ADIOI_PVFS_Flush, /* Flush */ ADIOI_PVFS_Flush, /* Flush */
ADIOI_PVFS_Resize, /* Resize */ ADIOI_PVFS_Resize, /* Resize */
ADIOI_PVFS_Delete, /* Delete */ ADIOI_PVFS_Delete, /* Delete */
ADIOI_PVFS_Feature, /* Features */
}; };

Просмотреть файл

@ -17,8 +17,8 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* This must be part of the open call. can set striping parameters /* This must be part of the open call. can set striping parameters
if necessary. */ if necessary. */
MPI_Info_create(&(fd->info)); MPI_Info_create(&(fd->info));
MPI_Info_set(fd->info, "romio_pvfs_listio_read", "disable"); ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", "disable");
MPI_Info_set(fd->info, "romio_pvfs_listio_write", "disable"); ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", "disable");
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE;
@ -27,7 +27,7 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
str_factor=atoi(value); str_factor=atoi(value);
@ -41,10 +41,10 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
return; return;
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
else MPI_Info_set(fd->info, "striping_factor", value); else ADIOI_Info_set(fd->info, "striping_factor", value);
} }
MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
str_unit=atoi(value); str_unit=atoi(value);
@ -58,10 +58,10 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
return; return;
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
else MPI_Info_set(fd->info, "striping_unit", value); else ADIOI_Info_set(fd->info, "striping_unit", value);
} }
MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
start_iodev=atoi(value); start_iodev=atoi(value);
@ -75,25 +75,25 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
return; return;
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
else MPI_Info_set(fd->info, "start_iodevice", value); else ADIOI_Info_set(fd->info, "start_iodevice", value);
} }
MPI_Info_get(users_info, "romio_pvfs_listio_read", ADIOI_Info_get(users_info, "romio_pvfs_listio_read",
MPI_MAX_INFO_VAL, MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{ {
MPI_Info_set(fd->info, "romio_pvfs_listio_read", value); ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value);
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_ENABLE; fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_ENABLE;
} else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{ {
MPI_Info_set(fd->info , "romio_pvfs_listio_read", value); ADIOI_Info_set(fd->info , "romio_pvfs_listio_read", value);
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE;
} }
else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{ {
MPI_Info_set(fd->info, "romio_pvfs_listio_read", value); ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value);
fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_AUTO; fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_AUTO;
} }
tmp_val = fd->hints->fs_hints.pvfs.listio_read; tmp_val = fd->hints->fs_hints.pvfs.listio_read;
@ -107,21 +107,21 @@ void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
} }
} }
MPI_Info_get(users_info, "romio_pvfs_listio_write", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "romio_pvfs_listio_write", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag) { if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{ {
MPI_Info_set(fd->info, "romio_pvfs_listio_write", value); ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_ENABLE; fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_ENABLE;
} else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{ {
MPI_Info_set(fd->info, "romio_pvfs_listio_write", value); ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE;
} }
else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{ {
MPI_Info_set(fd->info, "romio_pvfs_listio_write", value); ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value);
fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_AUTO; fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_AUTO;
} }
tmp_val = fd->hints->fs_hints.pvfs.listio_write; tmp_val = fd->hints->fs_hints.pvfs.listio_write;

Просмотреть файл

@ -37,15 +37,15 @@ void ADIOI_PVFS_Open(ADIO_File fd, int *error_code)
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (atoi(value) > 0)) pstat.pcount = atoi(value); if (flag && (atoi(value) > 0)) pstat.pcount = atoi(value);
MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (atoi(value) > 0)) pstat.ssize = atoi(value); if (flag && (atoi(value) > 0)) pstat.ssize = atoi(value);
MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, ADIOI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && (atoi(value) >= 0)) pstat.base = atoi(value); if (flag && (atoi(value) >= 0)) pstat.base = atoi(value);
@ -71,11 +71,11 @@ void ADIOI_PVFS_Open(ADIO_File fd, int *error_code)
if (fd->fd_sys != -1) { if (fd->fd_sys != -1) {
pvfs_ioctl(fd->fd_sys, GETMETA, &pstat); pvfs_ioctl(fd->fd_sys, GETMETA, &pstat);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.pcount); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.pcount);
MPI_Info_set(fd->info, "striping_factor", value); ADIOI_Info_set(fd->info, "striping_factor", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.ssize); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.ssize);
MPI_Info_set(fd->info, "striping_unit", value); ADIOI_Info_set(fd->info, "striping_unit", value);
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.base); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", pstat.base);
MPI_Info_set(fd->info, "start_iodevice", value); ADIOI_Info_set(fd->info, "start_iodevice", value);
} }
ADIOI_Free(value); ADIOI_Free(value);

Просмотреть файл

@ -43,6 +43,7 @@ void ADIOI_PVFS_ReadContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING #ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif #endif
if (err>0)
fd->fp_sys_posn = offset + err; fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */ /* individual file pointer not updated */
} }
@ -63,6 +64,7 @@ void ADIOI_PVFS_ReadContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING #ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif #endif
if (err > 0)
fd->fp_ind += err; fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind; fd->fp_sys_posn = fd->fp_ind;
} }

Просмотреть файл

@ -43,6 +43,7 @@ void ADIOI_PVFS_WriteContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING #ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif #endif
if (err > 0)
fd->fp_sys_posn = offset + err; fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */ /* individual file pointer not updated */
} }
@ -63,6 +64,7 @@ void ADIOI_PVFS_WriteContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING #ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif #endif
if (err > 0)
fd->fp_ind += err; fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind; fd->fp_sys_posn = fd->fp_ind;
} }

Просмотреть файл

@ -28,9 +28,15 @@ libadio_pvfs2_la_SOURCES = \
ad_pvfs2_common.c \ ad_pvfs2_common.c \
ad_pvfs2_delete.c \ ad_pvfs2_delete.c \
ad_pvfs2_fcntl.c \ ad_pvfs2_fcntl.c \
ad_pvfs2_features.c \
ad_pvfs2_flush.c \ ad_pvfs2_flush.c \
ad_pvfs2_hints.c \ ad_pvfs2_hints.c \
ad_pvfs2_io.h \
ad_pvfs2_io_dtype.c \
ad_pvfs2_io_list.c \
ad_pvfs2_open.c \ ad_pvfs2_open.c \
ad_pvfs2_read.c \ ad_pvfs2_read.c \
ad_pvfs2_read_list_classic.c
ad_pvfs2_resize.c \ ad_pvfs2_resize.c \
ad_pvfs2_write.c ad_pvfs2_write.c \
ad_pvfs2_write_list_classic.c

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_PVFS2_operations = { struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
ADIOI_PVFS2_Open, /* Open */ ADIOI_PVFS2_Open, /* Open */
ADIOI_SCALEABLE_OpenColl, /* OpenColl */
ADIOI_PVFS2_ReadContig, /* ReadContig */ ADIOI_PVFS2_ReadContig, /* ReadContig */
ADIOI_PVFS2_WriteContig, /* WriteContig */ ADIOI_PVFS2_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -22,13 +23,8 @@ struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
ADIOI_PVFS2_ReadStrided, /* ReadStrided */ ADIOI_PVFS2_ReadStrided, /* ReadStrided */
ADIOI_PVFS2_WriteStrided, /* WriteStrided */ ADIOI_PVFS2_WriteStrided, /* WriteStrided */
ADIOI_PVFS2_Close, /* Close */ ADIOI_PVFS2_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
ADIOI_PVFS2_IReadContig, /* IreadContig */ ADIOI_PVFS2_IReadContig, /* IreadContig */
ADIOI_PVFS2_IWriteContig, /* IwriteContig */ ADIOI_PVFS2_IWriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_FAKE_IODone, /* ReadDone */ ADIOI_FAKE_IODone, /* ReadDone */
ADIOI_FAKE_IODone, /* WriteDone */ ADIOI_FAKE_IODone, /* WriteDone */
ADIOI_FAKE_IOComplete, /* ReadComplete */ ADIOI_FAKE_IOComplete, /* ReadComplete */
@ -38,6 +34,7 @@ struct ADIOI_Fns_struct ADIO_PVFS2_operations = {
ADIOI_PVFS2_Flush, /* Flush */ ADIOI_PVFS2_Flush, /* Flush */
ADIOI_PVFS2_Resize, /* Resize */ ADIOI_PVFS2_Resize, /* Resize */
ADIOI_PVFS2_Delete, /* Delete */ ADIOI_PVFS2_Delete, /* Delete */
ADIOI_PVFS2_Feature,
}; };
/* /*

Просмотреть файл

@ -17,7 +17,6 @@
#include "pvfs2-compat.h" #include "pvfs2-compat.h"
#endif #endif
void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code); void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code);
void ADIOI_PVFS2_Close(ADIO_File fd, int *error_code); void ADIOI_PVFS2_Close(ADIO_File fd, int *error_code);
void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count, void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count,
@ -42,6 +41,8 @@ void ADIOI_PVFS2_Flush(ADIO_File fd, int *error_code);
void ADIOI_PVFS2_Delete(char *filename, int *error_code); void ADIOI_PVFS2_Delete(char *filename, int *error_code);
void ADIOI_PVFS2_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); void ADIOI_PVFS2_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
int ADIOI_PVFS2_Feature(ADIO_File fd, int flag);
void ADIOI_PVFS2_IReadContig(ADIO_File fd, void *buf, int count, void ADIOI_PVFS2_IReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, MPI_Request *request, ADIO_Offset offset, MPI_Request *request,
@ -54,4 +55,12 @@ void ADIOI_PVFS2_AIO_contig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, MPI_Request *request, ADIO_Offset offset, MPI_Request *request,
int flag, int *error_code); int flag, int *error_code);
void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
#endif #endif

Просмотреть файл

@ -17,7 +17,6 @@
#define READ 0 #define READ 0
#define WRITE 1 #define WRITE 1
#ifdef ROMIO_HAVE_WORKING_AIO
static int ADIOI_PVFS2_greq_class = 0; static int ADIOI_PVFS2_greq_class = 0;
int ADIOI_PVFS2_aio_free_fn(void *extra_state); int ADIOI_PVFS2_aio_free_fn(void *extra_state);
int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status *status); int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status *status);
@ -168,12 +167,10 @@ int ADIOI_PVFS2_aio_poll_fn(void *extra_state, MPI_Status *status)
aio_req = (ADIOI_AIO_Request *)extra_state; aio_req = (ADIOI_AIO_Request *)extra_state;
/* BUG: cannot PVFS_sys_testsome: does not work for a specific request */ /* BUG: cannot PVFS_sys_testsome: does not work for a specific request */
ret = PVFS_sys_wait(aio_req->op_id, __FUNCTION__, &error); ret = PVFS_sys_wait(aio_req->op_id, "ADIOI_PVFS2_aio_poll_fn", &error);
if (ret == 0) { if (ret == 0) {
aio_req->nbytes = aio_req->resp_io.total_completed; aio_req->nbytes = aio_req->resp_io.total_completed;
MPIR_Nest_incr();
MPI_Grequest_complete(aio_req->req); MPI_Grequest_complete(aio_req->req);
MPIR_Nest_decr();
return MPI_SUCCESS; return MPI_SUCCESS;
} else } else
return MPI_UNDEFINED; /* TODO: what's this error? */ return MPI_UNDEFINED; /* TODO: what's this error? */
@ -186,7 +183,7 @@ int ADIOI_PVFS2_aio_wait_fn(int count, void ** array_of_states,
ADIOI_AIO_Request **aio_reqlist; ADIOI_AIO_Request **aio_reqlist;
PVFS_sys_op_id *op_id_array; PVFS_sys_op_id *op_id_array;
int i,j, greq_count; int i,j, greq_count, completed_count=0;
int *error_array; int *error_array;
aio_reqlist = (ADIOI_AIO_Request **)array_of_states; aio_reqlist = (ADIOI_AIO_Request **)array_of_states;
@ -195,25 +192,27 @@ int ADIOI_PVFS2_aio_wait_fn(int count, void ** array_of_states,
error_array = (int *)ADIOI_Calloc(count, sizeof(int)); error_array = (int *)ADIOI_Calloc(count, sizeof(int));
greq_count = count; greq_count = count;
/* PVFS-2.6: testsome actually tests all requests and fills in op_id_array /* PVFS-2.6: testsome actually tests all requests and fills in op_id_array
* with the ones that have completed. count is an in/out parameter. * with the ones that have completed. count is an in/out parameter.
* returns with the number of completed operations. what a mess! */ * returns with the number of completed operations. what a mess! */
while (completed_count < greq_count ) {
count = greq_count;
PVFS_sys_testsome(op_id_array, &count, NULL, error_array, INT_MAX); PVFS_sys_testsome(op_id_array, &count, NULL, error_array, INT_MAX);
completed_count += count;
for (i=0; i< count; i++) { for (i=0; i< count; i++) {
for (j=0; j<greq_count; j++) { for (j=0; j<greq_count; j++) {
if (op_id_array[i] == aio_reqlist[j]->op_id) { if (op_id_array[i] == aio_reqlist[j]->op_id) {
aio_reqlist[j]->nbytes = aio_reqlist[j]->nbytes =
aio_reqlist[j]->resp_io.total_completed; aio_reqlist[j]->resp_io.total_completed;
MPIR_Nest_incr();
MPI_Grequest_complete(aio_reqlist[j]->req); MPI_Grequest_complete(aio_reqlist[j]->req);
MPIR_Nest_decr(); }
} }
} }
} }
return MPI_SUCCESS; /* TODO: no idea how to deal with errors */ return MPI_SUCCESS; /* TODO: no idea how to deal with errors */
} }
#endif
/* /*
* vim: ts=8 sts=4 sw=4 noexpandtab * vim: ts=8 sts=4 sw=4 noexpandtab

Просмотреть файл

@ -42,6 +42,7 @@ int ADIOI_PVFS2_End_call(MPI_Comm comm, int keyval,
{ {
int error_code; int error_code;
ADIOI_PVFS2_End(&error_code); ADIOI_PVFS2_End(&error_code);
MPI_Keyval_free(&keyval);
return error_code; return error_code;
} }
@ -81,7 +82,7 @@ void ADIOI_PVFS2_Init(int *error_code )
&ADIOI_PVFS2_Initialized, (void *)0); &ADIOI_PVFS2_Initialized, (void *)0);
/* just like romio does, we make a dummy attribute so we /* just like romio does, we make a dummy attribute so we
* get cleaned up */ * get cleaned up */
MPI_Attr_put(MPI_COMM_WORLD, ADIOI_PVFS2_Initialized, (void *)0); MPI_Attr_put(MPI_COMM_SELF, ADIOI_PVFS2_Initialized, (void *)0);
} }
void ADIOI_PVFS2_makeattribs(PVFS_sys_attr * attribs) void ADIOI_PVFS2_makeattribs(PVFS_sys_attr * attribs)
@ -107,9 +108,43 @@ void ADIOI_PVFS2_makecredentials(PVFS_credentials * credentials)
int ADIOI_PVFS2_error_convert(int pvfs_error) int ADIOI_PVFS2_error_convert(int pvfs_error)
{ {
switch(pvfs_error)
{
case PVFS_EPERM:
case PVFS_EACCES:
return MPI_ERR_ACCESS;
case PVFS_ENOENT:
case PVFS_ENXIO:
case PVFS_ENODEV:
return MPI_ERR_NO_SUCH_FILE;
case PVFS_EIO:
return MPI_ERR_IO;
case PVFS_EEXIST:
return MPI_ERR_FILE_EXISTS;
case PVFS_ENOTDIR: /* ??? */
case PVFS_EISDIR: /* ??? */
case PVFS_ENAMETOOLONG:
return MPI_ERR_BAD_FILE;
case PVFS_EINVAL:
return MPI_ERR_FILE;
case PVFS_EFBIG: /* ??? */
case PVFS_ENOSPC:
return MPI_ERR_NO_SPACE;
case PVFS_EROFS:
return MPI_ERR_READ_ONLY;
case PVFS_ENOSYS:
return MPI_ERR_UNSUPPORTED_OPERATION;
/* PVFS does not support quotas */
case EDQUOT:
return MPI_ERR_QUOTA;
case PVFS_ENOMEM:
return MPI_ERR_INTERN;
default:
return MPI_UNDEFINED; return MPI_UNDEFINED;
} }
}
/* /*
* vim: ts=8 sts=4 sw=4 noexpandtab * vim: ts=8 sts=4 sw=4 noexpandtab
*/ */

Просмотреть файл

@ -0,0 +1,16 @@
#include "adio.h"
#include "ad_pvfs2.h"
int ADIOI_PVFS2_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
return 1;
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -17,20 +17,37 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if ((fd->info) == MPI_INFO_NULL) { if ((fd->info) == MPI_INFO_NULL) {
/* part of the open call */ /* part of the open call */
MPI_Info_create(&(fd->info)); MPI_Info_create(&(fd->info));
MPI_Info_set(fd->info, "romio_pvfs2_debugmask", "0"); ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", "0");
fd->hints->fs_hints.pvfs2.debugmask = 0; fd->hints->fs_hints.pvfs2.debugmask = 0;
MPI_Info_set(fd->info, "striping_factor", "0"); ADIOI_Info_set(fd->info, "striping_factor", "0");
fd->hints->striping_factor = 0; fd->hints->striping_factor = 0;
MPI_Info_set(fd->info, "striping_unit", "0"); ADIOI_Info_set(fd->info, "striping_unit", "0");
fd->hints->striping_unit = 0; fd->hints->striping_unit = 0;
/* disable the aggressive strided optimizations by default */
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", "disable");
fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE;
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", "disable");
fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE;
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", "disable");
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", "disable");
fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE;
fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE;
/* any user-provided hints? */ /* any user-provided hints? */
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
/* pvfs2 debugging */ /* pvfs2 debugging */
value = (char *) ADIOI_Malloc( (MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc( (MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "romio_pvfs2_debugmask", ADIOI_Info_get(users_info, "romio_pvfs2_debugmask",
MPI_MAX_INFO_VAL, value, &flag); MPI_MAX_INFO_VAL, value, &flag);
if (flag) { if (flag) {
tmp_value = fd->hints->fs_hints.pvfs2.debugmask = tmp_value = fd->hints->fs_hints.pvfs2.debugmask =
@ -46,11 +63,11 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
MPI_Info_set(fd->info, "romio_pvfs2_debugmask", value); ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", value);
} }
/* the striping factor */ /* the striping factor */
MPI_Info_get(users_info, "striping_factor", ADIOI_Info_get(users_info, "striping_factor",
MPI_MAX_INFO_VAL, value, &flag); MPI_MAX_INFO_VAL, value, &flag);
if (flag) { if (flag) {
tmp_value = fd->hints->striping_factor = atoi(value); tmp_value = fd->hints->striping_factor = atoi(value);
@ -65,11 +82,11 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
MPI_Info_set(fd->info, "striping_factor", value); ADIOI_Info_set(fd->info, "striping_factor", value);
} }
/* the striping unit */ /* the striping unit */
MPI_Info_get(users_info, "striping_unit", ADIOI_Info_get(users_info, "striping_unit",
MPI_MAX_INFO_VAL, value, &flag); MPI_MAX_INFO_VAL, value, &flag);
if (flag) { if (flag) {
tmp_value = fd->hints->striping_unit = atoi(value); tmp_value = fd->hints->striping_unit = atoi(value);
@ -83,16 +100,167 @@ void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
} }
/* --END ERROR HANDLING-- */ /* --END ERROR HANDLING-- */
MPI_Info_set(fd->info, "striping_unit", value); ADIOI_Info_set(fd->info, "striping_unit", value);
} }
/* distribution name */ /* distribution name */
MPI_Info_get(users_info, "romio_pvfs2_distribution_name", ADIOI_Info_get(users_info, "romio_pvfs2_distribution_name",
MPI_MAX_INFO_VAL, value, &flag); MPI_MAX_INFO_VAL, value, &flag);
if (flag) { if (flag) {
} }
/* POSIX read */
ADIOI_Info_get(users_info, "romio_pvfs2_posix_read",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", value);
fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_posix_read", value);
fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.posix_read;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.posix_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"posix_read",
error_code);
return;
}
}
/* POSIX write */
ADIOI_Info_get(users_info, "romio_pvfs2_posix_write",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", value);
fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_posix_write", value);
fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.posix_write;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.posix_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"posix_write",
error_code);
return;
}
}
/* Datatype read */
ADIOI_Info_get(users_info, "romio_pvfs2_dtype_read",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", value);
fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_dtype_read", value);
fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.dtype_read;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.dtype_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"dtype_read",
error_code);
return;
}
}
/* Datatype write */
ADIOI_Info_get(users_info, "romio_pvfs2_dtype_write",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", value);
fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_dtype_write", value);
fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.dtype_write;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.dtype_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"dtype_write",
error_code);
return;
}
}
/* Listio read */
ADIOI_Info_get(users_info, "romio_pvfs2_listio_read",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", value);
fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_listio_read", value);
fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.listio_read;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.listio_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"listio_read",
error_code);
return;
}
}
/* Datatype write */
ADIOI_Info_get(users_info, "romio_pvfs2_listio_write",
MPI_MAX_INFO_VAL, value, &flag);
if (flag) {
if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE"))
{
ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", value);
fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_ENABLE;
}
else if ( !strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))
{
ADIOI_Info_set(fd->info , "romio_pvfs2_listio_write", value);
fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE;
}
tmp_value = fd->hints->fs_hints.pvfs2.listio_write;
MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm);
if (tmp_value != fd->hints->fs_hints.pvfs2.listio_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"listio_write",
error_code);
return;
}
}
ADIOI_Free(value); ADIOI_Free(value);
} }
} }
/* set the values for collective I/O and data sieving parameters */ /* set the values for collective I/O and data sieving parameters */

Просмотреть файл

@ -0,0 +1,79 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2006 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/* Contig I/O helper prototypes */
#define READ 0
#define WRITE 1
/* #define DEBUG_CONTIG */
/* #define DEBUG_LIST */
/* #define DEBUG_DTYPE */
/* Contig I/O helper prototypes */
int ADIOI_PVFS2_Contig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type);
/* List I/O helper prototypes */
int ADIOI_PVFS2_StridedListIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type);
int gen_listio_arr(ADIOI_Flatlist_node *flat_buf,
int *flat_buf_index_p,
int64_t *cur_flat_buf_reg_off_p,
int flat_buf_size,
int flat_buf_extent,
ADIOI_Flatlist_node *flat_file,
int *flat_file_index_p,
int64_t *cur_flat_file_reg_off_p,
int flat_file_size,
int flat_file_extent,
int max_ol_count,
ADIO_Offset disp,
int bytes_into_filetype,
int64_t *bytes_completed,
int64_t total_io_size,
int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t *buf_ol_count_p,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t *file_ol_count_p);
void print_buf_file_ol_pairs(int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t buf_ol_count,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t file_ol_count,
void *buf,
int rw_type);
/* Datatype I/O helper prototypes */
int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type);
int convert_named(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype, int combiner);
void print_dtype_info(int combiner,
int num_int,
int num_addr,
int num_dtype,
int *arr_int,
MPI_Aint *arr_addr,
MPI_Datatype *arr_dtype);
int convert_mpi_pvfs2_dtype(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype);

Просмотреть файл

@ -0,0 +1,720 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2006 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include <assert.h>
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_io.h"
#include "ad_pvfs2_common.h"
int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code,
int rw_type)
{
int filetype_size = -1, ret = -1, filetype_is_contig = -1;
int num_filetypes = 0, cur_flat_file_reg_off = 0;
PVFS_Request tmp_mem_req, mem_req, tmp_file_req, file_req;
PVFS_sysresp_io resp_io;
ADIO_Offset off = -1, bytes_into_filetype = 0;
MPI_Aint filetype_extent = -1;
int etype_size = -1, i = -1;
PVFS_size pvfs_disp = -1;
ADIOI_Flatlist_node *flat_file_p = ADIOI_Flatlist;
/* Use for offseting the PVFS2 filetype */
int pvfs_blk = 1;
ADIOI_PVFS2_fs *pvfs_fs;
static char myname[] = "ADIOI_PVFS2_STRIDED_DTYPE";
memset(&tmp_mem_req, 0, sizeof(PVFS_Request));
memset(&mem_req, 0, sizeof(PVFS_Request));
memset(&tmp_file_req, 0, sizeof(PVFS_Request));
memset(&file_req, 0, sizeof(PVFS_Request));
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* changed below if error */
*error_code = MPI_SUCCESS;
/* datatype is the memory type
* fd->filetype is the file type */
MPI_Type_size(fd->filetype, &filetype_size);
if (filetype_size == 0) {
*error_code = MPI_SUCCESS;
return -1;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(fd->etype, &etype_size);
if (filetype_size == 0) {
*error_code = MPI_SUCCESS;
return -1;
}
/* offset is in units of etype relative to the filetype. We
* convert this to off in terms of actual data bytes (the offset
* minus the number of bytes that are not used). We are allowed
* to do this since PVFS2 handles offsets with respect to a
* file_req in bytes, otherwise we would have to convert into a
* pure byte offset as is done in other methods. Explicit offset
* case is handled by using fd->disp and byte-converted off. */
pvfs_disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL)
{
if (filetype_is_contig)
{
off = fd->fp_ind - fd->disp;
}
else
{
int flag = 0;
/* Should have already been flattened in ADIO_Open*/
while (flat_file_p->type != fd->filetype)
{
flat_file_p = flat_file_p->next;
}
num_filetypes = -1;
while (!flag)
{
num_filetypes++;
for (i = 0; i < flat_file_p->count; i++)
{
/* Start on a non zero-length region */
if (flat_file_p->blocklens[i])
{
if (fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent) +
flat_file_p->blocklens[i] > fd->fp_ind &&
fd->disp + flat_file_p->indices[i] <=
fd->fp_ind)
{
cur_flat_file_reg_off = fd->fp_ind -
(fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent));
flag = 1;
break;
}
else
bytes_into_filetype += flat_file_p->blocklens[i];
}
}
}
/* Impossible that we don't find it in this datatype */
assert(i != flat_file_p->count);
off = bytes_into_filetype + cur_flat_file_reg_off;
}
}
else /* ADIO_EXPLICIT */
{
off = etype_size * offset;
}
#ifdef DEBUG_DTYPE
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: (fd->fp_ind=%Ld,fd->disp=%Ld,"
" offset=%Ld),(pvfs_disp=%Ld,off=%Ld)\n",
fd->fp_ind, fd->disp, offset, pvfs_disp, off);
#endif
/* Convert the MPI memory and file datatypes into
* PVFS2 datatypes */
ret = convert_mpi_pvfs2_dtype(&datatype, &tmp_mem_req);
if (ret < 0)
{
goto error_state;
}
ret = convert_mpi_pvfs2_dtype(&(fd->filetype), &tmp_file_req);
if (ret < 0)
{
goto error_state;
}
ret = PVFS_Request_contiguous(count, tmp_mem_req, &mem_req);
if (ret != 0) /* TODO: convert this to MPIO error handling */
fprintf(stderr, "ADIOI_PVFS2_stridedDtypeIO: error in final"
" CONTIG memory type\n");
PVFS_Request_free(&tmp_mem_req);
/* pvfs_disp is used to offset the filetype */
ret = PVFS_Request_hindexed(1, &pvfs_blk, &pvfs_disp,
tmp_file_req, &file_req);
if (ret != 0)
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: error in final"
" HINDEXED file type\n");
PVFS_Request_free(&tmp_file_req);
if (rw_type == READ)
ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, off, buf,
mem_req, &(pvfs_fs->credentials), &resp_io);
else
ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, off, buf,
mem_req, &(pvfs_fs->credentials), &resp_io);
if (ret != 0) {
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: Warning - PVFS_sys_"
"read/write returned %d and completed %Ld bytes.\n",
ret, resp_io.total_completed);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(ret),
"Error in PVFS_sys_io \n", 0);
goto error_state;
}
if (file_ptr_type == ADIO_INDIVIDUAL)
{
fd->fp_ind = off += resp_io.total_completed;
}
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
#ifdef DEBUG_DTYPE
fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: "
"resp_io.total_completed=%Ld,ret=%d\n",
resp_io.total_completed, ret);
#endif
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, (int)resp_io.total_completed);
/* This is a temporary way of filling in status. The right way is to
* keep track of how much data was actually acccessed by
* ADIOI_BUFFERED operations */
#endif
return ret;
}
/* convert_mpi_pvfs2_dtype - Convert a MPI datatype into
* a PVFS2 datatype so that we can natively use the PVFS2
* datatypes in the PVFS2 I/O calls instead of converting
* all datatypes to the hindexed method
* return 1 - a leaf node
* return 0 - normal return
* return -1 - problems */
int convert_mpi_pvfs2_dtype(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype)
{
int num_int = -1, num_addr = -1, num_dtype = -1,
combiner = -1, i = -1, ret = -1, leaf = -1;
int *arr_int = NULL, *arr_addr = NULL;
MPI_Datatype *arr_dtype = NULL;
PVFS_Request *old_pvfs_dtype = NULL;
PVFS_Request *old_pvfs_dtype_arr = NULL;
int arr_count = -1;
PVFS_size *pvfs_arr_disp = NULL;
int *pvfs_arr_len = NULL;
MPI_Type_get_envelope(*mpi_dtype,
&num_int,
&num_addr,
&num_dtype,
&combiner);
/* Depending on type of datatype do the following
* operations */
if (combiner == MPI_COMBINER_NAMED)
{
convert_named(mpi_dtype, pvfs_dtype, combiner);
return 1;
}
/* Allocate space for the arrays necessary for
* MPI_Type_get_contents */
if ((arr_int = ADIOI_Malloc(sizeof(int)*num_int)) == NULL)
{
fprintf(stderr, "Failed to allocate array_int\n");
return -1;
}
if ((arr_addr = ADIOI_Malloc(sizeof(int)*num_addr)) == NULL)
{
ADIOI_Free(arr_int);
fprintf(stderr, "Failed to allocate array_addr\n");
return -1;
}
if ((arr_dtype = ADIOI_Malloc(sizeof(MPI_Datatype)*num_dtype)) == NULL)
{
ADIOI_Free(arr_int);
ADIOI_Free(arr_addr);
fprintf(stderr, "Failed to allocate array_dtypes\n");
return -1;
}
MPI_Type_get_contents(*mpi_dtype,
num_int,
num_addr,
num_dtype,
arr_int,
arr_addr,
arr_dtype);
/* If it's not a predefined datatype, it is either a
* derived datatype or a structured datatype */
if (combiner != MPI_COMBINER_STRUCT)
{
if ((old_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL)
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate PVFS_Request\n");
switch (combiner)
{
case MPI_COMBINER_CONTIGUOUS:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_contiguous(arr_int[0],
*old_pvfs_dtype, pvfs_dtype);
break;
case MPI_COMBINER_VECTOR:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_vector(arr_int[0], arr_int[1],
arr_int[2], *old_pvfs_dtype,
pvfs_dtype);
break;
case MPI_COMBINER_HVECTOR:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_hvector(arr_int[0], arr_int[1],
arr_addr[0], *old_pvfs_dtype,
pvfs_dtype);
break;
/* Both INDEXED and HINDEXED types require PVFS_size
* address arrays. Therefore, we need to copy and
* convert the data from MPI_get_contents() into
* a PVFS_size buffer */
case MPI_COMBINER_INDEXED:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
if ((pvfs_arr_disp =
ADIOI_Malloc(arr_int[0]*sizeof(PVFS_size))) == 0)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_disp\n");
}
for (i = 0; i < arr_int[0]; i++)
{
pvfs_arr_disp[i] =
(PVFS_size) arr_int[arr_int[0]+1+i];
}
ret = PVFS_Request_indexed(arr_int[0], &arr_int[1],
pvfs_arr_disp,
*old_pvfs_dtype, pvfs_dtype);
ADIOI_Free(pvfs_arr_disp);
break;
case MPI_COMBINER_HINDEXED:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
if ((pvfs_arr_disp =
ADIOI_Malloc(arr_int[0]*sizeof(PVFS_size))) == 0)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_disp\n");
}
for (i = 0; i < arr_int[0]; i++)
{
pvfs_arr_disp[i] =
(PVFS_size) arr_addr[i];
}
ret = PVFS_Request_hindexed(arr_int[0], &arr_int[1],
(int64_t *)&arr_addr[0],
*old_pvfs_dtype, pvfs_dtype);
ADIOI_Free(pvfs_arr_disp);
break;
case MPI_COMBINER_DUP:
leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype);
ret = PVFS_Request_contiguous(1,
*old_pvfs_dtype, pvfs_dtype);
break;
case MPI_COMBINER_INDEXED_BLOCK:
/* No native PVFS2 support for this operation currently */
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"INDEXED_BLOCK is unsupported\n");
break;
case MPI_COMBINER_HINDEXED_INTEGER:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"HINDEXED_INTEGER is unsupported\n");
break;
case MPI_COMBINER_STRUCT_INTEGER:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"STRUCT_INTEGER is unsupported\n");
break;
case MPI_COMBINER_SUBARRAY:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"SUBARRAY is unsupported\n");
break;
case MPI_COMBINER_DARRAY:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"DARRAY is unsupported\n");
break;
case MPI_COMBINER_F90_REAL:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"F90_REAL is unsupported\n");
break;
case MPI_COMBINER_F90_COMPLEX:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"F90_COMPLEX is unsupported\n");
break;
case MPI_COMBINER_F90_INTEGER:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"F90_INTEGER is unsupported\n");
break;
case MPI_COMBINER_RESIZED:
ADIOI_Free(old_pvfs_dtype);
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"RESIZED is unsupported\n");
break;
default:
break;
}
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_* "
"for a derived datatype\n");
#ifdef DEBUG_DTYPE
print_dtype_info(combiner,
num_int,
num_addr,
num_dtype,
arr_int,
arr_addr,
arr_dtype);
#endif
if (leaf != 1 && combiner != MPI_COMBINER_DUP)
MPI_Type_free(&arr_dtype[0]);
ADIOI_Free(arr_int);
ADIOI_Free(arr_addr);
ADIOI_Free(arr_dtype);
PVFS_Request_free(old_pvfs_dtype);
ADIOI_Free(old_pvfs_dtype);
return ret;
}
else /* MPI_COMBINER_STRUCT */
{
MPI_Aint mpi_lb = -1, mpi_extent = -1;
PVFS_offset pvfs_lb = -1;
PVFS_size pvfs_extent = -1;
int has_lb_ub = 0;
/* When converting into a PVFS_Request_struct, we no longer
* can use MPI_LB and MPI_UB. Therfore, we have to do the
* following.
* We simply ignore all the MPI_LB and MPI_UB types and
* get the lb and extent and pass it on through a
* PVFS resized_req */
arr_count = 0;
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
{
arr_count++;
}
}
if (arr_int[0] != arr_count)
{
MPI_Type_get_extent(*mpi_dtype, &mpi_lb, &mpi_extent);
pvfs_lb = mpi_lb;
pvfs_extent = mpi_extent;
if ((pvfs_arr_len = ADIOI_Malloc(arr_count*sizeof(int)))
== NULL)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_len\n");
}
has_lb_ub = 1;
}
if ((old_pvfs_dtype_arr
= ADIOI_Malloc(arr_count*sizeof(PVFS_Request))) == NULL)
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate PVFS_Requests\n");
if ((pvfs_arr_disp = ADIOI_Malloc(arr_count*sizeof(PVFS_size)))
== NULL)
{
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate pvfs_arr_disp\n");
}
arr_count = 0;
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
{
leaf = convert_mpi_pvfs2_dtype(
&arr_dtype[i], &old_pvfs_dtype_arr[arr_count]);
if (leaf != 1)
MPI_Type_free(&arr_dtype[i]);
pvfs_arr_disp[arr_count] =
(PVFS_size) arr_addr[i];
if (has_lb_ub)
{
pvfs_arr_len[arr_count] =
arr_int[i+1];
}
arr_count++;
}
}
/* If a MPI_UB or MPI_LB did exist, we have to
* resize the datatype */
if (has_lb_ub)
{
PVFS_Request *tmp_pvfs_dtype = NULL;
if ((tmp_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL)
fprintf(stderr, "convert_mpi_pvfs2_dtype: "
"Failed to allocate PVFS_Request\n");
ret = PVFS_Request_struct(arr_count, pvfs_arr_len,
pvfs_arr_disp,
old_pvfs_dtype_arr, tmp_pvfs_dtype);
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_struct\n");
arr_count = 0;
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
{
PVFS_Request_free(&old_pvfs_dtype_arr[arr_count]);
arr_count++;
}
}
#ifdef DEBUG_DTYPE
fprintf(stderr, "STRUCT(WITHOUT %d LB or UB)(%d,[",
arr_int[0] - arr_count, arr_count);
for (i = 0; i < arr_count; i++)
fprintf(stderr, "(%d,%Ld) ",
pvfs_arr_len[i],
pvfs_arr_disp[i]);
fprintf(stderr, "]\n");
fprintf(stderr, "RESIZED(LB = %Ld, EXTENT = %Ld)\n",
pvfs_lb, pvfs_extent);
#endif
ret = PVFS_Request_resized(*tmp_pvfs_dtype,
pvfs_lb, pvfs_extent, pvfs_dtype);
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_resize\n");
PVFS_Request_free(tmp_pvfs_dtype);
ADIOI_Free(tmp_pvfs_dtype);
}
else /* No MPI_LB or MPI_UB datatypes */
{
ret = PVFS_Request_struct(arr_int[0], &arr_int[1],
pvfs_arr_disp,
old_pvfs_dtype_arr, pvfs_dtype);
if (ret != 0)
fprintf(stderr, "Error in PVFS_Request_struct\n");
for (i = 0; i < arr_int[0]; i++)
{
if (arr_dtype[i] != MPI_LB &&
arr_dtype[i] != MPI_UB)
PVFS_Request_free(&old_pvfs_dtype_arr[i]);
}
#ifdef DEBUG_DTYPE
print_dtype_info(combiner,
num_int,
num_addr,
num_dtype,
arr_int,
arr_addr,
arr_dtype);
#endif
}
ADIOI_Free(arr_int);
ADIOI_Free(arr_addr);
ADIOI_Free(arr_dtype);
ADIOI_Free(old_pvfs_dtype_arr);
ADIOI_Free(pvfs_arr_disp);
ADIOI_Free(pvfs_arr_len);
return ret;
}
/* Shouldn't have gotten here */
fprintf(stderr, "convert_mpi_pvfs2_dtype: SERIOUS ERROR\n");
return -1;
}
int convert_named(MPI_Datatype *mpi_dtype,
PVFS_Request *pvfs_dtype, int combiner)
{
int ret = -1;
#ifdef DEBUG_DTYPE
fprintf(stderr, "NAMED");
#endif
switch (*mpi_dtype)
{
case MPI_CHAR:
ret = PVFS_Request_contiguous(1, PVFS_CHAR, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_CHAR\n");
#endif
break;
case MPI_BYTE:
ret = PVFS_Request_contiguous(1, PVFS_BYTE, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_BYTE\n");
#endif
break;
case MPI_SHORT:
ret = PVFS_Request_contiguous(1, PVFS_SHORT, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_SHORT\n");
#endif
break;
case MPI_INT:
ret = PVFS_Request_contiguous(1, PVFS_INT, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_INT\n");
#endif
break;
case MPI_LONG:
ret = PVFS_Request_contiguous(1, PVFS_LONG, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_LONG\n");
#endif
break;
case MPI_FLOAT:
ret = PVFS_Request_contiguous(1, PVFS_FLOAT, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_FLOAT\n");
#endif
break;
case MPI_DOUBLE:
ret = PVFS_Request_contiguous(1, PVFS_DOUBLE, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_DOUBLE\n");
#endif
break;
case MPI_UNSIGNED_CHAR:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED_CHAR, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_UNSIGNED_CHAR\n");
#endif
break;
case MPI_UNSIGNED_SHORT:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_UNSIGNED_SHORT\n");
#endif
break;
case MPI_UNSIGNED:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_SHORT\n");
#endif
break;
case MPI_UNSIGNED_LONG:
ret = PVFS_Request_contiguous(1, PVFS_UNSIGNED_LONG, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_UNSIGNED_LONG\n");
#endif
break;
case MPI_LONG_DOUBLE:
ret = PVFS_Request_contiguous(1, PVFS_LONG_DOUBLE, pvfs_dtype);
#ifdef DEBUG_DTYPE
fprintf(stderr, "-MPI_LONG_DOUBLE\n");
#endif
break;
default:
fprintf(stderr, "convert_named: predefined type not found");
return -1;
break;
}
if (ret != 0)
fprintf(stderr, "convert_named: Datatype creation failed\n");
return ret;
}
void print_dtype_info(int combiner,
int num_int,
int num_addr,
int num_dtype,
int *arr_int,
MPI_Aint *arr_addr,
MPI_Datatype *arr_dtype)
{
int i = -1;
switch (combiner)
{
case MPI_COMBINER_CONTIGUOUS:
fprintf(stderr, "CONTIG(%d)\n", arr_int[0]);
break;
case MPI_COMBINER_VECTOR:
fprintf(stderr, "VECTOR(%d,%d,%d)\n",
arr_int[0], arr_int[1], arr_int[2]);
break;
case MPI_COMBINER_HVECTOR:
fprintf(stderr, "HVECTOR(%d,%d,%d)\n",
arr_int[0], arr_int[1],arr_addr[0]);
break;
case MPI_COMBINER_INDEXED:
fprintf(stderr, "INDEXED(%d,[",
arr_int[0]);
for (i = 0; i < arr_int[0]; i++)
fprintf(stderr, "(%d,%Ld) ",
arr_int[1+i],
(int64_t) arr_int[arr_int[0]+1+i]);
fprintf(stderr, "]\n");
break;
case MPI_COMBINER_HINDEXED:
fprintf(stderr, "HINDEXED(%d,[",
arr_int[0]);
for (i = 0; i < arr_int[0]; i++)
fprintf(stderr, "(%d,%Ld) ",
arr_int[1+i],
(int64_t) arr_addr[i]);
fprintf(stderr, "]\n");
break;
case MPI_COMBINER_STRUCT:
fprintf(stderr, "STRUCT(%d,[",
arr_int[0]);
for (i = 0; i < arr_int[0]; i++)
fprintf(stderr, "(%d,%Ld) ",
arr_int[1+i],
(int64_t) arr_addr[i]);
fprintf(stderr, "]\n");
break;
case MPI_COMBINER_DUP:
fprintf(stderr, "DUP\n");
break;
default:
fprintf(stderr, "no available information on this datatype");
}
}

Просмотреть файл

@ -0,0 +1,665 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2006 Unknown (TODO: fix this)
*/
#include <assert.h>
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_io.h"
#include "ad_pvfs2_common.h"
#define COALESCE_REGIONS /* TODO: would we ever want to *not* coalesce? */
#define MAX_OL_COUNT 64
int ADIOI_PVFS2_StridedListIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code, int rw_type)
{
/* list I/O parameters */
int i = -1, ret = -1;
int tmp_filetype_size = -1;
int64_t cur_io_size = 0, io_size = 0;
int etype_size = -1;
int num_etypes_in_filetype = -1, num_filetypes = -1;
int etypes_in_filetype = -1, size_in_filetype = -1;
int bytes_into_filetype = 0;
MPI_Offset total_bytes_accessed = 0;
/* parameters for offset-length pairs arrays */
int64_t buf_off_arr[MAX_OL_COUNT];
int32_t buf_len_arr[MAX_OL_COUNT];
int64_t file_off_arr[MAX_OL_COUNT];
int32_t file_len_arr[MAX_OL_COUNT];
int32_t buf_ol_count = 0;
int32_t file_ol_count = 0;
/* parameters for flattened memory and file datatypes*/
int flat_buf_index = 0;
int flat_file_index = 0;
int64_t cur_flat_buf_reg_off = 0;
int64_t cur_flat_file_reg_off = 0;
ADIOI_Flatlist_node *flat_buf_p, *flat_file_p;
int buftype_size = -1, buftype_extent = -1,
filetype_size = -1, filetype_extent = -1;
int buftype_is_contig = -1, filetype_is_contig = -1;
/* PVFS2 specific parameters */
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
static char myname[] = "ADIOI_PVFS2_STRIDED_LISTIO";
if (fd->atomicity) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"Atomic noncontiguous writes"
" are not supported by PVFS2", 0);
return -1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if (filetype_size == 0) {
*error_code = MPI_SUCCESS;
return -1;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
io_size = buftype_size*count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
/* Flatten the memory datatype
* (file datatype has already been flattened in ADIO open
* unless it is contibuous, then we need to flatten it manually)
* and set the correct buffers for flat_buf and flat_file */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
if (buftype_is_contig == 0)
{
ADIOI_Flatten_datatype(datatype);
flat_buf_p = ADIOI_Flatlist;
while (flat_buf_p->type != datatype)
flat_buf_p = flat_buf_p->next;
}
else
{
/* flatten and add to the list */
flat_buf_p = (ADIOI_Flatlist_node *) ADIOI_Malloc
(sizeof(ADIOI_Flatlist_node));
flat_buf_p->blocklens = (ADIO_Offset*)ADIOI_Malloc(sizeof(ADIO_Offset));
flat_buf_p->indices =
(ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
/* For the buffer, we can optimize the buftype, this is not
* possible with the filetype since it is tiled */
buftype_size = buftype_size*count;
buftype_extent = buftype_size*count;
flat_buf_p->blocklens[0] = buftype_size;
flat_buf_p->indices[0] = 0;
flat_buf_p->count = 1;
}
if (filetype_is_contig == 0)
{
/* TODO: why does avery say this should already have been
* flattened in Open, but also says contig types don't get
* flattened */
ADIOI_Flatten_datatype(fd->filetype);
flat_file_p = ADIOI_Flatlist;
while (flat_file_p->type != fd->filetype)
flat_file_p = flat_file_p->next;
}
else
{
/* flatten and add to the list */
flat_file_p = (ADIOI_Flatlist_node *) ADIOI_Malloc
(sizeof(ADIOI_Flatlist_node));
flat_file_p->blocklens =(ADIO_Offset*)ADIOI_Malloc(sizeof(ADIO_Offset));
flat_file_p->indices =
(ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset));
flat_file_p->blocklens[0] = filetype_size;
flat_file_p->indices[0] = 0;
flat_file_p->count = 1;
}
/* Find out where we are in the flattened filetype (the block index,
* how far into the block, and how many bytes_into_filetype)
* If the file_ptr_type == ADIO_INDIVIDUAL we will use disp, fp_ind
* to figure this out (offset should always be zero)
* If file_ptr_type == ADIO_EXPLICIT, we will use disp and offset
* to figure this out. */
etype_size = fd->etype_size;
num_etypes_in_filetype = filetype_size / etype_size;
if (file_ptr_type == ADIO_INDIVIDUAL)
{
int flag = 0;
/* Should have already been flattened in ADIO_Open*/
num_filetypes = -1;
while (!flag)
{
num_filetypes++;
for (i = 0; i < flat_file_p->count; i++)
{
/* Start on a non zero-length region */
if (flat_file_p->blocklens[i])
{
if (fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent) +
flat_file_p->blocklens[i] > fd->fp_ind &&
fd->disp + flat_file_p->indices[i] <=
fd->fp_ind)
{
flat_file_index = i;
cur_flat_file_reg_off = fd->fp_ind -
(fd->disp + flat_file_p->indices[i] +
(num_filetypes * filetype_extent));
flag = 1;
break;
}
else
bytes_into_filetype += flat_file_p->blocklens[i];
}
}
}
/* Impossible that we don't find it in this datatype */
assert(i != flat_file_p->count);
}
else
{
num_filetypes = (int) (offset / num_etypes_in_filetype);
etypes_in_filetype = (int) (offset % num_etypes_in_filetype);
size_in_filetype = etypes_in_filetype * etype_size;
tmp_filetype_size = 0;
for (i=0; i<flat_file_p->count; i++) {
tmp_filetype_size += flat_file_p->blocklens[i];
if (tmp_filetype_size > size_in_filetype)
{
flat_file_index = i;
cur_flat_file_reg_off = flat_file_p->blocklens[i] -
(tmp_filetype_size - size_in_filetype);
bytes_into_filetype = offset * filetype_size -
flat_file_p->blocklens[i];
break;
}
}
}
#ifdef DEBUG_LIST
fprintf(stderr, "ADIOI_PVFS2_StridedListIO: (fd->fp_ind=%Ld,fd->disp=%Ld,"
" offset=%Ld)\n(flat_file_index=%d,cur_flat_file_reg_off=%Ld,"
"bytes_into_filetype=%d)\n",
fd->fp_ind, fd->disp, offset, flat_file_index,
cur_flat_file_reg_off, bytes_into_filetype);
#endif
#ifdef DEBUG_LIST2
fprintf(stderr, "flat_buf:\n");
for (i = 0; i < flat_buf_p->count; i++)
fprintf(stderr, "(offset, length) = (%Ld, %d)\n",
flat_buf_p->indices[i],
flat_buf_p->blocklens[i]);
fprintf(stderr, "flat_file:\n");
for (i = 0; i < flat_file_p->count; i++)
fprintf(stderr, "(offset, length) = (%Ld, %d)\n",
flat_file_p->indices[i],
flat_file_p->blocklens[i]);
#endif
/* total data written */
cur_io_size = 0;
while (cur_io_size != io_size)
{
/* Initialize the temporarily unrolling lists and
* and associated variables */
buf_ol_count = 0;
file_ol_count = 0;
for (i = 0; i < MAX_OL_COUNT; i++)
{
buf_off_arr[i] = 0;
buf_len_arr[i] = 0;
file_off_arr[i] = 0;
file_len_arr[i] = 0;
}
/* Generate the offset-length pairs for a
* list I/O operation */
gen_listio_arr(flat_buf_p,
&flat_buf_index,
&cur_flat_buf_reg_off,
buftype_size,
buftype_extent,
flat_file_p,
&flat_file_index,
&cur_flat_file_reg_off,
filetype_size,
filetype_extent,
MAX_OL_COUNT,
fd->disp,
bytes_into_filetype,
&cur_io_size,
io_size,
buf_off_arr,
buf_len_arr,
&buf_ol_count,
file_off_arr,
file_len_arr,
&file_ol_count);
assert(buf_ol_count <= MAX_OL_COUNT);
assert(file_ol_count <= MAX_OL_COUNT);
#ifdef DEBUG_LIST2
print_buf_file_ol_pairs(buf_off_arr,
buf_len_arr,
buf_ol_count,
file_off_arr,
file_len_arr,
file_ol_count,
buf,
rw_type);
#endif
#ifdef DEBUG_LIST2
do {
int y, z;
fprintf(stderr, "ad_pvfs2_io_list.c::\n");
for (y = 0; y < buf_ol_count; y++)
{
for (z = 0; z < buf_len_arr[y]; z++)
{
fprintf(stderr, "buf[%d][%d]=%c\n",
y, z, ((char *) buf + buf_off_arr[y])[z]);
}
}
} while (0);
#endif
/* Run list I/O operation */
ret = PVFS_Request_hindexed(buf_ol_count, buf_len_arr,
buf_off_arr, PVFS_BYTE, &mem_req);
ret = PVFS_Request_hindexed(file_ol_count, file_len_arr,
file_off_arr, PVFS_BYTE, &file_req);
if (rw_type == READ)
{
ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
buf, mem_req,
&(pvfs_fs->credentials), &resp_io);
}
else
{
ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
buf, mem_req,
&(pvfs_fs->credentials), &resp_io);
}
if (ret != 0)
{
fprintf(stderr, "ADIOI_PVFS2_StridedListIO: Warning - PVFS_sys_"
"read/write returned %d and completed %Ld bytes.\n",
ret, resp_io.total_completed);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(ret),
"Error in PVFS_sys_io \n", 0);
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
goto error_state;
}
total_bytes_accessed += resp_io.total_completed;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
}
#ifdef DEBUG_LIST
fprintf(stderr, "ADIOI_PVFS2_StridedListIO: "
"total_bytes_accessed=%Ld,ret=%d\n",
total_bytes_accessed, ret);
#endif
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_accessed;
*error_code = MPI_SUCCESS;
error_state:
#ifdef HAVE_STATUS_SET_BYTES
/* TODO: why the cast? */
MPIR_Status_set_bytes(status, datatype, (int)total_bytes_accessed);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (buftype_is_contig == 0)
ADIOI_Delete_flattened(datatype);
else
{
ADIOI_Free(flat_buf_p->blocklens);
ADIOI_Free(flat_buf_p->indices);
ADIOI_Free(flat_buf_p);
}
if (filetype_is_contig == 0)
ADIOI_Delete_flattened(fd->filetype);
else
{
ADIOI_Free(flat_file_p->blocklens);
ADIOI_Free(flat_file_p->indices);
ADIOI_Free(flat_file_p);
}
return 0;
}
/* To do: Fix the code to coalesce the offset-length pairs for memory
* and file. */
/* gen_listio_arr - fills in offset-length pairs for memory and file
* for list I/O */
int gen_listio_arr(ADIOI_Flatlist_node *flat_buf_p,
int *flat_buf_index_p,
int64_t *cur_flat_buf_reg_off_p,
int flat_buf_size,
int flat_buf_extent,
ADIOI_Flatlist_node *flat_file_p,
int *flat_file_index_p,
int64_t *cur_flat_file_reg_off_p,
int flat_file_size,
int flat_file_extent,
int max_ol_count,
ADIO_Offset disp,
int bytes_into_filetype,
int64_t *bytes_completed,
int64_t total_io_size,
int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t *buf_ol_count_p,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t *file_ol_count_p)
{
int region_size = -1;
/* parameters for flattened memory and file datatypes*/
int64_t cur_flat_buf_reg_left = 0;
int64_t cur_flat_file_reg_left = 0;
#ifdef DEBUG_LIST2
fprintf(stderr, "gen_list_arr:\n");
#endif
if ((*buf_ol_count_p) != 0 ||(*file_ol_count_p) != 0)
{
fprintf(stderr, "buf_ol_count != 0 || file_ol_count != 0\n");
return -1;
}
/* Start on a non-zero memory and file region
* Note this does not affect the bytes_completed
* since no data is in these regions. Initialize the
* first memory and file offsets. */
while (flat_buf_p->blocklens[(*flat_buf_index_p)] == 0)
{
(*flat_buf_index_p) = ((*flat_buf_index_p) + 1) %
flat_buf_p->count;
}
buf_off_arr[*buf_ol_count_p] =
(*bytes_completed / flat_buf_size) *
flat_buf_extent +
flat_buf_p->indices[*flat_buf_index_p] +
*cur_flat_buf_reg_off_p;
buf_len_arr[*buf_ol_count_p] = 0;
while (flat_file_p->blocklens[(*flat_file_index_p)] == 0)
{
(*flat_file_index_p) = ((*flat_file_index_p) + 1) %
flat_file_p->count;
}
file_off_arr[*file_ol_count_p] = disp +
(((bytes_into_filetype + *bytes_completed) / flat_file_size) *
flat_file_extent) +
flat_file_p->indices[*flat_file_index_p] +
*cur_flat_file_reg_off_p;
file_len_arr[*file_ol_count_p] = 0;
#ifdef DEBUG_LIST2
fprintf(stderr, "initial buf_off_arr[%d] = %Ld\n", *buf_ol_count_p,
buf_off_arr[*buf_ol_count_p]);
fprintf(stderr, "initial file_off_arr[%d] = %Ld\n", *file_ol_count_p,
file_off_arr[*file_ol_count_p]);
#endif
while (*bytes_completed != total_io_size
&& (*buf_ol_count_p) < max_ol_count
&& (*file_ol_count_p) < max_ol_count)
{
/* How much data is left in the current piece in
* the flattened datatypes */
cur_flat_buf_reg_left = flat_buf_p->blocklens[*flat_buf_index_p]
- *cur_flat_buf_reg_off_p;
cur_flat_file_reg_left = flat_file_p->blocklens[*flat_file_index_p]
- *cur_flat_file_reg_off_p;
#ifdef DEBUG_LIST2
fprintf(stderr,
"flat_buf_index=%d flat_buf->blocklens[%d]=%d\n"
"cur_flat_buf_reg_left=%Ld "
"*cur_flat_buf_reg_off_p=%Ld\n"
"flat_file_index=%d flat_file->blocklens[%d]=%d\n"
"cur_flat_file_reg_left=%Ld "
"*cur_flat_file_reg_off_p=%Ld\n"
"bytes_completed=%Ld\n"
"buf_ol_count=%d file_ol_count=%d\n"
"buf_len_arr[%d]=%d file_len_arr[%d]=%d\n\n",
*flat_buf_index_p, *flat_buf_index_p,
flat_buf_p->blocklens[*flat_buf_index_p],
cur_flat_buf_reg_left,
*cur_flat_buf_reg_off_p,
*flat_file_index_p, *flat_file_index_p,
flat_file_p->blocklens[*flat_file_index_p],
cur_flat_file_reg_left,
*cur_flat_file_reg_off_p,
*bytes_completed,
*buf_ol_count_p, *file_ol_count_p,
*buf_ol_count_p,
buf_len_arr[*buf_ol_count_p],
*file_ol_count_p,
file_len_arr[*file_ol_count_p]);
#endif
/* What is the size of the next contiguous region agreed
* upon by both memory and file regions that does not
* surpass the file size */
if (cur_flat_buf_reg_left > cur_flat_file_reg_left)
region_size = cur_flat_file_reg_left;
else
region_size = cur_flat_buf_reg_left;
if (region_size > total_io_size - *bytes_completed)
region_size = total_io_size - *bytes_completed;
/* Add this piece to both the mem and file arrays
* coalescing offset-length pairs if possible and advance
* the pointers through the flatten mem and file datatypes
* as well Note: no more than a single piece can be done
* since we take the smallest one possible */
if (cur_flat_buf_reg_left == region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "reached end of memory block...\n");
#endif
(*flat_buf_index_p) = ((*flat_buf_index_p) + 1) %
flat_buf_p->count;
while (flat_buf_p->blocklens[(*flat_buf_index_p)] == 0)
{
(*flat_buf_index_p) = ((*flat_buf_index_p) + 1) %
flat_buf_p->count;
}
*cur_flat_buf_reg_off_p = 0;
#ifdef COALESCE_REGIONS
if (*buf_ol_count_p != 0)
{
if (buf_off_arr[(*buf_ol_count_p) - 1] +
buf_len_arr[(*buf_ol_count_p) - 1] ==
buf_off_arr[*buf_ol_count_p])
{
buf_len_arr[(*buf_ol_count_p) - 1] +=
region_size;
}
else
{
buf_len_arr[*buf_ol_count_p] += region_size;
(*buf_ol_count_p)++;
}
}
else
{
#endif
buf_len_arr[*buf_ol_count_p] += region_size;
(*buf_ol_count_p)++;
#ifdef COALESCE_REGIONS
}
#endif
/* Don't prepare for the next piece if we have reached
* the limit or else it will segment fault. */
if ((*buf_ol_count_p) != max_ol_count)
{
buf_off_arr[*buf_ol_count_p] =
((*bytes_completed + region_size) / flat_buf_size) *
flat_buf_extent +
flat_buf_p->indices[*flat_buf_index_p] +
(*cur_flat_buf_reg_off_p);
buf_len_arr[*buf_ol_count_p] = 0;
}
}
else if (cur_flat_buf_reg_left > region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "advanced %d in memory block...\n",
region_size);
#endif
(*cur_flat_buf_reg_off_p) += region_size;
buf_len_arr[*buf_ol_count_p] += region_size;
}
else
{
fprintf(stderr, "gen_listio_arr: Error\n");
}
/* To calculate the absolute file offset we need to
* add the disp, how many filetypes we have gone through,
* the relative block offset in the filetype and how far
* into the block we have gone. */
if (cur_flat_file_reg_left == region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "reached end of file block...\n");
#endif
(*flat_file_index_p) = ((*flat_file_index_p) + 1) %
flat_file_p->count;
while (flat_file_p->blocklens[(*flat_file_index_p)] == 0)
{
(*flat_file_index_p) = ((*flat_file_index_p) + 1) %
flat_file_p->count;
}
(*cur_flat_file_reg_off_p) = 0;
#ifdef COALESCE_REGIONS
if (*file_ol_count_p != 0)
{
if (file_off_arr[(*file_ol_count_p) - 1] +
file_len_arr[(*file_ol_count_p) - 1] ==
file_off_arr[*file_ol_count_p])
{
file_len_arr[(*file_ol_count_p) - 1] +=
region_size;
}
else
{
file_len_arr[*file_ol_count_p] += region_size;
(*file_ol_count_p)++;
}
}
else
{
#endif
file_len_arr[*file_ol_count_p] += region_size;
(*file_ol_count_p)++;
#ifdef COALESCE_REGIONS
}
#endif
/* Don't prepare for the next piece if we have reached
* the limit or else it will segment fault. */
if ((*file_ol_count_p) != max_ol_count)
{
file_off_arr[*file_ol_count_p] = disp +
(((bytes_into_filetype + *bytes_completed + region_size)
/ flat_file_size) *
flat_file_extent) +
flat_file_p->indices[*flat_file_index_p] +
(*cur_flat_file_reg_off_p);
file_len_arr[*file_ol_count_p] = 0;
}
}
else if (cur_flat_file_reg_left > region_size)
{
#ifdef DEBUG_LIST2
fprintf(stderr, "advanced %d in file block...\n",
region_size);
#endif
(*cur_flat_file_reg_off_p) += region_size;
file_len_arr[*file_ol_count_p] += region_size;
}
else
{
fprintf(stderr, "gen_listio_arr: Error\n");
}
#ifdef DEBUG_LIST2
fprintf(stderr,
"------------------------------\n\n");
#endif
*bytes_completed += region_size;
}
/* Increment the count if we stopped in the middle of a
* memory or file region */
if (*cur_flat_buf_reg_off_p != 0)
(*buf_ol_count_p)++;
if (*cur_flat_file_reg_off_p != 0)
(*file_ol_count_p)++;
return 0;
}
void print_buf_file_ol_pairs(int64_t buf_off_arr[],
int32_t buf_len_arr[],
int32_t buf_ol_count,
int64_t file_off_arr[],
int32_t file_len_arr[],
int32_t file_ol_count,
void *buf,
int rw_type)
{
int i = -1;
fprintf(stderr, "buf_ol_pairs(offset,length) count = %d\n",
buf_ol_count);
for (i = 0; i < buf_ol_count; i++)
{
fprintf(stderr, "(%Ld, %d) ", buf_off_arr[i], buf_len_arr[i]);
}
fprintf(stderr, "\n");
fprintf(stderr, "file_ol_pairs(offset,length) count = %d\n",
file_ol_count);
for (i = 0; i < file_ol_count; i++)
{
fprintf(stderr, "(%Ld, %d) ", file_off_arr[i], file_len_arr[i]);
}
fprintf(stderr, "\n\n");
}

Просмотреть файл

@ -8,7 +8,7 @@
#include "adio.h" #include "adio.h"
#include "adio_extern.h" #include "adio_extern.h"
#include "ad_pvfs2.h" #include "ad_pvfs2.h"
#include "ad_pvfs2_io.h"
#include "ad_pvfs2_common.h" #include "ad_pvfs2_common.h"
void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count, void ADIOI_PVFS2_ReadContig(ADIO_File fd, void *buf, int count,
@ -92,898 +92,76 @@ fn_exit:
return; return;
} }
static int ADIOI_PVFS2_ReadStridedListIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
return ADIOI_PVFS2_StridedListIO(fd, buf, count,
datatype, file_ptr_type,
offset, status,
error_code, READ);
}
static int ADIOI_PVFS2_ReadStridedDtypeIO(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
return ADIOI_PVFS2_StridedDtypeIO(fd, buf, count,
datatype, file_ptr_type,
offset, status, error_code,
READ);
}
void ADIOI_PVFS2_ReadStrided(ADIO_File fd, void *buf, int count, void ADIOI_PVFS2_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int ADIO_Offset offset, ADIO_Status *status, int
*error_code) *error_code)
{ {
/* offset is in units of etype relative to the filetype. */ /* four ways (to date) that we can carry out strided i/o accesses:
ADIOI_Flatlist_node *flat_buf, *flat_file; * - naive posix
int i, j, k, brd_size, frd_size=0, st_index=0; * - 'true' Datatype (from avery)
int bufsize, sum, n_etypes_in_filetype, size_in_filetype; * - new List I/O (from avery)
int n_filetypes, etype_in_filetype; * - classic List I/O (the one that's always been in ROMIO)
ADIO_Offset abs_off_in_filetype=0; * I imagine we'll keep Datatype as an optional optimization, and afer a
int filetype_size, etype_size, buftype_size; * release or two promote it to the default
MPI_Aint filetype_extent, buftype_extent; */
int buf_count, buftype_is_contig, filetype_is_contig; int ret = -1;
ADIO_Offset off, disp, start_off, initial_off;
int flag, st_frd_size, st_n_filetypes;
int mem_list_count, file_list_count; if (fd->hints->fs_hints.pvfs2.posix_read == ADIOI_HINT_ENABLE) {
PVFS_size *mem_offsets; ADIOI_GEN_ReadStrided(fd, buf, count, datatype,
int64_t *file_offsets; file_ptr_type, offset, status, error_code);
int *mem_lengths;
int32_t *file_lengths;
int total_blks_to_read;
int max_mem_list, max_file_list;
int b_blks_read;
int f_data_read;
int size_read=0, n_read_lists, extra_blks;
int end_brd_size, end_frd_size;
int start_k, start_j, new_file_read, new_buffer_read;
int start_mem_offset;
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
int err_flag=0;
MPI_Offset total_bytes_read = 0;
static char myname[] = "ADIOI_PVFS2_ReadStrided";
#define MAX_ARRAY_SIZE 64
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* the HDF5 tests showed a bug in this list processing code (see many many
* lines down below). We added a workaround, but common HDF5 file types
* are actually contiguous and do not need the expensive workarond */
if (!filetype_is_contig) {
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
if (flat_file->count == 1 && !buftype_is_contig)
filetype_is_contig = 1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
*error_code = MPI_SUCCESS;
return; return;
} }
if (fd->hints->fs_hints.pvfs2.dtype_read == ADIOI_HINT_ENABLE) {
ret = ADIOI_PVFS2_ReadStridedDtypeIO(fd, buf, count,
datatype, file_ptr_type,
offset, status, error_code);
MPI_Type_extent(fd->filetype, &filetype_extent); /* Fall back to list I/O if datatype I/O didn't work */
MPI_Type_size(datatype, &buftype_size); if (ret != 0)
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
int64_t file_offsets;
int32_t file_lengths;
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
file_list_count = 1;
file_offsets = off;
file_lengths = 0;
total_blks_to_read = count*flat_buf->count;
b_blks_read = 0;
/* allocate arrays according to max usage */
if (total_blks_to_read > MAX_ARRAY_SIZE)
mem_list_count = MAX_ARRAY_SIZE;
else mem_list_count = total_blks_to_read;
mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size));
mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));
/* TODO: CHECK RESULTS OF MEMORY ALLOCATION */
j = 0;
/* step through each block in memory, filling memory arrays */
while (b_blks_read < total_blks_to_read) {
for (i=0; i<flat_buf->count; i++) {
mem_offsets[b_blks_read % MAX_ARRAY_SIZE] =
/* TODO: fix this compiler warning */
((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]);
mem_lengths[b_blks_read % MAX_ARRAY_SIZE] =
flat_buf->blocklens[i];
file_lengths += flat_buf->blocklens[i];
b_blks_read++;
if (!(b_blks_read % MAX_ARRAY_SIZE) ||
(b_blks_read == total_blks_to_read)) {
/* in the case of the last read list call,
adjust mem_list_count */
if (b_blks_read == total_blks_to_read) {
mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE;
/* in case last read list call fills max arrays */
if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
}
err_flag = PVFS_Request_hindexed(mem_list_count,
mem_lengths, mem_offsets, PVFS_BYTE, &mem_req);
if (err_flag < 0) break;
err_flag = PVFS_Request_contiguous(file_lengths,
PVFS_BYTE, &file_req);
if (err_flag < 0) break;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req,
file_offsets, PVFS_BOTTOM, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
/* --END ERROR HANDLING-- */
/* in the case of error or the last read list call,
* leave here */
if (err_flag || b_blks_read == total_blks_to_read) break;
file_offsets += file_lengths;
file_lengths = 0;
}
} /* for (i=0; i<flat_buf->count; i++) */
j++;
} /* while (b_blks_read < total_blks_to_read) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_read;
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This isa temporary way of filling in status. The right way is to
keep tracke of how much data was actually read adn placed in buf
by ADIOI_BUFFERED_READ. */
#endif
ADIOI_Delete_flattened(datatype);
return;
} /* if (!buftype_is_contig && filetype_is_contig) */
/* know file is noncontiguous from above */
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
initial_off = offset;
/* for each case - ADIO_Individual pointer or explicit, find the file
offset in bytes (offset), n_filetypes (how many filetypes into
file to start), frd_size (remaining amount of data in present
file block), and st_index (start point in terms of blocks in
starting filetype) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent +
flat_file->blocklens[i] >= offset) {
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
break;
}
}
} /* while (!flag) */
} /* if (file_ptr_type == ADIO_INDIVIDUAL) */
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
abs_off_in_filetype;
} /* else [file_ptr_type != ADIO_INDIVIDUAL] */
start_off = offset;
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
int mem_lengths;
char *mem_offsets;
i = 0;
j = st_index;
n_filetypes = st_n_filetypes;
mem_list_count = 1;
/* determine how many blocks in file to read */
f_data_read = ADIOI_MIN(st_frd_size, bufsize);
total_blks_to_read = 1;
if (j < (flat_file->count-1)) j++;
else {
j = 0;
n_filetypes++;
}
while (f_data_read < bufsize) {
f_data_read += flat_file->blocklens[j];
total_blks_to_read++;
if (j<(flat_file->count-1)) j++;
else j = 0;
}
j = st_index;
n_filetypes = st_n_filetypes;
n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE;
extra_blks = total_blks_to_read%MAX_ARRAY_SIZE;
mem_offsets = buf;
mem_lengths = 0;
/* if at least one full readlist, allocate file arrays
at max array size and don't free until very end */
if (n_read_lists) {
file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int32_t));
}
/* if there's no full readlist allocate file arrays according
to needed size (extra_blks) */
else {
file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
sizeof(int32_t));
}
/* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
for (i=0; i<n_read_lists; i++) {
file_list_count = MAX_ARRAY_SIZE;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_frd_size;
mem_lengths = st_frd_size;
}
for (k=0; k<MAX_ARRAY_SIZE; k++) {
if (i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent
+ flat_file->indices[j];
file_lengths[k] = flat_file->blocklens[j];
mem_lengths += file_lengths[k];
}
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* PVFS_Request_hindexed already expresses the offsets into the
* file, so we should not pass in an offset if we are using
* hindexed for the file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
mem_offsets += mem_lengths;
mem_lengths = 0;
} /* for (i=0; i<n_read_lists; i++) */
/* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */
if (extra_blks) {
file_list_count = extra_blks;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_frd_size;
}
for (k=0; k<extra_blks; k++) {
if(i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent +
flat_file->indices[j];
if (k == (extra_blks - 1)) {
file_lengths[k] = bufsize - (int32_t) mem_lengths
- (int32_t) mem_offsets + (int32_t) buf;
}
else file_lengths[k] = flat_file->blocklens[j];
} /* if(i || k) */
mem_lengths += file_lengths[k];
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<extra_blks; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* as above, use 0 for 'offset' when using hindexed file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
max_mem_list = 0;
max_file_list = 0;
/* run through and file max_file_list and max_mem_list so that you
can allocate the file and memory arrays less than MAX_ARRAY_SIZE
if possible */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fakes filling the readlist arrays of lengths found above */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
if(i) {
if (i == (mem_list_count - 1)) {
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
if (i) {
if (i == (file_list_count - 1)) {
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
size_read += new_buffer_read;
start_k = k;
start_j = j;
if (max_mem_list < mem_list_count)
max_mem_list = mem_list_count;
if (max_file_list < file_list_count)
max_file_list = file_list_count;
} /* while (size_read < bufsize) */
/* one last check before we actually carry out the operation:
* this code has hard-to-fix bugs when a noncontiguous file type has
* such large pieces that the sum of the lengths of the memory type is
* not larger than one of those pieces (and vice versa for large memory
* types and many pices of file types. In these cases, give up and
* fall back to naive reads and writes. The testphdf5 test created a
* type with two very large memory regions and 600 very small file
* regions. The same test also created a type with one very large file
* region and many (700) very small memory regions. both cases caused
* problems for this code */
if ( ( (file_list_count == 1) &&
(new_file_read < flat_file->blocklens[0] ) ) ||
((mem_list_count == 1) &&
(new_buffer_read < flat_buf->blocklens[0]) ) ||
((file_list_count == MAX_ARRAY_SIZE) &&
(new_file_read < flat_buf->blocklens[0]) ) ||
( (mem_list_count == MAX_ARRAY_SIZE) &&
(new_buffer_read < flat_file->blocklens[0])) )
{ {
fprintf(stderr,
"Falling back to list I/O since datatype I/O failed\n");
ret = ADIOI_PVFS2_ReadStridedListIO(fd, buf, count,
datatype, file_ptr_type,
offset, status, error_code);
}
return;
}
if (fd->hints->fs_hints.pvfs2.listio_read == ADIOI_HINT_ENABLE) {
ret = ADIOI_PVFS2_ReadStridedListIO(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
return;
}
/* Use classic list I/O if no hints given base case */
ADIOI_Delete_flattened(datatype); ADIOI_PVFS2_OldReadStrided(fd, buf, count, datatype,
ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code);
file_ptr_type, initial_off, status, error_code);
return; return;
} }
mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size));
mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
/* this section calculates mem_list_count and file_list_count
and also finds the possibly odd sized last array elements
in new_frd_size and new_brd_size */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fills the allocated readlist arrays */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
mem_offsets[i] = ((PVFS_size)buf + buftype_extent*
(buf_count/flat_buf->count) +
(int)flat_buf->indices[k]);
if(!i) {
mem_lengths[0] = brd_size;
mem_offsets[0] += flat_buf->blocklens[k] - brd_size;
}
else {
if (i == (mem_list_count - 1)) {
mem_lengths[i] = end_brd_size;
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
else {
mem_lengths[i] = flat_buf->blocklens[k];
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
file_offsets[i] = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes) * filetype_extent;
if (!i) {
file_lengths[0] = frd_size;
file_offsets[0] += flat_file->blocklens[j] - frd_size;
}
else {
if (i == (file_list_count - 1)) {
file_lengths[i] = end_frd_size;
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
else file_lengths[i] = flat_file->blocklens[j];
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths,
mem_offsets, PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0 ) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
goto error_state;
}
/* -- END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* offset will be expressed in memory and file datatypes */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
size_read += new_buffer_read;
start_k = k;
start_j = j;
} /* while (size_read < bufsize) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
}
ADIOI_Free(file_offsets);
ADIOI_Free(file_lengths);
/* Other ADIO routines will convert absolute bytes into counts of datatypes */
/* when incrementing fp_ind, need to also take into account the file type:
* consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
* if we wrote N elements, offset needs to point at beginning of type, not
* at empty region at offset N+1) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* this is closer, but still incorrect for the cases where a small
* amount of a file type is "leftover" after a write */
fd->fp_ind = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes)*filetype_extent;
}
if (err_flag == 0) *error_code = MPI_SUCCESS;
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}
/* /*
* vim: ts=8 sts=4 sw=4 noexpandtab * vim: ts=8 sts=4 sw=4 noexpandtab

Просмотреть файл

@ -0,0 +1,909 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_common.h"
void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, brd_size, frd_size=0, st_index=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset off, disp, start_off, initial_off;
int flag, st_frd_size, st_n_filetypes;
int mem_list_count, file_list_count;
PVFS_size *mem_offsets;
int64_t *file_offsets;
int *mem_lengths;
int32_t *file_lengths;
int total_blks_to_read;
int max_mem_list, max_file_list;
int b_blks_read;
int f_data_read;
int size_read=0, n_read_lists, extra_blks;
int end_brd_size, end_frd_size;
int start_k, start_j, new_file_read, new_buffer_read;
int start_mem_offset;
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
int err_flag=0;
MPI_Offset total_bytes_read = 0;
static char myname[] = "ADIOI_PVFS2_ReadStrided";
#define MAX_ARRAY_SIZE 64
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* the HDF5 tests showed a bug in this list processing code (see many many
* lines down below). We added a workaround, but common HDF5 file types
* are actually contiguous and do not need the expensive workarond */
if (!filetype_is_contig) {
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
if (flat_file->count == 1 && !buftype_is_contig)
filetype_is_contig = 1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
int64_t file_offsets;
int32_t file_lengths;
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
file_list_count = 1;
file_offsets = off;
file_lengths = 0;
total_blks_to_read = count*flat_buf->count;
b_blks_read = 0;
/* allocate arrays according to max usage */
if (total_blks_to_read > MAX_ARRAY_SIZE)
mem_list_count = MAX_ARRAY_SIZE;
else mem_list_count = total_blks_to_read;
mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size));
mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));
/* TODO: CHECK RESULTS OF MEMORY ALLOCATION */
j = 0;
/* step through each block in memory, filling memory arrays */
while (b_blks_read < total_blks_to_read) {
for (i=0; i<flat_buf->count; i++) {
mem_offsets[b_blks_read % MAX_ARRAY_SIZE] =
/* TODO: fix this compiler warning */
((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]);
mem_lengths[b_blks_read % MAX_ARRAY_SIZE] =
flat_buf->blocklens[i];
file_lengths += flat_buf->blocklens[i];
b_blks_read++;
if (!(b_blks_read % MAX_ARRAY_SIZE) ||
(b_blks_read == total_blks_to_read)) {
/* in the case of the last read list call,
adjust mem_list_count */
if (b_blks_read == total_blks_to_read) {
mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE;
/* in case last read list call fills max arrays */
if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
}
err_flag = PVFS_Request_hindexed(mem_list_count,
mem_lengths, mem_offsets, PVFS_BYTE, &mem_req);
if (err_flag < 0) break;
err_flag = PVFS_Request_contiguous(file_lengths,
PVFS_BYTE, &file_req);
if (err_flag < 0) break;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req,
file_offsets, PVFS_BOTTOM, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
/* --END ERROR HANDLING-- */
/* in the case of error or the last read list call,
* leave here */
if (err_flag || b_blks_read == total_blks_to_read) break;
file_offsets += file_lengths;
file_lengths = 0;
}
} /* for (i=0; i<flat_buf->count; i++) */
j++;
} /* while (b_blks_read < total_blks_to_read) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_read;
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This isa temporary way of filling in status. The right way is to
keep tracke of how much data was actually read adn placed in buf
by ADIOI_BUFFERED_READ. */
#endif
ADIOI_Delete_flattened(datatype);
return;
} /* if (!buftype_is_contig && filetype_is_contig) */
/* know file is noncontiguous from above */
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
initial_off = offset;
/* for each case - ADIO_Individual pointer or explicit, find the file
offset in bytes (offset), n_filetypes (how many filetypes into
file to start), frd_size (remaining amount of data in present
file block), and st_index (start point in terms of blocks in
starting filetype) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent +
flat_file->blocklens[i] >= offset) {
st_index = i;
frd_size = (int) (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent
+ flat_file->blocklens[i] - offset);
flag = 1;
break;
}
}
} /* while (!flag) */
} /* if (file_ptr_type == ADIO_INDIVIDUAL) */
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
abs_off_in_filetype;
} /* else [file_ptr_type != ADIO_INDIVIDUAL] */
start_off = offset;
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
int mem_lengths;
char *mem_offsets;
i = 0;
j = st_index;
n_filetypes = st_n_filetypes;
mem_list_count = 1;
/* determine how many blocks in file to read */
f_data_read = ADIOI_MIN(st_frd_size, bufsize);
total_blks_to_read = 1;
if (j < (flat_file->count-1)) j++;
else {
j = 0;
n_filetypes++;
}
while (f_data_read < bufsize) {
f_data_read += flat_file->blocklens[j];
total_blks_to_read++;
if (j<(flat_file->count-1)) j++;
else j = 0;
}
j = st_index;
n_filetypes = st_n_filetypes;
n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE;
extra_blks = total_blks_to_read%MAX_ARRAY_SIZE;
mem_offsets = buf;
mem_lengths = 0;
/* if at least one full readlist, allocate file arrays
at max array size and don't free until very end */
if (n_read_lists) {
file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int32_t));
}
/* if there's no full readlist allocate file arrays according
to needed size (extra_blks) */
else {
file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
sizeof(int32_t));
}
/* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
for (i=0; i<n_read_lists; i++) {
file_list_count = MAX_ARRAY_SIZE;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_frd_size;
mem_lengths = st_frd_size;
}
for (k=0; k<MAX_ARRAY_SIZE; k++) {
if (i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent
+ flat_file->indices[j];
file_lengths[k] = flat_file->blocklens[j];
mem_lengths += file_lengths[k];
}
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* PVFS_Request_hindexed already expresses the offsets into the
* file, so we should not pass in an offset if we are using
* hindexed for the file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
mem_offsets += mem_lengths;
mem_lengths = 0;
} /* for (i=0; i<n_read_lists; i++) */
/* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */
if (extra_blks) {
file_list_count = extra_blks;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = ADIOI_MIN(st_frd_size, bufsize);
}
for (k=0; k<extra_blks; k++) {
if(i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent +
flat_file->indices[j];
if (k == (extra_blks - 1)) {
file_lengths[k] = bufsize - (int32_t) mem_lengths
- (int32_t) mem_offsets + (int32_t) buf;
}
else file_lengths[k] = flat_file->blocklens[j];
} /* if(i || k) */
mem_lengths += file_lengths[k];
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<extra_blks; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* as above, use 0 for 'offset' when using hindexed file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
max_mem_list = 0;
max_file_list = 0;
/* run through and file max_file_list and max_mem_list so that you
can allocate the file and memory arrays less than MAX_ARRAY_SIZE
if possible */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fakes filling the readlist arrays of lengths found above */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
if(i) {
if (i == (mem_list_count - 1)) {
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
if (i) {
if (i == (file_list_count - 1)) {
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
size_read += new_buffer_read;
start_k = k;
start_j = j;
if (max_mem_list < mem_list_count)
max_mem_list = mem_list_count;
if (max_file_list < file_list_count)
max_file_list = file_list_count;
} /* while (size_read < bufsize) */
/* one last check before we actually carry out the operation:
* this code has hard-to-fix bugs when a noncontiguous file type has
* such large pieces that the sum of the lengths of the memory type is
* not larger than one of those pieces (and vice versa for large memory
* types and many pices of file types. In these cases, give up and
* fall back to naive reads and writes. The testphdf5 test created a
* type with two very large memory regions and 600 very small file
* regions. The same test also created a type with one very large file
* region and many (700) very small memory regions. both cases caused
* problems for this code */
if ( ( (file_list_count == 1) &&
(new_file_read < flat_file->blocklens[0] ) ) ||
((mem_list_count == 1) &&
(new_buffer_read < flat_buf->blocklens[0]) ) ||
((file_list_count == MAX_ARRAY_SIZE) &&
(new_file_read < flat_buf->blocklens[0]) ) ||
( (mem_list_count == MAX_ARRAY_SIZE) &&
(new_buffer_read < flat_file->blocklens[0])) )
{
ADIOI_Delete_flattened(datatype);
ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype,
file_ptr_type, initial_off, status, error_code);
return;
}
mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size));
mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
size_read = 0;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
/* this section calculates mem_list_count and file_list_count
and also finds the possibly odd sized last array elements
in new_frd_size and new_brd_size */
while (size_read < bufsize) {
k = start_k;
new_buffer_read = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data read and data to be
read in the next immediate read list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k] +
size_read) > bufsize) {
end_brd_size = new_buffer_read +
flat_buf->blocklens[k] - (bufsize - size_read);
new_buffer_read = bufsize - size_read;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
else new_buffer_read = brd_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
j = start_j;
new_file_read = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_read < new_buffer_read)) {
if(file_list_count) {
if((new_file_read + flat_file->blocklens[j]) >
new_buffer_read) {
end_frd_size = new_buffer_read - new_file_read;
new_file_read = new_buffer_read;
j--;
}
else {
new_file_read += flat_file->blocklens[j];
end_frd_size = flat_file->blocklens[j];
}
}
else {
if (frd_size > new_buffer_read) {
new_file_read = new_buffer_read;
frd_size = new_file_read;
}
else new_file_read = frd_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_read < new_buffer_read) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_read = 0;
mem_list_count = 0;
while (new_buffer_read < new_file_read) {
if(mem_list_count) {
if((new_buffer_read + flat_buf->blocklens[k]) >
new_file_read) {
end_brd_size = new_file_read - new_buffer_read;
new_buffer_read = new_file_read;
k--;
}
else {
new_buffer_read += flat_buf->blocklens[k];
end_brd_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_read = brd_size;
if (brd_size > (bufsize - size_read)) {
new_buffer_read = bufsize - size_read;
brd_size = new_buffer_read;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_read < new_file_read) */
} /* if ((new_file_read < new_buffer_read) && (file_list_count
== MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_read < bufsize-size_read)) */
/* fills the allocated readlist arrays */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
mem_offsets[i] = ((PVFS_size)buf + buftype_extent*
(buf_count/flat_buf->count) +
(int)flat_buf->indices[k]);
if(!i) {
mem_lengths[0] = brd_size;
mem_offsets[0] += flat_buf->blocklens[k] - brd_size;
}
else {
if (i == (mem_list_count - 1)) {
mem_lengths[i] = end_brd_size;
if (flat_buf->blocklens[k] == end_brd_size)
brd_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
brd_size = flat_buf->blocklens[k] - end_brd_size;
k--;
buf_count--;
}
}
else {
mem_lengths[i] = flat_buf->blocklens[k];
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
file_offsets[i] = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes) * filetype_extent;
if (!i) {
file_lengths[0] = frd_size;
file_offsets[0] += flat_file->blocklens[j] - frd_size;
}
else {
if (i == (file_list_count - 1)) {
file_lengths[i] = end_frd_size;
if (flat_file->blocklens[j] == end_frd_size)
frd_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
frd_size = flat_file->blocklens[j] - end_frd_size;
j--;
}
}
else file_lengths[i] = flat_file->blocklens[j];
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths,
mem_offsets, PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0 ) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
goto error_state;
}
/* -- END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* offset will be expressed in memory and file datatypes */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0,
PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_read", 0);
}
/* --END ERROR HANDLING-- */
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
total_bytes_read += resp_io.total_completed;
size_read += new_buffer_read;
start_k = k;
start_j = j;
} /* while (size_read < bufsize) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
}
/* Other ADIO routines will convert absolute bytes into counts of datatypes */
/* when incrementing fp_ind, need to also take into account the file type:
* consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
* if we wrote N elements, offset needs to point at beginning of type, not
* at empty region at offset N+1)
*
* As we discussed on mpich-discuss in may/june 2009, the code below might
* look wierd, but by putting fp_ind at the last byte written, the next
* time we run through the strided code we'll update the fp_ind to the
* right location. */
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind = file_offsets[file_list_count-1]+
file_lengths[file_list_count-1];
}
ADIOI_Free(file_offsets);
ADIOI_Free(file_lengths);
if (err_flag == 0) *error_code = MPI_SUCCESS;
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,963 @@
/* -*- Mode: C; c-basic-offset:4 ; -*-
* vim: ts=8 sts=4 sw=4 noexpandtab
*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "ad_pvfs2.h"
#include "ad_pvfs2_common.h"
void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
/* as with all the other WriteStrided functions, offset is in units of
* etype relative to the filetype */
/* Since PVFS2 does not support file locking, can't do buffered writes
as on Unix */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, bwr_size, fwr_size=0, st_index=0;
int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset off, disp, start_off, initial_off;
int flag, st_fwr_size, st_n_filetypes;
int err_flag=0;
int mem_list_count, file_list_count;
PVFS_size * mem_offsets;
int64_t *file_offsets;
int *mem_lengths;
int32_t *file_lengths;
int total_blks_to_write;
int max_mem_list, max_file_list;
int b_blks_wrote;
int f_data_wrote;
int size_wrote=0, n_write_lists, extra_blks;
int end_bwr_size, end_fwr_size;
int start_k, start_j, new_file_write, new_buffer_write;
int start_mem_offset;
PVFS_Request mem_req, file_req;
ADIOI_PVFS2_fs * pvfs_fs;
PVFS_sysresp_io resp_io;
MPI_Offset total_bytes_written=0;
static char myname[] = "ADIOI_PVFS2_WRITESTRIDED";
/* note: don't increase this: several parts of PVFS2 now
* assume this limit*/
#define MAX_ARRAY_SIZE 64
/* --BEGIN ERROR HANDLING-- */
if (fd->atomicity) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"Atomic noncontiguous writes are not supported by PVFS2", 0);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
/* the HDF5 tests showed a bug in this list processing code (see many many
* lines down below). We added a workaround, but common HDF5 file types
* are actually contiguous and do not need the expensive workarond */
if (!filetype_is_contig) {
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
if (flat_file->count == 1 && !buftype_is_contig)
filetype_is_contig = 1;
}
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
int64_t file_offsets;
int32_t file_lengths;
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + etype_size * offset;
}
else off = fd->fp_ind;
file_list_count = 1;
file_offsets = off;
file_lengths = 0;
total_blks_to_write = count*flat_buf->count;
b_blks_wrote = 0;
/* allocate arrays according to max usage */
if (total_blks_to_write > MAX_ARRAY_SIZE)
mem_list_count = MAX_ARRAY_SIZE;
else mem_list_count = total_blks_to_write;
mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size));
mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));
j = 0;
/* step through each block in memory, filling memory arrays */
while (b_blks_wrote < total_blks_to_write) {
for (i=0; i<flat_buf->count; i++) {
mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] =
/* TODO: fix this warning by casting to an integer that's
* the same size as a char * and /then/ casting to
* PVFS_size */
((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]);
mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] =
flat_buf->blocklens[i];
file_lengths += flat_buf->blocklens[i];
b_blks_wrote++;
if (!(b_blks_wrote % MAX_ARRAY_SIZE) ||
(b_blks_wrote == total_blks_to_write)) {
/* in the case of the last write list call,
adjust mem_list_count */
if (b_blks_wrote == total_blks_to_write) {
mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE;
/* in case last write list call fills max arrays */
if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
}
err_flag = PVFS_Request_hindexed(mem_list_count,
mem_lengths, mem_offsets,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
break;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_contiguous(file_lengths,
PVFS_BYTE, &file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (file)", 0);
break;
}
/* --END ERROR HANDLING-- */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req,
file_offsets, PVFS_BOTTOM,
mem_req,
&(pvfs_fs->credentials),
&resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
total_bytes_written += resp_io.total_completed;
/* in the case of error or the last write list call,
* leave here */
/* --BEGIN ERROR HANDLING-- */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
break;
}
/* --END ERROR HANDLING-- */
if (b_blks_wrote == total_blks_to_write) break;
file_offsets += file_lengths;
file_lengths = 0;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
}
} /* for (i=0; i<flat_buf->count; i++) */
j++;
} /* while (b_blks_wrote < total_blks_to_write) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind += total_bytes_written;
if (!err_flag) *error_code = MPI_SUCCESS;
fd->fp_sys_posn = -1; /* clear this. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
ADIOI_Delete_flattened(datatype);
return;
} /* if (!buftype_is_contig && filetype_is_contig) */
/* already know that file is noncontiguous from above */
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
initial_off = offset;
/* for each case - ADIO_Individual pointer or explicit, find offset
(file offset in bytes), n_filetypes (how many filetypes into file
to start), fwr_size (remaining amount of data in present file
block), and st_index (start point in terms of blocks in starting
filetype) */
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind; /* in bytes */
n_filetypes = -1;
flag = 0;
while (!flag) {
n_filetypes++;
for (i=0; i<flat_file->count; i++) {
if (disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent +
flat_file->blocklens[i] >= offset) {
st_index = i;
fwr_size = disp + flat_file->indices[i] +
((ADIO_Offset) n_filetypes)*filetype_extent
+ flat_file->blocklens[i] - offset;
flag = 1;
break;
}
}
} /* while (!flag) */
} /* if (file_ptr_type == ADIO_INDIVIDUAL) */
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
abs_off_in_filetype;
} /* else [file_ptr_type != ADIO_INDIVIDUAL] */
start_off = offset;
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
int mem_lengths;
char *mem_offsets;
i = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
mem_list_count = 1;
/* determine how many blocks in file to write */
f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize);
total_blks_to_write = 1;
if (j < (flat_file->count -1)) j++;
else {
j = 0;
n_filetypes++;
}
while (f_data_wrote < bufsize) {
f_data_wrote += flat_file->blocklens[j];
total_blks_to_write++;
if (j<(flat_file->count-1)) j++;
else j = 0;
}
j = st_index;
n_filetypes = st_n_filetypes;
n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE;
extra_blks = total_blks_to_write%MAX_ARRAY_SIZE;
mem_offsets = buf;
mem_lengths = 0;
/* if at least one full writelist, allocate file arrays
at max array size and don't free until very end */
if (n_write_lists) {
file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
sizeof(int32_t));
}
/* if there's no full writelist allocate file arrays according
to needed size (extra_blks) */
else {
file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
sizeof(int64_t));
file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
sizeof(int32_t));
}
/* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
for (i=0; i<n_write_lists; i++) {
file_list_count = MAX_ARRAY_SIZE;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = st_fwr_size;
mem_lengths = st_fwr_size;
}
for (k=0; k<MAX_ARRAY_SIZE; k++) {
if (i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent
+ flat_file->indices[j];
file_lengths[k] = flat_file->blocklens[j];
mem_lengths += file_lengths[k];
}
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* PVFS_Request_hindexed already expresses the offsets into the
* file, so we should not pass in an offset if we are using
* hindexed for the file type */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
total_bytes_written += resp_io.total_completed;
mem_offsets += mem_lengths;
mem_lengths = 0;
PVFS_Request_free(&file_req);
PVFS_Request_free(&mem_req);
} /* for (i=0; i<n_write_lists; i++) */
/* for file arrays smaller than MAX_ARRAY_SIZE (last write_list call) */
if (extra_blks) {
file_list_count = extra_blks;
if(!i) {
file_offsets[0] = offset;
file_lengths[0] = ADIOI_MIN(st_fwr_size, bufsize);
}
for (k=0; k<extra_blks; k++) {
if(i || k) {
file_offsets[k] = disp +
((ADIO_Offset)n_filetypes)*filetype_extent +
flat_file->indices[j];
if (k == (extra_blks - 1)) {
file_lengths[k] = bufsize - (int32_t) mem_lengths
- (int32_t) mem_offsets + (int32_t) buf;
}
else file_lengths[k] = flat_file->blocklens[j];
} /* if(i || k) */
mem_lengths += file_lengths[k];
if (j<(flat_file->count - 1)) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (k=0; k<extra_blks; k++) */
err_flag = PVFS_Request_contiguous(mem_lengths,
PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_contiguous (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed(file)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* as above, use 0 for 'offset' when using hindexed file type*/
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
mem_offsets, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
total_bytes_written += resp_io.total_completed;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
size_wrote = 0;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
max_mem_list = 0;
max_file_list = 0;
/* run through and file max_file_list and max_mem_list so that you
can allocate the file and memory arrays less than MAX_ARRAY_SIZE
if possible */
while (size_wrote < bufsize) {
k = start_k;
new_buffer_write = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data written and data to be
written in the next immediate write list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k] +
size_wrote) > bufsize) {
end_bwr_size = new_buffer_write +
flat_buf->blocklens[k] - (bufsize - size_wrote);
new_buffer_write = bufsize - size_wrote;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
else new_buffer_write = bwr_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
j = start_j;
new_file_write = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_write < new_buffer_write)) {
if(file_list_count) {
if((new_file_write + flat_file->blocklens[j]) >
new_buffer_write) {
end_fwr_size = new_buffer_write - new_file_write;
new_file_write = new_buffer_write;
j--;
}
else {
new_file_write += flat_file->blocklens[j];
end_fwr_size = flat_file->blocklens[j];
}
}
else {
if (fwr_size > new_buffer_write) {
new_file_write = new_buffer_write;
fwr_size = new_file_write;
}
else new_file_write = fwr_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_write = 0;
mem_list_count = 0;
while (new_buffer_write < new_file_write) {
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k]) >
new_file_write) {
end_bwr_size = new_file_write -
new_buffer_write;
new_buffer_write = new_file_write;
k--;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_write = bwr_size;
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_write < new_file_write) */
} /* if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
/* fakes filling the writelist arrays of lengths found above */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
if(i) {
if (i == (mem_list_count - 1)) {
if (flat_buf->blocklens[k] == end_bwr_size)
bwr_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
bwr_size = flat_buf->blocklens[k] - end_bwr_size;
k--;
buf_count--;
}
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
if (i) {
if (i == (file_list_count - 1)) {
if (flat_file->blocklens[j] == end_fwr_size)
fwr_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
fwr_size = flat_file->blocklens[j] - end_fwr_size;
j--;
}
}
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
size_wrote += new_buffer_write;
start_k = k;
start_j = j;
if (max_mem_list < mem_list_count)
max_mem_list = mem_list_count;
if (max_file_list < file_list_count)
max_file_list = file_list_count;
} /* while (size_wrote < bufsize) */
/* one last check before we actually carry out the operation:
* this code has hard-to-fix bugs when a noncontiguous file type has
* such large pieces that the sum of the lengths of the memory type is
* not larger than one of those pieces (and vice versa for large memory
* types and many pices of file types. In these cases, give up and
* fall back to naive reads and writes. The testphdf5 test created a
* type with two very large memory regions and 600 very small file
* regions. The same test also created a type with one very large file
* region and many (700) very small memory regions. both cases caused
* problems for this code */
if ( ( (file_list_count == 1) &&
(new_file_write < flat_file->blocklens[0] ) ) ||
((mem_list_count == 1) &&
(new_buffer_write < flat_buf->blocklens[0]) ) ||
((file_list_count == MAX_ARRAY_SIZE) &&
(new_file_write < flat_buf->blocklens[0]) ) ||
( (mem_list_count == MAX_ARRAY_SIZE) &&
(new_buffer_write < flat_file->blocklens[0])) )
{
ADIOI_Delete_flattened(datatype);
ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype,
file_ptr_type, initial_off, status, error_code);
return;
}
mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size));
mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
size_wrote = 0;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
buf_count = 0;
start_mem_offset = 0;
start_k = k = 0;
start_j = st_index;
/* this section calculates mem_list_count and file_list_count
and also finds the possibly odd sized last array elements
in new_fwr_size and new_bwr_size */
while (size_wrote < bufsize) {
k = start_k;
new_buffer_write = 0;
mem_list_count = 0;
while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) {
/* find mem_list_count and file_list_count such that both are
less than MAX_ARRAY_SIZE, the sum of their lengths are
equal, and the sum of all the data written and data to be
written in the next immediate write list is less than
bufsize */
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k] +
size_wrote) > bufsize) {
end_bwr_size = new_buffer_write +
flat_buf->blocklens[k] - (bufsize - size_wrote);
new_buffer_write = bufsize - size_wrote;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
else new_buffer_write = bwr_size;
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
j = start_j;
new_file_write = 0;
file_list_count = 0;
while ((file_list_count < MAX_ARRAY_SIZE) &&
(new_file_write < new_buffer_write)) {
if(file_list_count) {
if((new_file_write + flat_file->blocklens[j]) >
new_buffer_write) {
end_fwr_size = new_buffer_write - new_file_write;
new_file_write = new_buffer_write;
j--;
}
else {
new_file_write += flat_file->blocklens[j];
end_fwr_size = flat_file->blocklens[j];
}
}
else {
if (fwr_size > new_buffer_write) {
new_file_write = new_buffer_write;
fwr_size = new_file_write;
}
else new_file_write = fwr_size;
}
file_list_count++;
if (j < (flat_file->count - 1)) j++;
else j = 0;
k = start_k;
if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) {
new_buffer_write = 0;
mem_list_count = 0;
while (new_buffer_write < new_file_write) {
if(mem_list_count) {
if((new_buffer_write + flat_buf->blocklens[k]) >
new_file_write) {
end_bwr_size = new_file_write -
new_buffer_write;
new_buffer_write = new_file_write;
k--;
}
else {
new_buffer_write += flat_buf->blocklens[k];
end_bwr_size = flat_buf->blocklens[k];
}
}
else {
new_buffer_write = bwr_size;
if (bwr_size > (bufsize - size_wrote)) {
new_buffer_write = bufsize - size_wrote;
bwr_size = new_buffer_write;
}
}
mem_list_count++;
k = (k + 1)%flat_buf->count;
} /* while (new_buffer_write < new_file_write) */
} /* if ((new_file_write < new_buffer_write) &&
(file_list_count == MAX_ARRAY_SIZE)) */
} /* while ((mem_list_count < MAX_ARRAY_SIZE) &&
(new_buffer_write < bufsize-size_wrote)) */
/* fills the allocated writelist arrays */
k = start_k;
j = start_j;
for (i=0; i<mem_list_count; i++) {
/* TODO: fix this warning by casting to an integer that's the
* same size as a char * and /then/ casting to PVFS_size */
mem_offsets[i] = ((PVFS_size)buf + buftype_extent*
(buf_count/flat_buf->count) +
(int)flat_buf->indices[k]);
if(!i) {
mem_lengths[0] = bwr_size;
mem_offsets[0] += flat_buf->blocklens[k] - bwr_size;
}
else {
if (i == (mem_list_count - 1)) {
mem_lengths[i] = end_bwr_size;
if (flat_buf->blocklens[k] == end_bwr_size)
bwr_size = flat_buf->blocklens[(k+1)%
flat_buf->count];
else {
bwr_size = flat_buf->blocklens[k] - end_bwr_size;
k--;
buf_count--;
}
}
else {
mem_lengths[i] = flat_buf->blocklens[k];
}
}
buf_count++;
k = (k + 1)%flat_buf->count;
} /* for (i=0; i<mem_list_count; i++) */
for (i=0; i<file_list_count; i++) {
file_offsets[i] = disp + flat_file->indices[j] +
((ADIO_Offset)n_filetypes) * filetype_extent;
if (!i) {
file_lengths[0] = fwr_size;
file_offsets[0] += flat_file->blocklens[j] - fwr_size;
}
else {
if (i == (file_list_count - 1)) {
file_lengths[i] = end_fwr_size;
if (flat_file->blocklens[j] == end_fwr_size)
fwr_size = flat_file->blocklens[(j+1)%
flat_file->count];
else {
fwr_size = flat_file->blocklens[j] - end_fwr_size;
j--;
}
}
else file_lengths[i] = flat_file->blocklens[j];
}
if (j < flat_file->count - 1) j++;
else {
j = 0;
n_filetypes++;
}
} /* for (i=0; i<file_list_count; i++) */
err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths,
mem_offsets, PVFS_BYTE, &mem_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0 ) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed (memory)", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
err_flag = PVFS_Request_hindexed(file_list_count, file_lengths,
file_offsets, PVFS_BYTE,
&file_req);
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_Request_hindexed", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
/* offset will be expressed in memory and file datatypes */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0,
PVFS_BOTTOM, mem_req,
&(pvfs_fs->credentials), &resp_io);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
/* --BEGIN ERROR HANDLING-- */
if (err_flag != 0) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(err_flag),
"Error in PVFS_sys_write", 0);
goto error_state;
}
/* --END ERROR HANDLING-- */
size_wrote += new_buffer_write;
total_bytes_written += resp_io.total_completed;
start_k = k;
start_j = j;
PVFS_Request_free(&mem_req);
PVFS_Request_free(&file_req);
} /* while (size_wrote < bufsize) */
ADIOI_Free(mem_offsets);
ADIOI_Free(mem_lengths);
}
/* when incrementing fp_ind, need to also take into account the file type:
* consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
* if we wrote N elements, offset needs to point at beginning of type, not
* at empty region at offset N+1).
*
* As we discussed on mpich-discuss in may/june 2009, the code below might
* look wierd, but by putting fp_ind at the last byte written, the next
* time we run through the strided code we'll update the fp_ind to the
* right location. */
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind = file_offsets[file_list_count-1]+
file_lengths[file_list_count-1];
}
ADIOI_Free(file_offsets);
ADIOI_Free(file_lengths);
*error_code = MPI_SUCCESS;
error_state:
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_TESTFS_operations = { struct ADIOI_Fns_struct ADIO_TESTFS_operations = {
ADIOI_TESTFS_Open, /* Open */ ADIOI_TESTFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_TESTFS_ReadContig, /* ReadContig */ ADIOI_TESTFS_ReadContig, /* ReadContig */
ADIOI_TESTFS_WriteContig, /* WriteContig */ ADIOI_TESTFS_WriteContig, /* WriteContig */
ADIOI_TESTFS_ReadStridedColl, /* ReadStridedColl */ ADIOI_TESTFS_ReadStridedColl, /* ReadStridedColl */
@ -33,4 +34,5 @@ struct ADIOI_Fns_struct ADIO_TESTFS_operations = {
ADIOI_TESTFS_Flush, /* Flush */ ADIOI_TESTFS_Flush, /* Flush */
ADIOI_TESTFS_Resize, /* Resize */ ADIOI_TESTFS_Resize, /* Resize */
ADIOI_TESTFS_Delete, /* Delete */ ADIOI_TESTFS_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
}; };

Просмотреть файл

@ -7,7 +7,9 @@
#include "ad_testfs.h" #include "ad_testfs.h"
#include "adioi.h" #include "adioi.h"
#ifdef ROMIO_BGL
#include "../ad_bgl/ad_bgl.h"
#endif
void ADIOI_TESTFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) void ADIOI_TESTFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{ {
int myrank, nprocs; int myrank, nprocs;
@ -21,5 +23,10 @@ void ADIOI_TESTFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
FPRINTF(stdout, "[%d/%d] calling ADIOI_GEN_SetInfo\n", FPRINTF(stdout, "[%d/%d] calling ADIOI_GEN_SetInfo\n",
myrank, nprocs); myrank, nprocs);
#ifdef ROMIO_BGL /* BlueGene support for pvfs through ufs */
/* BlueGene hack: force testfs to mimic BlueGene hints */
ADIOI_BGL_SetInfo(fd, users_info, error_code);
#else
ADIOI_GEN_SetInfo(fd, users_info, error_code); ADIOI_GEN_SetInfo(fd, users_info, error_code);
#endif
} }

Просмотреть файл

@ -26,10 +26,6 @@ void ADIOI_TESTFS_ReadContig(ADIO_File fd, void *buf, int count,
offset = fd->fp_ind; offset = fd->fp_ind;
fd->fp_ind += datatype_size * count; fd->fp_ind += datatype_size * count;
fd->fp_sys_posn = fd->fp_ind; fd->fp_sys_posn = fd->fp_ind;
#if 0
FPRINTF(stdout, "[%d/%d] new file position is %lld\n", myrank,
nprocs, (long long) fd->fp_ind);
#endif
} }
else { else {
fd->fp_sys_posn = offset + datatype_size * count; fd->fp_sys_posn = offset + datatype_size * count;

Просмотреть файл

@ -26,8 +26,8 @@ ADIO_Offset ADIOI_TESTFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
ADIO_Offset off; ADIO_Offset off;
ADIOI_Flatlist_node *flat_file; ADIOI_Flatlist_node *flat_file;
int i, n_etypes_in_filetype, n_filetypes, etype_in_filetype; int i, n_etypes_in_filetype, n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0; ADIO_Offset abs_off_in_filetype=0, sum;
int size_in_filetype, sum; int size_in_filetype;
int filetype_size, etype_size, filetype_is_contig; int filetype_size, etype_size, filetype_is_contig;
MPI_Aint filetype_extent; MPI_Aint filetype_extent;
@ -54,6 +54,7 @@ ADIO_Offset ADIOI_TESTFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
} }
n_etypes_in_filetype = filetype_size/etype_size; n_etypes_in_filetype = filetype_size/etype_size;
ADIOI_Assert((offset / n_etypes_in_filetype) == (int) (offset / n_etypes_in_filetype));
n_filetypes = (int) (offset / n_etypes_in_filetype); n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size; size_in_filetype = etype_in_filetype * etype_size;
@ -70,7 +71,7 @@ ADIO_Offset ADIOI_TESTFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset,
} }
/* abs. offset in bytes in the file */ /* abs. offset in bytes in the file */
off = fd->disp + (ADIO_Offset) n_filetypes * filetype_extent + off = fd->disp + (ADIO_Offset)n_filetypes * (ADIO_Offset)filetype_extent +
abs_off_in_filetype; abs_off_in_filetype;
} }

Просмотреть файл

@ -23,7 +23,7 @@ void ADIOI_TESTFS_WriteContig(ADIO_File fd, void *buf, int count,
nprocs, fd->filename); nprocs, fd->filename);
FPRINTF(stdout, "[%d/%d] writing (buf = %p, loc = %lld, sz = %lld)\n", FPRINTF(stdout, "[%d/%d] writing (buf = %p, loc = %lld, sz = %lld)\n",
myrank, nprocs, buf, (long long) offset, myrank, nprocs, buf, (long long) offset,
(long long) datatype_size * count); (long long)datatype_size * (long long)count);
if (file_ptr_type != ADIO_EXPLICIT_OFFSET) if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{ {

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_UFS_operations = { struct ADIOI_Fns_struct ADIO_UFS_operations = {
ADIOI_UFS_Open, /* Open */ ADIOI_UFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_GEN_ReadContig, /* ReadContig */ ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */ ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -38,4 +39,5 @@ struct ADIOI_Fns_struct ADIO_UFS_operations = {
ADIOI_GEN_Flush, /* Flush */ ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */ ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
}; };

Просмотреть файл

@ -22,13 +22,9 @@ noinst_LTLIBRARIES = libadio_xfs.la
libadio_xfs_la_SOURCES = \ libadio_xfs_la_SOURCES = \
ad_xfs.c \ ad_xfs.c \
ad_xfs.h \ ad_xfs.h \
ad_xfs_done.c \
ad_xfs_fcntl.c \ ad_xfs_fcntl.c \
ad_xfs_hints.c \ ad_xfs_hints.c \
ad_xfs_iread.c \
ad_xfs_iwrite.c \
ad_xfs_open.c \ ad_xfs_open.c \
ad_xfs_read.c \ ad_xfs_read.c \
ad_xfs_resize.c \ ad_xfs_resize.c \
ad_xfs_wait.c \
ad_xfs_write.c ad_xfs_write.c

Просмотреть файл

@ -12,6 +12,7 @@
struct ADIOI_Fns_struct ADIO_XFS_operations = { struct ADIOI_Fns_struct ADIO_XFS_operations = {
ADIOI_XFS_Open, /* Open */ ADIOI_XFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_XFS_ReadContig, /* ReadContig */ ADIOI_XFS_ReadContig, /* ReadContig */
ADIOI_XFS_WriteContig, /* WriteContig */ ADIOI_XFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -22,15 +23,21 @@ struct ADIOI_Fns_struct ADIO_XFS_operations = {
ADIOI_GEN_ReadStrided, /* ReadStrided */ ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */ ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */ ADIOI_GEN_Close, /* Close */
ADIOI_XFS_IreadContig, /* IreadContig */ #if defined(ROMIO_HAVE_WORKING_AIO)
ADIOI_XFS_IwriteContig, /* IwriteContig */ ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_XFS_ReadDone, /* ReadDone */ ADIOI_GEN_IwriteContig, /* IwriteContig */
ADIOI_XFS_WriteDone, /* WriteDone */ #else
ADIOI_XFS_ReadComplete, /* ReadComplete */ ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_XFS_WriteComplete, /* WriteComplete */ ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif /* ROMIO_HAVE_WORKING_AIO */
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */ ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */ ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */ ADIOI_GEN_Flush, /* Flush */
ADIOI_XFS_Resize, /* Resize */ ADIOI_XFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */ ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
}; };

Просмотреть файл

@ -8,20 +8,19 @@
#ifndef AD_XFS_INCLUDE #ifndef AD_XFS_INCLUDE
#define AD_XFS_INCLUDE #define AD_XFS_INCLUDE
#define _XOPEN_SOURCE 500
#include <unistd.h> #include <unistd.h>
#include <sys/types.h> #include <sys/types.h>
#include <fcntl.h> #include <fcntl.h>
#include "adio.h" #include "adio.h"
#include <aio.h>
int ADIOI_XFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, #if defined(MPISGI)
int wr, void *handle); #include "xfs/xfs_fs.h"
#ifndef __USE_LARGEFILE64
#if (defined(HAVE_PREAD64) && (_ABIO32 == 1)) #define __USE_LARGEFILE64
# define pread pread64 #endif
# define pwrite pwrite64 typedef struct aiocb64 aiocb64_t;
#endif #endif
/* above needed for IRIX 6.5 */
void ADIOI_XFS_Open(ADIO_File fd, int *error_code); void ADIOI_XFS_Open(ADIO_File fd, int *error_code);
void ADIOI_XFS_Close(ADIO_File fd, int *error_code); void ADIOI_XFS_Close(ADIO_File fd, int *error_code);
@ -33,22 +32,6 @@ void ADIOI_XFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type, MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int ADIO_Offset offset, ADIO_Status *status, int
*error_code); *error_code);
void ADIOI_XFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_XFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_XFS_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_XFS_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_XFS_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_XFS_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code); *error_code);
void ADIOI_XFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); void ADIOI_XFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);

Просмотреть файл

@ -1,69 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_xfs.h"
int ADIOI_XFS_ReadDone(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
int err, done=0;
static char myname[] = "ADIOI_XFS_READDONE";
if (*request == ADIO_REQUEST_NULL) {
*error_code = MPI_SUCCESS;
return 1;
}
if ((*request)->queued) {
errno = aio_error64((const aiocb64_t *) (*request)->handle);
if (errno == EINPROGRESS) {
done = 0;
*error_code = MPI_SUCCESS;
}
else {
err = aio_return64((aiocb64_t *) (*request)->handle);
(*request)->nbytes = err;
errno = aio_error64((const aiocb64_t *) (*request)->handle);
done = 1;
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
} /* if ((*request)->queued) */
else {
done = 1;
*error_code = MPI_SUCCESS;
}
#ifdef HAVE_STATUS_SET_BYTES
if (done && ((*request)->nbytes != -1))
MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes);
#endif
if (done) {
/* if request is still queued in the system, it is also there
on ADIOI_Async_list. Delete it from there. */
if ((*request)->queued) ADIOI_Del_req_from_list(request);
(*request)->fd->async_count--;
if ((*request)->handle) ADIOI_Free((*request)->handle);
ADIOI_Free_request((ADIOI_Req_node *) (*request));
*request = ADIO_REQUEST_NULL;
/* status to be filled */
}
return done;
}
int ADIOI_XFS_WriteDone(ADIO_Request *request, ADIO_Status *status, int *error_code)
{
return ADIOI_XFS_ReadDone(request, status, error_code);
}

Просмотреть файл

@ -7,6 +7,11 @@
#include "ad_xfs.h" #include "ad_xfs.h"
#include "adio_extern.h" #include "adio_extern.h"
#include <sys/ioctl.h>
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{ {
@ -37,7 +42,7 @@ void ADIOI_XFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *er
fl.l_len = fcntl_struct->diskspace; fl.l_len = fcntl_struct->diskspace;
#if defined(LINUX) && defined(MPISGI) #if defined(LINUX) && defined(MPISGI)
err = fcntl(fd->fd_sys, XFS_IOC_RESVSP64, &fl); err = ioctl(fd->fd_sys, XFS_IOC_RESVSP64, &fl);
#else #else
err = fcntl(fd->fd_sys, F_RESVSP64, &fl); err = fcntl(fd->fd_sys, F_RESVSP64, &fl);
#endif #endif

Просмотреть файл

@ -8,36 +8,76 @@
#include "ad_xfs.h" #include "ad_xfs.h"
#include "adio_extern.h" #include "adio_extern.h"
static unsigned xfs_direct_read_chunk_size;
static unsigned xfs_direct_write_chunk_size;
void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{ {
char *value; char *value, * c;
int flag; int flag;
static char xfs_initialized = 0;
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
/* the nightly builds say somthing is calling MPI_Info_set w/ a null info, ADIOI_Info_set(fd->info, "direct_read", "false");
* so protect the calls to MPI_Info_set */ ADIOI_Info_set(fd->info, "direct_write", "false");
if (fd->info != MPI_INFO_NULL ) {
MPI_Info_set(fd->info, "direct_read", "false");
MPI_Info_set(fd->info, "direct_write", "false");
fd->direct_read = fd->direct_write = 0; fd->direct_read = fd->direct_write = 0;
if (!xfs_initialized) {
xfs_initialized = 1;
c = getenv("MPIO_DIRECT_READ_CHUNK_SIZE");
if (c) {
int io;
io = atoi(c);
if (io <= 0) {
fprintf(stderr,
"MPI: Ignoring an invalid setting for MPIO_DIRECT_READ_CHUNK_SIZE.\n"
" It must be set to a positive integer value.\n");
} else {
xfs_direct_read_chunk_size = io;
}
} else {
xfs_direct_read_chunk_size = 0;
} }
/* has user specified values for keys "direct_read" and "direct wirte"? */ c = getenv("MPIO_DIRECT_WRITE_CHUNK_SIZE");
if (c) {
int io;
io = atoi(c);
if (io <= 0) {
fprintf(stderr,
"MPI: Ignoring an invalid setting for MPIO_DIRECT_WRITE_CHUNK_SIZE.\n"
" It must be set to a positive integer value.\n");
} else {
xfs_direct_write_chunk_size = io;
}
} else {
xfs_direct_write_chunk_size = 0;
}
}
if (!fd->hints->initialized) {
fd->hints->fs_hints.xfs.read_chunk_sz =
xfs_direct_read_chunk_size;
fd->hints->fs_hints.xfs.write_chunk_sz =
xfs_direct_write_chunk_size;
}
/* has user specified values for keys "direct_read" and "direct write"? */
if (users_info != MPI_INFO_NULL) { if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && !strcmp(value, "true")) { if (flag && !strcmp(value, "true")) {
MPI_Info_set(fd->info, "direct_read", "true"); ADIOI_Info_set(fd->info, "direct_read", "true");
fd->direct_read = 1; fd->direct_read = 1;
} }
MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
value, &flag); value, &flag);
if (flag && !strcmp(value, "true")) { if (flag && !strcmp(value, "true")) {
MPI_Info_set(fd->info, "direct_write", "true"); ADIOI_Info_set(fd->info, "direct_write", "true");
fd->direct_write = 1; fd->direct_write = 1;
} }
@ -47,8 +87,10 @@ void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
/* set the values for collective I/O and data sieving parameters */ /* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code); ADIOI_GEN_SetInfo(fd, users_info, error_code);
/* Environment variables override MPI_Info hints */
if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_read) fd->direct_read = 1;
if (ADIOI_Direct_write) fd->direct_write = 1; if (ADIOI_Direct_write) fd->direct_write = 1;
/* environment variables checked in ADIO_Init */ /* environment variables checked in ADIO_Init */
*error_code = MPI_SUCCESS; *error_code = MPI_SUCCESS;

Просмотреть файл

@ -1,42 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_xfs.h"
void ADIOI_XFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int *error_code)
{
int len, typesize, aio_errno = 0;
static char myname[] = "ADIOI_XFS_IREADCONTIG";
(*request) = ADIOI_Malloc_request();
(*request)->optype = ADIOI_READ;
(*request)->fd = fd;
(*request)->datatype = datatype;
MPI_Type_size(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
aio_errno = ADIOI_XFS_aio(fd, buf, len, offset, 0, &((*request)->handle));
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;
(*request)->queued = 1;
ADIOI_Add_req_to_list(request);
fd->fp_sys_posn = -1;
/* --BEGIN ERROR HANDLING-- */
if (aio_errno != 0) {
MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
fd->async_count++;
}

Просмотреть файл

@ -1,145 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_xfs.h"
void ADIOI_XFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request,
int *error_code)
{
int len, typesize, aio_errno = 0;
static char myname[] = "ADIOI_XFS_IWRITECONTIG";
*request = ADIOI_Malloc_request();
(*request)->optype = ADIOI_WRITE;
(*request)->fd = fd;
(*request)->datatype = datatype;
MPI_Type_size(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
aio_errno = ADIOI_XFS_aio(fd, buf, len, offset, 1, &((*request)->handle));
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;
(*request)->queued = 1;
ADIOI_Add_req_to_list(request);
fd->fp_sys_posn = -1;
/* --BEGIN ERROR HANDLING-- */
if (aio_errno != 0) {
MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
fd->async_count++;
}
void ADIOI_XFS_IwriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code)
{
ADIO_Status status;
#ifdef HAVE_STATUS_SET_BYTES
int typesize;
#endif
*request = ADIOI_Malloc_request();
(*request)->optype = ADIOI_WRITE;
(*request)->fd = fd;
(*request)->datatype = datatype;
(*request)->queued = 0;
(*request)->handle = 0;
/* call the blocking version. It is faster because it does data sieving. */
ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
offset, &status, error_code);
fd->async_count++;
#ifdef HAVE_STATUS_SET_BYTES
if (*error_code == MPI_SUCCESS) {
MPI_Type_size(datatype, &typesize);
(*request)->nbytes = count * typesize;
}
#endif
}
/* This function is for implementation convenience. It is not user-visible.
* It takes care of the differences in the interface for nonblocking I/O
* on various Unix machines! If wr==1 write, wr==0 read.
*
* Returns 0 on success, -errno on failure.
*/
int ADIOI_XFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle)
{
int err, error_code;
aiocb64_t *aiocbp;
aiocbp = (aiocb64_t *) ADIOI_Calloc(sizeof(aiocb64_t), 1);
if (((wr && fd->direct_write) || (!wr && fd->direct_read))
&& !(((long) buf) % fd->d_mem) && !(offset % fd->d_miniosz) &&
!(len % fd->d_miniosz) && (len >= fd->d_miniosz) &&
(len <= fd->d_maxiosz))
aiocbp->aio_fildes = fd->fd_direct;
else aiocbp->aio_fildes = fd->fd_sys;
aiocbp->aio_offset = offset;
aiocbp->aio_buf = buf;
aiocbp->aio_nbytes = len;
aiocbp->aio_reqprio = 0;
#ifdef AIO_SIGNOTIFY_NONE
/* SGI IRIX 6 */
aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
#else
aiocbp->aio_sigevent.sigev_signo = 0;
#endif
if (wr) err = aio_write64(aiocbp);
else err = aio_read64(aiocbp);
if (err != 0) {
if (errno == EAGAIN) {
/* exceeded the max. no. of outstanding requests.
complete all previous async. requests and try again. */
/* ADIOI_Complete_async(&error_code); */
if (error_code != MPI_SUCCESS) return -EIO;
if (wr) err = aio_write64(aiocbp);
else err = aio_read64(aiocbp);
while (err != 0) {
if (errno == EAGAIN) {
/* sleep and try again */
sleep(1);
if (wr) err = aio_write64(aiocbp);
else err = aio_read64(aiocbp);
}
else {
return -errno;
}
}
}
else {
return -errno;
}
}
*((aiocb64_t **) handle) = aiocbp;
return 0;
}

Просмотреть файл

@ -5,22 +5,26 @@
* See COPYRIGHT notice in top-level directory. * See COPYRIGHT notice in top-level directory.
*/ */
#define _GNU_SOURCE // for O_DIRECT
#include "ad_xfs.h" #include "ad_xfs.h"
#include <sys/ioctl.h>
#ifdef HAVE_STDDEF_H #ifdef HAVE_STDDEF_H
#include <stddef.h> #include <stddef.h>
#endif #endif
#if defined(MPISGI) #ifndef HAVE_LSEEK64
#include <mpitypedefs.h> #define lseek64 lseek
#include <mpifunctions.h>
#endif #endif
void ADIOI_XFS_Open(ADIO_File fd, int *error_code) void ADIOI_XFS_Open(ADIO_File fd, int *error_code)
{ {
int perm, amode, amode_direct; int perm, amode, amode_direct, factor;
unsigned int old_mask; unsigned int old_mask;
struct dioattr st; struct dioattr st;
static char myname[] = "ADIOI_XFS_OPEN"; static char myname[] = "ADIOI_XFS_OPEN";
unsigned read_chunk_sz = fd->hints->fs_hints.xfs.read_chunk_sz;
unsigned write_chunk_sz = fd->hints->fs_hints.xfs.write_chunk_sz;
if (fd->perm == ADIO_PERM_NULL) { if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022); old_mask = umask(022);
@ -49,7 +53,7 @@ void ADIOI_XFS_Open(ADIO_File fd, int *error_code)
fd->fd_direct = open(fd->filename, amode_direct, perm); fd->fd_direct = open(fd->filename, amode_direct, perm);
if (fd->fd_direct != -1) { if (fd->fd_direct != -1) {
#if defined(LINUX) && defined(MPISGI) #if defined(MPISGI)
ioctl(fd->fd_direct, XFS_IOC_DIOINFO, &st); ioctl(fd->fd_direct, XFS_IOC_DIOINFO, &st);
#else #else
fcntl(fd->fd_direct, F_DIOINFO, &st); fcntl(fd->fd_direct, F_DIOINFO, &st);
@ -57,7 +61,34 @@ void ADIOI_XFS_Open(ADIO_File fd, int *error_code)
fd->d_mem = st.d_mem; fd->d_mem = st.d_mem;
fd->d_miniosz = st.d_miniosz; fd->d_miniosz = st.d_miniosz;
fd->d_maxiosz = st.d_maxiosz;
if (read_chunk_sz == 0) {
fd->hints->fs_hints.xfs.read_chunk_sz = st.d_maxiosz;
} else {
/*
* MPIO_DIRECT_READ_CHUNK_SIZE was set.
* Make read_chunk_sz a multiple of d_miniosz.
*/
factor = read_chunk_sz / fd->d_miniosz;
if (factor == 0 || read_chunk_sz != fd->d_miniosz * factor) {
fd->hints->fs_hints.xfs.read_chunk_sz =
fd->d_miniosz * (factor + 1);
}
}
if (write_chunk_sz == 0) {
fd->hints->fs_hints.xfs.write_chunk_sz = st.d_maxiosz;
} else {
/*
* MPIO_DIRECT_WRITE_CHUNK_SIZE was set.
* Make write_chunk_sz a multiple of d_miniosz.
*/
factor = write_chunk_sz / fd->d_miniosz;
if (factor == 0 || write_chunk_sz != fd->d_miniosz * factor) {
fd->hints->fs_hints.xfs.write_chunk_sz =
fd->d_miniosz * (factor + 1);
}
}
if (fd->d_mem > XFS_MEMALIGN) { if (fd->d_mem > XFS_MEMALIGN) {
FPRINTF(stderr, "MPI: Run-time Direct-IO memory alignment, %d, does not match compile-time value, %d.\n", FPRINTF(stderr, "MPI: Run-time Direct-IO memory alignment, %d, does not match compile-time value, %d.\n",

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше