1
1

ROMIO 3.1.4 refresh: import romio from mpich 3.1.4 tarball

Этот коммит содержится в:
Gilles Gouaillardet 2015-04-30 18:53:03 +09:00
родитель e2e91142d5
Коммит eacd434a02
543 изменённых файлов: 90149 добавлений и 0 удалений

55
ompi/mca/io/romio314/romio/.codingcheck Обычный файл
Просмотреть файл

@ -0,0 +1,55 @@
# Here are names that at least at one point were used within ROMIO.
# We should look at these and decide which we wish to allow and which
# should be replaced with something more ROMIO-specific.
%romioDefines = ( 'ROMIO_[A-Za-z0-9_]+' => romio,
'PROFILE' => romio,
'PRINT_ERR_MSG' => romio,
'HPUX' => romio,
'SPPUX'=> romio,
'SX4'=> romio,
'AIO_SUN'=> romio,
'AIO_HANDLE_IN_AIOCB'=> romio,
'NO_FD_IN_AIOCB'=> romio,
'NO_AIO'=> romio,
'AIO_PRIORITY_DEFAULT'=> romio,
'AIO_SIGNOTIFY_NONE'=> romio,
'MPISGI'=> romio,
'CRAY'=> romio,
'PARAGON'=> romio,
'FREEBSD'=> romio,
'LINUX'=> romio,
'tflops'=> romio,
'NFS'=> romio,
'XFS'=> romio,
'CB_CONFIG_LIST_DEBUG'=> romio,
'SFS'=> romio,
'HFS'=> romio,
'UFS'=> romio,
'PVFS_.+' => romio,
'MPI_hpux'=> romio,
'FORTRANCAPS'=> romio,
'MPILAM'=> romio,
'NEEDS_ADIOCB_T'=> romio,
'AGG_DEBUG'=> romio,
'SOLARIS'=> romio,
'IRIX'=> romio,
'AIX'=> romio,
'DEC'=> romio,
'NEEDS_MPI_TEST'=> romio,
'PFS'=> romio,
'PIOFS'=> romio,
'MPICH'=> romio,
'MPICH' => romio,
'MPI_OFFSET_IS_INT'=> romio,
'MPI_COMBINER_NAMED'=> romio,
'_UNICOS'=> romio,
'MPIHP'=> romio,
);
# Only invoke this function if the function is defined (in case the
# user removed the cpp defines check with -rmchecks=cppdefines)
if (defined(&PushDefinesNames)) {
&PushDefinesNames( "romioDefines", "tree", "add" );
}
1;

38
ompi/mca/io/romio314/romio/.config_params Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
__sun4_
__rs6000_
__paragon_
__solaris_
__solaris86_
__tflop_
__tflops_
__hpux_
__sppux_
__SX4_
__sgi_
__sgi5_
__IRIX_
__IRIX32_
__IRIXN32_
__IRIX64_
__alpha_
__ALPHA_
__freebsd_
__netbsd_
__LINUX_
__LINUX_ALPHA_
__CRAY_
__Darwin_
__nfs_
__ufs_
__pfs_
__piofs_
__pvfs_
__testfs_
__xfs_
__hfs_
__sfs_
__mpich_mpi
__sgi_mpi
__hp_mpi
__cray_mpi
__lam_mpi

41
ompi/mca/io/romio314/romio/COPYRIGHT Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
COPYRIGHT
The following is a notice of limited availability of the code and
disclaimer, which must be included in the prologue of the code and in
all source listings of the code.
Copyright (C) 1997 University of Chicago
Permission is hereby granted to use, reproduce, prepare derivative
works, and to redistribute to others.
The University of Chicago makes no representations as to the suitability,
operability, accuracy, or correctness of this software for any purpose.
It is provided "as is" without express or implied warranty.
This software was authored by:
Rajeev Thakur: (630) 252-1682; thakur@mcs.anl.gov
Mathematics and Computer Science Division
Argonne National Laboratory, Argonne IL 60439, USA
GOVERNMENT LICENSE
Portions of this material resulted from work developed under a U.S.
Government Contract and are subject to the following license: the
Government is granted for itself and others acting on its behalf a
paid-up, nonexclusive, irrevocable worldwide license in this computer
software to reproduce, prepare derivative works, and perform publicly
and display publicly.
DISCLAIMER
This computer code material was prepared, in part, as an account of
work sponsored by an agency of the United States Government. Neither
the United States Government, nor the University of Chicago, nor any
of their employees, makes any warranty express or implied, or assumes
any legal liability or responsibility for the accuracy, completeness,
or usefulness of any information, apparatus, product, or process
disclosed, or represents that its use would not infringe privately
owned rights.

172
ompi/mca/io/romio314/romio/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,172 @@
# -*- Mode: Makefile; -*-
#
# (C) 2011 by Argonne National Laboratory.
# See COPYRIGHT in top-level directory.
#
## TODO: need to write an automakefile that handles two primary cases:
## 1) that ROMIO is being embedded within the MPI library, as in MPICH or Open
## MPI
## 2) that ROMIO is being built standalone, old-school style. This case is
## basically unused in modern practice.
# help autoreconf and friends realize where the macros live
ACLOCAL_AMFLAGS = -I confdb
# empty variable initializations so that later code can append (+=)
include_HEADERS =
nodist_include_HEADERS =
noinst_HEADERS =
AM_CPPFLAGS =
EXTRA_DIST =
SUFFIXES =
doc1_src_txt =
# ------------------------------------------------------------------------
# variables to be populated by the included Makefile.mk fragments:
# These are files that contain MPI routines (e.g., MPI_File_open).
# In MPICH these will have an MPI_ and a PMPI_ version. Other implementations
# (like OMPI) only want these to be MPI_ routines, possibly with some
# name-shifting prefix.
romio_mpi_sources =
# regular old source files that implement ROMIO, such as ADIO code
romio_other_sources =
# code that may need to be "up" called from the MPI library and/or is
# MPI-implementation-specific in some way
glue_sources =
# ------------------------------------------------------------------------
# when building under MPICH we must be able to find mpi.h
AM_CPPFLAGS += $(MPI_H_INCLUDE)
# ------------------------------------------------------------------------
# handle the "include" directory here
AM_CPPFLAGS += -I$(top_builddir)/include -I$(top_srcdir)/include
# nodist_ b/c these are created by config.status and should not be distributed
nodist_include_HEADERS += include/mpio.h include/mpiof.h
# ------------------------------------------------------------------------
SUBDIRS =
DIST_SUBDIRS = test test-internal
# for the sake of parallel make and avoiding an excessive number of convenience
# libs, we use a subdir automake fragment strategy
include mpi-io/Makefile.mk
include adio/Makefile.mk
EXTRA_DIST += autogen.sh
if BUILD_ROMIO_EMBEDDED
# Build a libtool convenience library that the enclosing MPI implementation can
# use by adding it to the right _LIBADD variable.
noinst_LTLIBRARIES = libromio.la
libromio_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_sources)
## NOTE: ROMIO's old build system builds a bunch of _foo.o objects that contain
## PMPI_ implementations as well as calls to only other PMPI routines. In
## MPICH, these are the objects that need to go into libmpi, while the foo.o
## objects should go into libpmpi. Furthermore, the -D option for ROMIO's
## source files is different and inverted (in the boolean sense) compared with
## MPICH's defintion. And ROMIO was dumping all of the symbols into the main
## libmpi library, regardless of the separate profiling library's existence.
##
## Annoying, right?
if BUILD_PROFILING_LIB
# The current best strategy for now is to build the PMPI symbols as a separate
# convenience lib to permit adding the special "-D..." argument for all objects.
# MPICH will then link in both convenience library into libmpi, since it
# won't work very well the other way around.
noinst_LTLIBRARIES += libpromio.la
libpromio_la_SOURCES = $(romio_mpi_sources)
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
endif BUILD_PROFILING_LIB
else !BUILD_ROMIO_EMBEDDED
lib_LTLIBRARIES = libromio.la
libromio_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_sources)
if BUILD_PROFILING_LIB
libpromio_la_SOURCES = $(romio_mpi_sources)
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
endif BUILD_PROFILING_LIB
endif
# --------------------------------------------------------------------------
.PHONY: coverage
gcov_sources = $(libmpl_la_SOURCES)
# assumes that these sources were compiled appropriately ("-fprofile-arcs"
# and "-ftest-coverage")
coverage:
@for file in $(gcov_sources) ; do \
dir=`dirname $$file` ; \
bname=`basename $$file` ; \
aux=`echo $$bname | sed -e 's,\.*$$,,'` ; \
echo "( $(GCOV) -b -f -o $$file $$file && mv $${bname}.gcov $$dir )" ; \
( $(GCOV) -b -f -o $$file $$file && mv $${bname}.gcov $$dir ) ; \
rm -f *.gcov ; \
done
for subdir in $(SUBDIRS) - ; do \
if test $$subdir = "-" ; then break ; fi ; \
( cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) coverage ) ; \
done
# --------------------------------------------------------------------------
.PHONY: mandoc mandoc-local htmldoc htmldoc-local
SUFFIXES += .man-phony .html-phony .man1-phony .html1-phony .txt
# "make V=1" support for our documentation recipes
doctextman_verbose = $(doctextman_verbose_$(V))
doctextman_verbose_ = $(doctextman_verbose_$(AM_DEFAULT_VERBOSITY))
doctextman_verbose_0 = @echo " DOCTEXTMAN " $@;
doctexthtml_verbose = $(doctexthtml_verbose_$(V))
doctexthtml_verbose_ = $(doctexthtml_verbose_$(AM_DEFAULT_VERBOSITY))
doctexthtml_verbose_0 = @echo " DOCTEXTHTML " $@;
# Build dir paths where the man pages will be created. Will usually be
# overridden by MPICH make.
mandoc_path1=$(abs_top_builddir)/man/man1
mandoc_path3=$(abs_top_builddir)/man/man3
htmldoc_path1=$(abs_top_builddir)/www/www1
htmldoc_path3=$(abs_top_builddir)/www/www3
doctext_docnotes=
# Provide an easily replaced url root for the generated index file.
# You can override this with URL desired in the index file generated by doctext.
# You can ignore this if you don't use mapnames or tohtml to add links
# to the MPI manual pages to documents.
htmldoc_root3="--your-url-here--"
.c.man-phony:
$(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path3) -ext 3 \
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
.c.html-phony:
$(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path3) \
-heading MPI -quotefmt -nolocation \
-index $(htmldoc_path3)/mpi.cit -indexdir $(htmldoc_root3) \
$(doctext_docnotes) $<
.txt.man1-phony:
$(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path1) -ext 1 \
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
.txt.html1-phony:
$(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path1) \
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
# use mandoc-local target to force directory creation before running DOCTEXT
mandoc:
test -d $(mandoc_path1) || $(MKDIR_P) $(mandoc_path1)
test -d $(mandoc_path3) || $(MKDIR_P) $(mandoc_path3)
$(MAKE) $(AM_MAKEFLAGS) mandoc-local
mandoc-local: $(romio_mpi_sources:.c=.man-phony) $(doc1_src_txt:.txt=.man1-phony)
# use htmldoc-local target to force directory creation before running DOCTEXT
htmldoc:
test -d $(top_builddir)/www/www1 || $(MKDIR_P) $(top_builddir)/www/www1
test -d $(top_builddir)/www/www3 || $(MKDIR_P) $(top_builddir)/www/www3
$(MAKE) $(AM_MAKEFLAGS) htmldoc-local
htmldoc-local: $(romio_mpi_sources:.c=.html-phony) $(doc1_src_txt:.txt=.html1-phony)
# --------------------------------------------------------------------------

660
ompi/mca/io/romio314/romio/README Обычный файл
Просмотреть файл

@ -0,0 +1,660 @@
ROMIO: A High-Performance, Portable MPI-IO Implementation
Version 2008-03-09
Major Changes in this version:
------------------------------
* Fixed performance problems with the darray and subarray datatypes
when using MPICH.
* Better support for building against existing MPICH and MPICH versions.
When building against an existing MPICH installation, use the
"--with-mpi=mpich" option to ROMIO configure. For MPICH, use the
"--with-mpi=mpich" option. These will allow ROMIO to take advantage
of internal features of these implementations.
* Deprecation of SFS, HFS, and PIOFS implementations.
These are no longer actively supported, although the code will continue
to be distributed for now.
* Initial support for the Panasas PanFS filesystem.
PanFS allows users to specify the layout of a file at file-creation time.
Layout information includes the number of StorageBlades (SB)
across which the data is stored, the number of SBs across which a
parity stripe is written, and the number of consecutive stripes that
are placed on the same set of SBs. The panfs_layout_* hints are only
used if supplied at file-creation time.
panfs_layout_type - Specifies the layout of a file:
2 = RAID0
3 = RAID5 Parity Stripes
panfs_layout_stripe_unit - The size of the stripe unit in bytes
panfs_layout_total_num_comps - The total number of StorageBlades a file
is striped across.
panfs_layout_parity_stripe_width - If the layout type is RAID5 Parity
Stripes, this hint specifies the
number of StorageBlades in a parity
stripe.
panfs_layout_parity_stripe_depth - If the layout type is RAID5 Parity
Stripes, this hint specifies the
number of contiguous parity stripes written
across the same set of SBs.
panfs_layout_visit_policy - If the layout type is RAID5 Parity Stripes,
the policy used to determine the parity
stripe a given file offset is written to:
1 = Round Robin
PanFS supports the "concurrent write" (CW) mode, where groups of cooperating
clients can disable the PanFS consistency mechanisms and use their own
consistency protocol. Clients participating in concurrent write mode use
application specific information to improve performance while maintaining
file consistency. All clients accessing the file(s) must enable concurrent
write mode. If any client does not enable concurrent write mode, then the
PanFS consistency protocol will be invoked. Once a file is opened in CW mode
on a machine, attempts to open a file in non-CW mode will fail with
EACCES. If a file is already opened in non-CW mode, attempts to open
the file in CW mode will fail with EACCES. The following hint is
used to enable concurrent write mode.
panfs_concurrent_write - If set to 1 at file open time, the file
is opened using the PanFS concurrent write
mode flag. Concurrent write mode is not a
persistent attribute of the file.
Below is an example PanFS layout using the following parameters:
- panfs_layout_type = 3
- panfs_layout_total_num_comps = 100
- panfs_layout_parity_stripe_width = 10
- panfs_layout_parity_stripe_depth = 8
- panfs_layout_visit_policy = 1
Parity Stripe Group 1 Parity Stripe Group 2 . . . Parity Stripe Group 10
---------------------- ---------------------- --------------------
SB1 SB2 ... SB10 SB11 SB12 ... SB20 ... SB91 SB92 ... SB100
----------------------- ----------------------- ---------------------
D1 D2 ... D10 D91 D92 ... D100 D181 D182 ... D190
D11 D12 D20 D101 D102 D110 D191 D192 D193
D21 D22 D30 . . . . . .
D31 D32 D40
D41 D42 D50
D51 D52 D60
D61 D62 D70
D71 D72 D80
D81 D82 D90 D171 D172 D180 D261 D262 D270
D271 D272 D273 . . . . . .
...
* Initial support for the Globus GridFTP filesystem. Work contributed by Troy
Baer (troy@osc.edu).
Major Changes in Version 1.2.5:
------------------------------
* Initial support for MPICH-2
* fix for a bug in which ROMIO would get confused for some permutations
of the aggregator list
* direct io on IRIX's XFS should work now
* fixed an issue with the Fortran bindings that would cause them to fail
when some compilers tried to build them.
* Initial support for deferred opens
Major Changes in Version 1.2.4:
------------------------------
* Added section describing ROMIO MPI_FILE_SYNC and MPI_FILE_CLOSE behavior to
User's Guide
* Bug removed from PVFS ADIO implementation regarding resize operations
* Added support for PVFS listio operations, including hints to control use
Major Changes in Version 1.2.3:
-------------------------------
* Enhanced aggregation control via cb_config_list, romio_cb_read,
and romio_cb_write hints
* Asynchronous IO can be enabled under Linux with the --enable-aio argument
to configure
* Additional PVFS support
* Additional control over data sieving with romio_ds_read hint
* NTFS ADIO implementation integrated into source tree
* testfs ADIO implementation added for debugging purposes
Major Changes in Version 1.0.3:
-------------------------------
* When used with MPICH 1.2.1, the MPI-IO functions return proper error codes
and classes, and the status object is filled in.
* On SGI's XFS file system, ROMIO can use direct I/O even if the
user's request does not meet the various restrictions needed to use
direct I/O. ROMIO does this by doing part of the request with
buffered I/O (until all the restrictions are met) and doing the rest
with direct I/O. (This feature hasn't been tested rigorously. Please
check for errors.)
By default, ROMIO will use only buffered I/O. Direct I/O can be
enabled either by setting the environment variables MPIO_DIRECT_READ
and/or MPIO_DIRECT_WRITE to TRUE, or on a per-file basis by using
the info keys "direct_read" and "direct_write".
Direct I/O will result in higher performance only if you are
accessing a high-bandwidth disk system. Otherwise, buffered I/O is
better and is therefore used as the default.
* Miscellaneous bug fixes.
Major Changes Version 1.0.2:
---------------------------
* Implemented the shared file pointer functions and
split collective I/O functions. Therefore, the main
components of the MPI I/O chapter not yet implemented are
file interoperability and error handling.
* Added support for using "direct I/O" on SGI's XFS file system.
Direct I/O is an optional feature of XFS in which data is moved
directly between the user's buffer and the storage devices, bypassing
the file-system cache. This can improve performance significantly on
systems with high disk bandwidth. Without high disk bandwidth,
regular I/O (that uses the file-system cache) perfoms better.
ROMIO, therefore, does not use direct I/O by default. The user can
turn on direct I/O (separately for reading and writing) either by
using environment variables or by using MPI's hints mechanism (info).
To use the environment-variables method, do
setenv MPIO_DIRECT_READ TRUE
setenv MPIO_DIRECT_WRITE TRUE
To use the hints method, the two keys are "direct_read" and "direct_write".
By default their values are "false". To turn on direct I/O, set the values
to "true". The environment variables have priority over the info keys.
In other words, if the environment variables are set to TRUE, direct I/O
will be used even if the info keys say "false", and vice versa.
Note that direct I/O must be turned on separately for reading
and writing.
The environment-variables method assumes that the environment
variables can be read by each process in the MPI job. This is
not guaranteed by the MPI Standard, but it works with SGI's MPI
and the ch_shmem device of MPICH.
* Added support (new ADIO device, ad_pvfs) for the PVFS parallel
file system for Linux clusters, developed at Clemson University
(see http://www.parl.clemson.edu/pvfs ). To use it, you must first install
PVFS and then when configuring ROMIO, specify "-file_system=pvfs" in
addition to any other options to "configure". (As usual, you can configure
for multiple file systems by using "+"; for example,
"-file_system=pvfs+ufs+nfs".) You will need to specify the path
to the PVFS include files via the "-cflags" option to configure,
for example, "configure -cflags=-I/usr/pvfs/include". You
will also need to specify the full path name of the PVFS library.
The best way to do this is via the "-lib" option to MPICH's
configure script (assuming you are using ROMIO from within MPICH).
* Uses weak symbols (where available) for building the profiling version,
i.e., the PMPI routines. As a result, the size of the library is reduced
considerably.
* The Makefiles use "virtual paths" if supported by the make utility. GNU make
supports it, for example. This feature allows you to untar the
distribution in some directory, say a slow NFS directory,
and compile the library (the .o files) in another
directory, say on a faster local disk. For example, if the tar file
has been untarred in an NFS directory called /home/thakur/romio,
one can compile it in a different directory, say /tmp/thakur, as follows:
cd /tmp/thakur
/home/thakur/romio/configure
make
The .o files will be created in /tmp/thakur; the library will be created in
/home/thakur/romio/lib/$ARCH/libmpio.a .
This method works only if the make utility supports virtual paths.
If the default make does not, you can install GNU make which does,
and specify it to configure as
/home/thakur/romio/configure -make=/usr/gnu/bin/gmake (or whatever)
* Lots of miscellaneous bug fixes and other enhancements.
* This version is included in MPICH 1.2.0. If you are using MPICH, you
need not download ROMIO separately; it gets built as part of MPICH.
The previous version of ROMIO is included in LAM, HP MPI, SGI MPI, and
NEC MPI. NEC has also implemented the MPI-IO functions missing
in ROMIO, and therefore NEC MPI has a complete implementation
of MPI-IO.
Major Changes in Version 1.0.1:
------------------------------
* This version is included in MPICH 1.1.1 and HP MPI 1.4.
* Added support for NEC SX-4 and created a new device ad_sfs for
NEC SFS file system.
* New devices ad_hfs for HP/Convex HFS file system and ad_xfs for
SGI XFS file system.
* Users no longer need to prefix the filename with the type of
file system; ROMIO determines the file-system type on its own.
* Added support for 64-bit file sizes on IBM PIOFS, SGI XFS,
HP/Convex HFS, and NEC SFS file systems.
* MPI_Offset is an 8-byte integer on machines that support 8-byte integers.
It is of type "long long" in C and "integer*8" in Fortran.
With a Fortran 90 compiler, you can use either integer*8 or
integer(kind=MPI_OFFSET_KIND).
If you printf an MPI_Offset in C, remember to use %lld
or %ld as required by your compiler. (See what is used in the test
program romio/test/misc.c.)
* On some machines, ROMIO detects at configure time that "long long" is
either not supported by the C compiler or it doesn't work properly.
In such cases, configure sets MPI_Offset to long in C and integer in
Fortran. This happens on Intel Paragon, Sun4, and FreeBSD.
* Added support for passing hints to the implementation via the MPI_Info
parameter. ROMIO understands the following hints (keys in MPI_Info object):
/* on all file systems */
cb_buffer_size - buffer size for collective I/O
cb_nodes - no. of processes that actually perform I/O in collective I/O
ind_rd_buffer_size - buffer size for data sieving in independent reads
/* on all file systems except IBM PIOFS */
ind_wr_buffer_size - buffer size for data sieving in independent writes
/* ind_wr_buffer_size is ignored on PIOFS because data sieving
cannot be done for writes since PIOFS doesn't support file locking */
/* on Intel PFS and IBM PIOFS only. These hints are understood only if
supplied at file-creation time. */
striping_factor - no. of I/O devices to stripe the file across
striping_unit - the striping unit in bytes
start_iodevice - the number of the I/O device from which to start
striping (between 0 and (striping_factor-1))
/* on Intel PFS only. */
pfs_svr_buf - turn on or off PFS server buffering by setting the value
to "true" or "false", case-sensitive.
If ROMIO doesn't understand a hint, or if the value is invalid, the hint
will be ignored. The values of hints being used by ROMIO at any time
can be obtained via MPI_File_get_info.
General Information
-------------------
ROMIO is a high-performance, portable implementation of MPI-IO (the
I/O chapter in MPI). ROMIO's home page is at
http://www.mcs.anl.gov/romio . The MPI standard is available at
http://www.mpi-forum.org/docs/docs.html .
This version of ROMIO includes everything defined in the MPI I/O
chapter except support for file interoperability and
user-defined error handlers for files. The subarray and
distributed array datatype constructor functions from Chapter 4
(Sec. 4.14.4 & 4.14.5) have been implemented. They are useful for
accessing arrays stored in files. The functions MPI_File_f2c and
MPI_File_c2f (Sec. 4.12.4) are also implemented.
C, Fortran, and profiling interfaces are provided for all functions
that have been implemented.
Please read the limitations of this version of ROMIO that are listed
below (e.g., MPIO_Request object, restriction to homogeneous
environments).
This version of ROMIO runs on at least the following machines: IBM SP;
Intel Paragon; HP Exemplar; SGI Origin2000; Cray T3E; NEC SX-4; other
symmetric multiprocessors from HP, SGI, DEC, Sun, and IBM; and networks of
workstations (Sun, SGI, HP, IBM, DEC, Linux, and FreeBSD). Supported
file systems are IBM PIOFS, Intel PFS, HP/Convex HFS, SGI XFS, NEC
SFS, PVFS, NFS, and any Unix file system (UFS).
This version of ROMIO is included in MPICH 1.2.3; an earlier version
is included in at least the following MPI implementations: LAM, HP
MPI, SGI MPI, and NEC MPI.
Note that proper I/O error codes and classes are returned and the
status variable is filled only when used with MPICH 1.2.1 or later.
You can open files on multiple file systems in the same program. The
only restriction is that the directory where the file is to be opened
must be accessible from the process opening the file. For example, a
process running on one workstation may not be able to access a
directory on the local disk of another workstation, and therefore
ROMIO will not be able to open a file in such a directory. NFS-mounted
files can be accessed.
An MPI-IO file created by ROMIO is no different than any other file
created by the underlying file system. Therefore, you may use any of
the commands provided by the file system to access the file, e.g., ls,
mv, cp, rm, ftp.
Using ROMIO on NFS
------------------
To use ROMIO on NFS, file locking with fcntl must work correctly on
the NFS installation. On some installations, fcntl locks don't work.
To get them to work, you need to use Version 3 of NFS, ensure that the
lockd daemon is running on all the machines, and have the system
administrator mount the NFS file system with the "noac" option (no
attribute caching). Turning off attribute caching may reduce
performance, but it is necessary for correct behavior.
The following are some instructions we received from Ian Wells of HP
for setting the noac option on NFS. We have not tried them
ourselves. We are including them here because you may find
them useful. Note that some of the steps may be specific to HP
systems, and you may need root permission to execute some of the
commands.
>1. first confirm you are running nfs version 3
>
>rpcnfo -p `hostname` | grep nfs
>
>ie
> goedel >rpcinfo -p goedel | grep nfs
> 100003 2 udp 2049 nfs
> 100003 3 udp 2049 nfs
>
>
>2. then edit /etc/fstab for each nfs directory read/written by MPIO
> on each machine used for multihost MPIO.
>
> Here is an example of a correct fstab entry for /epm1:
>
> ie grep epm1 /etc/fstab
>
> ROOOOT 11>grep epm1 /etc/fstab
> gershwin:/epm1 /rmt/gershwin/epm1 nfs bg,intr,noac 0 0
>
> if the noac option is not present, add it
> and then remount this directory
> on each of the machines that will be used to share MPIO files
>
>ie
>
>ROOOOT >umount /rmt/gershwin/epm1
>ROOOOT >mount /rmt/gershwin/epm1
>
>3. Confirm that the directory is mounted noac:
>
>ROOOOT >grep gershwin /etc/mnttab
>gershwin:/epm1 /rmt/gershwin/epm1 nfs
>noac,acregmin=0,acregmax=0,acdirmin=0,acdirmax=0 0 0 899911504
ROMIO Installation Instructions
-------------------------------
Since ROMIO is included in MPICH, LAM, HP MPI, SGI MPI, and NEC MPI,
you don't need to install it separately if you are using any of these
MPI implementations. If you are using some other MPI, you can
configure and build ROMIO as follows:
Untar the tar file as
gunzip -c romio.tar.gz | tar xvf -
OR
zcat romio.tar.Z | tar xvf -
THEN
cd romio
./configure
make
Some example programs and a Makefile are provided in the romio/test directory.
Run the examples the way you would run any MPI program. Each program takes
the filename as a command-line argument "-fname filename".
The configure script by default configures ROMIO for the file systems
most likely to be used on the given machine. If you wish, you can
explicitly specify the file systems by using the "-file_system" option
to configure. Multiple file systems can be specified by using "+" as a
separator. For example,
./configure -file_system=xfs+nfs
For the entire list of options to configure do
./configure -h | more
After building a specific version as above, you can install it in a
particular directory with
make install PREFIX=/usr/local/romio (or whatever directory you like)
or just
make install (if you used -prefix at configure time)
If you intend to leave ROMIO where you built it, you should NOT install it
(install is used only to move the necessary parts of a built ROMIO to
another location). The installed copy will have the include files,
libraries, man pages, and a few other odds and ends, but not the whole
source tree. It will have a test directory for testing the
installation and a location-independent Makefile built during
installation, which users can copy and modify to compile and link
against the installed copy.
To rebuild ROMIO with a different set of configure options, do
make distclean
to clean everything including the Makefiles created by configure.
Then run configure again with the new options, followed by make.
Testing ROMIO
-------------
To test if the installation works, do
make testing
in the romio/test directory. This calls a script that runs the test
programs and compares the results with what they should be. By
default, "make testing" causes the test programs to create files in
the current directory and use whatever file system that corresponds
to. To test with other file systems, you need to specify a filename in
a directory corresponding to that file system as follows:
make testing TESTARGS="-fname=/foo/piofs/test"
Compiling and Running MPI-IO Programs
-------------------------------------
If ROMIO is not already included in the MPI implementation, you need
to include the file mpio.h for C or mpiof.h for Fortran in your MPI-IO
program.
Note that on HP machines running HPUX and on NEC SX-4, you need to
compile Fortran programs with mpifort, because the f77 compilers on
these machines don't support 8-byte integers.
With MPICH, HP MPI, or NEC MPI, you can compile MPI-IO programs as
mpicc foo.c
or
mpif77 foo.f
or
mpifort foo.f
As mentioned above, mpifort is preferred over mpif77 on HPUX and NEC
because the f77 compilers on those machines do not support 8-byte integers.
With SGI MPI, you can compile MPI-IO programs as
cc foo.c -lmpi
or
f77 foo.f -lmpi
or
f90 foo.f -lmpi
With LAM, you can compile MPI-IO programs as
hcc foo.c -lmpi
or
hf77 foo.f -lmpi
If you have built ROMIO with some other MPI implementation, you can
compile MPI-IO programs by explicitly giving the path to the include
file mpio.h or mpiof.h and explicitly specifying the path to the
library libmpio.a, which is located in $(ROMIO_HOME)/lib/$(ARCH)/libmpio.a .
Run the program as you would run any MPI program on the machine. If
you use mpirun, make sure you use the correct mpirun for the MPI
implementation you are using. For example, if you are using MPICH on
an SGI machine, make sure that you use MPICH's mpirun and not SGI's
mpirun.
The Makefile in the romio/test directory illustrates how to compile
and link MPI-IO programs.
Limitations of this version of ROMIO
------------------------------------
* When used with any MPI implementation other than MPICH 1.2.1 (or later),
the "status" argument is not filled in any MPI-IO function. Consequently,
MPI_Get_count and MPI_Get_elements will not work when passed the status
object from an MPI-IO operation.
* All nonblocking I/O functions use a ROMIO-defined "MPIO_Request"
object instead of the usual "MPI_Request" object. Accordingly, two
functions, MPIO_Test and MPIO_Wait, are provided to wait and test on
these MPIO_Request objects. They have the same semantics as MPI_Test
and MPI_Wait.
int MPIO_Test(MPIO_Request *request, int *flag, MPI_Status *status);
int MPIO_Wait(MPIO_Request *request, MPI_Status *status);
The usual functions MPI_Test, MPI_Wait, MPI_Testany, etc., will not
work for nonblocking I/O.
* This version works only on a homogeneous cluster of machines,
and only the "native" file data representation is supported.
* When used with any MPI implementation other than MPICH 1.2.1 (or later),
all MPI-IO functions return only two possible error codes---MPI_SUCCESS
on success and MPI_ERR_UNKNOWN on failure.
* Shared file pointers are not supported on PVFS and IBM PIOFS file
systems because they don't support fcntl file locks, and ROMIO uses
that feature to implement shared file pointers.
* On HP machines running HPUX and on NEC SX-4, you need to compile
Fortran programs with mpifort instead of mpif77, because the f77
compilers on these machines don't support 8-byte integers.
* The file-open mode MPI_MODE_EXCL does not work on Intel PFS file system,
due to a bug in PFS.
Usage Tips
----------
* When using ROMIO with SGI MPI, you may sometimes get an error
message from SGI MPI: ``MPI has run out of internal datatype
entries. Please set the environment variable MPI_TYPE_MAX for
additional space.'' If you get this error message, add this line to
your .cshrc file:
setenv MPI_TYPE_MAX 65536
Use a larger number if you still get the error message.
* If a Fortran program uses a file handle created using ROMIO's C
interface, or vice-versa, you must use the functions MPI_File_c2f
or MPI_File_f2c. Such a situation occurs,
for example, if a Fortran program uses an I/O library written in C
with MPI-IO calls. Similar functions MPIO_Request_f2c and
MPIO_Request_c2f are also provided.
* For Fortran programs on the Intel Paragon, you may need
to provide the complete path to mpif.h in the include statement, e.g.,
include '/usr/local/mpich/include/mpif.h'
instead of
include 'mpif.h'
This is because the -I option to the Paragon Fortran compiler if77
doesn't work correctly. It always looks in the default directories first
and, therefore, picks up Intel's mpif.h, which is actually the
mpif.h of an older version of MPICH.
ROMIO Users Mailing List
------------------------
Please register your copy of ROMIO with us by sending email
to majordomo@mcs.anl.gov with the message
subscribe romio-users
This will enable us to notify you of new releases of ROMIO as well as
bug fixes.
Reporting Bugs
--------------
If you have trouble, first check the users guide (in
romio/doc/users-guide.ps.gz). Then check the on-line list of known
bugs and patches at http://www.mcs.anl.gov/romio .
Finally, if you still have problems, send a detailed message containing:
The type of system (often, uname -a)
The output of configure
The output of make
Any programs or tests
to romio-maint@mcs.anl.gov .
ROMIO Internals
---------------
A key component of ROMIO that enables such a portable MPI-IO
implementation is an internal abstract I/O device layer called
ADIO. Most users of ROMIO will not need to deal with the ADIO layer at
all. However, ADIO is useful to those who want to port ROMIO to some
other file system. The ROMIO source code and the ADIO paper
(see doc/README) will help you get started.
MPI-IO implementation issues are discussed in our IOPADS '99 paper,
"On Implementing MPI-IO Portably and with High Performance."
All ROMIO-related papers are available online from
http://www.mcs.anl.gov/romio.
Learning MPI-IO
---------------
The book "Using MPI-2: Advanced Features of the Message-Passing
Interface," published by MIT Press, provides a tutorial introduction to
all aspects of MPI-2, including parallel I/O. It has lots of example
programs. See http://www.mcs.anl.gov/mpi/usingmpi2 for further
information about the book.

46
ompi/mca/io/romio314/romio/adio/Makefile.mk Обычный файл
Просмотреть файл

@ -0,0 +1,46 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
AM_CPPFLAGS += -I$(top_builddir)/adio/include -I$(top_srcdir)/adio/include
noinst_HEADERS += \
adio/include/adio.h \
adio/include/adio_cb_config_list.h \
adio/include/adio_extern.h \
adio/include/adioi.h \
adio/include/adioi_errmsg.h \
adio/include/adioi_error.h \
adio/include/adioi_fs_proto.h \
adio/include/heap-sort.h \
adio/include/mpio_error.h \
adio/include/mpipr.h \
adio/include/mpiu_greq.h \
adio/include/nopackage.h \
adio/include/mpiu_external32.h \
adio/include/hint_fns.h
include $(top_srcdir)/adio/ad_gpfs/Makefile.mk
include $(top_srcdir)/adio/ad_gpfs/bg/Makefile.mk
include $(top_srcdir)/adio/ad_gpfs/pe/Makefile.mk
include $(top_srcdir)/adio/ad_gridftp/Makefile.mk
include $(top_srcdir)/adio/ad_hfs/Makefile.mk
include $(top_srcdir)/adio/ad_lustre/Makefile.mk
include $(top_srcdir)/adio/ad_nfs/Makefile.mk
## NTFS builds are handled entirely by the separate Windows build system
##include $(top_srcdir)/adio/ad_ntfs/Makefile.mk
include $(top_srcdir)/adio/ad_panfs/Makefile.mk
include $(top_srcdir)/adio/ad_pfs/Makefile.mk
include $(top_srcdir)/adio/ad_piofs/Makefile.mk
include $(top_srcdir)/adio/ad_pvfs/Makefile.mk
include $(top_srcdir)/adio/ad_pvfs2/Makefile.mk
include $(top_srcdir)/adio/ad_sfs/Makefile.mk
include $(top_srcdir)/adio/ad_testfs/Makefile.mk
include $(top_srcdir)/adio/ad_ufs/Makefile.mk
include $(top_srcdir)/adio/ad_xfs/Makefile.mk
include $(top_srcdir)/adio/ad_zoidfs/Makefile.mk
include $(top_srcdir)/adio/common/Makefile.mk

11
ompi/mca/io/romio314/romio/adio/ad_gpfs/.gitignore поставляемый Обычный файл
Просмотреть файл

@ -0,0 +1,11 @@
/Makefile
/.deps
/*.bb
/*.bbg
/*.gcda
/*.gcno
/.libs
/.libstamp*
/*.lo
/.*-cache
/.state-cache

Просмотреть файл

@ -0,0 +1,26 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2012 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_GPFS
noinst_HEADERS += \
adio/ad_gpfs/ad_gpfs_aggrs.h \
adio/ad_gpfs/ad_gpfs.h \
adio/ad_gpfs/ad_gpfs_tuning.h
romio_other_sources += \
adio/ad_gpfs/ad_gpfs_aggrs.c \
adio/ad_gpfs/ad_gpfs_close.c \
adio/ad_gpfs/ad_gpfs_flush.c \
adio/ad_gpfs/ad_gpfs_tuning.c \
adio/ad_gpfs/ad_gpfs.c \
adio/ad_gpfs/ad_gpfs_open.c \
adio/ad_gpfs/ad_gpfs_hints.c \
adio/ad_gpfs/ad_gpfs_rdcoll.c \
adio/ad_gpfs/ad_gpfs_wrcoll.c
endif BUILD_AD_GPFS

Просмотреть файл

@ -0,0 +1,61 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gpfs.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_GPFS_operations = {
ADIOI_GPFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_GPFS_ReadStridedColl, /* ReadStridedColl */
ADIOI_GPFS_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GEN_Fcntl, /* Fcntl */
#if defined(BGQPLATFORM) || defined(PEPLATFORM)
ADIOI_GPFS_SetInfo, /* SetInfo for BlueGene or PE */
#else
ADIOI_GEN_SetInfo, /* SetInfo for any platform besides BlueGene or PE */
#endif
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_GPFS_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
#warning Consider BG support for NFS before enabling this.
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GPFS_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
#ifdef BGQPLATFORM
"GPFS+BGQ: IBM GPFS for Blue Gene",
#elif PEPLATFORM
"GPFS+PE: IBM GPFS for PE",
#else
"GPFS: IBM GPFS"
#endif
};

Просмотреть файл

@ -0,0 +1,71 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs.h
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_GPFS_INCLUDE
#define AD_GPFS_INCLUDE
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#ifdef HAVE_AIO_H
#include <aio.h>
#endif
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code);
void ADIOI_GPFS_Close(ADIO_File fd, int *error_code);
void ADIOI_GPFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_GPFS_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
#include "ad_gpfs_tuning.h"
#endif

Просмотреть файл

@ -0,0 +1,846 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_aggrs.c
* \brief The externally used function from this file is is declared in ad_gpfs_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_cb_config_list.h"
#include "ad_gpfs.h"
#include "ad_gpfs_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(format...)
#endif
/* Comments copied from common:
* This file contains four functions:
*
* ADIOI_Calc_aggregator()
* ADIOI_Calc_file_domains()
* ADIOI_Calc_my_req()
* ADIOI_Calc_others_req()
*
* The last three of these were originally in ad_read_coll.c, but they are
* also shared with ad_write_coll.c. I felt that they were better kept with
* the rest of the shared aggregation code.
*/
/* Discussion of values available from above:
*
* ADIO_Offset st_offsets[0..nprocs-1]
* ADIO_Offset end_offsets[0..nprocs-1]
* These contain a list of start and end offsets for each process in
* the communicator. For example, an access at loc 10, size 10 would
* have a start offset of 10 and end offset of 19.
* int nprocs
* number of processors in the collective I/O communicator
* ADIO_Offset min_st_offset
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
* starting location of "file domain"; region that a given process will
* perform aggregation for (i.e. actually do I/O)
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
* start + size - 1 roughly, but it can be less, or 0, in the case of
* uneven distributions
*/
/* Description from common/ad_aggregate.c. (Does it completely apply to bg?)
* ADIOI_Calc_aggregator()
*
* The intention here is to implement a function which provides basically
* the same functionality as in Rajeev's original version of
* ADIOI_Calc_my_req(). He used a ceiling division approach to assign the
* file domains, and we use the same approach here when calculating the
* location of an offset/len in a specific file domain. Further we assume
* this same distribution when calculating the rank_index, which is later
* used to map to a specific process rank in charge of the file domain.
*
* A better (i.e. more general) approach would be to use the list of file
* domains only. This would be slower in the case where the
* original ceiling division was used, but it would allow for arbitrary
* distributions of regions to aggregators. We'd need to know the
* nprocs_for_coll in that case though, which we don't have now.
*
* Note a significant difference between this function and Rajeev's old code:
* this code doesn't necessarily return a rank in the range
* 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a
* result of the rank mapping; any set of ranks in the communicator could be
* used now.
*
* Returns an integer representing a rank in the collective I/O communicator.
*
* The "len" parameter is also modified to indicate the amount of data
* actually available in this file domain.
*/
/*
* This is more general aggregator search function which does not base on the assumption
* that each aggregator hosts the file domain with the same size
*/
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
ADIO_Offset fd_size,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end)
{
int rank_index, rank;
ADIO_Offset avail_bytes;
TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
ADIOI_Assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
/* binary search --> rank_index is returned */
int ub = fd->hints->cb_nodes;
int lb = 0;
/* get an index into our array of aggregators */
/* Common code for striping - bg doesn't use it but it's
here to make diff'ing easier.
rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
if (fd->hints->striping_unit > 0) {
* wkliao: implementation for file domain alignment
fd_start[] and fd_end[] have been aligned with file lock
boundaries when returned from ADIOI_Calc_file_domains() so cannot
just use simple arithmatic as above *
rank_index = 0;
while (off > fd_end[rank_index]) rank_index++;
}
bg does it's own striping below
*/
rank_index = fd->hints->cb_nodes / 2;
while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
if ( off > fd_end [rank_index] ) {
lb = rank_index;
rank_index = (rank_index + ub) / 2;
}
else
if ( off < fd_start[rank_index] ) {
ub = rank_index;
rank_index = (rank_index + lb) / 2;
}
}
/* we index into fd_end with rank_index, and fd_end was allocated to be no
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort */
if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
rank_index,fd->hints->cb_nodes,fd_size,off);
MPI_Abort(MPI_COMM_WORLD, 1);
}
/* DBG_FPRINTF ("ADIOI_GPFS_Calc_aggregator: rank_index = %d\n",
rank_index ); */
/*
* remember here that even in Rajeev's original code it was the case that
* different aggregators could end up with different amounts of data to
* aggregate. here we use fd_end[] to make sure that we know how much
* data this aggregator is working with.
*
* the +1 is to take into account the end vs. length issue.
*/
avail_bytes = fd_end[rank_index] + 1 - off;
if (avail_bytes < *len && avail_bytes > 0) {
/* this file domain only has part of the requested contig. region */
*len = avail_bytes;
}
/* map our index to a rank */
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
rank = fd->hints->ranklist[rank_index];
TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");
return rank;
}
/*
* Compute a dynamic access range based file domain partition among I/O aggregators,
* which align to the GPFS block size
* Divide the I/O workload among "nprocs_for_coll" processes. This is
* done by (logically) dividing the file into file domains (FDs); each
* process may directly access only its own file domain.
* Additional effort is to make sure that each I/O aggregator get
* a file domain that aligns to the GPFS block size. So, there will
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
*
* The common version of this now accepts a min_fd_size and striping_unit.
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
*/
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
blksize_t blksize;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5004, 0, NULL);
#endif
# if AGG_DEBUG
static char myname[] = "ADIOI_GPFS_Calc_file_domains";
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
myname,__LINE__,nprocs_for_coll);
# endif
if (fd->blksize <= 0)
/* default to 1M if blksize unset */
fd->blksize = 1048576;
blksize = fd->blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
# endif
/* find min of start offsets and max of end offsets of all processes */
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
/* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_
= %qd, %qd\n", min_st_offset, max_end_offset );*/
/* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process */
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
int naggs = nprocs_for_coll;
/* Tweak the file domains so that no fd is smaller than a threshold. We
* have to strike a balance between efficency and parallelism: somewhere
* between 10k processes sending 32-byte requests and one process sending a
* 320k request is a (system-dependent) sweet spot
This is from the common code - the new min_fd_size parm that we didn't implement.
(And common code uses a different declaration of fd_size so beware)
if (fd_size < min_fd_size)
fd_size = min_fd_size;
*/
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
/* each process will have a file domain of some number of gpfs blocks, but
* the division of blocks is not likely to be even. Some file domains will
* be "large" and others "small"
*
* Example: consider 17 blocks distributed over 3 aggregators.
* nb_cn_small = 17/3 = 5
* naggs_large = 17 - 3*(17/3) = 17 - 15 = 2
* naggs_small = 3 - 2 = 1
*
* and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
*
* what about (relatively) small files? say, a file of 1000 blocks
* distributed over 2064 aggregators:
* nb_cn_small = 1000/2064 = 0
* naggs_large = 1000 - 2064*(1000/2064) = 1000
* naggs_small = 2064 - 1000 = 1064
* and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
*
* it might be a good idea instead of having all the zeros up front, to
* "mix" those zeros into the fd_size array. that way, no pset/bridge-set
* is left with zero work. In fact, even if the small file domains aren't
* zero, it's probably still a good idea to mix the "small" file domains
* across the fd_size array to keep the io nodes in balance */
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
#ifdef BGQPLATFORM
if (gpfsmpio_balancecontig == 1) {
/* File domains blocks are assigned to aggregators in a breadth-first
* fashion relative to the ions - additionally, file domains on the
* aggregators sharing the same bridgeset and ion have contiguous
* offsets. */
// initialize everything to small
for (i=0; i<naggs; i++)
fd_size[i] = nb_cn_small * blksize;
// go thru and distribute the large across the bridges
/* bridelistoffset: agg rank list offsets using the bridgelist - each
* entry is created by adding up the indexes for the aggs from all
* previous bridges */
int *bridgelistoffset =
(int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
* decremented to keep track of bridge assignments during the actual
* large block assignments to the agg rank list*/
int *tmpbridgelistnum =
(int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
int j;
for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
int k, bridgerankoffset = 0;
for (k=0;k<j;k++) {
bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
}
bridgelistoffset[j] = bridgerankoffset;
}
for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
int bridgeiter = 0;
/* distribute the large blocks across the aggs going breadth-first
* across the bridgelist - this distributes the fd sizes across the
* ions, so later in the file domain assignment when it iterates thru
* the ranklist the offsets will be contiguous within the bridge and
* ion as well */
for (j=0;j<naggs_large;j++) {
int foundbridge = 0;
int numbridgelistpasses = 0;
while (!foundbridge) {
if (tmpbridgelistnum[bridgeiter] > 0) {
foundbridge = 1;
/*
printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
*/
int currentbridgelistnum =
(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-
tmpbridgelistnum[bridgeiter]);
int currentfdsizeindex = bridgelistoffset[bridgeiter] +
currentbridgelistnum;
fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize;
tmpbridgelistnum[bridgeiter]--;
}
if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) {
/* guard against infinite loop - should only ever make 1 pass
* thru bridgelist */
ADIOI_Assert(numbridgelistpasses == 0);
numbridgelistpasses++;
bridgeiter = 0;
}
else
bridgeiter++;
}
}
ADIOI_Free(tmpbridgelistnum);
ADIOI_Free(bridgelistoffset);
} else {
/* BG/L- and BG/P-style distribution of file domains: simple allocation of
* file domins to each aggregator */
for (i=0; i<naggs; i++) {
if (i < naggs_large) {
fd_size[i] = (nb_cn_small+1) * blksize;
} else {
fd_size[i] = nb_cn_small * blksize;
}
}
}
#ifdef balancecontigtrace
int myrank;
MPI_Comm_rank(fd->comm,&myrank);
if (myrank == 0) {
fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
for (i=0; i<naggs; i++) {
fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
}
}
#endif
#else // not BGQ platform
for (i=0; i<naggs; i++) {
if (i < naggs_large) {
fd_size[i] = (nb_cn_small+1) * blksize;
} else {
fd_size[i] = nb_cn_small * blksize;
}
}
#endif
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): "
"gpfs_ub %llu, "
"gpfs_lb %llu, "
"gpfs_ub_rdoff %llu, "
"gpfs_lb_rdoff %llu, "
"fd_gpfs_range %llu, "
"n_gpfs_blk %llu, "
"nb_cn_small %llu, "
"naggs_large %llu, "
"naggs_small %llu, "
"\n",
myname,__LINE__,
gpfs_ub ,
gpfs_lb ,
gpfs_ub_rdoff,
gpfs_lb_rdoff,
fd_gpfs_range,
n_gpfs_blk ,
nb_cn_small ,
naggs_large ,
naggs_small
);
# endif
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
/* compute the file domain for each aggr */
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5005, 0, NULL);
#endif
ADIOI_Free (fd_size);
TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
}
/*
* ADIOI_GPFS_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
* is specific for static file domain partitioning.
*
* ADIOI_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
They are used as memory buffer indices so it seems like the 2G limit is in effect */
{
int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
int i, l, proc;
ADIO_Offset fd_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
TRACE_ERR("Entering ADIOI_GPFS_Calc_my_req\n");
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5024, 0, NULL);
#endif
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
process in process i's file domain. calloc initializes to zero.
I'm allocating memory of size nprocs, so that I can do an
MPI_Alltoall later on.*/
buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* buf_idx is relevant only if buftype_is_contig.
buf_idx[i] gives the index into user_buf where data received
from proc. i should be placed. This allows receives to be done
without extra buffer. This can't be done if buftype is not contig. */
/* initialize buf_idx to -1 */
for (i=0; i < nprocs; i++) buf_idx[i] = -1;
/* one pass just to calculate how much space to allocate for my_req;
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/
for (i=0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
fd_len = len_list[i];
/* note: we set fd_len to be the total size of the access. then
* ADIOI_Calc_aggregator() will modify the value to return the
* amount that was available from the file domain that holds the
* first part of the access.
*/
/* BES */
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
count_my_req_per_proc[proc]++;
/* figure out how much data is remaining in the access (i.e. wasn't
* part of the file domain that had the starting byte); we'll take
* care of this data (if there is any) in the while loop below.
*/
rem_len = len_list[i] - fd_len;
while (rem_len > 0) {
off += fd_len; /* point to first remaining byte */
fd_len = rem_len; /* save remaining size, pass to calc */
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
count_my_req_per_proc[proc]++;
rem_len -= fd_len; /* reduce remaining length by amount from fd */
}
}
/* now allocate space for my_req, offset, and len */
*my_req_ptr = (ADIOI_Access *)
ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
my_req = *my_req_ptr;
count_my_req_procs = 0;
for (i=0; i < nprocs; i++) {
if (count_my_req_per_proc[i]) {
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
my_req[i].lens =
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed
later */
}
/* now fill in my_req */
curr_idx = 0;
for (i=0; i<contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
fd_len = len_list[i];
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
/* for each separate contiguous access from this process */
if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count;
curr_idx += fd_len;
rem_len = len_list[i] - fd_len;
/* store the proc, offset, and len information in an array
* of structures, my_req. Each structure contains the
* offsets and lengths located in that process's FD,
* and the associated count.
*/
my_req[proc].offsets[l] = off;
my_req[proc].lens[l] = fd_len;
my_req[proc].count++;
while (rem_len > 0) {
off += fd_len;
fd_len = rem_len;
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count;
curr_idx += fd_len;
rem_len -= fd_len;
my_req[proc].offsets[l] = off;
my_req[proc].lens[l] = fd_len;
my_req[proc].count++;
}
}
#ifdef AGG_DEBUG
for (i=0; i<nprocs; i++) {
if (count_my_req_per_proc[i] > 0) {
DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
my_req[i].count);
for (l=0; l < my_req[i].count; l++) {
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %lld\n", l,
my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
}
#endif
*count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5025, 0, NULL);
#endif
TRACE_ERR("Leaving ADIOI_GPFS_Calc_my_req\n");
}
/*
* ADIOI_Calc_others_req (copied to bg and switched to all to all for performance)
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
* contig. requests of this process in
* process i's file domain.
* param[in] my_req A structure defining my request
* param[in] nprocs Number of nodes in the block
* param[in] myrank Rank of this node
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
* my process's file domain (including my
* process itself)
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr)
{
TRACE_ERR("Entering ADIOI_GPFS_Calc_others_req\n");
/* determine what requests of other processes lie in this process's
file domain */
/* count_others_req_procs = number of processes whose requests lie in
this process's file domain (including this process itself)
count_others_req_per_proc[i] indicates how many separate contiguous
requests of proc. i lie in this process's file domain. */
int *count_others_req_per_proc, count_others_req_procs;
int i;
ADIOI_Access *others_req;
/* Parameters for MPI_Alltoallv */
int *scounts, *sdispls, *rcounts, *rdispls;
/* Parameters for MPI_Alltoallv. These are the buffers, which
* are later computed to be the lowest address of all buffers
* to be sent/received for offsets and lengths. Initialize to
* the highest possible address which is the current minimum.
*/
void *sendBufForOffsets=(void*)0xFFFFFFFFFFFFFFFF,
*sendBufForLens =(void*)0xFFFFFFFFFFFFFFFF,
*recvBufForOffsets=(void*)0xFFFFFFFFFFFFFFFF,
*recvBufForLens =(void*)0xFFFFFFFFFFFFFFFF;
/* first find out how much to send/recv and from/to whom */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5026, 0, NULL);
#endif
/* Send 1 int to each process. count_my_req_per_proc[i] is the number of
* requests that my process will do to the file domain owned by process[i].
* Receive 1 int from each process. count_others_req_per_proc[i] is the number of
* requests that process[i] will do to the file domain owned by my process.
*/
count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* cora2a1=timebase(); */
/*for(i=0;i<nprocs;i++) ?*/
MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
count_others_req_per_proc, 1, MPI_INT, fd->comm);
/* total_cora2a+=timebase()-cora2a1; */
/* Allocate storage for an array of other nodes' accesses of our
* node's file domain. Also allocate storage for the alltoallv
* parameters.
*/
*others_req_ptr = (ADIOI_Access *)
ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
others_req = *others_req_ptr;
scounts = ADIOI_Malloc(nprocs*sizeof(int));
sdispls = ADIOI_Malloc(nprocs*sizeof(int));
rcounts = ADIOI_Malloc(nprocs*sizeof(int));
rdispls = ADIOI_Malloc(nprocs*sizeof(int));
/* If process[i] has any requests in my file domain,
* initialize an ADIOI_Access structure that will describe each request
* from process[i]. The offsets, lengths, and buffer pointers still need
* to be obtained to complete the setting of this structure.
*/
count_others_req_procs = 0;
for (i=0; i<nprocs; i++) {
if (count_others_req_per_proc[i])
{
others_req[i].count = count_others_req_per_proc[i];
others_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
others_req[i].lens =
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
recvBufForOffsets = others_req[i].offsets;
if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
recvBufForLens = others_req[i].lens;
others_req[i].mem_ptrs = (MPI_Aint *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
count_others_req_procs++;
}
else
{
others_req[i].count = 0;
others_req[i].offsets = NULL;
others_req[i].lens = NULL;
}
}
/* If no recv buffer was allocated in the loop above, make it NULL */
if ( recvBufForOffsets == (void*)0xFFFFFFFFFFFFFFFF) recvBufForOffsets = NULL;
if ( recvBufForLens == (void*)0xFFFFFFFFFFFFFFFF) recvBufForLens = NULL;
/* Now send the calculated offsets and lengths to respective processes */
/************************/
/* Exchange the offsets */
/************************/
/* Determine the lowest sendBufForOffsets/Lens */
for (i=0; i<nprocs; i++)
{
if ( (my_req[i].count) &&
((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
{
sendBufForOffsets = my_req[i].offsets;
}
if ( (my_req[i].count) &&
((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
{
sendBufForLens = my_req[i].lens;
}
}
/* If no send buffer was found in the loop above, make it NULL */
if ( sendBufForOffsets == (void*)0xFFFFFFFFFFFFFFFF) sendBufForOffsets = NULL;
if ( sendBufForLens == (void*)0xFFFFFFFFFFFFFFFF) sendBufForLens = NULL;
/* Calculate the displacements from the sendBufForOffsets/Lens */
for (i=0; i<nprocs; i++)
{
/* Send these offsets to process i.*/
scounts[i] = count_my_req_per_proc[i];
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].offsets -
(MPIR_Upint)sendBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
/* Receive these offsets from process i.*/
rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 )
rdispls[i] = 0;
else
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].offsets -
(MPIR_Upint)recvBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
}
/* Exchange the offsets */
MPI_Alltoallv(sendBufForOffsets,
scounts, sdispls, ADIO_OFFSET,
recvBufForOffsets,
rcounts, rdispls, ADIO_OFFSET,
fd->comm);
/************************/
/* Exchange the lengths */
/************************/
for (i=0; i<nprocs; i++)
{
/* Send these lengths to process i.*/
scounts[i] = count_my_req_per_proc[i];
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].lens -
(MPIR_Upint)sendBufForLens ) /
(MPIR_Upint) sizeof(ADIO_Offset) );
/* Receive these offsets from process i. */
rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 )
rdispls[i] = 0;
else
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].lens -
(MPIR_Upint)recvBufForLens ) /
(MPIR_Upint) sizeof(ADIO_Offset) );
}
/* Exchange the lengths */
MPI_Alltoallv(sendBufForLens,
scounts, sdispls, ADIO_OFFSET,
recvBufForLens,
rcounts, rdispls, ADIO_OFFSET,
fd->comm);
/* Clean up */
ADIOI_Free(count_others_req_per_proc);
ADIOI_Free (scounts);
ADIOI_Free (sdispls);
ADIOI_Free (rcounts);
ADIOI_Free (rdispls);
*count_others_req_procs_ptr = count_others_req_procs;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5027, 0, NULL);
#endif
TRACE_ERR("Leaving ADIOI_GPFS_Calc_others_req\n");
}

Просмотреть файл

@ -0,0 +1,86 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_aggrs.h
* \brief ???
*/
/*
* File: ad_gpfs_aggrs.h
*
* Declares functions optimized specifically for GPFS parallel I/O solution.
*
*/
#ifndef AD_GPFS_AGGRS_H_
#define AD_GPFS_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr);
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
static file domain partitioning */
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
ADIO_Offset fd_size,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end);
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
static file domain partitioning */
void ADIOI_GPFS_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr);
/*
* ADIOI_Calc_others_req
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
* contig. requests of this process in
* process i's file domain.
* param[in] my_req A structure defining my request
* param[in] nprocs Number of nodes in the block
* param[in] myrank Rank of this node
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
* my process's file domain (including my
* process itself)
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr);
#endif /* AD_GPFS_AGGRS_H_ */

Просмотреть файл

@ -0,0 +1,57 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_close.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gpfs.h"
#include "ad_gpfs_tuning.h"
#include <unistd.h>
void ADIOI_GPFS_Close(ADIO_File fd, int *error_code)
{
int err, derr=0;
static char myname[] = "ADIOI_GPFS_CLOSE";
#ifdef PROFILE
MPE_Log_event(9, 0, "start close");
#endif
if (fd->null_fd >= 0)
close(fd->null_fd);
err = close(fd->fd_sys);
if (fd->fd_direct >= 0)
{
derr = close(fd->fd_direct);
}
#ifdef PROFILE
MPE_Log_event(10, 0, "end close");
#endif
/* FPRINTF(stderr,"%s(%d):'%s'. Free %#X\n",myname,__LINE__,fd->filename,(int)fd->fs_ptr);*/
if (fd->fs_ptr != NULL) {
ADIOI_Free(fd->fs_ptr);
fd->fs_ptr = NULL;
}
fd->fd_sys = -1;
fd->fd_direct = -1;
if (err == -1 || derr == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,68 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_flush.c
* \brief Scalable flush for GPFS
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gpfs.h"
void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code)
{
int err=0;
static char myname[] = "ADIOI_GPFS_FLUSH";
int rank;
MPI_Comm_rank(fd->comm, &rank);
/* the old logic about who is an fsync aggregator and who is not fell down
* when deferred open was enabled. Instead, make this look more like
* ad_pvfs2_flush. If one day the I/O aggregators have something they need
* to flush, we can consult the 'fd->hints->ranklist[]' array. For now, a
* flush from one process should suffice */
/* ensure all other proceses are done writing. On many platforms MPI_Reduce
* is fastest because it has the lightest constraints. On Blue Gene, BARRIER
* is optimized */
MPI_Barrier(fd->comm);
if (rank == fd->hints->ranklist[0]) {
err = fsync(fd->fd_sys);
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
/* We want errno, not the return code if it failed */
if (err == -1) err = errno;
else err = 0;
}
MPI_Bcast(&err, 1, MPI_UNSIGNED, fd->hints->ranklist[0], fd->comm);
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
if (err) /* if it's non-zero, it must be an errno */
{
errno = err;
err = -1;
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,288 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_hints.c
* \brief GPFS hint processing - for now, only used for BlueGene and PE platforms
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "hint_fns.h"
#include "ad_gpfs.h"
#define ADIOI_GPFS_CB_BUFFER_SIZE_DFLT "16777216"
#define ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT "4194304"
#define ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT "4194304"
#ifdef BGQPLATFORM
#define ADIOI_BG_NAGG_IN_PSET_HINT_NAME "bg_nodes_pset"
#endif
/** \page mpiio_vars MPIIO Configuration
*
* GPFS MPIIO configuration and performance tuning. Used by ad_gpfs ADIO.
*
* Used for BlueGene and PE platforms, which each have their own aggregator selection
* algorithms that ignore user provided cb_config_list.
*
* \section hint_sec Hints
* - bg_nodes_pset - BlueGene only - specify how many aggregators to use per pset.
* This hint will override the cb_nodes hint based on BlueGene psets.
* - N - Use N nodes per pset as aggregators.
* - Default is based on partition configuration and cb_nodes.
*
* The following default key/value pairs may differ from other platform defaults.
*
* - key = cb_buffer_size value = 16777216
* - key = romio_cb_read value = enable
* - key = romio_cb_write value = enable
* - key = ind_rd_buffer_size value = 4194304
* - key = ind_wr_buffer_size value = 4194304
*/
#ifdef BGQPLATFORM
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
extern int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
#elif PEPLATFORM
extern int
ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
#endif
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
/* if fd->info is null, create a new info object.
Initialize fd->info to default values.
Initialize fd->hints to default values.
Examine the info object passed by the user. If it contains values that
ROMIO understands, override the default. */
MPI_Info info;
char *value;
int flag, intval, nprocs=0, nprocs_is_valid = 0;
static char myname[] = "ADIOI_GPFS_SETINFO";
int did_anything = 0;
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
info = fd->info;
/* Note that fd->hints is allocated at file open time; thus it is
* not necessary to allocate it, or check for allocation, here.
*/
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Assert ((value != NULL));
/* initialize info and hints to default values if they haven't been
* previously initialized
*/
if (!fd->hints->initialized) {
ad_gpfs_get_env_vars();
did_anything = 1;
/* buffer size for collective I/O */
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
/* default is to let romio automatically decide when to use
* collective buffering
*/
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = ADIOI_HINT_ENABLE;
ADIOI_Info_set(info, "romio_cb_write", "enable");
fd->hints->cb_write = ADIOI_HINT_ENABLE;
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
fd->hints->cb_config_list = NULL;
/* number of processes that perform I/O in collective I/O */
MPI_Comm_size(fd->comm, &nprocs);
nprocs_is_valid = 1;
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
ADIOI_Info_set(info, "cb_nodes", value);
fd->hints->cb_nodes = -1;
/* hint indicating that no indep. I/O will be performed on this file */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
/* gpfs is not implementing file realms (ADIOI_IOStridedColl),
initialize to disabled it. */
/* hint instructing the use of persistent file realms */
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
/* hint guiding the assignment of persistent file realms */
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
fd->hints->cb_fr_type = ADIOI_FR_AAR;
/* hint to align file realms with a certain byte value */
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
fd->hints->cb_fr_alignment = 1;
/* hint to set a threshold percentage for a datatype's size/extent at
* which data sieving should be done in collective I/O */
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
fd->hints->cb_ds_threshold = 0;
/* hint to switch between point-to-point or all-to-all for two-phase */
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
/* deferred_open derived from no_indep_rw and cb_{read,write} */
fd->hints->deferred_open = 0;
/* buffer size for data sieving in independent reads */
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
/* buffer size for data sieving in independent writes */
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
ADIOI_Info_set(info, "romio_ds_read", "automatic");
fd->hints->ds_read = ADIOI_HINT_AUTO;
ADIOI_Info_set(info, "romio_ds_write", "automatic");
fd->hints->ds_write = ADIOI_HINT_AUTO;
/* still to do: tune this a bit for a variety of file systems. there's
* no good default value so just leave it unset */
fd->hints->min_fdomain_size = 0;
fd->hints->striping_unit = 0;
fd->hints->initialized = 1;
}
/* add in user's info if supplied */
if (users_info != MPI_INFO_NULL) {
ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
&(fd->hints->cb_buffer_size), myname, error_code);
/* new hints for enabling/disabling coll. buffering on
* reads/writes
*/
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
&(fd->hints->cb_read), myname, error_code);
if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
/* romio_cb_read overrides no_indep_rw */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
&(fd->hints->cb_write), myname, error_code);
if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
/* romio_cb_write overrides no_indep_rw */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
/* Has the user indicated all I/O will be done collectively? */
ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
&(fd->hints->no_indep_rw), myname, error_code);
if (fd->hints->no_indep_rw == 1) {
/* if 'no_indep_rw' set, also hint that we will do
* collective buffering: if we aren't doing independent io,
* then we have to do collective */
ADIOI_Info_set(info, "romio_cb_write", "enable");
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = 1;
fd->hints->cb_write = 1;
}
/* new hints for enabling/disabling data sieving on
* reads/writes
*/
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
&(fd->hints->ds_read), myname, error_code);
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
&(fd->hints->ds_write), myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
&(fd->hints->ind_wr_buffer_size), myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
&(fd->hints->ind_rd_buffer_size), myname, error_code);
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
fd->hints->min_fdomain_size = intval;
}
/* Now we use striping unit in common code so we should
process hints for it. */
ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
&(fd->hints->striping_unit), myname, error_code);
#ifdef BGQPLATFORM
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
did_anything = 1;
ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
fd->hints->cb_nodes = intval;
}
#endif
}
/* special CB aggregator assignment */
if (did_anything) {
#ifdef BGQPLATFORM
ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
#elif PEPLATFORM
ADIOI_PE_gen_agg_ranklist(fd);
#endif
}
/* deferred_open won't be set by callers, but if the user doesn't
* explicitly disable collecitve buffering (two-phase) and does hint that
* io w/o independent io is going on, we'll set this internal hint as a
* convenience */
if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \
&& (fd->hints->cb_write != ADIOI_HINT_DISABLE)\
&& fd->hints->no_indep_rw ) ) {
fd->hints->deferred_open = 1;
} else {
/* setting romio_no_indep_rw enable and romio_cb_{read,write}
* disable at the same time doesn't make sense. honor
* romio_cb_{read,write} and force the no_indep_rw hint to
* 'disable' */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
fd->hints->deferred_open = 0;
}
/* BobC commented this out, but since hint processing runs on both bg and
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
* PVFS */
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
/* disable data sieving for fs that do not
support file locking */
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
/* get rid of this value if it is set */
ADIOI_Info_delete(info, "ind_wr_buffer_size");
}
/* note: leave ind_wr_buffer_size alone; used for other cases
* as well. -- Rob Ross, 04/22/2003
*/
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
ADIOI_Free(value);
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,156 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_open.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gpfs.h"
#include "ad_gpfs_tuning.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
#ifdef HAVE_GPFS_FCNTL_H
#include <gpfs_fcntl.h>
#endif
#ifdef HAVE_GPFS_FCNTL_H
static void gpfs_free_all_locks(int fd)
{
int rc;
struct {
gpfsFcntlHeader_t header;
gpfsFreeRange_t release;
} release_all;
release_all.header.totalLength = sizeof(release_all);
release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
release_all.header.fcntlReserved = 0;
release_all.release.structLen = sizeof(release_all.release);
release_all.release.structType = GPFS_FREE_RANGE;
release_all.release.start = 0;
release_all.release.length = 0;
rc = gpfs_fcntl(fd, &release_all);
if (rc != 0) {
DBGV_FPRINTF(stderr,"GPFS fcntl release failed with rc=%d, errno=%d\n",
rc,errno);
}
}
#endif
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode, rank, rc;
static char myname[] = "ADIOI_GPFS_OPEN";
/* set internal variables for tuning environment variables */
ad_gpfs_get_env_vars();
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
fd->fd_direct = -1;
if (gpfsmpio_devnullio == 1) {
fd->null_fd = open("/dev/null", O_RDWR);
} else {
fd->null_fd = -1;
}
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
if(fd->fd_sys != -1)
{
fd->blksize = 1048576; /* default to 1M */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
/* in this fs-specific routine, we might not be called over entire
* communicator (deferred open). Collect statistics on one process.
* ADIOI_GEN_Opencoll (common-code caller) will take care of the
* broadcast */
MPI_Comm_rank(fd->comm, &rank);
if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
struct stat64 gpfs_statbuf;
/* Get the (real) underlying file system block size */
rc = stat64(fd->filename, &gpfs_statbuf);
if (rc >= 0)
{
fd->blksize = gpfs_statbuf.st_blksize;
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
fd->filename,gpfs_statbuf.st_blksize);
}
else
{
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
}
}
/* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll
* will take care of that in both standard and deferred-open case */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
#ifdef HAVE_GPFS_FCNTL_H
/* in parallel workload, might be helpful to immediately release block
* tokens. Or, system call overhead will outweigh any benefits... */
if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL)
gpfs_free_all_locks(fd->fd_sys);
#endif
}
if (fd->fd_sys == -1) {
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
else *error_code = MPI_SUCCESS;
}
/*
*vim: ts=8 sts=4 sw=4 noexpandtab
*/

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,277 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_tuning.c
* \brief Defines ad_gpfs performance tuning
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*---------------------------------------------------------------------
* ad_gpfs_tuning.c
*
* defines global variables and functions for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#include "ad_gpfs_tuning.h"
#include "mpi.h"
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
int gpfsmpio_timing;
int gpfsmpio_timing2;
int gpfsmpio_timing_cw_level;
int gpfsmpio_comm;
int gpfsmpio_tunegather;
int gpfsmpio_tuneblocking;
long bglocklessmpio_f_type;
int gpfsmpio_bg_nagg_pset;
int gpfsmpio_pthreadio;
int gpfsmpio_p2pcontig;
int gpfsmpio_balancecontig;
int gpfsmpio_devnullio;
int gpfsmpio_bridgeringagg;
double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
/* set internal variables for tuning environment variables */
/** \page mpiio_vars MPIIO Configuration
\section env_sec Environment Variables
* - GPFSMPIO_COMM - Define how data is exchanged on collective
* reads and writes. Possible values:
* - 0 - Use MPI_Alltoallv.
* - 1 - Use MPI_Isend/MPI_Irecv.
* - Default is 0.
*
* - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
* Possible values:
* - 0 - Do not collect/report timing.
* - 1 - Collect/report timing.
* - Default is 0.
*
* - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
* for aggregator collective i/o. Possible values:
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
* - Default is 1.
*
* - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
* calculated (block size). Possible values:
* - 0 - Evenly calculate file domains across aggregators. Also use
* MPI_Isend/MPI_Irecv to exchange domain information.
* - 1 - Align file domains with the underlying file system's block size. Also use
* MPI_Alltoallv to exchange domain information.
* - Default is 1.
*
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
* "bg:" or "bglockless:") on a file name will override this environment
* variable. Possible values:
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
* statfs() field f_type.
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
*
* - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
* compute group (compute nodes + i/o nodes). Possible values:
* - any integer
* - Default is 8
*
* - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
* pthread is spawned to do the posix writes while the main thread does the
* data aggregation - useful for large files where multiple rounds are
* required (more that the cb_buffer_size of data per aggregator). User
* must ensure there is hw resource available for the thread to run. I
* am sure there is a better way to do this involving comm threads - this is
* just a start. NOTE: For some reason the stats collected when this is
* enabled misses some of the data so the data sizes are off a bit - this is
* a statistical issue only, the data is still accurately written out
*
* - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the
* aggregator and the procs that feed it. Performance could be enhanced by a
* one-sided put algorithm. Current implementation allows only 1 round of
* data. Useful/allowed only when:
* 1.) The datatype is contiguous.
* 2.) The offsets are increasing in rank-order.
* 3.) There are no gaps between the offsets.
* 4.) No single rank has a data size which spans multiple file domains.
*
* - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
* to aggregators in a breadth-first fashion relative to the ions - additionally,
* file domains on the aggregators sharing the same bridgeset and ion have contiguous
* offsets. The breadth-first assignment improves performance in the case of
* a relatively small file of size less than the gpfs block size multiplied
* by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values
* - 0 - assign file domain blocks in the traditional manner
* - 1 - if there are variable sized file domain blocks, spread them out
* (balance) across bridge nodes
*
* - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
* system. When experimenting with different two-phase I/O strategies, it's
* helpful to remove the highly variable file system from the experiment.
* - 0 (disabled) or 1 (enabled)
* - Default is 0
*
* - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement
* optimization whch forms a 5-d ring around the bridge node starting at
* GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results
* suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
* and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still
* GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
*
*/
void ad_gpfs_get_env_vars() {
char *x, *dummy;
gpfsmpio_comm = 0;
x = getenv( "GPFSMPIO_COMM" );
if (x) gpfsmpio_comm = atoi(x);
gpfsmpio_timing = 0;
x = getenv( "GPFSMPIO_TIMING" );
if (x) gpfsmpio_timing = atoi(x);
gpfsmpio_tunegather = 1;
x = getenv( "GPFSMPIO_TUNEGATHER" );
if (x) gpfsmpio_tunegather = atoi(x);
gpfsmpio_tuneblocking = 1;
x = getenv( "GPFSMPIO_TUNEBLOCKING" );
if (x) gpfsmpio_tuneblocking = atoi(x);
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
bglocklessmpio_f_type,bglocklessmpio_f_type);
/* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
* when we know a bit more about what "largest possible value" and
* "smallest possible value" should be */
gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
x = getenv("GPFSMPIO_NAGG_PSET");
if (x) gpfsmpio_bg_nagg_pset = atoi(x);
gpfsmpio_pthreadio = 0;
x = getenv( "GPFSMPIO_PTHREADIO" );
if (x) gpfsmpio_pthreadio = atoi(x);
gpfsmpio_p2pcontig = 0;
x = getenv( "GPFSMPIO_P2PCONTIG" );
if (x) gpfsmpio_p2pcontig = atoi(x);
gpfsmpio_balancecontig = 0;
x = getenv( "GPFSMPIO_BALANCECONTIG" );
if (x) gpfsmpio_balancecontig = atoi(x);
gpfsmpio_devnullio = 0;
x = getenv( "GPFSMPIO_DEVNULLIO" );
if (x) gpfsmpio_devnullio = atoi(x);
gpfsmpio_bridgeringagg = 0;
x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
if (x) gpfsmpio_bridgeringagg = atoi(x);
}
/* report timing breakdown for MPI I/O collective call */
void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
{
int i;
if (gpfsmpio_timing) {
/* Timing across the whole communicator is a little bit interesting,
* but what is *more* interesting is if we single out the aggregators
* themselves. non-aggregators spend a lot of time in "exchange" not
* exchanging data, but blocked because they are waiting for
* aggregators to finish writing. If we focus on just the aggregator
* processes we will get a more clear picture about the data exchange
* vs. i/o time breakdown */
/* if deferred open enabled, we could use the aggregator communicator */
MPI_Comm agg_comm;
int nr_aggs, agg_rank;
MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
if(agg_comm != MPI_COMM_NULL) {
MPI_Comm_size(agg_comm, &nr_aggs);
MPI_Comm_rank(agg_comm, &agg_rank);
}
double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
if( agg_comm != MPI_COMM_NULL) {
MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
}
if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] =
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ];
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] =
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW ];
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
fprintf(stderr,"SEEK-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ] );
fprintf(stderr,"SEEK-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ] );
fprintf(stderr,"LOCAL-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ] );
fprintf(stderr,"GATHER-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ] );
fprintf(stderr,"PATTERN-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ] );
fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ] );
fprintf(stderr,"MYREQ-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ] );
fprintf(stderr,"OTHERREQ-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ] );
fprintf(stderr,"EXCHANGE-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ] );
fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] );
fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP] );
fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET] );
fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT] );
fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE] );
fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ] );
fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ] );
fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ] );
fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] );
fprintf(stderr,"MPI-BW-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] );
fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
}
if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
}
}

Просмотреть файл

@ -0,0 +1,114 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_tuning.h
* \brief ???
*/
/*---------------------------------------------------------------------
* ad_gpfs_tuning.h
*
* declares global variables and macros for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#ifndef AD_GPFS_TUNING_H_
#define AD_GPFS_TUNING_H_
#include "adio.h"
/*-----------------------------------------
* Global variables for the control of
* 1. timing
* 2. select specific optimizations
*-----------------------------------------*/
/* timing fields */
enum {
GPFSMPIO_CIO_DATA_SIZE=0,
GPFSMPIO_CIO_T_SEEK,
GPFSMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
GPFSMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
GPFSMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
GPFSMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
GPFSMPIO_CIO_T_MYREQ, /* time for ADIOI_Calc_my_req(), local */
GPFSMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
GPFSMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
/* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
GPFSMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
size info with everyone else */
GPFSMPIO_CIO_T_DEXCH_SETUP, /* time for setup portion of I/O data exchange */
GPFSMPIO_CIO_T_DEXCH_NET, /* time for network portion of I/O data exchange */
GPFSMPIO_CIO_T_DEXCH_SORT, /* time to sort requesst in I/O data exchange */
GPFSMPIO_CIO_T_DEXCH_SIEVE, /* time for read portion of RMW in two phase */
GPFSMPIO_CIO_T_POSI_RW,
GPFSMPIO_CIO_B_POSI_RW,
GPFSMPIO_CIO_T_MPIO_RW, /* time for ADIOI_WriteContig() */
GPFSMPIO_CIO_B_MPIO_RW,
GPFSMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_GPFS_WriteStridedColl() */
GPFSMPIO_CIO_B_MPIO_CRW,
GPFSMPIO_CIO_LAST
};
/* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */
extern double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
extern double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
/* corresponds to environment variables to select optimizations and timing level */
extern int gpfsmpio_timing;
extern int gpfsmpio_timing_cw_level;
extern int gpfsmpio_comm;
extern int gpfsmpio_tunegather;
extern int gpfsmpio_tuneblocking;
extern long bglocklessmpio_f_type;
extern int gpfsmpio_pthreadio;
extern int gpfsmpio_p2pcontig;
extern int gpfsmpio_balancecontig;
extern int gpfsmpio_devnullio;
extern int gpfsmpio_bridgeringagg;
/* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
* i/o node and all compute nodes wired to it. On Blue Gene /Q that
* relationship is a lot more fluid. There are still I/O nodes, and compute
* nodes are assigned to an i/o node, but there are two routes to the i/o node,
* via compute nodes designated as "bridge nodes". In this code, what we used
* to call a "pset" is actually "compute nodes associated with and including a
* bridge node". So, "nAgg" is roughly "number of aggregators per bridge", but
* look closely at ADIOI_BG_persInfo_init() for the details */
#define ADIOI_BG_NAGG_PSET_DFLT 16
extern int gpfsmpio_bg_nagg_pset;
/* set internal variables for tuning environment variables */
void ad_gpfs_get_env_vars(void);
/* report timing breakdown for MPI I/O collective call */
void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
/* note:
* T := timing;
* CIO := collective I/O
*/
#define GPFSMPIO_T_CIO_RESET( RW ) \
{ \
int _i; \
for ( _i = 0; _i < GPFSMPIO_CIO_LAST; _i ++ ) \
gpfsmpio_prof_c##RW [ _i ] = 0; \
}
#define GPFSMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
ad_gpfs_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
#define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
{\
double temp = MPI_Wtime(); \
if ( ISSET ) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \
if ( ISGET ) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ;\
}
#endif /* AD_GPFS_TUNING_H_ */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,18 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2012 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_BG
noinst_HEADERS += \
adio/ad_gpfs/bg/ad_bg_aggrs.h \
adio/ad_gpfs/bg/ad_bg_pset.h
romio_other_sources += \
adio/ad_gpfs/bg/ad_bg_aggrs.c \
adio/ad_gpfs/bg/ad_bg_pset.c
endif BUILD_AD_BG

Просмотреть файл

@ -0,0 +1,675 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_aggrs.c
* \brief The externally used function from this file is is declared in ad_bg_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*#define TRACE_ON */
// Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
// #define balancecontigtrace 1
// #define bridgeringaggtrace 1
#include "adio.h"
#include "adio_cb_config_list.h"
#include "../ad_gpfs.h"
#include "ad_bg_pset.h"
#include "ad_bg_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(format...)
#endif
/* Comments copied from common:
* This file contains four functions:
*
* ADIOI_Calc_aggregator()
* ADIOI_Calc_file_domains()
* ADIOI_Calc_my_req()
* ADIOI_Calc_others_req()
*
* The last three of these were originally in ad_read_coll.c, but they are
* also shared with ad_write_coll.c. I felt that they were better kept with
* the rest of the shared aggregation code.
*/
/* Discussion of values available from above:
*
* ADIO_Offset st_offsets[0..nprocs-1]
* ADIO_Offset end_offsets[0..nprocs-1]
* These contain a list of start and end offsets for each process in
* the communicator. For example, an access at loc 10, size 10 would
* have a start offset of 10 and end offset of 19.
* int nprocs
* number of processors in the collective I/O communicator
* ADIO_Offset min_st_offset
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
* starting location of "file domain"; region that a given process will
* perform aggregation for (i.e. actually do I/O)
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
* start + size - 1 roughly, but it can be less, or 0, in the case of
* uneven distributions
*/
/* forward declaration */
static void
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo);
/*
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
* The parameters are
* . the number of aggregators (proxies) : fd->hints->cb_nodes
* . the ranks of the aggregators : fd->hints->ranklist
* By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
* ADIO can work more efficiently.
*/
int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
{
int r, s;
ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
ADIOI_BG_ConfInfo_t *confInfo;
TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
MPI_Comm_size( fd->comm, &s );
MPI_Comm_rank( fd->comm, &r );
/* Collect individual BG personality information */
confInfo = ADIOI_BG_ConfInfo_new ();
procInfo = ADIOI_BG_ProcInfo_new ();
ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
/* Gather BG personality infomation onto process 0 */
/* if (r == 0) */
all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
(void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
0,
fd->comm );
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
if (r == 0) {
ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
/* ADIOI_BG_ProcInfo_free (all_procInfo);*/
}
ADIOI_BG_ProcInfo_free (all_procInfo);
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
Declared in adio_cb_config_list.h */
ADIOI_cb_bcast_rank_map(fd);
if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
bridgelistnum and numbridges to all
ranks */
if (r != 0) {
fd->hints->fs_hints.bg.bridgelist =
ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
if (fd->hints->fs_hints.bg.bridgelist == NULL) {
/* NEED TO HANDLE ENOMEM */
}
}
MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
fd->comm);
if (r != 0) {
fd->hints->fs_hints.bg.bridgelistnum =
ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
/* NEED TO HANDLE ENOMEM */
}
}
MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
MPI_INT, 0, fd->comm);
MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
fd->comm);
}
ADIOI_BG_persInfo_free( confInfo, procInfo );
TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
return 0;
}
/* There are some number of bridge nodes (randomly) distributed through the job
* We need to split the nodes among the bridge nodes */
/* Maybe find which bridge node is closer (manhattan distance) and try to
* distribute evenly.
*/
/*
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
* The first order of tmp_ranklist is : PSET number
* The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
*/
typedef struct
{
int rank;
int bridge;
} sortstruct;
typedef struct
{
int bridgeRank;
int numAggsAssigned;
} bridgeAggAssignment;
static int intsort(const void *p1, const void *p2)
{
sortstruct *i1, *i2;
i1 = (sortstruct *)p1;
i2 = (sortstruct *)p2;
return(i1->bridge - i2->bridge);
}
static int
ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo,
int *tmp_ranklist)
{
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
/* BES: This should be done in the init routines probably. */
int i, j;
int aggTotal;
int *aggList;
if (gpfsmpio_bridgeringagg > 0) {
int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
/* the number of aggregators is (numAggs per bridgenode) */
if(numAggs == 1)
aggTotal = 1;
else
aggTotal = confInfo->numBridgeRanks * numAggs;
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
if(aggTotal == 1) { /* special case when we only have one bridge node */
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
for(i=0; i < confInfo->nProcs; i++)
{
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
bridgelist[i].rank = i;
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
}
/* This list contains rank->bridge info. Now, we need to sort this list. */
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
aggList[0] = bridgelist[0].bridge;
ADIOI_Free(bridgelist);
}
else { // aggTotal > 1
int currentAggListSize = 0;
int numBridgesWithAggAssignments = 0;
bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
int partitionSize = all_procInfo[0].numNodesInPartition;
int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
for (i=0;i<partitionSize;i++)
nodesAssigned[i] = 0;
int currentNumHops = gpfsmpio_bridgeringagg;
int allAggsAssigned = 0;
/* Iterate thru the process infos and select aggregators starting at currentNumHops
away. Increase the currentNumHops until all bridges have numAggs assigned to them.
*/
while (!allAggsAssigned) {
/* track whether any aggs are selected durng this round */
int startingCurrentAggListSize = currentAggListSize;
int numIterForHopsWithNoAggs = 0;
for (i=0;i<confInfo->nProcs;i++) {
if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
int foundBridge = 0;
for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
foundBridge = 1;
if (aggAssignments[j].numAggsAssigned < numAggs) {
aggAssignments[j].numAggsAssigned++;
nodesAssigned[all_procInfo[i].nodeRank] = 1;
aggList[currentAggListSize] = all_procInfo[i].rank;
currentAggListSize++;
#ifdef bridgeringaggtrace
printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
#endif
}
}
}
if (!foundBridge) {
aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
numBridgesWithAggAssignments++;
nodesAssigned[all_procInfo[i].nodeRank] = 1;
aggList[currentAggListSize] = all_procInfo[i].rank;
currentAggListSize++;
#ifdef bridgeringaggtrace
printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
#endif
}
}
}
}
if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
allAggsAssigned = 1;
for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
if (aggAssignments[i].numAggsAssigned < numAggs)
allAggsAssigned = 0;
}
}
currentNumHops++;
/* If 3 rounds go by without selecting an agg abort to avoid
infinite loop.
*/
if (startingCurrentAggListSize == currentAggListSize)
numIterForHopsWithNoAggs++;
else
numIterForHopsWithNoAggs = 0;
ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
}
ADIOI_Free(aggAssignments);
ADIOI_Free(nodesAssigned);
} // else aggTotal > 1
memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
} // gpfsmpio_bridgeringagg > 0
else { // gpfsmpio_bridgeringagg unset - default code
int distance, numAggs;
/* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
* bridge node */
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
for(i=0; i < confInfo->nProcs; i++)
{
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
bridgelist[i].rank = i;
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
}
/* This list contains rank->bridge info. Now, we need to sort this list. */
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
/* In this array, we can pick an appropriate number of midpoints based on
* our bridgenode index and the number of aggregators */
numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
if(numAggs == 1)
aggTotal = 1;
else
/* the number of aggregators is (numAggs per bridgenode) plus each
* bridge node is an aggregator */
aggTotal = confInfo->numBridgeRanks * (numAggs+1);
if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, aggTotal);
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
/* For each bridge node, determine who the aggregators will be */
/* basically, the n*distance and bridge node */
if(aggTotal == 1) /* special case when we only have one bridge node */
aggList[0] = bridgelist[0].bridge;
else
{
int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
int psetSize = 0;
int procIndex;
for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
{
TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex, bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
if(lastBridge == bridgelist[procIndex].bridge)
{
psetSize++;
if(procIndex) continue;
else procIndex--;/* procIndex == 0 */
}
/* Sets up a list of nodes which will act as aggregators. numAggs
* per bridge node total. The list of aggregators is
* bridgeNode 0
* bridgeNode 1
* bridgeNode ...
* bridgeNode N
* bridgeNode[0]aggr[0]
* bridgeNode[0]aggr[1]...
* bridgeNode[0]aggr[N]...
* ...
* bridgeNode[N]aggr[0]..
* bridgeNode[N]aggr[N]
*/
aggList[nextBridge]=lastBridge;
distance = psetSize/numAggs;
TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
if(numAggs>1)
{
for(j = 0; j < numAggs; j++)
{
ADIOI_Assert(nextAggr<aggTotal);
aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
{
aggList[nextAggr] = bridgelist[procIndex+psetSize].rank; /* take the last one in the pset */
TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
}
nextAggr++;
}
}
if(procIndex<0) break;
lastBridge = bridgelist[procIndex].bridge;
psetSize = 1;
nextBridge++;
}
}
TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
for(i=0;i<aggTotal;i++)
{
TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
}
ADIOI_Free (bridgelist);
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
}
ADIOI_Free (aggList);
return aggTotal;
}
/*
* compute aggregators ranklist and put it into fd->hints struct
*/
static void
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo)
{
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
int i;
int naggs;
int size;
int *tmp_ranklist;
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
# if AGG_DEBUG
for (i=0; i<confInfo->nProcs; i++) {
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
}
# endif
naggs=
ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
# define VERIFY 1
# if VERIFY
DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
confInfo->ioMinSize ,
confInfo->ioMaxSize ,
confInfo->nAggrs ,
confInfo->numBridgeRanks ,
confInfo->nProcs ,
confInfo->ioMaxSize /*virtualPsetSize*/ ,
confInfo->cpuIDsize,
confInfo->aggRatio ,
naggs );
# endif
MPI_Comm_size( fd->comm, &size );
/* This fix is for when the bridgenode rnk is not part of the particular
* subcomm associated with this MPI File operation. I don't know if
* this is the best/right answer but it passes the test cases at least.
* I don't know how common file IO in subcomms is anyway... */
for(i=0;i<naggs;i++)
{
if(tmp_ranklist[i] > size)
{
TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
i, tmp_ranklist[i], fd->comm);
tmp_ranklist[i] = 0;
}
}
# if AGG_DEBUG
for (i=0; i<naggs; i++) {
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
}
# endif
if (gpfsmpio_balancecontig == 1) {
/* what comes out of this code block is the agg ranklist sorted by
* bridge set and ion id with associated bridge info stored in the
* hints structure for later access during file domain assignment */
// sort the agg ranklist by ions and bridges
int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
/* list of all bridge ranks */
int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
/* each entry here is the number of aggregators associated with the
* bridge rank of the same index in bridgelist */
int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
/* list of all ion IDs corresponding with bridgelist entries of same index */
int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
int numbridges = 0;
for (i=0;i<naggs;i++)
bridgelistnum[i] = 0;
/* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
* agg rank on that ion. */
int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
for (i=0;i<naggs;i++)
summarybridgeminionaggrank[i] = -1;
/* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
* entry and find the associated bridge list index - at the end we will
* know how many aggs belong to each bridge in each ion */
for (i=0;i<naggs;i++) {
int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
int foundrank = 0;
int summaryranklistbridgeindex = 0;
int j;
for (j=0;(j<numbridges && !foundrank);j++) {
if (bridgelist[j] == aggbridgerank) {
foundrank = 1;
summaryranklistbridgeindex = j;
}
else
summaryranklistbridgeindex++;
}
if (!foundrank) {
bridgelist[summaryranklistbridgeindex] = aggbridgerank;
ionlist[summaryranklistbridgeindex] = aggionid;
if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
numbridges++;
}
bridgelistnum[summaryranklistbridgeindex]++;
}
/* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
* need to make each entry the minimum bridge rank for the entire ion. */
for (i=0;i<numbridges;i++) {
int aggIonId = ionlist[i];
int j;
for (j=0;j<numbridges;j++) {
if (ionlist[j] == aggIonId) {
if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
}
}
}
// resort by io node minimum bridge rank
int x;
for (x=0;x<numbridges;x++) {
for (i=0;i<(numbridges-1);i++) {
if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
int tmpminionaggrank = summarybridgeminionaggrank[i];
summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
summarybridgeminionaggrank[i+1] = tmpminionaggrank;
int tmpionid = ionlist[i];
ionlist[i] = ionlist[i+1];
ionlist[i+1] = tmpionid;
int tmpbridgerank = bridgelist[i];
bridgelist[i] = bridgelist[i+1];
bridgelist[i+1] = tmpbridgerank;
int tmpbridgeranknum = bridgelistnum[i];
bridgelistnum[i] = bridgelistnum[i+1];
bridgelistnum[i+1] = tmpbridgeranknum;
}
}
}
// for each io node make sure bridgelist is in rank order
int startSortIndex = -1;
int endSortIndex = -1;
int currentBridgeIndex = 0;
while (currentBridgeIndex < numbridges) {
int currentIonId = ionlist[currentBridgeIndex];
startSortIndex = currentBridgeIndex;
while (ionlist[currentBridgeIndex] == currentIonId)
currentBridgeIndex++;
endSortIndex = currentBridgeIndex-1;
for (x=startSortIndex;x<=endSortIndex;x++) {
for (i=startSortIndex;i<endSortIndex;i++) {
if (bridgelist[i] > bridgelist[i+1]) {
int tmpbridgerank = bridgelist[i];
bridgelist[i] = bridgelist[i+1];
bridgelist[i+1] = tmpbridgerank;
int tmpbridgeranknum = bridgelistnum[i];
bridgelistnum[i] = bridgelistnum[i+1];
bridgelistnum[i+1] = tmpbridgeranknum;
}
}
}
}
/* populate interleavedbridgeranklist - essentially the agg rank list
* is now sorted by the ion minimum bridge rank and bridge node */
int currentrankoffset = 0;
for (i=0;i<numbridges;i++) {
int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
int numAggsForThisBridge = 0;
int k;
for (k=0;k<naggs;k++) {
int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
if (aggbridgerank == bridgelist[i]) {
thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
numAggsForThisBridge++;
}
}
// sort thisBridgeAggList
for (x=0;x<numAggsForThisBridge;x++) {
int n;
for (n=0;n<(numAggsForThisBridge-1);n++) {
if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
int tmpthisBridgeAggList = thisBridgeAggList[n];
thisBridgeAggList[n] = thisBridgeAggList[n+1];
thisBridgeAggList[n+1] = tmpthisBridgeAggList;
}
}
}
int n;
for (n=0;n<numAggsForThisBridge;n++) {
interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
currentrankoffset++;
}
ADIOI_Free(thisBridgeAggList);
}
#ifdef balancecontigtrace
fprintf(stderr,"Interleaved aggregator list:\n");
for (i=0;i<naggs;i++) {
fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
}
fprintf(stderr,"Bridges list:\n");
for (i=0;i<numbridges;i++) {
fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
}
#endif
/* copy the ranklist of IO aggregators to fd->hints */
if(fd->hints->ranklist != NULL)
ADIOI_Free (fd->hints->ranklist);
if(fd->hints->fs_hints.bg.bridgelist != NULL)
ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
fd->hints->cb_nodes = naggs;
fd->hints->fs_hints.bg.numbridges = numbridges;
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
ADIOI_Free(summarybridgeminionaggrank);
ADIOI_Free( tmp_ranklist );
ADIOI_Free( bridgelistnum );
ADIOI_Free( bridgelist );
ADIOI_Free( interleavedbridgeranklist );
ADIOI_Free(ionlist);
} else {
/* classic topology-agnostic copy of the ranklist of IO aggregators to
* fd->hints */
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
fd->hints->cb_nodes = naggs;
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
ADIOI_Free( tmp_ranklist );
}
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
return;
}

Просмотреть файл

@ -0,0 +1,33 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_aggrs.h
* \brief ???
*/
/*
*
* Declares functions specific for the BlueGene platform within the GPFS
* parallel I/O solution. Implements aligned file-domain partitioning
* (7/28/2005); persistent file doamin work not implemented
*
*/
#ifndef AD_BG_AGGRS_H_
#define AD_BG_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
/* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
#endif /* AD_BG_AGGRS_H_ */

Просмотреть файл

@ -0,0 +1,377 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_pset.c
* \brief Definition of functions associated to structs ADIOI_BG_ProcInfo_t and ADIOI_BG_ConfInfo_t
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/* #define TRACE_ON */
// #define bridgeringaggtrace 1
#include <stdlib.h>
#include "../ad_gpfs.h"
#include "ad_bg_pset.h"
#include <spi/include/kernel/process.h>
#include <firmware/include/personality.h>
#ifdef HAVE_MPIX_H
#include <mpix.h>
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(fmt...)
#endif
ADIOI_BG_ProcInfo_t *
ADIOI_BG_ProcInfo_new()
{
ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ProcInfo_t));
ADIOI_Assert ((p != NULL));
return p;
}
ADIOI_BG_ProcInfo_t *
ADIOI_BG_ProcInfo_new_n( int n )
{
ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BG_ProcInfo_t));
ADIOI_Assert ((p != NULL));
return p;
}
void
ADIOI_BG_ProcInfo_free( ADIOI_BG_ProcInfo_t *info )
{
if (info != NULL) ADIOI_Free (info);
}
ADIOI_BG_ConfInfo_t *
ADIOI_BG_ConfInfo_new ()
{
ADIOI_BG_ConfInfo_t *p = (ADIOI_BG_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ConfInfo_t));
ADIOI_Assert ((p != NULL));
return p;
}
void
ADIOI_BG_ConfInfo_free( ADIOI_BG_ConfInfo_t *info )
{
if (info != NULL) ADIOI_Free (info);
}
typedef struct
{
int rank;
int bridgeCoord;
} sortstruct;
static int intsort(const void *p1, const void *p2)
{
sortstruct *i1, *i2;
i1 = (sortstruct *)p1;
i2 = (sortstruct *)p2;
return(i1->bridgeCoord - i2->bridgeCoord);
}
unsigned torusSize[MPIX_TORUS_MAX_DIMS];
unsigned dimTorus[MPIX_TORUS_MAX_DIMS];
/* This function computes the number of hops between the torus coordinates of the
* aggCoords and bridgeCoords parameters.
*/
static unsigned procManhattanDistance(unsigned *aggCoords, unsigned *bridgeCoords) {
unsigned totalDistance = 0;
int i;
for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
unsigned dimDistance = abs((int)aggCoords[i] - (int)bridgeCoords[i]);
if (dimDistance > 0) { // could torus make it closer?
if (dimTorus[i]) {
if (aggCoords[i] == torusSize[i]) { // is wrap-around closer
if ((bridgeCoords[i]+1) < dimDistance) // assume will use torus link
dimDistance = bridgeCoords[i]+1;
}
else if (bridgeCoords[i] == torusSize[i]) { // is wrap-around closer
if ((aggCoords[i]+1) < dimDistance) // assume will use torus link
dimDistance = aggCoords[i]+1;
}
}
} /* else: dimDistance == 0, meaning aggCoords[i] and bridgeCoords[i] are
the same and there's no closer point to pick */
totalDistance += dimDistance;
}
return totalDistance;
}
void
ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
ADIOI_BG_ProcInfo_t *proc,
int size, int rank, int n_aggrs, MPI_Comm comm)
{
int i, iambridge=0, bridgerank = -1, bridgeIndex;
int countPset;
sortstruct *bridges;
int commsize;
TRACE_ERR("Entering BG_persInfo_init, size: %d, rank: %d, n_aggrs: %d, comm: %d\n", size, rank, n_aggrs, (int)comm);
Personality_t pers;
MPIX_Hardware_t hw;
MPIX_Hardware(&hw);
TRACE_ERR("BG_persInfo_init, my coords{%u,%u,%u,%u,%u} rankInPset %u,sizeOfPset %u,idOfPset %u\n",hw.Coords[0],hw.Coords[1],hw.Coords[2],hw.Coords[3],hw.Coords[4],hw.rankInPset,hw.sizeOfPset,hw.idOfPset);
Kernel_GetPersonality(&pers, sizeof(pers));
proc->rank = rank;
proc->coreID = hw.coreID;
if (gpfsmpio_bridgeringagg > 0) {
#ifdef bridgeringaggtrace
if (rank == 0)
fprintf(stderr,"Block dimensions:\n");
#endif
/* Set the numNodesInPartition and nodeRank for this proc
*/
proc->numNodesInPartition = 1;
proc->nodeRank = 0;
for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
torusSize[i] = hw.Size[i];
dimTorus[i] = hw.isTorus[i];
proc->numNodesInPartition *= hw.Size[i];
int baseNum = 1, j;
for (j=0;j<i;j++)
baseNum *= hw.Size[j];
proc->nodeRank += (hw.Coords[i] * baseNum);
#ifdef bridgeringaggtrace
if (rank == 0)
fprintf(stderr,"Dimension %d has %d elements wrap-around value is %d\n",i,torusSize[i],dimTorus[i]);
#endif
}
}
MPI_Comm_size(comm, &commsize);
proc->ionID = MPIX_IO_node_id ();
if(size == 1)
{
proc->iamBridge = 1;
proc->bridgeRank = rank;
if (gpfsmpio_bridgeringagg > 0) {
proc->manhattanDistanceToBridge = 0;
}
/* Set up the other parameters */
proc->myIOSize = size;
proc->ioNodeIndex = 0;
conf->ioMinSize = size;
conf->ioMaxSize = size;
conf->numBridgeRanks = 1;
conf->nProcs = size;
conf->cpuIDsize = hw.ppn;
/*conf->virtualPsetSize = conf->ioMaxSize * conf->cpuIDsize;*/
conf->nAggrs = 1;
conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize*/;
if(conf->aggRatio > 1) conf->aggRatio = 1.;
TRACE_ERR("I am (single) Bridge rank\n");
return;
}
/* Find the nearest bridge node coords. We don't know the
rank in our comm so we will collective find/pick a bridge
rank later.
*/
int32_t bridgeCoords;
bridgeCoords = pers.Network_Config.cnBridge_A << 24 |
pers.Network_Config.cnBridge_B << 18 |
pers.Network_Config.cnBridge_C << 12 |
pers.Network_Config.cnBridge_D << 6 |
pers.Network_Config.cnBridge_E << 2;
ADIOI_Assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
if((hw.Coords[0] == pers.Network_Config.cnBridge_A) &&
(hw.Coords[1] == pers.Network_Config.cnBridge_B) &&
(hw.Coords[2] == pers.Network_Config.cnBridge_C) &&
(hw.Coords[3] == pers.Network_Config.cnBridge_D) &&
(hw.Coords[4] == pers.Network_Config.cnBridge_E)) {
iambridge = 1; /* I am bridge */
if (gpfsmpio_bridgeringagg > 0) {
proc->manhattanDistanceToBridge = 0;
}
}
else { // calculate manhattan distance to bridge if gpfsmpio_bridgeringagg is set
if (gpfsmpio_bridgeringagg > 0) {
unsigned aggCoords[MPIX_TORUS_MAX_DIMS],manhattanBridgeCoords[MPIX_TORUS_MAX_DIMS];
aggCoords[0] = hw.Coords[0];
manhattanBridgeCoords[0] = pers.Network_Config.cnBridge_A;
aggCoords[1] = hw.Coords[1];
manhattanBridgeCoords[1] = pers.Network_Config.cnBridge_B;
aggCoords[2] = hw.Coords[2];
manhattanBridgeCoords[2] = pers.Network_Config.cnBridge_C;
aggCoords[3] = hw.Coords[3];
manhattanBridgeCoords[3] = pers.Network_Config.cnBridge_D;
aggCoords[4] = hw.Coords[4];
manhattanBridgeCoords[4] = pers.Network_Config.cnBridge_E;
proc->manhattanDistanceToBridge= procManhattanDistance(aggCoords, manhattanBridgeCoords);
#ifdef bridgeringaggtrace
fprintf(stderr,"agg coords are %u %u %u %u %u bridge coords are %u %u %u %u %u distance is %u\n",aggCoords[0],aggCoords[1],aggCoords[2],aggCoords[3],aggCoords[4],manhattanBridgeCoords[0],manhattanBridgeCoords[1],manhattanBridgeCoords[2],manhattanBridgeCoords[3],manhattanBridgeCoords[4], proc->manhattanDistanceToBridge);
#endif
}
}
TRACE_ERR("Bridge coords(%8.8X): %d %d %d %d %d, %d. iambridge %d\n",bridgeCoords, pers.Network_Config.cnBridge_A,pers.Network_Config.cnBridge_B,pers.Network_Config.cnBridge_C,pers.Network_Config.cnBridge_D,pers.Network_Config.cnBridge_E,0, iambridge);
/* Allgather the ranks and bridgeCoords to determine the bridge
rank and how many ranks belong to each bridge rank*/
bridges = (sortstruct *) ADIOI_Malloc(sizeof(sortstruct) * size);
/* We're going to sort this structure by bridgeCoord:
typedef struct
{
int rank;
int bridgeCoord;
} sortstruct;
and I want the rank that IS the bridge to sort first, so
OR in '1' on non-bridge ranks that use a bridge coord.
*/
/* My input to the collective */
bridges[rank].rank = rank;
bridges[rank].bridgeCoord = bridgeCoords;
if(!iambridge)
bridges[rank].bridgeCoord |= 1; /* I am not bridge, turn on bit */
MPI_Allgather(MPI_IN_PLACE, 2, MPI_INT, bridges, 2, MPI_INT, comm);
qsort(bridges, size, sizeof(sortstruct), intsort);
/* Once the list is sorted walk through it to setup bridge
info and find bridge ranks, etc. */
int tempCoords, tempRank, mincompute, maxcompute;
tempCoords = bridges[0].bridgeCoord & ~1;
tempRank = bridges[0].rank;
countPset=1;
bridgeIndex = 0;
mincompute = size+1;
maxcompute = 1;
for(i=1; i<size; i++)
{
if((bridges[i].bridgeCoord & ~1) == tempCoords)
countPset++; /* same bridge (pset), count it */
else /* new bridge found */
{
#ifdef TRACE_ON
if(rank == 0)
TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
bridgeIndex, tempRank, tempCoords, countPset);
#endif
if(countPset > maxcompute)
maxcompute = countPset;
if(countPset < mincompute)
mincompute = countPset;
/* Was this my bridge we finished? */
if(tempCoords == bridgeCoords)
{
/* Am I the bridge rank? */
if(tempRank == rank)
iambridge = 1;
else
iambridge = 0; /* Another rank on my node may have taken over */
TRACE_ERR("Rank %u, bridge set %u, bridge rank %d (%#8.8X) has %d ranks, iambridge %u\n",
rank, bridgeIndex, tempRank, tempCoords, countPset,iambridge);
bridgerank = tempRank;
proc->myIOSize = countPset;
proc->ioNodeIndex = bridgeIndex;
}
/* Setup next bridge */
tempCoords = bridges[i].bridgeCoord & ~1;
tempRank = bridges[i].rank;
bridgeIndex++;
countPset = 1;
}
}
/* Process last bridge */
#ifdef TRACE_ON
if(rank == 0)
TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
bridgeIndex, tempRank, tempCoords, countPset);
#endif
if(countPset > maxcompute)
maxcompute = countPset;
if(countPset < mincompute)
mincompute = countPset;
/* Was this my bridge? */
if(tempCoords == bridgeCoords)
{
/* Am I the bridge rank? */
if(tempRank == rank)
iambridge = 1;
else
iambridge = 0; /* Another rank on my node may have taken over */
bridgerank = tempRank;
proc->myIOSize = countPset;
proc->ioNodeIndex = bridgeIndex;
}
if(rank == 0)
{
/* Only rank 0 has a conf structure, fill in stuff as appropriate */
conf->ioMinSize = mincompute;
conf->ioMaxSize = maxcompute; /* equivalent to pset size */
conf->numBridgeRanks = bridgeIndex+1;
conf->nProcs = size;
conf->cpuIDsize = hw.ppn;
/*conf->virtualPsetSize = maxcompute * conf->cpuIDsize;*/
conf->nAggrs = n_aggrs;
/* First pass gets nAggrs = -1 */
if(conf->nAggrs <=0)
conf->nAggrs = gpfsmpio_bg_nagg_pset;
if(conf->ioMinSize <= conf->nAggrs)
conf->nAggrs = ADIOI_MAX(1,conf->ioMinSize-1); /* not including bridge itself */
/* if(conf->nAggrs > conf->numBridgeRanks)
conf->nAggrs = conf->numBridgeRanks;
*/
conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize*/;
/* if(conf->aggRatio > 1) conf->aggRatio = 1.; */
TRACE_ERR("n_aggrs %zd, conf->nProcs %zu, conf->ioMaxSize %zu, ADIOI_BG_NAGG_PSET_DFLT %zu,conf->numBridgeRanks %zu,conf->nAggrs %zu\n",(size_t)n_aggrs, (size_t)conf->nProcs, (size_t)conf->ioMaxSize, (size_t)ADIOI_BG_NAGG_PSET_DFLT,(size_t)conf->numBridgeRanks,(size_t)conf->nAggrs);
TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
}
ADIOI_Assert((bridgerank != -1));
proc->bridgeRank = bridgerank;
proc->iamBridge = iambridge;
TRACE_ERR("Rank %d has bridge set index %d (bridge rank: %d) with %d other ranks, ioNodeIndex: %d\n", rank, proc->ioNodeIndex, bridgerank, proc->myIOSize, proc->ioNodeIndex);
ADIOI_Free(bridges);
}
void
ADIOI_BG_persInfo_free( ADIOI_BG_ConfInfo_t *conf, ADIOI_BG_ProcInfo_t *proc )
{
ADIOI_BG_ConfInfo_free( conf );
ADIOI_BG_ProcInfo_free( proc );
}

Просмотреть файл

@ -0,0 +1,83 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_pset.h
* \brief ???
*/
/* File: ad_bg_pset.h
*
* Defines two structures that keep BlueGene PSET specific information and their public interfaces:
* . ADIOI_BG_ProcInfo_t object keeps specific information to each process
* . ADIOI_BG_ConfInfo_t object keeps general information for the whole communicator, only kept
* on process 0.
*/
#ifndef AD_BG_PSET_H_
#define AD_BG_PSET_H_
#ifdef HAVE_MPIX_H
#include <mpix.h>
#endif
/* Keeps specific information to each process, will be exchanged among processes */
typedef struct {
int ioNodeIndex; /* similar to psetNum on BGL/BGP */
int rank; /* my rank */
int ionID; /* ion id this cn is using */
/* int myCoords[5]; */
int bridgeRank; /* my bridge node (or proxy) rank */
unsigned char coreID;
unsigned char threadID; /* unlikely to be useful but better than just padding */
unsigned char __cpad[2];
int myIOSize; /* number of ranks sharing my bridge/IO
node, i.e. psetsize*/
int iamBridge; /* am *I* the bridge rank? */
int __ipad[2];
unsigned nodeRank; /* torus coords converted to an integer for use with gpfsmpio_bridgeringagg */
unsigned numNodesInPartition; /* number of physical nodes in the job partition */
unsigned manhattanDistanceToBridge; /* number of hops between this rank and the bridge node */
} ADIOI_BG_ProcInfo_t __attribute__((aligned(16)));
/* Keeps general information for the whole communicator, only on process 0 */
typedef struct {
int ioMinSize; /* Smallest number of ranks shareing 1 bridge node */
int ioMaxSize; /* Largest number of ranks sharing 1 bridge node */
/* ioMaxSize will be the "psetsize" */
int nAggrs;
int numBridgeRanks;
/*int virtualPsetSize; ppn * pset size */
int nProcs;
int cpuIDsize; /* num ppn */
float aggRatio;
} ADIOI_BG_ConfInfo_t __attribute__((aligned(16)));
#undef MIN
#define MIN(a,b) (((a)<(b) ? (a) : (b)))
/* public funcs for ADIOI_BG_ProcInfo_t objects */
ADIOI_BG_ProcInfo_t * ADIOI_BG_ProcInfo_new();
ADIOI_BG_ProcInfo_t * ADIOI_BG_ProcInfo_new_n( int n );
void ADIOI_BG_ProcInfo_free( ADIOI_BG_ProcInfo_t *info );
/* public funcs for ADIOI_BG_ConfInfo_t objects */
ADIOI_BG_ConfInfo_t * ADIOI_BG_ConfInfo_new ();
void ADIOI_BG_ConfInfo_free( ADIOI_BG_ConfInfo_t *info );
/* public funcs for a pair of ADIOI_BG_ConfInfo_t and ADIOI_BG_ProcInfo_t objects */
void ADIOI_BG_persInfo_init( ADIOI_BG_ConfInfo_t *conf,
ADIOI_BG_ProcInfo_t *proc,
int s, int r, int n_aggrs, MPI_Comm comm);
void ADIOI_BG_persInfo_free( ADIOI_BG_ConfInfo_t *conf,
ADIOI_BG_ProcInfo_t *proc );
#endif /* AD_BG_PSET_H_ */

Просмотреть файл

@ -0,0 +1,16 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2012 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_PE
noinst_HEADERS += \
adio/ad_gpfs/pe/ad_pe_aggrs.h
romio_other_sources += \
adio/ad_gpfs/pe/ad_pe_aggrs.c
endif BUILD_AD_PE

Просмотреть файл

@ -0,0 +1,276 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_pe_aggrs.c
* \brief The externally used function from this file is is declared in ad_pe_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*#define TRACE_ON */
#include "adio.h"
#include "adio_cb_config_list.h"
#include "../ad_gpfs.h"
#include "ad_pe_aggrs.h"
#include "mpiimpl.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(format...)
#endif
/*
* Compute the aggregator-related parameters that are required in 2-phase
* collective IO of ADIO.
* The parameters are
* . the number of aggregators (proxies) : fd->hints->cb_nodes
* . the ranks of the aggregators : fd->hints->ranklist
* If MP_IONODEFILE is defined, POE determines all tasks on every node listed
* in the node file and defines MP_IOTASKLIST with them, making them all
* aggregators. Alternatively, the user can explictly set MP_IOTASKLIST
* themselves. The format of the MP_IOTASKLIST is a colon-delimited list of
* task ids, the first entry being the total number of aggregators, for example
* to specify 4 aggregators on task ids 0,8,16,24 the value would be:
* 4:0:8:16:24. If there is no MP_IONODEFILE, or MP_IOTASKLIST, then the
* default aggregator selection is 1 task per node for every node of the job -
* additionally, an environment variable MP_IOAGGR_CNT can be specified, which
* defines the total number of aggregators, spread evenly across all the nodes.
* The romio_cb_nodes and romio_cb_config_list hint user settings are ignored.
*/
int
ADIOI_PE_gen_agg_ranklist(ADIO_File fd)
{
int numAggs = 0;
char *ioTaskList = getenv( "MP_IOTASKLIST" );
char *ioAggrCount = getenv("MP_IOAGGR_CNT");
int i,j;
int inTERcommFlag = 0;
int myRank,commSize;
MPI_Comm_rank(fd->comm, &myRank);
MPI_Comm_size(fd->comm, &commSize);
MPI_Comm_test_inter(fd->comm, &inTERcommFlag);
if (inTERcommFlag) {
FPRINTF(stderr,"ERROR: ATTENTION: inTERcomms are not supported in MPI-IO - aborting....\n");
perror("ADIOI_PE_gen_agg_ranklist:");
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (ioTaskList) {
int ioTaskListLen = strlen(ioTaskList);
int ioTaskListPos = 0;
char tmpBuf[8]; /* Big enough for 1M tasks (7 digits task ID). */
tmpBuf[7] = '\0';
for (i=0; i<7; i++) {
tmpBuf[i] = *ioTaskList++; /* Maximum is 7 digits for 1 million. */
ioTaskListPos++;
if (*ioTaskList == ':') { /* If the next char is a ':' ends it. */
tmpBuf[i+1] = '\0';
break;
}
}
numAggs = atoi(tmpBuf);
if (numAggs == 0)
FPRINTF(stderr,"ERROR: ATTENTION: Number of aggregators specified in MP_IOTASKLIST set at 0 - default aggregator selection will be used.\n");
else if (!((numAggs > 0 ) && (numAggs <= commSize))) {
FPRINTF(stderr,"ERROR: ATTENTION: The number of aggregators (%s) specified in MP_IOTASKLIST is outside the communicator task range of %d.\n",tmpBuf,commSize);
numAggs = commSize;
}
fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));
int aggIndex = 0;
while (aggIndex < numAggs) {
ioTaskList++; /* Advance past the ':' */
ioTaskListPos++;
int allDigits=1;
for (i=0; i<7; i++) {
if (*ioTaskList < '0' || *ioTaskList > '9')
allDigits=0;
tmpBuf[i] = *ioTaskList++;
ioTaskListPos++;
if ( (*ioTaskList == ':') || (*ioTaskList == '\0') ) {
tmpBuf[i+1] = '\0';
break;
}
}
if (allDigits) {
int newAggRank = atoi(tmpBuf);
if (!((newAggRank >= 0 ) && (newAggRank < commSize))) {
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not within the communicator task range of 0 to %d - it will be ignored.\n",tmpBuf,commSize-1);
}
else {
int aggAlreadyAdded = 0;
for (i=0;i<aggIndex;i++)
if (fd->hints->ranklist[i] == newAggRank) {
aggAlreadyAdded = 1;
break;
}
if (!aggAlreadyAdded)
fd->hints->ranklist[aggIndex++] = newAggRank;
else
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%d' is specified multiple times in MP_IOTASKLIST - duplicates are ignored.\n",newAggRank);
}
}
else {
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not a valid integer task id - it will be ignored.\n",tmpBuf);
}
/* At the end check whether the list is shorter than specified. */
if (ioTaskListPos == ioTaskListLen) {
if (aggIndex == 0) {
FPRINTF(stderr,"ERROR: ATTENTION: No aggregators were correctly specified in MP_IOTASKLIST - default aggregator selection will be used.\n");
ADIOI_Free(fd->hints->ranklist);
}
else if (aggIndex < numAggs)
FPRINTF(stderr,"ERROR: ATTENTION: %d aggregators were specified in MP_IOTASKLIST but only %d were correctly specified - setting the number of aggregators to %d.\n",numAggs, aggIndex,aggIndex);
numAggs = aggIndex;
}
}
}
if (numAggs == 0) {
MPID_Comm *mpidCommData;
MPID_Comm_get_ptr(fd->comm,mpidCommData);
int localSize = mpidCommData->local_size;
// get my node rank
int myNodeRank = mpidCommData->intranode_table[mpidCommData->rank];
int *allNodeRanks = (int *) ADIOI_Malloc (localSize * sizeof(int));
allNodeRanks[myRank] = myNodeRank;
MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, allNodeRanks, 1, MPI_INT, fd->comm);
#ifdef AGG_DEBUG
printf("MPID_Comm data: local_size is %d\nintranode_table entries:\n",mpidCommData->local_size);
for (i=0;i<localSize;i++) {
printf("%d ",mpidCommData->intranode_table[i]);
}
printf("\ninternode_table entries:\n");
for (i=0;i<localSize;i++) {
printf("%d ",mpidCommData->internode_table[i]);
}
printf("\n");
printf("\nallNodeRanks entries:\n");
for (i=0;i<localSize;i++) {
printf("%d ",allNodeRanks[i]);
}
printf("\n");
#endif
if (ioAggrCount) {
int cntType = -1;
if ( strcasecmp(ioAggrCount, "ALL") ) {
if ( (cntType = atoi(ioAggrCount)) <= 0 ) {
/* Input is other non-digit or less than 1 the assume */
/* 1 aggregator per node. Note: atoi(-1) reutns -1. */
/* No warning message given here -- done earlier. */
cntType = -1;
}
}
else {
/* ALL is specified set aggr count to localSize */
cntType = -2;
}
switch(cntType) {
case -1:
/* 1 aggr/node case */
{
int rankListIndex = 0;
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
for (i=0;i<localSize;i++) {
if (allNodeRanks[i] == 0) {
fd->hints->ranklist[rankListIndex++] = i;
numAggs++;
}
}
}
break;
case -2:
/* ALL tasks case */
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
for (i=0;i<localSize;i++) {
fd->hints->ranklist[i] = i;
numAggs++;
}
break;
default:
/* Specific aggr count case -- MUST be less than localSize, otherwise set to localSize */
if (cntType > localSize)
cntType = localSize;
numAggs = cntType;
// Round-robin thru allNodeRanks - pick the 0's, then the 1's, etc
int currentNodeRank = 0; // node rank currently being selected as aggregator
int rankListIndex = 0;
int currentAllNodeIndex = 0;
fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));
while (rankListIndex < numAggs) {
int foundEntry = 0;
while (!foundEntry && (currentAllNodeIndex < localSize)) {
if (allNodeRanks[currentAllNodeIndex] == currentNodeRank) {
fd->hints->ranklist[rankListIndex++] = currentAllNodeIndex;
foundEntry = 1;
}
currentAllNodeIndex++;
}
if (!foundEntry) {
currentNodeRank++;
currentAllNodeIndex = 0;
}
} // while
break;
} // switch(cntType)
} // if (ioAggrCount)
else { // default is 1 aggregator per node
// take the 0 entries from allNodeRanks
int rankListIndex = 0;
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
for (i=0;i<localSize;i++) {
if (allNodeRanks[i] == 0) {
fd->hints->ranklist[rankListIndex++] = i;
numAggs++;
}
}
}
ADIOI_Free(allNodeRanks);
}
if ( getenv("MP_I_SHOW_AGGRS") ) {
if (myRank == 0) {
printf("Agg rank list of %d generated:\n", numAggs);
for (i=0;i<numAggs;i++) {
printf("%d ",fd->hints->ranklist[i]);
}
printf("\n");
}
}
fd->hints->cb_nodes = numAggs;
return 0;
}

Просмотреть файл

@ -0,0 +1,30 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_pe_aggrs.h
* \brief ???
*/
/*
*
* Declares functions specific for the PE platform within the GPFS
* parallel I/O solution. For now simply processes the MP_IOTASKLIST
* env var.
*
*/
#ifndef AD_PE_AGGRS_H_
#define AD_PE_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
/* generate a list of I/O aggregators following a methodology specific for PE */
int ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
#endif /* AD_PE_AGGRS_H_ */

Просмотреть файл

@ -0,0 +1,27 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_GRIDFTP
noinst_HEADERS += adio/ad_gridftp/ad_gridftp.h
romio_other_sources += \
adio/ad_gridftp/ad_gridftp_close.c \
adio/ad_gridftp/ad_gridftp_open.c \
adio/ad_gridftp/ad_gridftp_read.c \
adio/ad_gridftp/ad_gridftp_write.c \
adio/ad_gridftp/ad_gridftp_fcntl.c \
adio/ad_gridftp/ad_gridftp_flush.c \
adio/ad_gridftp/ad_gridftp_resize.c \
adio/ad_gridftp/ad_gridftp_hints.c \
adio/ad_gridftp/ad_gridftp_delete.c \
adio/ad_gridftp/ad_gridftp.c \
adio/ad_gridftp/globus_routines.c \
adio/ad_gridftp/ad_gridftp_features.c
endif BUILD_AD_GRIDFTP

Просмотреть файл

@ -0,0 +1,37 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_GRIDFTP_operations = {
ADIOI_GRIDFTP_Open, /* Open */
ADIOI_GRIDFTP_ReadContig, /* ReadContig */
ADIOI_GRIDFTP_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GRIDFTP_Fcntl, /* Fcntl */
ADIOI_GRIDFTP_SetInfo, /* SetInfo */
ADIOI_GRIDFTP_ReadStrided, /* ReadStrided */
ADIOI_GRIDFTP_WriteStrided, /* WriteStrided */
ADIOI_GRIDFTP_Close, /* Close */
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
ADIOI_FAKE_IODone, /* ReadDone */
ADIOI_FAKE_IODone, /* WriteDone */
ADIOI_FAKE_IOComplete, /* ReadComplete */
ADIOI_FAKE_IOComplete, /* WriteComplete */
ADIOI_FAKE_IreadStrided, /* IreadStrided */
ADIOI_FAKE_IwriteStrided, /* IwriteStrided */
ADIOI_GRIDFTP_Flush, /* Flush */
ADIOI_GRIDFTP_Resize, /* Resize */
ADIOI_GRIDFTP_Delete, /* Delete */
ADIOI_GRIDFTP_Feature, /* Features */
};

Просмотреть файл

@ -0,0 +1,96 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_GRIDFTP_INCLUDE
#define AD_GRIDFTP_INCLUDE
#include <unistd.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <fcntl.h>
#include "adio.h"
#include <globus_ftp_client.h>
/* These live in globus_routines.c */
extern int num_gridftp_handles;
#ifndef ADIO_GRIDFTP_HANDLES_MAX
#define ADIO_GRIDFTP_HANDLES_MAX 200
#endif /* ! ADIO_GRIDFTP_HANDLES_MAX */
extern globus_ftp_client_handle_t gridftp_fh[ADIO_GRIDFTP_HANDLES_MAX];
extern globus_ftp_client_operationattr_t oattr[ADIO_GRIDFTP_HANDLES_MAX];
/* TODO: weed out the now-unused prototypes */
void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code);
void ADIOI_GRIDFTP_Close(ADIO_File fd, int *error_code);
void ADIOI_GRIDFTP_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_GRIDFTP_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_GRIDFTP_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_GRIDFTP_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
void ADIOI_GRIDFTP_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code);
void ADIOI_GRIDFTP_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_GRIDFTP_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_WriteStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GRIDFTP_IreadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_GRIDFTP_IwriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_GRIDFTP_Flush(ADIO_File fd, int *error_code);
void ADIOI_GRIDFTP_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
void ADIOI_GRIDFTP_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_GRIDFTP_Get_shared_fp(ADIO_File fd, int size,
ADIO_Offset *shared_fp,
int *error_code);
void ADIOI_GRIDFTP_Set_shared_fp(ADIO_File fd, ADIO_Offset offset,
int *error_code);
void ADIOI_GRIDFTP_Delete(char *filename, int *error_code);
void globus_err_handler(const char *routine, const char *caller,
globus_result_t result);
#endif

Просмотреть файл

@ -0,0 +1,50 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
void ADIOI_GRIDFTP_Close(ADIO_File fd, int *error_code)
{
int err;
static char myname[]="ADIOI_GRIDFTP_Close";
globus_result_t result;
MPI_Barrier(fd->comm);
/* Destroy the ftp handle and opattr */
result = globus_ftp_client_operationattr_destroy(&(oattr[fd->fd_sys]));
if (result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_operationattr_destroy",
myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s",globus_object_printable_to_string(globus_error_get(result)));
return;
}
result=globus_ftp_client_handle_destroy(&(gridftp_fh[fd->fd_sys]));
if (result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_handle_destroy",
myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
fd->fd_sys = -1;
fd->fp_ind=0;
fd->fp_sys_posn=0;
num_gridftp_handles--;
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,95 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
static globus_mutex_t lock;
static globus_cond_t cond;
static globus_bool_t delete_done, delete_success;
static void delete_cb(void *myarg, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
else
{
delete_success=GLOBUS_TRUE;
}
delete_done=GLOBUS_TRUE;
}
void ADIOI_GRIDFTP_Delete(char *filename, int *error_code)
{
char myname[]="ADIOI_GRIDFTP_Delete";
int myrank, nprocs;
globus_ftp_client_handle_t handle;
globus_result_t result;
*error_code = MPI_SUCCESS;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
globus_module_activate(GLOBUS_FTP_CLIENT_MODULE);
result=globus_ftp_client_handle_init(&handle,GLOBUS_NULL);
if (result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_handle_init",myname,result);
*error_code= MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
delete_done=GLOBUS_FALSE;
delete_success=GLOBUS_FALSE;
result=globus_ftp_client_delete(&handle,filename,GLOBUS_NULL,delete_cb,GLOBUS_NULL);
if (result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_delete",myname,result);
*error_code= MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&lock);
while ( delete_done!=GLOBUS_TRUE )
globus_cond_wait(&cond,&lock);
globus_mutex_unlock(&lock);
result=globus_ftp_client_handle_destroy(&handle);
if (result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_handle_destroy",myname,result);
*error_code= MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
if ( delete_success!=GLOBUS_TRUE )
{
*error_code= MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
}
}

Просмотреть файл

@ -0,0 +1,91 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
#include "adio_extern.h"
globus_mutex_t fcntl_size_lock;
globus_cond_t fcntl_size_cond;
globus_bool_t fcntl_size_done;
void fcntl_size_cb(void *myargs, globus_ftp_client_handle_t *handle,
globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
globus_mutex_lock(&fcntl_size_lock);
fcntl_size_done=GLOBUS_TRUE;
globus_cond_signal(&fcntl_size_cond);
globus_mutex_unlock(&fcntl_size_lock);
}
void ADIOI_GRIDFTP_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code)
{
MPI_Datatype copy_etype, copy_filetype;
int combiner, i, j, k, filetype_is_contig, err;
ADIOI_Flatlist_node *flat_file;
char myname[]="ADIOI_GRIDFTP_Fcntl";
int myrank, nprocs;
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
switch(flag) {
case ADIO_FCNTL_GET_FSIZE:
{
globus_result_t result;
globus_off_t fsize=0;
globus_mutex_init(&fcntl_size_lock,GLOBUS_NULL);
globus_cond_init(&fcntl_size_cond,GLOBUS_NULL);
fcntl_size_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_size(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
&(fsize),
fcntl_size_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_size",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&fcntl_size_lock);
while ( fcntl_size_done!=GLOBUS_TRUE )
globus_cond_wait(&fcntl_size_lock,&fcntl_size_cond);
globus_mutex_unlock(&fcntl_size_lock);
globus_mutex_destroy(&fcntl_size_lock);
globus_cond_destroy(&fcntl_size_cond);
fcntl_struct->fsize=fsize;
}
*error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
break;
case ADIO_FCNTL_SET_ATOMICITY:
default:
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"**flag", "**flag %d", flag);
}
}

Просмотреть файл

@ -0,0 +1,18 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
int ADIOI_GRIDFTP_Feature (ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

@ -0,0 +1,19 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
/* GridFTP doesn't give you a way to cache writes on the client side, so
is essentially a no-op */
/* if there is a mechanism where we can ask the server to flush data to disk we
* should do it here. I'll leave that up to Troy */
void ADIOI_GRIDFTP_Flush(ADIO_File fd, int *error_code)
{
return;
}

Просмотреть файл

@ -0,0 +1,68 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
/*
Valid hints for ftp:// and gsiftp:// URLs (aside from the std. ones):
ftp_control_mode extended|block|stream|compressed
(default extended for gsiftp:// URLs and stream for ftp:// URLs)
parallelism integer number of simultaneous threads connecting to
ftp server (default 1)
striped_ftp true|false or enable|disable; enables gsiftp striped data transfer
tcp_buffer integer size of tcp stream buffers in bytes
transfer_type ascii or binary (default binary)
These *must* be specified at open time currently.
*/
#include "ad_gridftp.h"
#include "adioi.h"
void ADIOI_GRIDFTP_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
if (!(fd->info))
{
if ( users_info==MPI_INFO_NULL )
{
/* This must be part of the open call. */
MPI_Info_create(&(fd->info));
}
else
{
MPI_Info_dup(users_info,&(fd->info));
}
}
else
{
int i,nkeys,valuelen,flag;
char key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];
if ( users_info!=MPI_INFO_NULL )
{
MPI_Info_get_nkeys(users_info,&nkeys);
for (i=0;i<nkeys;i++)
{
MPI_Info_get_nthkey(users_info,i,key);
MPI_Info_get_valuelen(users_info,key,&valuelen,&flag);
if (flag)
{
ADIOI_Info_get(users_info,key,valuelen,value,&flag);
if (flag) ADIOI_Info_set(fd->info,key,value);
}
}
}
}
/* let the generic ROMIO and MPI-I/O stuff happen... */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
}

Просмотреть файл

@ -0,0 +1,343 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
static globus_mutex_t lock;
static globus_cond_t cond;
static globus_bool_t file_exists,exists_done;
static void exists_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
else
{
file_exists=GLOBUS_TRUE;
}
exists_done=GLOBUS_TRUE;
}
static globus_bool_t touch_ctl_done;
static void touch_ctl_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
globus_mutex_lock(&lock);
touch_ctl_done=GLOBUS_TRUE;
globus_cond_signal(&cond);
globus_mutex_unlock(&lock);
}
static void touch_data_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error,
globus_byte_t *buffer, globus_size_t length, globus_off_t offset,
globus_bool_t eof)
{
if (error)
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
globus_ftp_client_register_read(handle,buffer,length,touch_data_cb,myargs);
return;
}
void ADIOI_GRIDFTP_Open(ADIO_File fd, int *error_code)
{
static char myname[]="ADIOI_GRIDFTP_Open";
int myrank, nprocs, keyfound;
char hintval[MPI_MAX_INFO_VAL+1];
globus_ftp_client_handleattr_t hattr;
globus_result_t result;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
/* activate Globus ftp client module -- can be called multiple times, so
it's safest to call once per file/connection */
globus_module_activate(GLOBUS_FTP_CLIENT_MODULE);
fd->fd_sys = num_gridftp_handles;
/* No shared file pointers for now */
fd->shared_fp_fname = NULL;
*error_code = MPI_SUCCESS;
/* Access modes here mean something very different here than they
would on a "real" filesystem... As a result, the amode and hint
processing here is intermingled and a little weird because many
of them have to do with the connection rather than the file itself.
The thing that sucks about this is that read and write ops will
have to check themselves if the file is being accessed rdonly, rdwr,
or wronly.
*/
result=globus_ftp_client_handleattr_init(&hattr);
if ( result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_handleattr_init",
myname,result);
fd->fd_sys = -1;
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
result = globus_ftp_client_operationattr_init(&(oattr[fd->fd_sys]));
if ( result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_operationattr_init",
myname,result);
fd->fd_sys = -1;
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* Always use connection caching unless told otherwise */
result=globus_ftp_client_handleattr_set_cache_all(&hattr,GLOBUS_TRUE);
if ( result !=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_handleattr_set_cache_all",myname,result);
/* Assume that it's safe to cache a file if it's read-only */
if ( (fd->access_mode&ADIO_RDONLY) &&
(result=globus_ftp_client_handleattr_add_cached_url(&hattr,fd->filename))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_handleattr_add_cached_url",myname,result);
/* Since we're (almost by definition) doing things that FTP S (stream)
control mode can't handle, default to E (extended block) control mode
for gsiftp:// URLs. ftp:// URLs use standard stream control mode
by default. This behavior can be overridden by the ftp_control_mode
hint. */
/*
if ( !strncmp(fd->filename,"gsiftp:",7) &&
(result=globus_ftp_client_operationattr_set_mode(&(oattr[fd->fd_sys]),GLOBUS_FTP_CONTROL_MODE_EXTENDED_BLOCK))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
else if ( !strncmp(fd->filename,"ftp:",4) &&
(result=globus_ftp_client_operationattr_set_mode(&(oattr[fd->fd_sys]),GLOBUS_FTP_CONTROL_MODE_STREAM))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
*/
/* Set append mode if necessary */
if ( (fd->access_mode&ADIO_APPEND) &&
((result=globus_ftp_client_operationattr_set_append(&(oattr[fd->fd_sys]),GLOBUS_TRUE))!=GLOBUS_SUCCESS) )
globus_err_handler("globus_ftp_client_operationattr_set_append",myname,result);
/* Other hint and amode processing that would affect hattr and/or
oattr[] (eg. parallelism, striping, etc.) goes here */
if ( fd->info!=MPI_INFO_NULL )
{
ADIOI_Info_get(fd->info,"ftp_control_mode",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
if ( ( !strcmp(hintval,"extended") || !strcmp(hintval,"extended_block") ) &&
(result=globus_ftp_client_operationattr_set_mode(&(oattr[fd->fd_sys]),GLOBUS_FTP_CONTROL_MODE_EXTENDED_BLOCK))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
else if ( !strcmp(hintval,"block") &&
(result=globus_ftp_client_operationattr_set_mode(&(oattr[fd->fd_sys]),GLOBUS_FTP_CONTROL_MODE_BLOCK))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
else if ( !strcmp(hintval,"compressed") &&
(result=globus_ftp_client_operationattr_set_mode(&(oattr[fd->fd_sys]),GLOBUS_FTP_CONTROL_MODE_COMPRESSED))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
else if ( !strcmp(hintval,"stream") &&
(result=globus_ftp_client_operationattr_set_mode(&(oattr[fd->fd_sys]),GLOBUS_FTP_CONTROL_MODE_STREAM))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_mode",myname,result);
}
ADIOI_Info_get(fd->info,"parallelism",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
int nftpthreads;
if ( sscanf(hintval,"%d",&nftpthreads)==1 )
{
globus_ftp_control_parallelism_t parallelism;
parallelism.mode = GLOBUS_FTP_CONTROL_PARALLELISM_FIXED;
parallelism.fixed.size = nftpthreads;
if ( (result=globus_ftp_client_operationattr_set_parallelism(&(oattr[fd->fd_sys]),
&parallelism))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_parallelism",myname,result);
}
}
ADIOI_Info_get(fd->info,"striped_ftp",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
/* if set to "true" or "enable", set up round-robin block layout */
if ( !strncmp("true",hintval,4) || !strncmp("TRUE",hintval,4) ||
!strncmp("enable",hintval,4) || !strncmp("ENABLE",hintval,4) )
{
ADIOI_Info_get(fd->info,"striping_factor",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
int striping_factor;
if ( sscanf(hintval,"%d",&striping_factor)==1 )
{
globus_ftp_control_layout_t layout;
layout.mode = GLOBUS_FTP_CONTROL_STRIPING_BLOCKED_ROUND_ROBIN;
layout.round_robin.block_size = striping_factor;
if ( (result=globus_ftp_client_operationattr_set_layout(&(oattr[fd->fd_sys]),
&layout))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_layout",
myname,result);
}
}
}
}
ADIOI_Info_get(fd->info,"tcp_buffer",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
/* set tcp buffer size */
int buffer_size;
if ( sscanf(hintval,"%d",&buffer_size)==1 )
{
globus_ftp_control_tcpbuffer_t tcpbuf;
tcpbuf.mode = GLOBUS_FTP_CONTROL_TCPBUFFER_FIXED;
tcpbuf.fixed.size = buffer_size;
if ( (result=globus_ftp_client_operationattr_set_tcp_buffer(&(oattr[fd->fd_sys]),
&tcpbuf))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_tcp_buffer",myname,result);
}
}
ADIOI_Info_get(fd->info,"transfer_type",MPI_MAX_INFO_VAL,hintval,&keyfound);
if ( keyfound )
{
globus_ftp_control_type_t filetype;
/* set transfer type (i.e. ASCII or binary) */
if ( !strcmp("ascii",hintval) || !strcmp("ASCII",hintval) )
{
filetype=GLOBUS_FTP_CONTROL_TYPE_ASCII;
}
else
{
filetype=GLOBUS_FTP_CONTROL_TYPE_IMAGE;
}
if ( (result=globus_ftp_client_operationattr_set_type(&(oattr[fd->fd_sys]),filetype))!=GLOBUS_SUCCESS )
globus_err_handler("globus_ftp_client_operationattr_set_type",myname,result);
}
}
else
FPRINTF(stderr,"no MPI_Info object associated with %s\n",fd->filename);
/* Create the ftp handle */
result=globus_ftp_client_handle_init(&(gridftp_fh[fd->fd_sys]),&hattr);
if ( result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_handle_init",myname,result);
fd->fd_sys = -1;
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* Check for existence of the file */
globus_mutex_init(&lock, GLOBUS_NULL);
globus_cond_init(&cond, GLOBUS_NULL);
file_exists=GLOBUS_FALSE;
exists_done=GLOBUS_FALSE;
if ( myrank==0 )
{
if ( (result=globus_ftp_client_exists(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
exists_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_exists",myname,result);
fd->fd_sys = -1;
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* wait till the callback completes */
globus_mutex_lock(&lock);
while ( exists_done!=GLOBUS_TRUE )
globus_cond_wait(&cond,&lock);
globus_mutex_unlock(&lock);
}
MPI_Barrier(fd->comm);
MPI_Bcast(&file_exists,1,MPI_INT,0,fd->comm);
/* It turns out that this is handled by MPI_File_open() directly */
if ( (file_exists!=GLOBUS_TRUE) && (fd->access_mode&ADIO_CREATE) &&
!(fd->access_mode&ADIO_EXCL) && !(fd->access_mode&ADIO_RDONLY) )
{
if ( myrank==0 )
{
/* if the file doesn't exist, write a single NULL to it */
globus_byte_t touchbuf=(globus_byte_t)'\0';
touch_ctl_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_put(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
touch_ctl_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_put",myname,result);
fd->fd_sys = -1;
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
result=globus_ftp_client_register_write(&(gridftp_fh[fd->fd_sys]),
(globus_byte_t *)&touchbuf, 0,
(globus_off_t)0, GLOBUS_TRUE,
touch_data_cb, GLOBUS_NULL);
if ( result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_register_write",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&lock);
while ( touch_ctl_done!=GLOBUS_TRUE )
globus_cond_wait(&cond,&lock);
globus_mutex_unlock(&lock);
}
MPI_Barrier(fd->comm);
}
else if ( (fd->access_mode&ADIO_EXCL) && (file_exists==GLOBUS_TRUE) )
{
fd->fd_sys = -1;
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", 0);
return;
}
else if ( (fd->access_mode&ADIO_RDONLY) && (file_exists!=GLOBUS_TRUE) )
{
if ( myrank==0 )
{
FPRINTF(stderr,"WARNING: read-only file %s does not exist!\n",fd->filename);
}
}
num_gridftp_handles++;
}

Просмотреть файл

@ -0,0 +1,468 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
#include "adio_extern.h"
static globus_mutex_t readcontig_ctl_lock;
static globus_cond_t readcontig_ctl_cond;
static globus_bool_t readcontig_ctl_done;
static void readcontig_ctl_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
globus_mutex_lock(&readcontig_ctl_lock);
if ( readcontig_ctl_done!=GLOBUS_TRUE )
readcontig_ctl_done=GLOBUS_TRUE;
globus_cond_signal(&readcontig_ctl_cond);
globus_mutex_unlock(&readcontig_ctl_lock);
return;
}
static void readcontig_data_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error,
globus_byte_t *buffer, globus_size_t length, globus_off_t offset,
globus_bool_t eof)
{
globus_size_t *bytes_read;
bytes_read=(globus_size_t *)myargs;
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
*bytes_read+=length;
/* I don't understand why the data callback has to keep recalling register_read,
but everything I've done and all the examples I've seen seem to require
that behavior to work... */
/*
* Using buffer+length seems to work, but is probably not the correct
* solution. A big read of 256kB chunks will have lines like this:
readcontig_data_cb: buffer 0x404e0008 length 0 offset 31719424 eof 1
readcontig_data_cb: buffer 0x404a0008 length 65536 offset 31981568 eof 0
readcontig_data_cb: buffer 0x404b0008 length 65536 offset 32047104 eof 0
readcontig_data_cb: buffer 0x404c0008 length 65536 offset 32112640 eof 0
readcontig_data_cb: buffer 0x404d0008 length 65536 offset 32178176 eof 0
*/
if ( !eof )
globus_ftp_client_register_read(handle,
buffer+length,
length,
readcontig_data_cb,
(void *)(bytes_read));
return;
}
static globus_mutex_t readdiscontig_ctl_lock;
static globus_cond_t readdiscontig_ctl_cond;
static globus_bool_t readdiscontig_ctl_done;
static void readdiscontig_ctl_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
globus_mutex_lock(&readdiscontig_ctl_lock);
if ( readdiscontig_ctl_done!=GLOBUS_TRUE )
readdiscontig_ctl_done=GLOBUS_TRUE;
globus_cond_signal(&readdiscontig_ctl_cond);
globus_mutex_unlock(&readdiscontig_ctl_lock);
return;
}
static void readdiscontig_data_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error,
globus_byte_t *buffer, globus_size_t length, globus_off_t offset,
globus_bool_t eof)
{
globus_size_t *bytes_read;
bytes_read=(globus_size_t *)myargs;
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
*bytes_read+=length;
/* I don't understand why the data callback has to keep recalling register_read,
but everything I've done and all the examples I've seen seem to require
that behavior to work... */
if ( !eof )
globus_ftp_client_register_read(handle,
buffer,
length,
readdiscontig_data_cb,
(void *)(bytes_read));
return;
}
void ADIOI_GRIDFTP_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
static char myname[]="ADIOI_GRIDFTP_ReadContig";
int myrank, nprocs;
MPI_Count datatype_size;
globus_size_t len,bytes_read=0;
globus_off_t goff;
globus_result_t result;
if ( fd->access_mode&ADIO_WRONLY )
{
*error_code=MPIR_ERR_MODE_WRONLY;
return;
}
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size_x(datatype, &datatype_size);
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
offset = fd->fp_ind;
}
/* Do the gridftp I/O transfer */
goff = (globus_off_t)offset;
len = ((globus_size_t)datatype_size)*((globus_size_t)count);
globus_mutex_init(&readcontig_ctl_lock, GLOBUS_NULL);
globus_cond_init(&readcontig_ctl_cond, GLOBUS_NULL);
readcontig_ctl_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_partial_get(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
goff,
goff+(globus_off_t)len,
readcontig_ctl_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_partial_get",myname,result);
*error_code=MPI_ERR_IO;
ADIOI_Error(fd,*error_code,myname);
return;
}
result=globus_ftp_client_register_read(&(gridftp_fh[fd->fd_sys]),
(globus_byte_t *)buf, len, readcontig_data_cb,
(void *)(&bytes_read));
if ( result != GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_register_read",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* The ctl callback won't start till the data callbacks complete, so it's
safe to wait on just the ctl callback */
globus_mutex_lock(&readcontig_ctl_lock);
while ( readcontig_ctl_done!=GLOBUS_TRUE )
globus_cond_wait(&readcontig_ctl_cond,&readcontig_ctl_lock);
globus_mutex_unlock(&readcontig_ctl_lock);
globus_mutex_destroy(&readcontig_ctl_lock);
globus_cond_destroy(&readcontig_ctl_cond);
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bytes_read);
#endif
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
fd->fp_ind += bytes_read;
fd->fp_sys_posn = fd->fp_ind;
}
else {
fd->fp_sys_posn = offset + bytes_read;
}
}
void ADIOI_GRIDFTP_ReadDiscontig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
char myname[]="ADIOI_GRIDFTP_ReadDiscontig";
int myrank,nprocs;
/* size and extent of buffer in memory */
MPI_Aint btype_size,btype_extent;
/* size and extent of file record layout */
MPI_Aint ftype_size,ftype_extent;
/* size of file elemental type; seeks are done in units of this */
MPI_Aint etype_size;
MPI_Aint extent;
ADIOI_Flatlist_node *flat_file;
int i,buf_contig,boff,nblks;
globus_off_t start,end,goff;
globus_size_t bytes_read;
globus_result_t result;
globus_byte_t *tmp;
if ( fd->access_mode&ADIO_WRONLY )
{
*error_code=MPIR_ERR_MODE_WRONLY;
return;
}
*error_code=MPI_SUCCESS;
MPI_Comm_rank(fd->comm,&myrank);
MPI_Comm_size(fd->comm,&nprocs);
etype_size=fd->etype_size;
MPI_Type_size_x(fd->filetype,&ftype_size);
MPI_Type_extent(fd->filetype,&ftype_extent);
/* This is arguably unnecessary, as this routine assumes that the
buffer in memory is contiguous */
MPI_Type_size_x(datatype,&btype_size);
MPI_Type_extent(datatype,&btype_extent);
ADIOI_Datatype_iscontig(datatype,&buf_contig);
if ( ( btype_extent!=btype_size ) || ( ! buf_contig ) )
{
FPRINTF(stderr,"[%d/%d] %s called with discontigous memory buffer\n",
myrank,nprocs,myname);
fflush(stderr);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", 0 );
return;
}
/* from here we can assume btype_extent==btype_size */
/* Flatten out fd->filetype so we know which blocks to skip */
ADIOI_Flatten_datatype(fd->filetype);
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype && flat_file->next!=NULL)
flat_file = flat_file->next;
/* Figure out how big the area to read is */
start=(globus_off_t)(offset*etype_size);
goff=start;
boff=0;
extent=0;
nblks=0;
while ( boff < (count*btype_size) )
{
int blklen=0;
for (i=0;i<flat_file->count;i++)
{
/* find the length of the next block */
if ( (boff+flat_file->blocklens[i]) < (count*btype_size) )
blklen=flat_file->blocklens[i];
else
blklen=(count*btype_size)-boff;
/* increment buffer size to be used */
boff+=blklen;
/* compute extent -- the nblks*ftype_extent bit is
there so we remember how many ftypes we've already
been through */
extent=MAX(extent,nblks*ftype_extent+flat_file->indices[i]+blklen);
if ( boff>=(count*btype_size) )
break;
}
nblks++;
}
if ( extent < count*btype_size )
{
FPRINTF(stderr,"[%d/%d] %s error in computing extent -- extent %d is smaller than total bytes requested %d!\n",
myrank,nprocs,myname,extent,count*btype_size);
fflush(stderr);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", 0);
return;
}
end=start+(globus_off_t)extent;
tmp=(globus_byte_t *)ADIOI_Malloc((size_t)extent*sizeof(globus_byte_t));
/* start up the globus partial read */
globus_mutex_init(&readdiscontig_ctl_lock, GLOBUS_NULL);
globus_cond_init(&readdiscontig_ctl_cond, GLOBUS_NULL);
readdiscontig_ctl_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_partial_get(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
start,
end,
readdiscontig_ctl_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_partial_get",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* Do all the actual I/Os */
/* Since globus_ftp_client_register_read() is brain-dead and doesn't
let you specify an offset, we have to slurp the entire extent into
memory and then parse out the pieces we want... Sucks, doesn't it?
This should probably be done in chunks (preferably of a size
set using a file hint), but that'll have to come later.
--TB */
if ( (result=globus_ftp_client_register_read(&(gridftp_fh[fd->fd_sys]),
tmp,
(globus_size_t)extent,
readdiscontig_data_cb,
(void *)(&bytes_read)))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_register_read",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* The ctl callback won't start till the data callbacks complete, so it's
safe to wait on just the ctl callback */
globus_mutex_lock(&readdiscontig_ctl_lock);
while ( readdiscontig_ctl_done!=GLOBUS_TRUE )
globus_cond_wait(&readdiscontig_ctl_cond,&readdiscontig_ctl_lock);
globus_mutex_unlock(&readdiscontig_ctl_lock);
globus_mutex_destroy(&readdiscontig_ctl_lock);
globus_cond_destroy(&readdiscontig_ctl_cond);
boff=0;
nblks=0;
goff=0;
while ( boff < (count*btype_size) )
{
int i,blklen;
for (i=0;i<flat_file->count;i++)
{
if ( (boff+flat_file->blocklens[i]) < (count*btype_size) )
blklen=flat_file->blocklens[i];
else
blklen=(count*btype_size)-boff;
if ( blklen > 0 )
{
goff=nblks*ftype_extent+flat_file->indices[i];
memcpy((globus_byte_t *)buf+boff,tmp+goff,(size_t)blklen);
boff+=blklen;
if ( boff>=(count*btype_size) )
break;
}
}
nblks++;
}
ADIOI_Free(tmp);
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bytes_read);
#endif
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
fd->fp_ind += extent;
fd->fp_sys_posn = fd->fp_ind;
}
else {
fd->fp_sys_posn = offset + extent;
}
}
void ADIOI_GRIDFTP_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/*
int myrank, nprocs;
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
#ifdef PRINT_ERR_MSG
FPRINTF(stdout, "[%d/%d] ADIOI_GRIDFTP_ReadStrided called on %s\n", myrank,
nprocs, fd->filename);
FPRINTF(stdout, "[%d/%d] calling ADIOI_GEN_ReadStrided\n", myrank,
nprocs);
#endif
ADIOI_GEN_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset,
status, error_code);
*/
char myname[]="ADIOI_GRIDFTP_ReadStrided";
int myrank, nprocs;
int i,j;
int buf_contig,file_contig;
MPI_Aint btype_size,bufsize;
globus_off_t start,disp;
globus_size_t bytes_read;
globus_byte_t *intermediate;
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size_x(datatype,&btype_size);
bufsize=count*btype_size;
ADIOI_Datatype_iscontig(fd->filetype,&file_contig);
ADIOI_Datatype_iscontig(datatype,&buf_contig);
if ( buf_contig && !file_contig )
{
/* Contiguous in memory, discontig in file */
ADIOI_GRIDFTP_ReadDiscontig(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
}
else if ( !buf_contig && file_contig )
{
/* Discontiguous in mem, contig in file -- comparatively easy */
int posn=0;
/* read contiguous data into intermediate buffer */
intermediate=(globus_byte_t *)ADIOI_Malloc((size_t)bufsize);
ADIOI_GRIDFTP_ReadContig(fd, intermediate, bufsize, MPI_BYTE,
file_ptr_type, offset, status, error_code);
/* explode contents of intermediate buffer into main buffer */
MPI_Unpack(intermediate,bufsize,&posn,buf,count,datatype,fd->comm);
ADIOI_Free(intermediate);
}
else if ( !buf_contig && !file_contig )
{
/* Discontig in both mem and file -- the hardest case */
int posn=0;
/* Read discontiguous data into intermediate buffer */
intermediate=(globus_byte_t *)ADIOI_Malloc((size_t)bufsize);
ADIOI_GRIDFTP_ReadDiscontig(fd, intermediate, bufsize, MPI_BYTE,
file_ptr_type, offset, status, error_code);
/* explode contents of intermediate buffer into main buffer */
posn=0;
MPI_Unpack(intermediate,bufsize,&posn,buf,count,datatype,fd->comm);
ADIOI_Free(intermediate);
}
else
{
/* Why did you bother calling ReadStrided?!?!?! */
ADIOI_GRIDFTP_ReadContig(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
}
}

Просмотреть файл

@ -0,0 +1,241 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
static globus_mutex_t resize_lock;
static globus_cond_t resize_cond;
static globus_bool_t resize_done;
static globus_bool_t resize_success;
void resize_cb(void *myargs, globus_ftp_client_handle_t *handle,
globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
globus_mutex_lock(&resize_lock);
resize_success=GLOBUS_FALSE;
globus_mutex_unlock(&resize_lock);
}
else
{
globus_mutex_lock(&resize_lock);
resize_success=GLOBUS_TRUE;
globus_mutex_unlock(&resize_lock);
}
globus_mutex_lock(&resize_lock);
resize_done=GLOBUS_TRUE;
globus_cond_signal(&resize_cond);
globus_mutex_unlock(&resize_lock);
}
static void resize_wrdata_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error,
globus_byte_t *buffer, globus_size_t length, globus_off_t offset,
globus_bool_t eof)
{
if (error)
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
if (!eof)
globus_ftp_client_register_read(handle,
buffer,
length,
resize_wrdata_cb,
myargs);
return;
}
void ADIOI_GRIDFTP_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
{
int myrank, nprocs;
char myname[]="ADIOI_GRIDFTP_Resize";
globus_off_t fsize;
globus_result_t result;
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
/* Sanity check */
if ( fd->access_mode&ADIO_RDONLY )
{
FPRINTF(stderr,"%s: attempt to resize read-only file %s!\n",
myname,fd->filename);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", 0);
return;
}
/* This routine is supposed to do the moral equivalent of truncate(),
but there's not an equivalent operation in the globus_ftp_client API. */
globus_mutex_init(&resize_lock,GLOBUS_NULL);
globus_cond_init(&resize_cond,GLOBUS_NULL);
resize_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_size(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
&(fsize),
resize_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_size",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&resize_lock);
while ( resize_done!=GLOBUS_TRUE )
globus_cond_wait(&resize_lock,&resize_cond);
if ( fsize < (globus_off_t)size )
{
/* The file is smaller than the requested size, so
do a zero-byte write to where the new EOF should be. */
globus_byte_t touchbuf=(globus_byte_t)'\0';
resize_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_partial_put(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
(globus_off_t)size,
(globus_off_t)size,
resize_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_partial_put",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
if ( (result=globus_ftp_client_register_write(&(gridftp_fh[fd->fd_sys]),
(globus_byte_t *)&touchbuf,
0,
(globus_off_t)0,
GLOBUS_TRUE,
resize_wrdata_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_register_write",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&resize_lock);
while ( resize_done!=GLOBUS_TRUE )
globus_cond_wait(&resize_cond,&resize_lock);
globus_mutex_unlock(&resize_lock);
}
else if ( fsize > (globus_off_t)size )
{
/* The file is bigger than the requested size, so
we'll abuse globus_ftp_client_third_party_partial_put()
into truncating it for us. */
char *urlold;
size_t urllen;
urllen=strlen(fd->filename);
urlold=(char *)ADIOI_Malloc(urllen+5);
ADIOI_Snprintf(urlold,urllen+5,"%s.old",fd->filename);
resize_done=GLOBUS_FALSE;
resize_success=GLOBUS_FALSE;
if ( (result=globus_ftp_client_move(&(gridftp_fh[fd->fd_sys]),
fd->filename,
urlold,
&(oattr[fd->fd_sys]),
resize_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_move",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&resize_lock);
while ( resize_done!=GLOBUS_TRUE )
globus_cond_wait(&resize_cond,&resize_lock);
globus_mutex_unlock(&resize_lock);
if ( resize_success!=GLOBUS_TRUE )
{
*error_code = MPI_ERR_IO;
return;
}
resize_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_partial_third_party_transfer(&(gridftp_fh[fd->fd_sys]),
urlold,
&(oattr[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
0,
(globus_off_t)size,
resize_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_partial_third_party_transfer",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&resize_lock);
while ( resize_done!=GLOBUS_TRUE )
globus_cond_wait(&resize_cond,&resize_lock);
globus_mutex_unlock(&resize_lock);
if ( resize_success!=GLOBUS_TRUE )
{
*error_code = MPI_ERR_IO;
ADIOI_Error(fd,*error_code,myname);
return;
}
resize_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_delete(&(gridftp_fh[fd->fd_sys]),
urlold,
&(oattr[fd->fd_sys]),
resize_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_delete",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s",
globus_object_printable_to_string(globus_error_get(result)));
return;
}
globus_mutex_lock(&resize_lock);
while ( resize_done!=GLOBUS_TRUE )
globus_cond_wait(&resize_cond,&resize_lock);
globus_mutex_unlock(&resize_lock);
if ( resize_success!=GLOBUS_TRUE )
{
*error_code = MPI_ERR_IO;
ADIOI_Error(fd,*error_code,myname);
return;
}
ADIOI_Free(urlold);
}
globus_mutex_destroy(&resize_lock);
globus_cond_destroy(&resize_cond);
}

Просмотреть файл

@ -0,0 +1,473 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gridftp.h"
#include "adioi.h"
#include "adio_extern.h"
static globus_mutex_t writecontig_ctl_lock;
static globus_cond_t writecontig_ctl_cond;
static globus_bool_t writecontig_ctl_done;
static void writecontig_ctl_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
globus_mutex_lock(&writecontig_ctl_lock);
if ( writecontig_ctl_done!=GLOBUS_TRUE )
writecontig_ctl_done=GLOBUS_TRUE;
globus_cond_signal(&writecontig_ctl_cond);
globus_mutex_unlock(&writecontig_ctl_lock);
#ifdef PRINT_ERR_MSG
FPRINTF(stderr,"finished with contig write transaction\n");
#endif /* PRINT_ERR_MSG */
return;
}
static void writecontig_data_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error,
globus_byte_t *buffer, globus_size_t length, globus_off_t offset,
globus_bool_t eof)
{
globus_size_t *bytes_written;
bytes_written=(globus_size_t *)myargs;
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
*bytes_written+=length;
/* I don't understand why the data callback has to keep recalling register_write,
but everything I've done and all the examples I've seen seem to require
that behavior to work... */
if ( !eof )
{
globus_ftp_client_register_write(handle,
buffer,
length,
offset,
GLOBUS_TRUE,
writecontig_data_cb,
(void *)(bytes_written));
}
#ifdef PRINT_ERR_MSG
FPRINTF(stderr,"wrote %Ld bytes...",(long long)length);
#endif /* PRINT_ERR_MSG */
return;
}
static globus_mutex_t writediscontig_ctl_lock;
static globus_cond_t writediscontig_ctl_cond;
static globus_bool_t writediscontig_ctl_done;
static void writediscontig_ctl_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error)
{
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
globus_mutex_lock(&writediscontig_ctl_lock);
if ( writediscontig_ctl_done!=GLOBUS_TRUE )
writediscontig_ctl_done=GLOBUS_TRUE;
globus_cond_signal(&writediscontig_ctl_cond);
globus_mutex_unlock(&writediscontig_ctl_lock);
return;
}
static void writediscontig_data_cb(void *myargs, globus_ftp_client_handle_t *handle, globus_object_t *error,
globus_byte_t *buffer, globus_size_t length, globus_off_t offset,
globus_bool_t eof)
{
globus_size_t *bytes_written;
bytes_written=(globus_size_t *)myargs;
if (error)
{
FPRINTF(stderr, "%s\n", globus_object_printable_to_string(error));
}
*bytes_written+=length;
/* I don't understand why the data callback has to keep recalling register_read,
but everything I've done and all the examples I've seen seem to require
that behavior to work... */
if ( !eof )
globus_ftp_client_register_write(handle,
buffer,
length,
offset,
eof,
writediscontig_data_cb,
(void *)(bytes_written));
FPRINTF(stderr,"wrote %Ld bytes...",(long long)length);
return;
}
void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
char myname[]="ADIOI_GRIDFTP_WriteContig";
int myrank, nprocs;
MPI_Count datatype_size;
globus_size_t len,bytes_written=0;
globus_off_t goff;
globus_result_t result;
if ( fd->access_mode&ADIO_RDONLY )
{
*error_code=MPI_ERR_AMODE;
return;
}
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size_x(datatype, &datatype_size);
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
offset = fd->fp_ind;
}
/* Do the gridftp I/O transfer */
goff = (globus_off_t)offset;
len = ((globus_size_t)datatype_size)*((globus_size_t)count);
globus_mutex_init(&writecontig_ctl_lock, GLOBUS_NULL);
globus_cond_init(&writecontig_ctl_cond, GLOBUS_NULL);
writecontig_ctl_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_partial_put(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
goff,
goff+(globus_off_t)len,
writecontig_ctl_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_partial_put",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
if ( (result=globus_ftp_client_register_write(&(gridftp_fh[fd->fd_sys]),
(globus_byte_t *)buf,
len,
goff,
GLOBUS_TRUE,
writecontig_data_cb,
(void *)(&bytes_written)))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_register_write",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* The ctl callback won't start till the data callbacks complete, so it's
safe to wait on just the ctl callback */
globus_mutex_lock(&writecontig_ctl_lock);
while ( writecontig_ctl_done!=GLOBUS_TRUE )
globus_cond_wait(&writecontig_ctl_cond,&writecontig_ctl_lock);
globus_mutex_unlock(&writecontig_ctl_lock);
globus_mutex_destroy(&writecontig_ctl_lock);
globus_cond_destroy(&writecontig_ctl_cond);
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bytes_written);
#endif
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
offset = fd->fp_ind;
fd->fp_ind += bytes_written;
fd->fp_sys_posn = fd->fp_ind;
}
else {
fd->fp_sys_posn = offset + bytes_written;
}
}
void ADIOI_GRIDFTP_WriteDiscontig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
char myname[]="ADIOI_GRIDFTP_WriteDiscontig";
int myrank,nprocs;
MPI_Aint btype_size,btype_extent;
MPI_Aint ftype_size,ftype_extent;
MPI_Aint etype_size;
MPI_Aint extent;
ADIOI_Flatlist_node *flat_file;
int buf_contig,boff,i,nblks;
globus_off_t start,end,goff;
globus_size_t bytes_written;
globus_result_t result;
MPI_Comm_rank(fd->comm,&myrank);
MPI_Comm_size(fd->comm,&nprocs);
etype_size=fd->etype_size;
MPI_Type_size_x(fd->filetype,&ftype_size);
MPI_Type_extent(fd->filetype,&ftype_extent);
/* This is arguably unnecessary, as this routine assumes that the
buffer in memory is contiguous */
MPI_Type_size_x(datatype,&btype_size);
MPI_Type_extent(datatype,&btype_extent);
ADIOI_Datatype_iscontig(datatype,&buf_contig);
if ( ( btype_extent!=btype_size ) || ( ! buf_contig ) )
{
FPRINTF(stderr,"[%d/%d] %s called with discontigous memory buffer\n",
myrank,nprocs,myname);
fflush(stderr);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* from here we can assume btype_extent==btype_size */
/* Flatten out fd->filetype so we know which blocks to skip */
ADIOI_Flatten_datatype(fd->filetype);
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype && flat_file->next!=NULL)
flat_file = flat_file->next;
/* Figure out how big the area to write is */
/* ASSUMPTION: ftype_size is an integer multiple of btype_size or vice versa. */
start=(globus_off_t)(offset*etype_size);
goff=start;
boff=0;
extent=0;
nblks=0;
while ( boff < (count*btype_size) )
{
int blklen;
for (i=0;i<flat_file->count;i++)
{
if ( (boff+flat_file->blocklens[i]) < (count*btype_size) )
blklen=flat_file->blocklens[i];
else
blklen=(count*btype_size)-boff;
boff+=blklen;
extent=MAX(extent,nblks*ftype_extent+flat_file->indices[i]+blklen);
if ( boff>=(count*btype_size) )
break;
}
nblks++;
}
if ( extent < count*btype_size )
{
FPRINTF(stderr,"[%d/%d] %s error in computing extent -- extent %d is smaller than total bytes requested %d!\n",
myrank,nprocs,myname,extent,count*btype_size);
fflush(stderr);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
end=start+(globus_off_t)extent;
FPRINTF(stderr,"[%d/%d] %s writing %d bytes into extent of %d bytes starting at offset %Ld\n",
myrank,nprocs,myname,count*btype_size,extent,(long long)start);
fflush(stderr);
/* start up the globus partial write */
globus_mutex_init(&writediscontig_ctl_lock, GLOBUS_NULL);
globus_cond_init(&writediscontig_ctl_cond, GLOBUS_NULL);
writediscontig_ctl_done=GLOBUS_FALSE;
if ( (result=globus_ftp_client_partial_put(&(gridftp_fh[fd->fd_sys]),
fd->filename,
&(oattr[fd->fd_sys]),
GLOBUS_NULL,
start,
end,
writediscontig_ctl_cb,
GLOBUS_NULL))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_partial_get",myname,result);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", globus_object_printable_to_string(globus_error_get(result)));
return;
}
/* Do all the actual I/Os */
boff=0;
nblks=0;
while ( boff < (count*btype_size) )
{
int i,blklen;
for (i=0;i<flat_file->count;i++)
{
if ( (boff+flat_file->blocklens[i]) < (count*btype_size) )
blklen=flat_file->blocklens[i];
else
blklen=(count*btype_size)-boff;
if ( blklen > 0 )
{
goff=start+nblks*ftype_extent+((globus_off_t)flat_file->indices[i]);
/*
FPRINTF(stderr,"[%d/%d] %s writing %d bytes from boff=%d at goff=%Ld\n",myrank,nprocs,myname,blklen,boff,goff);
*/
if ( (result=globus_ftp_client_register_write(&(gridftp_fh[fd->fd_sys]),
((globus_byte_t *)buf)+boff,
(globus_size_t)blklen,
goff,
GLOBUS_TRUE,
writediscontig_data_cb,
(void *)(&bytes_written)))!=GLOBUS_SUCCESS )
{
globus_err_handler("globus_ftp_client_register_write",myname,result);
*error_code=MPI_ERR_IO;
ADIOI_Error(fd,*error_code,myname);
return;
}
boff+=blklen;
if ( boff>=(count*btype_size) )
break;
}
}
nblks++;
}
/* The ctl callback won't start till the data callbacks complete, so it's
safe to wait on just the ctl callback */
globus_mutex_lock(&writediscontig_ctl_lock);
while ( writediscontig_ctl_done!=GLOBUS_TRUE )
globus_cond_wait(&writediscontig_ctl_cond,&writediscontig_ctl_lock);
globus_mutex_unlock(&writediscontig_ctl_lock);
globus_mutex_destroy(&writediscontig_ctl_lock);
globus_cond_destroy(&writediscontig_ctl_cond);
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bytes_written);
#endif
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
fd->fp_ind += extent;
fd->fp_sys_posn = fd->fp_ind;
}
else {
fd->fp_sys_posn = offset + extent;
}
}
#define GRIDFTP_USE_GENERIC_STRIDED
void ADIOI_GRIDFTP_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
#ifdef GRIDFTP_USE_GENERIC_STRIDED
int myrank, nprocs;
if ( fd->access_mode&ADIO_RDONLY )
{
*error_code=MPI_ERR_AMODE;
return;
}
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
ADIOI_GEN_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset,
status, error_code);
return;
#else
char myname[]="ADIOI_GRIDFTP_WriteStrided";
int myrank, nprocs;
int buf_contig,file_contig;
MPI_Aint btype_size,bufsize;
globus_byte_t *intermediate;
*error_code = MPI_SUCCESS;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size_x(datatype,&btype_size);
bufsize=count*btype_size;
ADIOI_Datatype_iscontig(fd->filetype,&file_contig);
ADIOI_Datatype_iscontig(datatype,&buf_contig);
if ( buf_contig && !file_contig )
{
/* Contiguous in memory, discontig in file */
FPRINTF(stderr,"[%d/%d] %s called w/ contig mem, discontig file\n",
myrank,nprocs,myname);
fflush(stderr);
ADIOI_GRIDFTP_WriteDiscontig(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
}
else if ( !buf_contig && file_contig )
{
/* Discontiguous in mem, contig in file -- comparatively easy */
int posn=0;
FPRINTF(stderr,"[%d/%d] %s called w/ discontig mem, contig file\n",
myrank,nprocs,myname);
fflush(stderr);
/* squeeze contents of main buffer into intermediate buffer*/
intermediate=(globus_byte_t *)ADIOI_Malloc((size_t)bufsize);
MPI_Pack(buf,count,datatype,intermediate,bufsize,&posn,fd->comm);
/* write contiguous data from intermediate buffer */
ADIOI_GRIDFTP_WriteContig(fd, intermediate, bufsize, MPI_BYTE,
file_ptr_type, offset, status, error_code);
ADIOI_Free(intermediate);
}
else if ( !buf_contig && !file_contig )
{
/* Discontig in both mem and file -- the hardest case */
int posn=0;
FPRINTF(stderr,"[%d/%d] %s called w/ discontig mem, discontig file\n",
myrank,nprocs,myname);
fflush(stderr);
/* squeeze contents of main buffer into intermediate buffer*/
intermediate=(globus_byte_t *)ADIOI_Malloc((size_t)bufsize);
MPI_Pack(buf,count,datatype,intermediate,bufsize,&posn,fd->comm);
/* write contiguous data from intermediate buffer */
ADIOI_GRIDFTP_WriteDiscontig(fd, intermediate, bufsize, MPI_BYTE,
file_ptr_type, offset, status, error_code);
ADIOI_Free(intermediate);
}
else
{
/* Why did you bother calling WriteStrided?!?!?! */
FPRINTF(stderr,"[%d/%d] Why the heck did you call %s with contiguous buffer *and* file types?\n",
myrank,nprocs,myname);
ADIOI_GRIDFTP_WriteContig(fd, buf, count, datatype,
file_ptr_type, offset, status, error_code);
}
#endif /* ! GRIDFTP_USE_GENERIC_STRIDED */
}

Просмотреть файл

@ -0,0 +1,36 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2003 University of Chicago, Ohio Supercomputer Center.
* See COPYRIGHT notice in top-level directory.
*/
#include <unistd.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <fcntl.h>
#include "adio.h"
#include <globus_ftp_client.h>
/* Here are the canonical definitions of the extern's referenced by
ad_gridftp.h */
int num_gridftp_handles=0;
#ifndef ADIO_GRIDFTP_HANDLES_MAX
#define ADIO_GRIDFTP_HANDLES_MAX 200
#endif /* ! ADIO_GRIDFTP_HANDLES_MAX */
/* having to keep not one but two big global tables sucks... */
globus_ftp_client_handle_t gridftp_fh[ADIO_GRIDFTP_HANDLES_MAX];
globus_ftp_client_operationattr_t oattr[ADIO_GRIDFTP_HANDLES_MAX];
void globus_err_handler(const char *routine, const char *caller,
globus_result_t result)
{
int myrank,nprocs;
globus_object_t *err;
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
err = globus_error_get(result);
FPRINTF(stderr, "[%d/%d] %s error \"%s\", called from %s\n",
myrank,nprocs,routine,globus_object_printable_to_string(err),caller);
}

Просмотреть файл

@ -0,0 +1,21 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_HFS
noinst_HEADERS += adio/ad_hfs/ad_hfs.h
romio_other_sources += \
adio/ad_hfs/ad_hfs_read.c \
adio/ad_hfs/ad_hfs_open.c \
adio/ad_hfs/ad_hfs_write.c \
adio/ad_hfs/ad_hfs_fcntl.c \
adio/ad_hfs/ad_hfs_resize.c \
adio/ad_hfs/ad_hfs.c
endif BUILD_AD_HFS

Просмотреть файл

@ -0,0 +1 @@
This code is no longer supported.

Просмотреть файл

@ -0,0 +1,36 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_hfs.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_HFS_operations = {
ADIOI_HFS_Open, /* Open */
ADIOI_HFS_ReadContig, /* ReadContig */
ADIOI_HFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_HFS_Fcntl, /* Fcntl */
ADIOI_GEN_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
ADIOI_FAKE_IODone, /* ReadDone */
ADIOI_FAKE_IODone, /* WriteDone */
ADIOI_FAKE_IOComplete, /* ReadComplete */
ADIOI_FAKE_IOComplete, /* WriteComplete */
ADIOI_FAKE_IreadStrided, /* IreadStrided */
ADIOI_FAKE_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_HFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
};

Просмотреть файл

@ -0,0 +1,34 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_HFS_INCLUDE
#define AD_HFS_INCLUDE
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef SPPUX
# include <sys/cnx_types.h>
# include <sys/puio.h>
# include <cnx_unistd.h>
#endif
void ADIOI_HFS_Open(ADIO_File fd, int *error_code);
void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_HFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code);
void ADIOI_HFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
#endif

Просмотреть файл

@ -0,0 +1,113 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_hfs.h"
#include "adio_extern.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{
int i, ntimes, err;
ADIO_Offset curr_fsize, alloc_size, size, len, done;
ADIO_Status status;
char *buf;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_FCNTL";
#endif
switch(flag) {
case ADIO_FCNTL_GET_FSIZE:
fcntl_struct->fsize = lseek64(fd->fd_sys, 0, SEEK_END);
#ifdef HPUX
if (fd->fp_sys_posn != -1)
lseek64(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
/* not required in SPPUX since there we use pread/pwrite */
#endif
if (fcntl_struct->fsize == -1) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = MPI_ERR_UNKNOWN;
#else /* MPICH-1 */
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(fd, *error_code, myname);
#endif
}
else *error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
/* will be called by one process only */
#ifdef HPUX
err = prealloc64(fd->fd_sys, fcntl_struct->diskspace);
/* prealloc64 works only if file is of zero length */
if (err && (errno != ENOTEMPTY)) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = MPI_ERR_UNKNOWN;
#else
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(fd, *error_code, myname);
#endif
return;
}
if (err && (errno == ENOTEMPTY)) {
#endif
#ifdef SPPUX
/* SPPUX has no prealloc64. therefore, use prealloc
if size < (2GB - 1), otherwise use long method. */
if (fcntl_struct->diskspace <= 2147483647) {
err = prealloc(fd->fd_sys, (off_t) fcntl_struct->diskspace);
if (err && (errno != ENOTEMPTY)) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = MPI_ERR_UNKNOWN;
#else
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(fd, *error_code, myname);
#endif
return;
}
}
if ((fcntl_struct->diskspace > 2147483647) ||
(err && (errno == ENOTEMPTY))) {
#endif
ADIOI_GEN_Prealloc(fd,fcntl_struct->diskspace, error_code);
}
ADIOI_Free(buf);
#ifdef HPUX
if (fd->fp_sys_posn != -1)
lseek64(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
/* not required in SPPUX since there we use pread/pwrite */
#endif
}
*error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_ATOMICITY:
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
*error_code = MPI_SUCCESS;
break;
default:
FPRINTF(stderr, "Unknown flag passed to ADIOI_HFS_Fcntl\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
}

Просмотреть файл

@ -0,0 +1,67 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_OPEN";
#endif
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
fd->fd_sys = open64(fd->filename, amode, perm);
fd->fd_direct = -1;
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) {
fd->fp_ind = lseek64(fd->fd_sys, 0, SEEK_END);
#ifdef HPUX
fd->fp_sys_posn = fd->fp_ind;
#endif
}
#ifdef SPPUX
fd->fp_sys_posn = -1; /* set it to null bec. we use pread, pwrite*/
#endif
if (fd->fd_sys == -1 ) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = MPI_ERR_UNKNOWN;
#else /* MPICH-1 */
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(ADIO_FILE_NULL, *error_code, myname);
#endif
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,71 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
MPI_Count err=-1, datatype_size, len;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_READCONTIG";
#endif
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
#ifdef SPPUX
fd->fp_sys_posn = -1; /* set it to null, since we are using pread */
if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
err = pread64(fd->fd_sys, buf, len, offset);
else { /* read from curr. location of ind. file pointer */
err = pread64(fd->fd_sys, buf, len, fd->fp_ind);
fd->fp_ind += err;
}
#endif
#ifdef HPUX
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset)
lseek64(fd->fd_sys, offset, SEEK_SET);
err = read(fd->fd_sys, buf, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* read from curr. location of ind. file pointer */
if (fd->fp_sys_posn != fd->fp_ind)
lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
err = read(fd->fd_sys, buf, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#endif
#ifdef HAVE_STATUS_SET_BYTES
if (err != -1) MPIR_Status_set_bytes(status, datatype, err);
#endif
if (err == -1 ) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = (err == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS;
#else /* MPICH-1 */
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(fd, *error_code, myname);
#endif
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,31 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_hfs.h"
void ADIOI_HFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
{
int err;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_RESIZE";
#endif
err = ftruncate64(fd->fd_sys, size);
if (err == -1) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = MPI_ERR_UNKNOWN;
#else /* MPICH-1 */
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(fd, *error_code, myname);
#endif
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,70 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_hfs.h"
#ifndef HAVE_LSEEK64
#define lseek64 lseek
#endif
void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
MPI_Count err=-1, datatype_size, len;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_WRITECONTIG";
#endif
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
#ifdef SPPUX
fd->fp_sys_posn = -1; /* set it to null, since we are using pwrite */
if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
err = pwrite64(fd->fd_sys, buf, len, offset);
else { /* write from curr. location of ind. file pointer */
err = pwrite64(fd->fd_sys, buf, len, fd->fp_ind);
fd->fp_ind += err;
}
#endif
#ifdef HPUX
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset)
lseek64(fd->fd_sys, offset, SEEK_SET);
err = write(fd->fd_sys, buf, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* write from curr. location of ind. file pointer */
if (fd->fp_sys_posn != fd->fp_ind)
lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
err = write(fd->fd_sys, buf, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#endif
#ifdef HAVE_STATUS_SET_BYTES
if (err != -1) MPIR_Status_set_bytes(status, datatype, err);
#endif
if (err == -1) {
#ifdef MPICH
*error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
#elif defined(PRINT_ERR_MSG)
*error_code = MPI_SUCCESS;
#else /* MPICH-1 */
*error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR,
myname, "I/O Error", "%s", strerror(errno));
ADIOI_Error(fd, *error_code, myname);
#endif
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,22 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_LUSTRE
noinst_HEADERS += adio/ad_lustre/ad_lustre.h
romio_other_sources += \
adio/ad_lustre/ad_lustre.c \
adio/ad_lustre/ad_lustre_open.c \
adio/ad_lustre/ad_lustre_rwcontig.c \
adio/ad_lustre/ad_lustre_wrcoll.c \
adio/ad_lustre/ad_lustre_wrstr.c \
adio/ad_lustre/ad_lustre_hints.c \
adio/ad_lustre/ad_lustre_aggregate.c
endif BUILD_AD_LUSTRE

Просмотреть файл

@ -0,0 +1,55 @@
Upcoming soon:
o Hierarchical striping as described in the paper from CCGrid2007
http://ft.ornl.gov/projects/io/pubs/CCGrid-2007-file-joining.pdf
Further out:
o To post the code for ParColl (Partitioned collective IO)
-----------------------------------------------------
V05:
-----------------------------------------------------
Improved data redistribution
o Improve I/O pattern identification. Besides checking interleaving,
if request I/O size is small, collective I/O will be performed.
The hint bigsize can be used to define the req size value.
o Provide hint CO for load balancing to control the number of
IO clients for each OST
o Produce stripe-contiguous I/O pattern that Lustre prefers
o Control read-modify-write in data sieving in collective IO
by hint ds_in_coll.
o Reduce extent lock conflicts by make each OST accessed by one or
more constant clients.
-----------------------------------------------------
V04:
-----------------------------------------------------
o Direct IO and Lockless IO support
-----------------------------------------------------
V03:
-----------------------------------------------------
o Correct detection of fs_type when lustre: prefix is not given
o Further fix on stripe alignment
o Tested/Enabled striping hints over Cray XT (Catamount and CNL)
-----------------------------------------------------
V02:
-----------------------------------------------------
The Lustre ADIO driver has been cleaned up quite a lot. Compared
to the intital posting, here are the changes:
o Removal of dead/redundant code
o Removal of asynchronous IO piece as it appears outdated
o Bug fixes for setting Lustre Hints
o Bug fixes for data sieving
o Improved Setsize operation with one process calling ftruncate
o Improved collective IO with domain partitioning on
Lustre stripe boundary
Contributing:
o You may contribute via many different ways, such as
testing results, bug reports, and new feature patches.
o We appreciate any courtesy reference of this work.
o Disclaimer: you are welcome to try the code, but at your own risk.
Contact info:
For more info, visit http://ft.ornl.gov/projects/io/

Просмотреть файл

@ -0,0 +1,44 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
ADIOI_LUSTRE_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_LUSTRE_ReadContig, /* ReadContig */
ADIOI_LUSTRE_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_LUSTRE_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
#if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
"LUSTRE:",
};

Просмотреть файл

@ -0,0 +1,91 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#ifndef AD_UNIX_INCLUDE
#define AD_UNIX_INCLUDE
/* temp*/
#define HAVE_ASM_TYPES_H 1
#include <unistd.h>
#include <linux/types.h>
#ifdef __linux__
# include <sys/ioctl.h> /* necessary for: */
# include <time.h>
# define __USE_GNU /* O_DIRECT and */
# include <fcntl.h> /* IO operations */
# undef __USE_GNU
#endif /* __linux__ */
/*#include <fcntl.h>*/
#include <sys/ioctl.h>
#include <lustre/lustre_user.h>
#include "adio.h"
/*#include "adioi.h"*/
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#ifdef HAVE_AIO_H
#include <aio.h>
#ifdef HAVE_SYS_AIO_H
#include <sys/aio.h>
#endif
#endif /* End of HAVE_SYS_AIO_H */
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code);
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
/* the lustre utilities: */
int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
ADIO_Offset *len_list, int nprocs);
void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
int mode);
void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
ADIO_Offset *len_list, int contig_access_count,
int *striping_info, int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int ***buf_idx_ptr);
int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
ADIO_Offset *len, int *striping_info);
#endif /* End of AD_UNIX_INCLUDE */

Просмотреть файл

@ -0,0 +1,322 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#undef AGG_DEBUG
void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
int mode)
{
int *striping_info = NULL;
/* get striping information:
* striping_info[0]: stripe_size
* striping_info[1]: stripe_count
* striping_info[2]: avail_cb_nodes
*/
int stripe_size, stripe_count, CO = 1;
int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
/* Get hints value */
/* stripe size */
stripe_size = fd->hints->striping_unit;
/* stripe count */
/* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
stripe_count = fd->hints->striping_factor;
/* Calculate the available number of I/O clients */
if (!mode) {
/* for collective read,
* if "CO" clients access the same OST simultaneously,
* the OST disk seek time would be much. So, to avoid this,
* it might be better if 1 client only accesses 1 OST.
* So, we set CO = 1 to meet the above requirement.
*/
CO = 1;
/*XXX: maybe there are other better way for collective read */
} else {
/* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
CO = fd->hints->fs_hints.lustre.co_ratio;
}
/* Calculate how many IO clients we need */
/* Algorithm courtesy Pascal Deveze (pascal.deveze@bull.net) */
/* To avoid extent lock conflicts,
* avail_cb_nodes should either
* - be a multiple of stripe_count,
* - or divide stripe_count exactly
* so that each OST is accessed by a maximum of CO constant clients. */
if (nprocs_for_coll >= stripe_count)
/* avail_cb_nodes should be a multiple of stripe_count and the number
* of procs per OST should be limited to the minimum between
* nprocs_for_coll/stripe_count and CO
*
* e.g. if stripe_count=20, nprocs_for_coll=42 and CO=3 then
* avail_cb_nodes should be equal to 40 */
avail_cb_nodes =
stripe_count * ADIOI_MIN(nprocs_for_coll/stripe_count, CO);
else {
/* nprocs_for_coll is less than stripe_count */
/* avail_cb_nodes should divide stripe_count */
/* e.g. if stripe_count=60 and nprocs_for_coll=8 then
* avail_cb_nodes should be egal to 6 */
/* This could be done with :
while (stripe_count % avail_cb_nodes != 0) avail_cb_nodes--;
but this can be optimized for large values of nprocs_for_coll and
stripe_count */
divisor = 2;
avail_cb_nodes = 1;
/* try to divise */
while (stripe_count >= divisor*divisor) {
if ((stripe_count % divisor) == 0) {
if (stripe_count/divisor <= nprocs_for_coll) {
/* The value is found ! */
avail_cb_nodes = stripe_count/divisor;
break;
}
/* if divisor is less than nprocs_for_coll, divisor is a
* solution, but it is not sure that it is the best one */
else if (divisor <= nprocs_for_coll)
avail_cb_nodes = divisor;
}
divisor++;
}
}
*striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
striping_info = *striping_info_ptr;
striping_info[0] = stripe_size;
striping_info[1] = stripe_count;
striping_info[2] = avail_cb_nodes;
}
int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
ADIO_Offset *len, int *striping_info)
{
int rank_index, rank;
ADIO_Offset avail_bytes;
int stripe_size = striping_info[0];
int avail_cb_nodes = striping_info[2];
/* Produce the stripe-contiguous pattern for Lustre */
rank_index = (int)((off / stripe_size) % avail_cb_nodes);
/* we index into fd_end with rank_index, and fd_end was allocated to be no
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort
*/
if (rank_index >= fd->hints->cb_nodes)
MPI_Abort(MPI_COMM_WORLD, 1);
avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
(ADIO_Offset)stripe_size - off;
if (avail_bytes < *len) {
/* this proc only has part of the requested contig. region */
*len = avail_bytes;
}
/* map our index to a rank */
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
rank = fd->hints->ranklist[rank_index];
return rank;
}
/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
ADIO_Offset *len_list, int contig_access_count,
int *striping_info, int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int ***buf_idx_ptr)
{
/* Nothing different from ADIOI_Calc_my_req(), except calling
* ADIOI_Lustre_Calc_aggregator() instead of the old one */
int *count_my_req_per_proc, count_my_req_procs, **buf_idx;
int i, l, proc;
ADIO_Offset avail_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
* process in process i's file domain. calloc initializes to zero.
* I'm allocating memory of size nprocs, so that I can do an
* MPI_Alltoall later on.
*/
buf_idx = (int **) ADIOI_Malloc(nprocs * sizeof(int*));
/* one pass just to calculate how much space to allocate for my_req;
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/
for (i = 0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write
*/
if (len_list[i] == 0)
continue;
off = offset_list[i];
avail_len = len_list[i];
/* note: we set avail_len to be the total size of the access.
* then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
* the amount that was available.
*/
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
count_my_req_per_proc[proc]++;
/* figure out how many data is remaining in the access
* we'll take care of this data (if there is any)
* in the while loop below.
*/
rem_len = len_list[i] - avail_len;
while (rem_len != 0) {
off += avail_len; /* point to first remaining byte */
avail_len = rem_len; /* save remaining size, pass to calc */
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
count_my_req_per_proc[proc]++;
rem_len -= avail_len; /* reduce remaining length by amount from fd */
}
}
/* buf_idx is relevant only if buftype_is_contig.
* buf_idx[i] gives the index into user_buf where data received
* from proc 'i' should be placed. This allows receives to be done
* without extra buffer. This can't be done if buftype is not contig.
*/
/* initialize buf_idx vectors */
for (i = 0; i < nprocs; i++) {
/* add one to count_my_req_per_proc[i] to avoid zero size malloc */
buf_idx[i] = (int *) ADIOI_Malloc((count_my_req_per_proc[i] + 1)
* sizeof(int));
}
/* now allocate space for my_req, offset, and len */
*my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
my_req = *my_req_ptr;
count_my_req_procs = 0;
for (i = 0; i < nprocs; i++) {
if (count_my_req_per_proc[i]) {
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(ADIO_Offset));
my_req[i].lens = ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(ADIO_Offset));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed later */
}
/* now fill in my_req */
curr_idx = 0;
for (i = 0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
avail_len = len_list[i];
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
l = my_req[proc].count;
ADIOI_Assert(curr_idx == (int) curr_idx);
ADIOI_Assert(l < count_my_req_per_proc[proc]);
buf_idx[proc][l] = (int) curr_idx;
curr_idx += avail_len;
rem_len = len_list[i] - avail_len;
/* store the proc, offset, and len information in an array
* of structures, my_req. Each structure contains the
* offsets and lengths located in that process's FD,
* and the associated count.
*/
my_req[proc].offsets[l] = off;
ADIOI_Assert(avail_len == (int) avail_len);
my_req[proc].lens[l] = (int) avail_len;
my_req[proc].count++;
while (rem_len != 0) {
off += avail_len;
avail_len = rem_len;
proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
striping_info);
l = my_req[proc].count;
ADIOI_Assert(curr_idx == (int) curr_idx);
ADIOI_Assert(l < count_my_req_per_proc[proc]);
buf_idx[proc][l] = (int) curr_idx;
curr_idx += avail_len;
rem_len -= avail_len;
my_req[proc].offsets[l] = off;
ADIOI_Assert(avail_len == (int) avail_len);
my_req[proc].lens[l] = (int) avail_len;
my_req[proc].count++;
}
}
#ifdef AGG_DEBUG
for (i = 0; i < nprocs; i++) {
if (count_my_req_per_proc[i] > 0) {
FPRINTF(stdout, "data needed from %d (count = %d):\n",
i, my_req[i].count);
for (l = 0; l < my_req[i].count; l++) {
FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
l, my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
}
#endif
*count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx;
}
int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
ADIO_Offset *len_list, int nprocs)
{
/* If the processes are non-interleaved, we will check the req_size.
* if (avg_req_size > big_req_size) {
* docollect = 0;
* }
*/
int i, docollect = 1, big_req_size = 0;
ADIO_Offset req_size = 0, total_req_size;
int avg_req_size, total_access_count;
/* calculate total_req_size and total_access_count */
for (i = 0; i < contig_access_count; i++)
req_size += len_list[i];
MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
fd->comm);
MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
fd->comm);
/* estimate average req_size */
avg_req_size = (int)(total_req_size / total_access_count);
/* get hint of big_req_size */
big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
/* Don't perform collective I/O if there are big requests */
if ((big_req_size > 0) && (avg_req_size > big_req_size))
docollect = 0;
return docollect;
}

Просмотреть файл

@ -0,0 +1,97 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*/
#include "ad_lustre.h"
#include "adio_extern.h"
void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{
int i, ntimes;
ADIO_Offset curr_fsize, alloc_size, size, len, done;
ADIO_Status status;
char *buf;
#if defined(MPICH) || !defined(PRINT_ERR_MSG)
static char myname[] = "ADIOI_LUSTRE_FCNTL";
#endif
switch(flag) {
case ADIO_FCNTL_GET_FSIZE:
fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
if (fd->fp_sys_posn != -1)
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
if (fcntl_struct->fsize == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
/* will be called by one process only */
/* On file systems with no preallocation function, I have to
explicitly write
to allocate space. Since there could be holes in the file,
I need to read up to the current file size, write it back,
and then write beyond that depending on how much
preallocation is needed.
read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */
curr_fsize = lseek(fd->fd_sys, 0, SEEK_END);
alloc_size = fcntl_struct->diskspace;
size = ADIOI_MIN(curr_fsize, alloc_size);
ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ;
buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ);
done = 0;
for (i=0; i<ntimes; i++) {
len = ADIOI_MIN(size-done, ADIOI_PREALLOC_BUFSZ);
ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done,
&status, error_code);
if (*error_code != MPI_SUCCESS) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname, __LINE__,
MPI_ERR_IO, "**io", "**io %s", strerror(errno));
return;
}
ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
done, &status, error_code);
if (*error_code != MPI_SUCCESS) return;
done += len;
}
if (alloc_size > curr_fsize) {
memset(buf, 0, ADIOI_PREALLOC_BUFSZ);
size = alloc_size - curr_fsize;
ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ;
for (i=0; i<ntimes; i++) {
len = ADIOI_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ);
ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
done, &status, error_code);
if (*error_code != MPI_SUCCESS) return;
done += len;
}
}
ADIOI_Free(buf);
if (fd->fp_sys_posn != -1)
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
*error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_ATOMICITY:
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
*error_code = MPI_SUCCESS;
break;
default:
FPRINTF(stderr, "Unknown flag passed to ADIOI_LUSTRE_Fcntl\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
}

Просмотреть файл

@ -0,0 +1,180 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#include "hint_fns.h"
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
char *value;
int flag;
ADIO_Offset stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
struct lov_user_md lum = { 0 };
int err, myrank, fd_sys, perm, amode, old_mask;
static char myname[] = "ADIOI_LUSTRE_SETINFO";
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
if ( (fd->info) == MPI_INFO_NULL) {
/* This must be part of the open call. can set striping parameters
if necessary. */
MPI_Info_create(&(fd->info));
ADIOI_Info_set(fd->info, "direct_read", "false");
ADIOI_Info_set(fd->info, "direct_write", "false");
fd->direct_read = fd->direct_write = 0;
/* initialize lustre hints */
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
fd->hints->fs_hints.lustre.co_ratio = 1;
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
fd->hints->fs_hints.lustre.coll_threshold = 0;
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
/* has user specified striping or server buffering parameters
and do they have the same value on all processes? */
if (users_info != MPI_INFO_NULL) {
/* striping information */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag)
str_unit=atoll(value);
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag)
str_factor=atoll(value);
ADIOI_Info_get(users_info, "romio_lustre_start_iodevice",
MPI_MAX_INFO_VAL, value, &flag);
if (flag)
start_iodev=atoll(value);
/* direct read and write */
ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
ADIOI_Info_set(fd->info, "direct_read", "true");
fd->direct_read = 1;
}
ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
ADIOI_Info_set(fd->info, "direct_write", "true");
fd->direct_write = 1;
}
}
/* set striping information with ioctl */
MPI_Comm_rank(fd->comm, &myrank);
if (myrank == 0) {
stripe_val[0] = str_factor;
stripe_val[1] = str_unit;
stripe_val[2] = start_iodev;
}
MPI_Bcast(stripe_val, 3, MPI_OFFSET, 0, fd->comm);
if (stripe_val[0] != str_factor
|| stripe_val[1] != str_unit
|| stripe_val[2] != start_iodev) {
FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
"-striping_factor:striping_unit:start_iodevice "
"need to be identical across all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
} else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
/* if user has specified striping info, process 0 tries to set it */
if (!myrank) {
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
/* we need to create file so ensure this is set */
amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
fd_sys = open(fd->filename, amode, perm);
if (fd_sys == -1) {
if (errno != EEXIST)
fprintf(stderr,
"Failure to open file %s %d %d\n",strerror(errno), amode, perm);
} else {
lum.lmm_magic = LOV_USER_MAGIC;
lum.lmm_pattern = 0;
lum.lmm_stripe_size = str_unit;
/* crude check for overflow of lustre internal datatypes.
* Silently cap to large value if user provides a value
* larger than lustre supports */
if (lum.lmm_stripe_size != str_unit) {
lum.lmm_stripe_size = UINT_MAX;
}
lum.lmm_stripe_count = str_factor;
if ( lum.lmm_stripe_count != str_factor) {
lum.lmm_stripe_count = USHRT_MAX;
}
lum.lmm_stripe_offset = start_iodev;
if (lum.lmm_stripe_offset != start_iodev) {
lum.lmm_stripe_offset = USHRT_MAX;
}
err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
if (err == -1 && errno != EEXIST) {
fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
}
close(fd_sys);
}
} /* End of striping parameters validation */
}
MPI_Barrier(fd->comm);
}
/* get other hint */
if (users_info != MPI_INFO_NULL) {
/* CO: IO Clients/OST,
* to keep the load balancing between clients and OSTs */
ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_co_ratio",
&(fd->hints->fs_hints.lustre.co_ratio), myname, error_code );
/* coll_threshold:
* if the req size is bigger than this, collective IO may not be performed.
*/
ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_coll_threshold",
&(fd->hints->fs_hints.lustre.coll_threshold), myname, error_code );
/* ds_in_coll: disable data sieving in collective IO */
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_lustre_ds_in_coll",
&(fd->hints->fs_hints.lustre.ds_in_coll), myname, error_code );
}
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);
if (ADIOI_Direct_read) fd->direct_read = 1;
if (ADIOI_Direct_write) fd->direct_write = 1;
ADIOI_Free(value);
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,113 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
/* what is the basis for this define?
* what happens if there are more than 1k UUIDs? */
#define MAX_LOV_UUID_COUNT 1000
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode, amode_direct;
int lumlen;
struct lov_user_md *lum = NULL;
char *value;
#if defined(MPICH) || !defined(PRINT_ERR_MSG)
static char myname[] = "ADIOI_LUSTRE_OPEN";
#endif
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
amode_direct = amode | O_DIRECT;
fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
if (fd->fd_sys != -1) {
int err;
/* get file striping information and set it in info */
/* odd malloc here because lov_user_md contains some fixed data and
* then a list of 'lmm_objects' representing stripe */
lumlen = sizeof(struct lov_user_md) +
MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data);
/* furthermore, Pascal Deveze reports that, even though we pass a
* "GETSTRIPE" (read) flag to the ioctl, if some of the values of this
* struct are uninitialzed, the call can give an error. calloc in case
* there are other members that must be initialized and in case
* lov_user_md struct changes in future */
lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen);
lum->lmm_magic = LOV_USER_MAGIC;
err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum);
if (!err) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
fd->hints->striping_unit = lum->lmm_stripe_size;
sprintf(value, "%d", lum->lmm_stripe_size);
ADIOI_Info_set(fd->info, "striping_unit", value);
fd->hints->striping_factor = lum->lmm_stripe_count;
sprintf(value, "%d", lum->lmm_stripe_count);
ADIOI_Info_set(fd->info, "striping_factor", value);
fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset;
sprintf(value, "%d", lum->lmm_stripe_offset);
ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
ADIOI_Free(value);
}
ADIOI_Free(lum);
if (fd->access_mode & ADIO_APPEND)
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
}
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
fd->fd_direct = -1;
if (fd->direct_write || fd->direct_read) {
fd->fd_direct = open(fd->filename, amode_direct, perm);
if (fd->fd_direct != -1) {
fd->d_mem = fd->d_miniosz = (1<<12);
} else {
perror("cannot open file with O_Direct");
fd->direct_write = fd->direct_read = 0;
}
}
/* --BEGIN ERROR HANDLING-- */
if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
(fd->direct_write || fd->direct_read))) {
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
/* --END ERROR HANDLING-- */
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,208 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#ifdef _STDC_C99
#define _XOPEN_SOURCE 600
#else
#define _XOPEN_SOURCE 500
#endif
#include <unistd.h>
#include <stdlib.h>
#include <malloc.h>
#include "ad_lustre.h"
#define LUSTRE_MEMALIGN (1<<12) /* to use page_shift */
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err);
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err)
{
int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz)) {
*err = pwrite(fd->fd_direct, buf, len, offset);
} else if (len < fd->d_miniosz) {
*err = pwrite(fd->fd_sys, buf, len, offset);
} else {
rem = len % fd->d_miniosz;
size = len - rem;
nbytes = pwrite(fd->fd_direct, buf, size, offset);
nbytes += pwrite(fd->fd_sys, ((char *)buf) + size, rem, offset+size);
*err = nbytes;
}
}
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err);
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err)
{
int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz))
*err = pread(fd->fd_direct, (void *)buf, len, offset);
else if (len < fd->d_miniosz)
*err = pread(fd->fd_sys, (void *)buf, len, offset);
else {
rem = len % fd->d_miniosz;
size = len - rem;
nbytes = pread(fd->fd_direct, (void *)buf, size, offset);
nbytes += pread(fd->fd_sys, ((char *)buf) + size, rem, offset+size);
*err = nbytes;
}
}
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len,
off_t offset, int rw);
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len,
off_t offset, int rw)
{
int err=-1, diff, size=len, nbytes = 0;
void *newbuf;
if (offset % fd->d_miniosz) {
diff = fd->d_miniosz - (offset % fd->d_miniosz);
diff = ADIOI_MIN(diff, len);
if (rw)
nbytes = pwrite(fd->fd_sys, (void *)buf, diff, offset);
else
nbytes = pread(fd->fd_sys, (void *)buf, diff, offset);
buf = ((char *) buf) + diff;
offset += diff;
size = len - diff;
}
if (!size) {
return diff;
}
if (rw) { /* direct I/O enabled */
if (!(((long) buf) % fd->d_mem)) {
ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, buf, size, offset, &err);
nbytes += err;
} else {
newbuf = (void *) memalign(LUSTRE_MEMALIGN, size);
if (newbuf) {
memcpy(newbuf, buf, size);
ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err);
nbytes += err;
ADIOI_Free(newbuf);
}
else nbytes += pwrite(fd->fd_sys, buf, size, offset);
}
err = nbytes;
} else {
if (!(((long) buf) % fd->d_mem)) {
ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, buf, size, offset, &err);
nbytes += err;
} else {
newbuf = (void *) memalign(LUSTRE_MEMALIGN, size);
if (newbuf) {
ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err);
if (err > 0) memcpy((void *)buf, newbuf, err);
nbytes += err;
ADIOI_Free(newbuf);
}
else nbytes += pread(fd->fd_sys, (void *)buf, size, offset);
}
err = nbytes;
}
return err;
}
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int io_mode, int *error_code);
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int io_mode, int *error_code)
{
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_LUSTRE_IOCONTIG";
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind;
}
if (!(fd->direct_read || fd->direct_write)) {
if (fd->fp_sys_posn != offset) {
err = lseek(fd->fd_sys, offset, SEEK_SET);
if (err == -1) goto ioerr;
}
if (io_mode) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
#endif
err = write(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
#endif
} else {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
#endif
err = read(fd->fd_sys, (void *)buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
#endif
}
} else {
err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
}
if (err == -1) goto ioerr;
fd->fp_sys_posn = offset + err;
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind += err;
}
#ifdef HAVE_STATUS_SET_BYTES
if (status) MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
ioerr:
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
fd->fp_sys_posn = -1;
return;
}
/* --END ERROR HANDLING-- */
}
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
ADIOI_LUSTRE_IOContig(fd, buf, count, datatype, file_ptr_type,
offset, status, 1, error_code);
}
void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
ADIOI_LUSTRE_IOContig(fd, buf, count, datatype, file_ptr_type,
offset, status, 0, error_code);
}

Просмотреть файл

@ -0,0 +1,989 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
/* prototypes of functions used for collective writes only. */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
MPI_Datatype datatype, int nprocs,
int myrank,
ADIOI_Access *others_req,
ADIOI_Access *my_req,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
int *striping_info,
int **buf_idx, int *error_code);
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
MPI_Request *requests,
int *sent_to_proc, int nprocs,
int myrank, int contig_access_count,
int *striping_info,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent);
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
int *striping_info,
ADIOI_Access *others_req,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int *hole,
int iter, MPI_Aint buftype_extent,
int *buf_idx,
ADIO_Offset **srt_off, int **srt_len, int *srt_num,
int *error_code);
void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
ADIO_Offset *srt_off, int *srt_len, int *start_pos,
int nprocs, int nprocs_recv, int total_elements);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype,
int file_ptr_type, ADIO_Offset offset,
ADIO_Status *status, int *error_code)
{
/* Uses a generalized version of the extended two-phase method described
* in "An Extended Two-Phase Method for Accessing Sections of
* Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
* Scientific Programming, (5)4:301--317, Winter 1996.
* http://www.mcs.anl.gov/home/thakur/ext2ph.ps
*/
ADIOI_Access *my_req;
/* array of nprocs access structures, one for each other process has
this process's request */
ADIOI_Access *others_req;
/* array of nprocs access structures, one for each other process
whose request is written by this process. */
int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
ADIO_Offset orig_fp, start_offset, end_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
ADIO_Offset *len_list = NULL;
int **buf_idx = NULL, *striping_info = NULL;
int old_error, tmp_error;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
orig_fp = fd->fp_ind;
/* IO patten identification if cb_write isn't disabled */
if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
/* For this process's request, calculate the list of offsets and
lengths in the file and determine the start and end offsets. */
/* Note: end_offset points to the last byte-offset that will be accessed.
* e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
*/
ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
&offset_list, &len_list, &start_offset,
&end_offset, &contig_access_count);
/* each process communicates its start and end offsets to other
* processes. The result is an array each of start and end offsets
* stored in order of process rank.
*/
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
ADIO_OFFSET, fd->comm);
MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
ADIO_OFFSET, fd->comm);
/* are the accesses of different processes interleaved? */
for (i = 1; i < nprocs; i++)
if ((st_offsets[i] < end_offsets[i-1]) &&
(st_offsets[i] <= end_offsets[i]))
interleave_count++;
/* This is a rudimentary check for interleaving, but should suffice
for the moment. */
/* Two typical access patterns can benefit from collective write.
* 1) the processes are interleaved, and
* 2) the req size is small.
*/
if (interleave_count > 0) {
do_collect = 1;
} else {
do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
len_list, nprocs);
}
}
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
/* Decide if collective I/O should be done */
if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
fd->hints->cb_write == ADIOI_HINT_DISABLE) {
/* use independent accesses */
if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
}
fd->fp_ind = orig_fp;
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
if (buftype_is_contig && filetype_is_contig) {
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
ADIO_WriteContig(fd, buf, count, datatype,
ADIO_EXPLICIT_OFFSET,
off, status, error_code);
} else
ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
0, status, error_code);
} else {
ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
offset, status, error_code);
}
return;
}
/* Get Lustre hints information */
ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
/* calculate what portions of the access requests of this process are
* located in which process
*/
ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
striping_info, nprocs, &count_my_req_procs,
&count_my_req_per_proc, &my_req,
&buf_idx);
/* based on everyone's my_req, calculate what requests of other processes
* will be accessed by this process.
* count_others_req_procs = number of processes whose requests (including
* this process itself) will be accessed by this process
* count_others_req_per_proc[i] indicates how many separate contiguous
* requests of proc. i will be accessed by this process.
*/
ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
my_req, nprocs, myrank, &count_others_req_procs,
&others_req);
ADIOI_Free(count_my_req_per_proc);
/* exchange data and write in sizes of no more than stripe_size. */
ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
others_req, my_req, offset_list, len_list,
contig_access_count, striping_info,
buf_idx, error_code);
/* If this collective write is followed by an independent write,
* it's possible to have those subsequent writes on other processes
* race ahead and sneak in before the read-modify-write completes.
* We carry out a collective communication at the end here so no one
* can start independent i/o before collective I/O completes.
*
* need to do some gymnastics with the error codes so that if something
* went wrong, all processes report error, but if a process has a more
* specific error code, we can still have that process report the
* additional information */
old_error = *error_code;
if (*error_code != MPI_SUCCESS)
*error_code = MPI_ERR_IO;
/* optimization: if only one process performing i/o, we can perform
* a less-expensive Bcast */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
#endif
if (fd->hints->cb_nodes == 1)
MPI_Bcast(error_code, 1, MPI_INT,
fd->hints->ranklist[0], fd->comm);
else {
tmp_error = *error_code;
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
MPI_MAX, fd->comm);
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
#endif
if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
*error_code = old_error;
if (!buftype_is_contig)
ADIOI_Delete_flattened(datatype);
/* free all memory allocated for collective I/O */
/* free others_req */
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
ADIOI_Free(others_req[i].offsets);
ADIOI_Free(others_req[i].lens);
ADIOI_Free(others_req[i].mem_ptrs);
}
}
ADIOI_Free(others_req);
/* free my_req here */
for (i = 0; i < nprocs; i++) {
if (my_req[i].count) {
ADIOI_Free(my_req[i].offsets);
ADIOI_Free(my_req[i].lens);
}
}
ADIOI_Free(my_req);
for (i = 0; i < nprocs; i++) {
ADIOI_Free(buf_idx[i]);
}
ADIOI_Free(buf_idx);
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
ADIOI_Free(striping_info);
#ifdef HAVE_STATUS_SET_BYTES
if (status) {
MPI_Count bufsize, size;
/* Don't set status if it isn't needed */
MPI_Type_size_x(datatype, &size);
bufsize = size * count;
MPIR_Status_set_bytes(status, datatype, bufsize);
}
/* This is a temporary way of filling in status. The right way is to
* keep track of how much data was actually written during collective I/O.
*/
#endif
fd->fp_sys_posn = -1; /* set it to null. */
}
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
* code is created and returned in error_code.
*/
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
MPI_Datatype datatype, int nprocs,
int myrank, ADIOI_Access *others_req,
ADIOI_Access *my_req,
ADIO_Offset *offset_list,
ADIO_Offset *len_list,
int contig_access_count,
int *striping_info, int **buf_idx,
int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
* than lustre stripe_size.
* The idea is to reduce the amount of extra memory required for
* collective I/O. If all data were written all at once, which is much
* easier, it would require temp space more than the size of user_buf,
* which is often unacceptable. For example, to write a distributed
* array to a file, where each local array is 8Mbytes, requiring
* at least another 8Mbytes of temp space is unacceptable.
*/
int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
ADIO_Offset max_size, step_size = 0;
int real_size, req_len, send_len;
int *recv_curr_offlen_ptr, *recv_count, *recv_size;
int *send_curr_offlen_ptr, *send_size;
int *sent_to_proc, *recv_start_pos;
int *send_buf_idx, *curr_to_proc, *done_to_proc;
int *this_buf_idx;
char *write_buf = NULL;
MPI_Status status;
ADIOI_Flatlist_node *flat_buf = NULL;
MPI_Aint buftype_extent;
int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
int data_sieving = 0;
ADIO_Offset *srt_off = NULL;
int *srt_len = NULL;
int srt_num = 0;
ADIO_Offset block_offset;
int block_len;
*error_code = MPI_SUCCESS; /* changed below if error */
/* only I/O errors are currently reported */
/* calculate the number of writes of stripe size to be done.
* That gives the no. of communication phases as well.
* Note:
* Because we redistribute data in stripe-contiguous pattern for Lustre,
* each process has the same no. of communication phases.
*/
for (i = 0; i < nprocs; i++) {
if (others_req[i].count) {
st_loc = others_req[i].offsets[0];
end_loc = others_req[i].offsets[0];
break;
}
}
for (i = 0; i < nprocs; i++) {
for (j = 0; j < others_req[i].count; j++) {
st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
others_req[i].lens[j] - 1));
}
}
/* this process does no writing. */
if ((st_loc == -1) && (end_loc == -1))
ntimes = 0;
MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
/* avoid min_st_loc be -1 */
if (st_loc == -1)
st_loc = max_end_loc;
MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
/* align downward */
min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
/* Each time, only avail_cb_nodes number of IO clients perform IO,
* so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
* and ntimes=whole_file_portion/step_size
*/
step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
+ (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/* max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
if (ntimes)
write_buf = (char *) ADIOI_Malloc(stripe_size);
/* calculate the start offset for each iteration */
off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
for (m = 0; m < max_ntimes; m ++)
off_list[m] = max_end_loc;
for (i = 0; i < nprocs; i++) {
for (j = 0; j < others_req[i].count; j ++) {
req_off = others_req[i].offsets[j];
m = (int)((req_off - min_st_loc) / step_size);
off_list[m] = ADIOI_MIN(off_list[m], req_off);
}
}
recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* their use is explained below. calloc initializes to 0. */
recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* to store count of how many off-len pairs per proc are satisfied
in an iteration. */
send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* total size of data to be sent to each proc. in an iteration.
Of size nprocs so that I can use MPI_Alltoall later. */
recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* total size of data to be recd. from each proc. in an iteration. */
sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* amount of data sent to each proc so far. Used in
ADIOI_Fill_send_buffer. initialized to 0 here. */
send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* Above three are used in ADIOI_Fill_send_buffer */
this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
/* used to store the starting value of recv_curr_offlen_ptr[i] in
this iteration */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
if (!buftype_is_contig) {
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype)
flat_buf = flat_buf->next;
}
MPI_Type_extent(datatype, &buftype_extent);
/* I need to check if there are any outstanding nonblocking writes to
* the file, which could potentially interfere with the writes taking
* place in this collective write call. Since this is not likely to be
* common, let me do the simplest thing possible here: Each process
* completes all pending nonblocking operations before completing.
*/
/*ADIOI_Complete_async(error_code);
if (*error_code != MPI_SUCCESS) return;
MPI_Barrier(fd->comm);
*/
iter_st_off = min_st_loc;
/* Although we have recognized the data according to OST index,
* a read-modify-write will be done if there is a hole between the data.
* For example: if blocksize=60, xfersize=30 and stripe_size=100,
* then rank0 will collect data [0, 30] and [60, 90] then write. There
* is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
*
* To reduce its impact on the performance, we can disable data sieving
* by hint "ds_in_coll".
*/
/* check the hint for data sieving */
data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
for (m = 0; m < max_ntimes; m++) {
/* go through all others_req and my_req to check which will be received
* and sent in this iteration.
*/
/* Note that MPI guarantees that displacements in filetypes are in
monotonically nondecreasing order and that, for writes, the
filetypes cannot specify overlapping regions in the file. This
simplifies implementation a bit compared to reads. */
/*
off = start offset in the file for the data to be written in
this iteration
iter_st_off = start offset of this iteration
real_size = size of data written (bytes) corresponding to off
max_size = possible maximum size of data written in this iteration
req_off = offset in the file for a particular contiguous request minus
what was satisfied in previous iteration
send_off = offset the request needed by other processes in this iteration
req_len = size corresponding to req_off
send_len = size corresponding to send_off
*/
/* first calculate what should be communicated */
for (i = 0; i < nprocs; i++)
recv_count[i] = recv_size[i] = send_size[i] = 0;
off = off_list[m];
max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size -
off,
end_loc - off + 1);
for (i = 0; i < nprocs; i++) {
if (my_req[i].count) {
this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
send_off = my_req[i].offsets[j];
send_len = my_req[i].lens[j];
if (send_off < iter_st_off + max_size) {
send_size[i] += send_len;
} else {
break;
}
}
send_curr_offlen_ptr[i] = j;
}
if (others_req[i].count) {
recv_start_pos[i] = recv_curr_offlen_ptr[i];
for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
req_off = others_req[i].offsets[j];
req_len = others_req[i].lens[j];
if (req_off < iter_st_off + max_size) {
recv_count[i]++;
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
MPI_Address(write_buf + req_off - off,
&(others_req[i].mem_ptrs[j]));
recv_size[i] += req_len;
} else {
break;
}
}
recv_curr_offlen_ptr[i] = j;
}
}
/* use variable "hole" to pass data_sieving flag into W_Exchange_data */
hole = data_sieving;
ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, real_size,
recv_count, recv_start_pos,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
striping_info, others_req, send_buf_idx,
curr_to_proc, done_to_proc, &hole, m,
buftype_extent, this_buf_idx,
&srt_off, &srt_len, &srt_num, error_code);
if (*error_code != MPI_SUCCESS)
goto over;
flag = 0;
for (i = 0; i < nprocs; i++)
if (recv_count[i]) {
flag = 1;
break;
}
if (flag) {
/* check whether to do data sieving */
if(data_sieving == ADIOI_HINT_ENABLE) {
ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status,
error_code);
} else {
/* if there is no hole, write data in one time;
* otherwise, write data in several times */
if (!hole) {
ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status,
error_code);
} else {
block_offset = -1;
block_len = 0;
for (i = 0; i < srt_num; ++i) {
if (srt_off[i] < off + real_size &&
srt_off[i] >= off) {
if (block_offset == -1) {
block_offset = srt_off[i];
block_len = srt_len[i];
} else {
if (srt_off[i] == block_offset + block_len) {
block_len += srt_len[i];
} else {
ADIO_WriteContig(fd,
write_buf + block_offset - off,
block_len,
MPI_BYTE, ADIO_EXPLICIT_OFFSET,
block_offset, &status,
error_code);
if (*error_code != MPI_SUCCESS)
goto over;
block_offset = srt_off[i];
block_len = srt_len[i];
}
}
}
}
if (block_offset != -1) {
ADIO_WriteContig(fd,
write_buf + block_offset - off,
block_len,
MPI_BYTE, ADIO_EXPLICIT_OFFSET,
block_offset, &status,
error_code);
if (*error_code != MPI_SUCCESS)
goto over;
}
}
}
if (*error_code != MPI_SUCCESS)
goto over;
}
iter_st_off += max_size;
}
over:
if (srt_off)
ADIOI_Free(srt_off);
if (srt_len)
ADIOI_Free(srt_len);
if (ntimes)
ADIOI_Free(write_buf);
ADIOI_Free(recv_curr_offlen_ptr);
ADIOI_Free(send_curr_offlen_ptr);
ADIOI_Free(recv_count);
ADIOI_Free(send_size);
ADIOI_Free(recv_size);
ADIOI_Free(sent_to_proc);
ADIOI_Free(recv_start_pos);
ADIOI_Free(send_buf_idx);
ADIOI_Free(curr_to_proc);
ADIOI_Free(done_to_proc);
ADIOI_Free(this_buf_idx);
ADIOI_Free(off_list);
}
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
* in the case of error.
*/
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
int *striping_info,
ADIOI_Access *others_req,
int *send_buf_idx,
int *curr_to_proc, int *done_to_proc,
int *hole, int iter,
MPI_Aint buftype_extent,
int *buf_idx,
ADIO_Offset **srt_off, int **srt_len, int *srt_num,
int *error_code)
{
int i, j, nprocs_recv, nprocs_send, err;
char **send_buf = NULL;
MPI_Request *requests, *send_req;
MPI_Datatype *recv_types;
MPI_Status *statuses, status;
int sum_recv;
int data_sieving = *hole;
static char myname[] = "ADIOI_W_EXCHANGE_DATA";
/* create derived datatypes for recv */
nprocs_recv = 0;
for (i = 0; i < nprocs; i++)
if (recv_size[i])
nprocs_recv++;
recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
sizeof(MPI_Datatype));
/* +1 to avoid a 0-size malloc */
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
ADIOI_Type_create_hindexed_x(count[i],
&(others_req[i].lens[start_pos[i]]),
&(others_req[i].mem_ptrs[start_pos[i]]),
MPI_BYTE, recv_types + j);
/* absolute displacements; use MPI_BOTTOM in recv */
MPI_Type_commit(recv_types + j);
j++;
}
}
/* To avoid a read-modify-write,
* check if there are holes in the data to be written.
* For this, merge the (sorted) offset lists others_req using a heap-merge.
*/
*srt_num = 0;
for (i = 0; i < nprocs; i++)
*srt_num += count[i];
if (*srt_off)
*srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset));
else
*srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset));
if (*srt_len)
*srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int));
else
*srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int));
/* +1 to avoid a 0-size malloc */
ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos,
nprocs, nprocs_recv, *srt_num);
/* check if there are any holes */
*hole = 0;
for (i = 0; i < *srt_num - 1; i++) {
if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) {
*hole = 1;
break;
}
}
/* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
* between aggregation, nominally contiguous regions, and cb_buffer_size
* should be handled with a read-modify-write (otherwise we will write out
* more data than we receive from everyone else (inclusive), so override
* hole detection
*/
if (*hole == 0) {
sum_recv = 0;
for (i = 0; i < nprocs; i++)
sum_recv += recv_size[i];
if (size > sum_recv)
*hole = 1;
}
/* check the hint for data sieving */
if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, &err);
// --BEGIN ERROR HANDLING--
if (err != MPI_SUCCESS) {
*error_code = MPIO_Err_create_code(err,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"**ioRMWrdwr", 0);
ADIOI_Free(recv_types);
return;
}
// --END ERROR HANDLING--
}
nprocs_send = 0;
for (i = 0; i < nprocs; i++)
if (send_size[i])
nprocs_send++;
if (fd->atomicity) {
/* bug fix from Wei-keng Liao and Kenin Coloma */
requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
sizeof(MPI_Request));
send_req = requests;
} else {
requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
sizeof(MPI_Request));
/* +1 to avoid a 0-size malloc */
/* post receives */
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
myrank + i + 100 * iter, fd->comm, requests + j);
j++;
}
}
send_req = requests + nprocs_recv;
}
/* post sends.
* if buftype_is_contig, data can be directly sent from
* user buf at location given by buf_idx. else use send_buf.
*/
if (buftype_is_contig) {
j = 0;
for (i = 0; i < nprocs; i++)
if (send_size[i]) {
ADIOI_Assert(buf_idx[i] != -1);
MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
send_req + j);
j++;
}
} else
if (nprocs_send) {
/* buftype is not contig */
send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
for (i = 0; i < nprocs; i++)
if (send_size[i])
send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
len_list, send_size, send_req,
sent_to_proc, nprocs, myrank,
contig_access_count, striping_info,
send_buf_idx, curr_to_proc, done_to_proc,
iter, buftype_extent);
/* the send is done in ADIOI_Fill_send_buffer */
}
/* bug fix from Wei-keng Liao and Kenin Coloma */
if (fd->atomicity) {
j = 0;
for (i = 0; i < nprocs; i++) {
MPI_Status wkl_status;
if (recv_size[i]) {
MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
myrank + i + 100 * iter, fd->comm, &wkl_status);
j++;
}
}
}
for (i = 0; i < nprocs_recv; i++)
MPI_Type_free(recv_types + i);
ADIOI_Free(recv_types);
/* bug fix from Wei-keng Liao and Kenin Coloma */
/* +1 to avoid a 0-size malloc */
if (fd->atomicity) {
statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
sizeof(MPI_Status));
} else {
statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
sizeof(MPI_Status));
}
#ifdef NEEDS_MPI_TEST
i = 0;
if (fd->atomicity) {
/* bug fix from Wei-keng Liao and Kenin Coloma */
while (!i)
MPI_Testall(nprocs_send, send_req, &i, statuses);
} else {
while (!i)
MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
}
#else
/* bug fix from Wei-keng Liao and Kenin Coloma */
if (fd->atomicity)
MPI_Waitall(nprocs_send, send_req, statuses);
else
MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
#endif
ADIOI_Free(statuses);
ADIOI_Free(requests);
if (!buftype_is_contig && nprocs_send) {
for (i = 0; i < nprocs; i++)
if (send_size[i])
ADIOI_Free(send_buf[i]);
ADIOI_Free(send_buf);
}
}
#define ADIOI_BUF_INCR \
{ \
while (buf_incr) { \
size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
else { \
flat_buf_idx = 0; \
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
buf_incr -= size_in_buf; \
} \
}
#define ADIOI_BUF_COPY \
{ \
while (size) { \
size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + user_buf_idx) == (ADIO_Offset)(MPIR_Upint)((MPIR_Upint)buf + user_buf_idx)); \
ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \
memcpy(&(send_buf[p][send_buf_idx[p]]), \
((char *) buf) + user_buf_idx, size_in_buf); \
send_buf_idx[p] += size_in_buf; \
user_buf_idx += size_in_buf; \
flat_buf_sz -= size_in_buf; \
if (!flat_buf_sz) { \
if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
else { \
flat_buf_idx = 0; \
n_buftypes++; \
} \
user_buf_idx = flat_buf->indices[flat_buf_idx] + \
(ADIO_Offset)n_buftypes*(ADIO_Offset)buftype_extent; \
flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
} \
size -= size_in_buf; \
buf_incr -= size_in_buf; \
} \
ADIOI_BUF_INCR \
}
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
MPI_Request *requests,
int *sent_to_proc, int nprocs,
int myrank,
int contig_access_count,
int *striping_info,
int *send_buf_idx,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent)
{
/* this function is only called if buftype is not contig */
int i, p, flat_buf_idx, size;
int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
ADIO_Offset off, len, rem_len, user_buf_idx;
/* curr_to_proc[p] = amount of data sent to proc. p that has already
* been accounted for so far
* done_to_proc[p] = amount of data already sent to proc. p in
* previous iterations
* user_buf_idx = current location in user buffer
* send_buf_idx[p] = current location in send_buf of proc. p
*/
for (i = 0; i < nprocs; i++) {
send_buf_idx[i] = curr_to_proc[i] = 0;
done_to_proc[i] = sent_to_proc[i];
}
jj = 0;
user_buf_idx = flat_buf->indices[0];
flat_buf_idx = 0;
n_buftypes = 0;
flat_buf_sz = flat_buf->blocklens[0];
/* flat_buf_idx = current index into flattened buftype
* flat_buf_sz = size of current contiguous component in flattened buf
*/
for (i = 0; i < contig_access_count; i++) {
off = offset_list[i];
rem_len = (ADIO_Offset) len_list[i];
/*this request may span to more than one process */
while (rem_len != 0) {
len = rem_len;
/* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
* longer than the single region that processor "p" is responsible
* for.
*/
p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
if (send_buf_idx[p] < send_size[p]) {
if (curr_to_proc[p] + len > done_to_proc[p]) {
if (done_to_proc[p] > curr_to_proc[p]) {
size = (int) ADIOI_MIN(curr_to_proc[p] + len -
done_to_proc[p],
send_size[p] -
send_buf_idx[p]);
buf_incr = done_to_proc[p] - curr_to_proc[p];
ADIOI_BUF_INCR
ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p]));
buf_incr = (int) (curr_to_proc[p] + len -
done_to_proc[p]);
ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size));
curr_to_proc[p] = done_to_proc[p] + size;
ADIOI_BUF_COPY
} else {
size = (int) ADIOI_MIN(len, send_size[p] -
send_buf_idx[p]);
buf_incr = (int) len;
ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size));
curr_to_proc[p] += size;
ADIOI_BUF_COPY
}
if (send_buf_idx[p] == send_size[p]) {
MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
myrank + p + 100 * iter, fd->comm,
requests + jj);
jj++;
}
} else {
ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len));
curr_to_proc[p] += (int) len;
buf_incr = (int) len;
ADIOI_BUF_INCR
}
} else {
buf_incr = (int) len;
ADIOI_BUF_INCR
}
off += len;
rem_len -= len;
}
}
for (i = 0; i < nprocs; i++)
if (send_size[i])
sent_to_proc[i] = curr_to_proc[i];
}

Просмотреть файл

@ -0,0 +1,533 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
*
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#include "ad_lustre.h"
#include "adio_extern.h"
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
if (writebuf_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, \
&status1, error_code); \
if (!(fd->atomicity)) \
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
} \
writebuf_off = req_off; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
if (!(fd->atomicity)) \
ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, \
writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowsrc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, \
writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == \
ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf + req_off - writebuf_off, (char *)buf +userbuf_off, write_sz); \
while (write_sz != req_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
if (!(fd->atomicity)) \
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
if (!(fd->atomicity)) \
ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, \
writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowsrc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
} \
}
/* this macro is used when filetype is contig and buftype is not contig.
it does not do a read-modify-write and does not lock*/
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, \
error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, \
myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
writebuf_off = req_off; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
} \
write_sz = (unsigned) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf + req_off - writebuf_off, \
(char *)buf + userbuf_off, write_sz); \
while (write_sz != req_len) { \
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
if (*error_code != MPI_SUCCESS) { \
*error_code = MPIO_Err_create_code(*error_code, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**iowswc", 0); \
ADIOI_Free(writebuf); \
return; \
} \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
/* stripe_size alignment */ \
writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \
(writebuf_off / stripe_size + 1) * \
stripe_size - writebuf_off); \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \
} \
}
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status * status,
int *error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, sum, size_in_filetype;
int i, j, k, st_index=0;
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
MPI_Count filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf;
unsigned bufsize, writebuf_len, write_sz;
ADIO_Status status1;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
int stripe_size;
static char myname[] = "ADIOI_LUSTRE_WriteStrided";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on writes, use naive
* approach instead.
*/
ADIOI_GEN_WriteStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset, status, error_code);
return;
}
*error_code = MPI_SUCCESS; /* changed below if error */
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size_x(fd->filetype, &filetype_size);
if (!filetype_size) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size_x(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get striping info */
stripe_size = fd->hints->striping_unit;
/* Different buftype to different filetype */
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype)
flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off;
end_offset = start_off + bufsize - 1;
/* write stripe size buffer each time */
writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
writebuf_off = 0;
writebuf_len = 0;
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
for (j = 0; j < count; j++) {
for (i = 0; i < flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent +
flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
}
/* write the buffer out finally */
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
error_code);
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
if (*error_code != MPI_SUCCESS) {
ADIOI_Free(writebuf);
return;
}
ADIOI_Free(writebuf);
if (file_ptr_type == ADIO_INDIVIDUAL)
fd->fp_ind = off;
} else {
/* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype)
flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i = 0; i < flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file
* contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
req_off = start_off;
req_len = bufsize;
end_offset = start_off + bufsize - 1;
writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
writebuf_off = 0;
writebuf_len = 0;
userbuf_off = 0;
ADIOI_BUFFERED_WRITE_WITHOUT_READ
/* write the buffer out finally */
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
ADIOI_Free(writebuf);
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
i_offset += fwr_size;
end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
writebuf_off = 0;
writebuf_len = 0;
writebuf = (char *) ADIOI_Malloc(stripe_size);
memset(writebuf, -1, stripe_size);
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] +
n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
new_fwr_size = fwr_size;
new_bwr_size = bwr_size;
if (size == fwr_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i_offset += size;
new_bwr_size -= size;
}
}
if (size == bwr_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = (ADIO_Offset)buftype_extent *
(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;
new_fwr_size -= size;
}
}
num += size;
fwr_size = new_fwr_size;
bwr_size = new_bwr_size;
}
}
/* write the buffer out finally */
if (writebuf_len) {
ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
ADIO_EXPLICIT_OFFSET,
writebuf_off, &status1, error_code);
if (!(fd->atomicity))
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
if (*error_code != MPI_SUCCESS) return;
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
ADIOI_Free(writebuf);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig)
ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -0,0 +1,28 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_NFS
noinst_HEADERS += adio/ad_nfs/ad_nfs.h
romio_other_sources += \
adio/ad_nfs/ad_nfs_read.c \
adio/ad_nfs/ad_nfs_open.c \
adio/ad_nfs/ad_nfs_write.c \
adio/ad_nfs/ad_nfs_done.c \
adio/ad_nfs/ad_nfs_fcntl.c \
adio/ad_nfs/ad_nfs_iread.c \
adio/ad_nfs/ad_nfs_iwrite.c \
adio/ad_nfs/ad_nfs_wait.c \
adio/ad_nfs/ad_nfs_setsh.c \
adio/ad_nfs/ad_nfs_getsh.c \
adio/ad_nfs/ad_nfs.c \
adio/ad_nfs/ad_nfs_resize.c \
adio/ad_nfs/ad_nfs_features.c
endif BUILD_AD_NFS

Просмотреть файл

@ -0,0 +1,41 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_NFS_operations = {
ADIOI_NFS_Open, /* Open */
ADIOI_FAILSAFE_OpenColl, /* OpenColl */
ADIOI_NFS_ReadContig, /* ReadContig */
ADIOI_NFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_NFS_Fcntl, /* Fcntl */
ADIOI_GEN_SetInfo, /* SetInfo */
ADIOI_NFS_ReadStrided, /* ReadStrided */
ADIOI_NFS_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
/* Even with lockd running and NFS mounted 'noac', we have been unable to
* gaurantee correct behavior over NFS with asyncronous I/O operations */
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
ADIOI_NFS_ReadDone, /* ReadDone */
ADIOI_NFS_WriteDone, /* WriteDone */
ADIOI_NFS_ReadComplete, /* ReadComplete */
ADIOI_NFS_WriteComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_NFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_NFS_Feature, /* Features */
"NFS:" /* fsname: just a string */
};

Просмотреть файл

@ -0,0 +1,83 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_NFS_INCLUDE
#define AD_NFS_INCLUDE
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_AIO_H
#include <aio.h>
#endif
#ifdef HAVE_SYS_AIO_H
#include <sys/aio.h>
#endif
/* Workaround for incomplete set of definitions if __REDIRECT is not
defined and large file support is used in aio.h */
#if !defined(__REDIRECT) && defined(__USE_FILE_OFFSET64)
#define aiocb aiocb64
#endif
int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, MPI_Request *request);
#ifdef SX4
#define lseek llseek
#endif
void ADIOI_NFS_Open(ADIO_File fd, int *error_code);
void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_NFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_NFS_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_NFS_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
void ADIOI_NFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code);
void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp,
int *error_code);
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
int ADIOI_NFS_Feature(ADIO_File fd, int feature_flag);
#endif

Просмотреть файл

@ -0,0 +1,19 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
int ADIOI_NFS_ReadDone(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
*error_code = MPI_SUCCESS;
return 1;
}
int ADIOI_NFS_WriteDone(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return ADIOI_NFS_ReadDone(request, status, error_code);
}

Просмотреть файл

@ -0,0 +1,65 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
#include "adio_extern.h"
/* #ifdef MPISGI
#include "mpisgi2.h"
#endif */
void ADIOI_NFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{
static char myname[] = "ADIOI_NFS_FCNTL";
switch(flag) {
case ADIO_FCNTL_GET_FSIZE:
ADIOI_READ_LOCK(fd, 0, SEEK_SET, 1);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
ADIOI_UNLOCK(fd, 0, SEEK_SET, 1);
if (fd->fp_sys_posn != -1) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
}
if (fcntl_struct->fsize == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
break;
case ADIO_FCNTL_SET_ATOMICITY:
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
*error_code = MPI_SUCCESS;
break;
default:
/* --BEGIN ERROR HANDLING-- */
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"**flag", "**flag %d", flag);
return;
/* --END ERROR HANDLING-- */
}
}

Просмотреть файл

@ -0,0 +1,24 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "adio.h"
#include "ad_nfs.h"
int ADIOI_NFS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
return 1;
case ADIO_SCALABLE_OPEN:
case ADIO_UNLINK_AFTER_CLOSE:
case ADIO_SCALABLE_RESIZE:
default:
return 0;
}
}

Просмотреть файл

@ -0,0 +1,105 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
/* returns the current location of the shared_fp in terms of the
no. of etypes relative to the current view, and also increments the
shared_fp by the number of etypes to be accessed (incr) in the read
or write following this function. */
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
int *error_code)
{
ADIO_Offset new_fp;
ssize_t err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_NFS_GET_SHARED_FP";
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
fd->shared_fp_fname,
fd->file_system,
fd->fns,
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
ADIO_PERM_NULL, error_code);
if (*error_code != MPI_SUCCESS) return;
*shared_fp = 0;
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
/* if the file is empty, the above read may return error
(reading beyond end of file). In that case, shared_fp = 0,
set above, is the correct value. */
}
else {
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
if (err == 0) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->shared_fp_fd->fd_sys, shared_fp,
sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
}
if (err == -1) {
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
return;
}
}
if (incr == 0) {goto done;}
new_fp = *shared_fp + incr;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
if (err == 0) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
}
done:
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,13 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
void ADIOI_NFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
ADIOI_GEN_SetInfo(fd, users_info, error_code);
}

Просмотреть файл

@ -0,0 +1,37 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
#ifdef ROMIO_HAVE_WORKING_AIO
/* nearly identical to ADIOI_GEN_IreadContig, except we lock around I/O */
void ADIOI_NFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request,
int *error_code)
{
MPI_Count len, typesize;
int aio_errno = 0;
static char myname[] = "ADIOI_NFS_IREADCONTIG";
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
aio_errno = ADIOI_NFS_aio(fd, buf, len, offset, 0, request);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;
fd->fp_sys_posn = -1;
if (aio_errno != 0) {
/* --BEGIN ERROR HANDLING-- */
MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
return;
/* --END ERROR HANDLING-- */
}
else *error_code = MPI_SUCCESS;
}
#endif

Просмотреть файл

@ -0,0 +1,130 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
#include "../../mpi-io/mpioimpl.h"
#include "../../mpi-io/mpioprof.h"
#include "mpiu_greq.h"
#include <string.h>
#ifdef ROMIO_HAVE_WORKING_AIO
static MPIX_Grequest_class ADIOI_GEN_greq_class = 0;
/* this routine is nearly identical to ADIOI_GEN_IwriteContig, except we lock
* around I/O */
void ADIOI_NFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int *error_code)
{
MPI_Count len, typesize;
int aio_errno = 0;
static char myname[] = "ADIOI_NFS_IWRITECONTIG";
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
aio_errno = ADIOI_NFS_aio(fd, buf, len, offset, 1, request);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;
fd->fp_sys_posn = -1;
if (aio_errno != 0) {
/* --BEGIN ERROR HANDLING-- */
MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
return;
/* --END ERROR HANDLING-- */
}
else *error_code = MPI_SUCCESS;
return;
}
#endif
/* This function is for implementation convenience. It is not user-visible.
* It takes care of the differences in the interface for nonblocking I/O
* on various Unix machines! If wr==1 write, wr==0 read.
*
* Returns 0 on success, -errno on failure.
*/
#ifdef ROMIO_HAVE_WORKING_AIO
int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, MPI_Request *request)
{
int err=-1, fd_sys;
int error_code, this_errno;
struct aiocb *aiocbp;
ADIOI_AIO_Request *aio_req;
MPI_Status status;
fd_sys = fd->fd_sys;
aio_req = (ADIOI_AIO_Request*)ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1);
aiocbp->aio_offset = offset;
aiocbp->aio_buf = buf;
aiocbp->aio_nbytes = len;
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_WHENCE
aiocbp->aio_whence = SEEK_SET;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_FILDES
aiocbp->aio_fildes = fd_sys;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_SIGEVENT
# ifdef AIO_SIGNOTIFY_NONE
aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
# endif
aiocbp->aio_sigevent.sigev_signo = 0;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_REQPRIO
# ifdef AIO_PRIO_DFL
aiocbp->aio_reqprio = AIO_PRIO_DFL; /* not needed in DEC Unix 4.0 */
# else
aiocbp->aio_reqprio = 0;
# endif
#endif
if (wr) ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
#ifndef ROMIO_HAVE_AIO_CALLS_NEED_FILEDES
if (wr) err = aio_write(aiocbp);
else err = aio_read(aiocbp);
#else
/* Broken IBM interface */
if (wr) err = aio_write(fd_sys, aiocbp);
else err = aio_read(fd_sys, aiocbp);
#endif
this_errno = errno;
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
if (err == -1) {
if (this_errno == EAGAIN) {
/* exceeded the max. no. of outstanding requests.
complete all previous async. requests and try again. */
ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, &status, &error_code);
MPIO_Completed_request_create(&fd, len, &error_code, request);
return 0;
} else {
return -this_errno;
}
}
aio_req->aiocbp = aiocbp;
if (ADIOI_GEN_greq_class == 0) {
MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn,
ADIOI_GEN_aio_free_fn, MPIU_Greq_cancel_fn,
ADIOI_GEN_aio_poll_fn, ADIOI_GEN_aio_wait_fn,
&ADIOI_GEN_greq_class);
}
MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, aio_req, request);
memcpy(&(aio_req->req), request, sizeof(MPI_Request));
return 0;
}
#endif

Просмотреть файл

@ -0,0 +1,58 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
void ADIOI_NFS_Open(ADIO_File fd, int *error_code)
{
int perm, amode;
mode_t old_mask;
static char myname[] = "ADIOI_NFS_OPEN";
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_open_a, 0, NULL );
#endif
fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_open_b, 0, NULL );
#endif
fd->fd_direct = -1;
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
}
if (fd->fd_sys == -1) {
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,553 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
#include "adio_extern.h"
void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_NFS_READCONTIG";
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, offset, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
}
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* read from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (fd->fp_sys_posn != fd->fp_ind) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
}
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
}
#ifdef ADIOI_MPE_LOGGING
#define ADIOI_BUFFERED_READ \
{ \
if (req_off >= readbuf_off + readbuf_len) { \
readbuf_off = req_off; \
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); \
err = read(fd->fd_sys, readbuf, readbuf_len);\
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
if (err == -1) err_flag = 1; \
} \
while (req_len > readbuf_off + readbuf_len - req_off) { \
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
ADIOI_Free(readbuf); \
readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
memcpy(readbuf, tmp_buf, partial_read); \
ADIOI_Free(tmp_buf); \
readbuf_off += readbuf_len-partial_read; \
readbuf_len = (int) (partial_read + ADIOI_MIN(max_bufsize, \
end_offset-readbuf_off+1)); \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); \
err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
if (err == -1) err_flag = 1; \
} \
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
}
#else
#define ADIOI_BUFFERED_READ \
{ \
if (req_off >= readbuf_off + readbuf_len) { \
readbuf_off = req_off; \
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
err = read(fd->fd_sys, readbuf, readbuf_len);\
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
if (err == -1) err_flag = 1; \
} \
while (req_len > readbuf_off + readbuf_len - req_off) { \
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
ADIOI_Free(readbuf); \
readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
memcpy(readbuf, tmp_buf, partial_read); \
ADIOI_Free(tmp_buf); \
readbuf_off += readbuf_len-partial_read; \
readbuf_len = (int) (partial_read + ADIOI_MIN(max_bufsize, \
end_offset-readbuf_off+1)); \
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
if (err == -1) err_flag = 1; \
} \
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
}
#endif
void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, err=-1, brd_size, frd_size=0, st_index=0;
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int req_len, partial_read;
MPI_Count filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value;
int st_frd_size, st_n_filetypes, readbuf_len;
int new_brd_size, new_frd_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_READSTRIDED";
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size_x(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size_x(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
readbuf_off = off;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, readbuf_off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->fd_sys, readbuf, readbuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
if (err == -1) err_flag = 1;
for (j=0; j<count; j++)
for (i=0; i<flat_buf->count; i++) {
userbuf_off = j*buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_READ
off += flat_buf->blocklens[i];
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else { /* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* frd_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0 ) {
frd_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
i = 0;
j = st_index;
off = offset;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i < bufsize) {
i += frd_size;
end_offset = off + frd_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
/* initial read into readbuf */
readbuf_off = offset;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, offset, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->fd_sys, readbuf, readbuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
if (err == -1) err_flag = 1;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i < bufsize) {
if (frd_size) {
/* TYPE_UB and TYPE_LB can result in
frd_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
req_off = off;
req_len = frd_size;
userbuf_off = i;
ADIOI_BUFFERED_READ
}
i += frd_size;
if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
off += frd_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i = (int) (flat_buf->indices[0]);
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(frd_size, brd_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = read(fd->fd_sys, ((char *) buf) + i, size); */
req_off = off;
req_len = size;
userbuf_off = i;
ADIOI_BUFFERED_READ
}
new_frd_size = frd_size;
new_brd_size = brd_size;
if (size == frd_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
new_frd_size = flat_file->blocklens[j];
if (size != brd_size) {
i += size;
new_brd_size -= size;
}
}
if (size == brd_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i = (int) (buftype_extent*(buf_count/flat_buf->count) +
flat_buf->indices[k]);
new_brd_size = flat_buf->blocklens[k];
if (size != frd_size) {
off += size;
new_frd_size -= size;
}
}
num += size;
frd_size = new_frd_size;
brd_size = new_brd_size;
}
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -0,0 +1,35 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2004 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
/* NFS resize
*
* Note: we resize on all processors to guarantee that all processors
* will have updated cache values. This used to be the generic
* implementation used by the majority of the ADIO implementations.
*/
void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
{
int err;
static char myname[] = "ADIOI_NFS_RESIZE";
err = ftruncate(fd->fd_sys, size);
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,74 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
/* set the shared file pointer to "offset" etypes relative to the current
view */
/*
This looks very similar to ADIOI_GEN_Set_shared_fp, except this
function avoids locking the file twice. The generic version does
Write lock
ADIO_WriteContig
Unlock
For NFS, ADIOI_NFS_WriteContig does a lock before writing to disable
caching. To avoid the lock being called twice, this version for NFS does
Write lock
Lseek
Write
Unlock
*/
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
{
ssize_t err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_NFS_SET_SHARED_FP";
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
fd->shared_fp_fname,
fd->file_system, fd->fns,
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
ADIO_PERM_NULL, error_code);
}
if (*error_code != MPI_SUCCESS) return;
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,20 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
void ADIOI_NFS_ReadComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return;
}
void ADIOI_NFS_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
ADIOI_NFS_ReadComplete(request, status, error_code);
}

Просмотреть файл

@ -0,0 +1,679 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_nfs.h"
#include "adio_extern.h"
void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_NFS_WRITECONTIG";
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, offset, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
}
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err = write(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* write from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (fd->fp_sys_posn != fd->fp_ind) {
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
}
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err = write(fd->fd_sys, buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
}
#ifdef ADIOI_MPE_LOGGING
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
goto fn_exit; \
} \
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
goto fn_exit; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
#else
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
goto fn_exit; \
} \
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
goto fn_exit; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
#endif
/* this macro is used when filetype is contig and buftype is not contig.
it does not do a read-modify-write and does not lock*/
#ifdef ADIOI_MPE_LOGGING
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
#else
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
#endif
void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int req_len;
MPI_Count filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf=NULL, *value;
int st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_WRITESTRIDED";
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size_x(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size_x(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
writebuf_off = off;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
for (j=0; j<count; j++)
for (i=0; i<flat_buf->count; i++) {
userbuf_off = j*buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
/* write the buffer out finally */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err = write(fd->fd_sys, writebuf, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
if (err == -1) err_flag = 1;
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else { /* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = (int) (offset / n_etypes_in_filetype);
etype_in_filetype = (int) (offset % n_etypes_in_filetype);
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i < bufsize) {
i += fwr_size;
end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
/* initial read for the read-modify-write */
writebuf_off = offset;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (int)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
err = read(fd->fd_sys, writebuf, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
goto fn_exit;
}
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i;
ADIOI_BUFFERED_WRITE
}
i += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i = (int) (flat_buf->indices[0]);
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i, size); */
req_off = off;
req_len = size;
userbuf_off = i;
ADIOI_BUFFERED_WRITE
}
new_fwr_size = fwr_size;
new_bwr_size = bwr_size;
if (size == fwr_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
(ADIO_Offset) n_filetypes*filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i += size;
new_bwr_size -= size;
}
}
if (size == bwr_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i = (int) (buftype_extent*(buf_count/flat_buf->count) +
flat_buf->indices[k]);
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;
new_fwr_size -= size;
}
}
num += size;
fwr_size = new_fwr_size;
bwr_size = new_bwr_size;
}
}
/* write the buffer out finally */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
err = write(fd->fd_sys, writebuf, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
if (!(fd->atomicity))
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (err == -1) err_flag = 1;
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
fn_exit:
if (writebuf != NULL) ADIOI_Free(writebuf);
return;
}

Просмотреть файл

@ -0,0 +1,38 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Open, /* Open */
ADIOI_FAILSAFE_OpenColl, /* OpenColl */
ADIOI_NTFS_ReadContig, /* ReadContig */
ADIOI_NTFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_NTFS_Fcntl, /* Fcntl */
ADIOI_GEN_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_NTFS_Close, /* Close */
ADIOI_NTFS_IreadContig, /* IreadContig */
ADIOI_NTFS_IwriteContig, /* IwriteContig */
ADIOI_NTFS_ReadDone, /* ReadDone */
ADIOI_NTFS_WriteDone, /* WriteDone */
ADIOI_NTFS_ReadComplete, /* ReadComplete */
ADIOI_NTFS_WriteComplete, /* WriteComplete */
ADIOI_FAKE_IreadStrided, /* IreadStrided */
ADIOI_FAKE_IwriteStrided, /* IwriteStrided */
ADIOI_NTFS_Flush, /* Flush */
ADIOI_NTFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_NTFS_Feature /* Features */
};

Просмотреть файл

@ -0,0 +1,68 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_NTFS_INCLUDE
#define AD_NTFS_INCLUDE
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef HAVE_INT64
#define DWORDLOW(x) ( (DWORD) ( x & (__int64) 0xFFFFFFFF ) )
#define DWORDHIGH(x) ( (DWORD) ( (x >> 32) & (__int64) 0xFFFFFFFF ) )
#define DWORDTOINT64(x,y) ( (__int64) ( ( (__int64 x) << 32 ) + (__int64) y ) )
#else
#define DWORDLOW(x) x
#define DWORDHIGH(x) 0
#define DWORDTOINT64(x,y) x
#endif
int ADIOI_NTFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle);
void ADIOI_NTFS_Open(ADIO_File fd, int *error_code);
void ADIOI_NTFS_Close(ADIO_File fd, int *error_code);
void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NTFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_NTFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_NTFS_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_NTFS_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_NTFS_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_NTFS_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
void ADIOI_NTFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code);
void ADIOI_NTFS_IwriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_NTFS_Flush(ADIO_File fd, int *error_code);
void ADIOI_NTFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
#define FORMAT_MESSAGE_MIN_SIZE 100
#define ADIOI_NTFS_ERR_MSG_MAX FORMAT_MESSAGE_MIN_SIZE
void ADIOI_NTFS_Strerror(int error, char *errMsg, int errMsgLen);
#endif

Просмотреть файл

@ -0,0 +1,30 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_Close(ADIO_File fd, int *error_code)
{
int err;
static char myname[] = "ADIOI_NTFS_Close";
err = CloseHandle(fd->fd_sys);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,20 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
int ADIOI_NTFS_ReadDone(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return 0;
}
int ADIOI_NTFS_WriteDone(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return 0;
}

Просмотреть файл

@ -0,0 +1,76 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
#include "adio_extern.h"
void ADIOI_NTFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code)
{
DWORD err;
LONG dwTemp;
static char myname[] = "ADIOI_NTFS_FCNTL";
switch(flag)
{
case ADIO_FCNTL_GET_FSIZE:
fcntl_struct->fsize = SetFilePointer(fd->fd_sys, 0, 0, FILE_END);
if (fd->fp_sys_posn != -1)
{
dwTemp = DWORDHIGH(fd->fp_sys_posn);
if (SetFilePointer(fd->fd_sys, DWORDLOW(fd->fp_sys_posn), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
if (err != NO_ERROR)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
return;
}
}
}
/* --BEGIN ERROR HANDLING-- */
if (fcntl_struct->fsize == INVALID_SET_FILE_POINTER)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
dwTemp = GetLastError();
ADIOI_NTFS_Strerror(dwTemp, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
break;
case ADIO_FCNTL_SET_ATOMICITY:
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
*error_code = MPI_SUCCESS;
/*
fd->atomicity = 0;
*error_code = MPI_ERR_UNSUPPORTED_OPERATION;
*/
break;
default:
/* --BEGIN ERROR HANDLING-- */
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"**flag", "**flag %d", flag);
return;
/* --END ERROR HANDLING-- */
}
}

Просмотреть файл

@ -0,0 +1,26 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "adio.h"
int ADIOI_NTFS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
/* supported features */
case ADIO_LOCKS:
case ADIO_SHARED_FP:
case ADIO_ATOMIC_MODE:
case ADIO_DATA_SIEVING_WRITES:
return 1;
break;
/* unsupported features */
case ADIO_SCALABLE_OPEN:
case ADIO_UNLINK_AFTER_CLOSE:
default:
return 0;
break;
}
}

Просмотреть файл

@ -0,0 +1,32 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_Flush(ADIO_File fd, int *error_code)
{
int err;
static char myname[] = "ADIOI_NTFS_Flush";
err = (fd->access_mode & ADIO_RDONLY) ? TRUE :
FlushFileBuffers(fd->fd_sys);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,42 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int *error_code)
{
MPI_Count len, typesize;
int err;
static char myname[] = "ADIOI_NTFS_IreadContig";
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL)
{
offset = fd->fp_ind;
}
err = ADIOI_NTFS_aio(fd, buf, len, offset, 0, request);
if (file_ptr_type == ADIO_INDIVIDUAL)
{
fd->fp_ind += len;
}
/* --BEGIN ERROR HANDLING-- */
if (err != MPI_SUCCESS)
{
*error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", 0);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
fd->fp_sys_posn = -1; /* set it to null. */
}

Просмотреть файл

@ -0,0 +1,303 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
#include "../../mpi-io/mpioimpl.h"
#include "../../mpi-io/mpioprof.h"
#include "mpiu_greq.h"
static MPIX_Grequest_class ADIOI_NTFS_greq_class = 0;
/* Fills the input buffer, errMsg, with the error message
corresponding to error code, error */
void ADIOI_NTFS_Strerror(int error, char *errMsg, int errMsgLen)
{
LPTSTR str;
int num_bytes;
num_bytes = FormatMessage(
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_ALLOCATE_BUFFER,
NULL,
error,
0,
&str,
FORMAT_MESSAGE_MIN_SIZE,
0);
if (num_bytes == 0)
{
strncpy(errMsg, "\0", errMsgLen);
}
else
{
strncpy(errMsg, str, errMsgLen);
LocalFree(str);
}
}
/* poll for completion of a single outstanding AIO request */
int ADIOI_NTFS_aio_poll_fn(void *extra_state, MPI_Status *status)
{
ADIOI_AIO_Request *aio_req;
int mpi_errno = MPI_SUCCESS;
/* FIXME: Validate the args -- has it already been done by the
caller ? */
aio_req = (ADIOI_AIO_Request *)extra_state;
/* XXX: test for AIO completion here */
if(!GetOverlappedResult( aio_req->fd, aio_req->lpOvl,
&(aio_req->nbytes), FALSE)){
if(GetLastError() == ERROR_IO_INCOMPLETE){
/* IO in progress */
/* TODO: need to diddle with status somehow */
}else{
/* Error occured */
/* TODO: unsure how to handle this */
}
}else{
mpi_errno = MPI_Grequest_complete(aio_req->req);
if (mpi_errno != MPI_SUCCESS) {
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
"ADIOI_NTFS_aio_poll_fn", __LINE__,
MPI_ERR_IO, "**mpi_grequest_complete",
0);
}
}
return mpi_errno;
}
/* Wait for completion of one of the outstanding AIO requests */
int ADIOI_NTFS_aio_wait_fn(int count, void **array_of_states,
double timeout, MPI_Status *status)
{
int i, mpi_errno = MPI_SUCCESS;
ADIOI_AIO_Request **aio_reqlist;
LPHANDLE lpHandles;
DWORD retObject=0;
/* FIXME: Validate the args -- has it already been done by the
caller ? */
aio_reqlist = (ADIOI_AIO_Request **)array_of_states;
lpHandles = (LPHANDLE) ADIOI_Calloc(count, sizeof(HANDLE));
if (lpHandles == NULL)
{
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
"ADIOI_NTFS_aio_wait_fn", __LINE__, MPI_ERR_IO,
"**nomem", "**nomem %s", "Event handles");
return mpi_errno;
}
/* XXX: set-up arrays of outstanding requests */
for(i=0; i<count; i++){
lpHandles[i] = (aio_reqlist[i])->lpOvl->hEvent;
}
/* XXX: wait for one request to complete */
/* FIXME: Is the timeout in seconds ? */
timeout = (timeout <= 0) ? INFINITE : (timeout * 1000);
if((retObject = WaitForMultipleObjects(count, lpHandles,
FALSE, timeout)) != WAIT_FAILED){
retObject = retObject - WAIT_OBJECT_0;
if(GetOverlappedResult( aio_reqlist[retObject]->fd,
aio_reqlist[retObject]->lpOvl, &(aio_reqlist[retObject]->nbytes),
FALSE)){
/* XXX: mark completed requests as 'done'*/
mpi_errno = MPI_Grequest_complete(aio_reqlist[retObject]->req);
if (mpi_errno != MPI_SUCCESS) {
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
"ADIOI_NTFS_aio_wait_fn", __LINE__,
MPI_ERR_IO, "**mpi_grequest_complete",
0);
}
}else{
if(GetLastError() == ERROR_IO_INCOMPLETE){
/* IO in progress */
/* TODO: need to diddle with status somehow */
}else{
/* Error occured */
/* TODO: not sure how to handle this */
}
}
}else{
/* TODO: How to handle error while waiting ? */
}
ADIOI_Free(lpHandles);
return mpi_errno;
}
int ADIOI_NTFS_aio_query_fn(void *extra_state, MPI_Status *status)
{
ADIOI_AIO_Request *aio_req;
aio_req = (ADIOI_AIO_Request *)extra_state;
MPI_Status_set_elements(status, MPI_BYTE, aio_req->nbytes);
/* can never cancel so always true */
MPI_Status_set_cancelled(status, 0);
/* choose not to return a value for this */
status->MPI_SOURCE = MPI_UNDEFINED;
/* tag has no meaning for this generalized request */
status->MPI_TAG = MPI_UNDEFINED;
/* this generalized request never fails */
return MPI_SUCCESS;
}
int ADIOI_NTFS_aio_free_fn(void *extra_state)
{
ADIOI_AIO_Request *aio_req;
/* FIXME: Validate the args -- has it already been done by the
caller ? */
aio_req = (ADIOI_AIO_Request*)extra_state;
CloseHandle(aio_req->lpOvl->hEvent);
ADIOI_Free(aio_req->lpOvl);
ADIOI_Free(aio_req);
return MPI_SUCCESS;
}
void ADIOI_NTFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request,
int *error_code)
{
MPI_Count len, typesize;
int err;
static char myname[] = "ADIOI_NTFS_IwriteContig";
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL)
{
offset = fd->fp_ind;
}
err = ADIOI_NTFS_aio(fd, buf, len, offset, 1, request);
if (file_ptr_type == ADIO_INDIVIDUAL)
{
fd->fp_ind += len;
}
/* --BEGIN ERROR HANDLING-- */
if (err != MPI_SUCCESS)
{
*error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", 0);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
fd->fp_sys_posn = -1; /* set it to null. */
}
/* This function is for implementation convenience. It is not user-visible.
* If wr==1 write, wr==0 read.
*
* Returns MPI_SUCCESS on success, mpi_errno on failure.
*/
int ADIOI_NTFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, MPI_Request *request)
{
static char myname[] = "ADIOI_NTFS_aio";
ADIOI_AIO_Request *aio_req;
static DWORD dwNumWritten, dwNumRead;
BOOL ret_val = FALSE;
FDTYPE fd_sys;
int mpi_errno = MPI_SUCCESS;
DWORD err;
fd_sys = fd->fd_sys;
aio_req = (ADIOI_AIO_Request *)ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
if (aio_req == NULL)
{
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**nomem", "**nomem %s", "AIO_REQ");
return mpi_errno;
}
aio_req->lpOvl = (LPOVERLAPPED ) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
if (aio_req->lpOvl == NULL)
{
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**nomem", "**nomem %s", "OVERLAPPED");
ADIOI_Free(aio_req);
return mpi_errno;
}
aio_req->lpOvl->hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
if (aio_req->lpOvl->hEvent == NULL)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
ADIOI_Free(aio_req->lpOvl);
ADIOI_Free(aio_req);
return mpi_errno;
}
aio_req->lpOvl->Offset = DWORDLOW(offset);
aio_req->lpOvl->OffsetHigh = DWORDHIGH(offset);
aio_req->fd = fd_sys;
/* XXX: initiate async I/O */
if (wr)
{
ret_val = WriteFile(fd_sys, buf, len, &dwNumWritten, aio_req->lpOvl);
}
else
{
ret_val = ReadFile(fd_sys, buf, len, &dwNumRead, aio_req->lpOvl);
}
/* --BEGIN ERROR HANDLING-- */
if (ret_val == FALSE)
{
mpi_errno = GetLastError();
if (mpi_errno != ERROR_IO_PENDING)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
ADIOI_NTFS_Strerror(mpi_errno, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return mpi_errno;
}
mpi_errno = MPI_SUCCESS;
}
/* --END ERROR HANDLING-- */
/* XXX: set up generalized request class and request */
if (ADIOI_NTFS_greq_class == 0) {
mpi_errno = MPIX_Grequest_class_create(ADIOI_NTFS_aio_query_fn,
ADIOI_NTFS_aio_free_fn, MPIU_Greq_cancel_fn,
ADIOI_NTFS_aio_poll_fn, ADIOI_NTFS_aio_wait_fn,
&ADIOI_NTFS_greq_class);
if(mpi_errno != MPI_SUCCESS){
/* FIXME: Pass appropriate error code to user */
}
}
mpi_errno = MPIX_Grequest_class_allocate(ADIOI_NTFS_greq_class, aio_req, request);
if(mpi_errno != MPI_SUCCESS){
/* FIXME: Pass appropriate error code to user */
}
memcpy(&(aio_req->req), request, sizeof(MPI_Request));
return mpi_errno;
}

Просмотреть файл

@ -0,0 +1,101 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_Open(ADIO_File fd, int *error_code)
{
int err;
int cmode, amode, attrib;
static char myname[] = "ADIOI_NTFS_Open";
amode = 0;
cmode = OPEN_EXISTING;
#ifdef USE_WIN_THREADED_IO
attrib = FILE_FLAG_OVERLAPPED;
#else
attrib = FILE_ATTRIBUTE_NORMAL;
#endif
if (fd->access_mode & ADIO_CREATE)
{
cmode = OPEN_ALWAYS;
}
if (fd->access_mode & ADIO_EXCL)
{
cmode = CREATE_NEW;
}
if (fd->access_mode & ADIO_RDONLY)
{
amode = GENERIC_READ;
}
if (fd->access_mode & ADIO_WRONLY)
{
amode = GENERIC_WRITE;
}
if (fd->access_mode & ADIO_RDWR)
{
amode = GENERIC_READ | GENERIC_WRITE;
}
if (fd->access_mode & ADIO_DELETE_ON_CLOSE)
{
attrib = attrib | FILE_FLAG_DELETE_ON_CLOSE;
}
if (fd->access_mode & ADIO_SEQUENTIAL)
{
attrib = attrib | FILE_FLAG_SEQUENTIAL_SCAN;
}
else
{
attrib = attrib | FILE_FLAG_RANDOM_ACCESS;
}
fd->fd_sys = CreateFile(fd->filename,
amode,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
NULL,
cmode,
attrib,
NULL);
fd->fd_direct = -1;
if ((fd->fd_sys != INVALID_HANDLE_VALUE) && (fd->access_mode & ADIO_APPEND))
{
fd->fp_ind = fd->fp_sys_posn = SetFilePointer(fd->fd_sys, 0, NULL, FILE_END);
if (fd->fp_ind == INVALID_SET_FILE_POINTER)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
if (err != NO_ERROR)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
return;
}
}
}
/* --BEGIN ERROR HANDLING-- */
if (fd->fd_sys == INVALID_HANDLE_VALUE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,259 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
LONG dwTemp;
DWORD dwNumRead = 0;
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_NTFS_ReadContig";
OVERLAPPED *pOvl;
/* If file pointer is of type ADIO_INDIVIDUAL ignore the offset
and use the current location of file pointer */
if(file_ptr_type == ADIO_INDIVIDUAL){
offset = fd->fp_ind;
}
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
if (pOvl == NULL)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**nomem", "**nomem %s", "OVERLAPPED");
return;
}
pOvl->hEvent = CreateEvent(NULL, TRUE, TRUE, NULL);
if (pOvl->hEvent == NULL)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
ADIOI_Free(pOvl);
return;
}
pOvl->Offset = DWORDLOW(offset);
pOvl->OffsetHigh = DWORDHIGH(offset);
if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
{
if (fd->fp_sys_posn != offset)
{
dwTemp = DWORDHIGH(offset);
if (SetFilePointer(fd->fd_sys, DWORDLOW(offset), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
if (err != NO_ERROR)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
}
/*
{
ADIO_Fcntl_t fcntl_struct;
int error_code;
ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, &fcntl_struct, &error_code);
printf("File size b: %d\n", fcntl_struct.fsize);
}
printf("ReadFile(%d bytes)\n", len);fflush(stdout);
*/
err = ReadFile(fd->fd_sys, buf, len, &dwNumRead, pOvl);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
switch (err)
{
case ERROR_IO_PENDING:
break;
case ERROR_HANDLE_EOF:
/*printf("EOF error\n");fflush(stdout);*/
SetEvent(pOvl->hEvent);
break;
default:
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
/* --END ERROR HANDLING-- */
err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumRead, TRUE);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
if (err != ERROR_HANDLE_EOF) /* Ignore EOF errors */
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
/* --END ERROR HANDLING-- */
if (!CloseHandle(pOvl->hEvent))
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
ADIOI_Free(pOvl);
fd->fp_sys_posn = offset + (ADIO_Offset)dwNumRead;
/* individual file pointer not updated */
}
else
{
/* read from curr. location of ind. file pointer */
if (fd->fp_sys_posn != fd->fp_ind)
{
dwTemp = DWORDHIGH(fd->fp_ind);
if (SetFilePointer(fd->fd_sys, DWORDLOW(fd->fp_ind), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
if (err != NO_ERROR)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
}
/*
{
ADIO_Fcntl_t fcntl_struct;
int error_code;
ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, &fcntl_struct, &error_code);
printf("File size c: %d\n", fcntl_struct.fsize);
}
printf("ReadFile(%d bytes)\n", len);fflush(stdout);
*/
err = ReadFile(fd->fd_sys, buf, len, &dwNumRead, pOvl);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
switch (err)
{
case ERROR_IO_PENDING:
break;
case ERROR_HANDLE_EOF:
/*printf("EOF error\n");fflush(stdout);*/
SetEvent(pOvl->hEvent);
break;
default:
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
/* --END ERROR HANDLING-- */
err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumRead, TRUE);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
if (err != ERROR_HANDLE_EOF) /* Ignore EOF errors */
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
/* --END ERROR HANDLING-- */
if (!CloseHandle(pOvl->hEvent))
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
ADIOI_Free(pOvl);
return;
}
ADIOI_Free(pOvl);
fd->fp_ind = fd->fp_ind + (ADIO_Offset)dwNumRead;
fd->fp_sys_posn = fd->fp_ind;
}
#ifdef HAVE_STATUS_SET_BYTES
if (err != FALSE)
{
MPIR_Status_set_bytes(status, datatype, dwNumRead);
}
#endif
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,51 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
{
LONG dwTemp;
DWORD err;
BOOL result;
static char myname[] = "ADIOI_NTFS_Resize";
dwTemp = DWORDHIGH(size);
err = SetFilePointer(fd->fd_sys, DWORDLOW(size), &dwTemp, FILE_BEGIN);
/* --BEGIN ERROR HANDLING-- */
if (err == INVALID_SET_FILE_POINTER)
{
err = GetLastError();
if (err != NO_ERROR)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
}
/*printf("setting file length to %d\n", size);fflush(stdout);*/
/* --END ERROR HANDLING-- */
result = SetEndOfFile(fd->fd_sys);
/* --BEGIN ERROR HANDLING-- */
if (result == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,20 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_ReadComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return;
}
void ADIOI_NTFS_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code)
{
return;
}

Просмотреть файл

@ -0,0 +1,222 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_ntfs.h"
void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
static char myname[] = "ADIOI_NTFS_WriteContig";
LONG dwTemp;
DWORD dwNumWritten = 0;
MPI_Count err=-1, datatype_size, len;
OVERLAPPED *pOvl;
/* If file pointer type in ADIO_INDIVIDUAL then offset should be
ignored and the current location of file pointer should be used */
if(file_ptr_type == ADIO_INDIVIDUAL){
offset = fd->fp_ind;
}
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
if (pOvl == NULL)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**nomem", "**nomem %s", "OVERLAPPED");
return;
}
pOvl->hEvent = CreateEvent(NULL, TRUE, TRUE, NULL);
if (pOvl->hEvent == NULL)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
ADIOI_Free(pOvl);
return;
}
pOvl->Offset = DWORDLOW(offset);
pOvl->OffsetHigh = DWORDHIGH(offset);
if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
{
if (fd->fp_sys_posn != offset)
{
dwTemp = DWORDHIGH(offset);
if (SetFilePointer(fd->fd_sys, DWORDLOW(offset), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
{
err = GetLastError();
if (err != NO_ERROR)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
}
/*printf("WriteFile(%d bytes)\n", len);fflush(stdout);*/
err = WriteFile(fd->fd_sys, buf, len, &dwNumWritten, pOvl);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
err = GetLastError();
if (err != ERROR_IO_PENDING)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
/* --END ERROR HANDLING-- */
err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumWritten, TRUE);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
/* --END ERROR HANDLING-- */
if (!CloseHandle(pOvl->hEvent))
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
ADIOI_Free(pOvl);
fd->fp_sys_posn = offset + dwNumWritten;
/* individual file pointer not updated */
}
else
{
/* write from curr. location of ind. file pointer */
if (fd->fp_sys_posn != fd->fp_ind)
{
dwTemp = DWORDHIGH(fd->fp_ind);
if (SetFilePointer(fd->fd_sys, DWORDLOW(fd->fp_ind), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
{
err = GetLastError();
if (err != NO_ERROR)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
}
/*printf("WriteFile(%d bytes)\n", len);fflush(stdout);*/
err = WriteFile(fd->fd_sys, buf, len, &dwNumWritten, pOvl);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
err = GetLastError();
if (err != ERROR_IO_PENDING)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
}
/* --END ERROR HANDLING-- */
err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumWritten, TRUE);
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", errMsg);
CloseHandle(pOvl->hEvent);
ADIOI_Free(pOvl);
return;
}
/* --END ERROR HANDLING-- */
if (!CloseHandle(pOvl->hEvent))
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", errMsg);
ADIOI_Free(pOvl);
return;
}
ADIOI_Free(pOvl);
fd->fp_ind = fd->fp_ind + dwNumWritten;
fd->fp_sys_posn = fd->fp_ind;
}
#ifdef HAVE_STATUS_SET_BYTES
if (err != FALSE)
{
MPIR_Status_set_bytes(status, datatype, dwNumWritten);
}
#endif
/* --BEGIN ERROR HANDLING-- */
if (err == FALSE)
{
char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
err = GetLastError();
ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", errMsg);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,21 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_PANFS
noinst_HEADERS += adio/ad_panfs/ad_panfs.h
romio_other_sources += \
adio/ad_panfs/ad_panfs.c \
adio/ad_panfs/ad_panfs_open.c \
adio/ad_panfs/ad_panfs_hints.c \
adio/ad_panfs/ad_panfs_read.c \
adio/ad_panfs/ad_panfs_resize.c \
adio/ad_panfs/ad_panfs_write.c
endif BUILD_AD_PANFS

Просмотреть файл

@ -0,0 +1,45 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* ad_panfs.c
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_panfs.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_PANFS_operations = {
ADIOI_PANFS_Open, /* Open */
ADIOI_GEN_OpenColl,
ADIOI_PANFS_ReadContig, /* ReadContig */
ADIOI_PANFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_PANFS_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_GEN_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_GEN_Flush, /* Flush */
ADIOI_PANFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature,
"PANFS: Panasas PanFS"
};

Просмотреть файл

@ -0,0 +1,62 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* ad_panfs.h
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_PANFS_INCLUDE
#define AD_PANFS_INCLUDE
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifndef NO_AIO
#ifdef AIO_SUN
#include <sys/asynch.h>
#else
#include <aio.h>
#ifdef NEEDS_ADIOCB_T
typedef struct adiocb adiocb_t;
#endif
#endif
#endif
void ADIOI_PANFS_Open(ADIO_File fd, int *error_code);
void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_PANFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_PANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
void ADIOI_PANFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
/* TODO: move this to common code and have all routines retry. */
/* TODO: also check for EWOULDBLOCK */
#if defined(NEEDS_USLEEP_DECL)
int usleep(useconds_t usec);
#endif
/* Delay 1 ms */
#define AD_PANFS_RETRY_DELAY 1000
#define AD_PANFS_RETRY(_op_,_rc_) \
{ \
_rc_ = (_op_); \
while(_rc_ == -1 && errno == EAGAIN) \
{ \
if(usleep(AD_PANFS_RETRY_DELAY) == -1) \
{ \
break; \
} \
_rc_ = (_op_); \
} \
}
#endif

Просмотреть файл

@ -0,0 +1,72 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* ad_panfs_hints.c
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_panfs.h"
#include <pan_fs_client_cw_mode.h>
#include "hint_fns.h"
void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
#if defined(MPICH) || !defined(PRINT_ERR_MSG)
static char myname[] = "ADIOI_PANFS_SETINFO";
#endif
int gen_error_code;
*error_code = MPI_SUCCESS;
if (fd->info == MPI_INFO_NULL) {
/* This must be part of the open call. can set striping parameters
* if necessary.
*/
MPI_Info_create(&(fd->info));
/* anticipate concurrent writes in an MPI-IO application */
ADIOI_Info_set (fd->info, "panfs_concurrent_write", "1");
/* has user specified striping parameters
and do they have the same value on all processes? */
if (users_info != MPI_INFO_NULL) {
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_concurrent_write",
NULL, myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_type",
NULL, myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_stripe_unit",
NULL, myname, error_code);
/* strange: there was a check "layout_type ==
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE, but
* nothing ever touched layout_type */
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_parity_stripe_width", NULL, myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_parity_stripe_depth", NULL, myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_total_num_comps", NULL, myname, error_code);
/* this hint used to check for
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE or
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID10, but again, layout_type never
* gets updated */
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_visit_policy", NULL, myname, error_code);
}
}
ADIOI_GEN_SetInfo(fd, users_info, &gen_error_code);
/* If this function is successful, use the error code returned from ADIOI_GEN_SetInfo
* otherwise use the error_code generated by this function
*/
if(*error_code == MPI_SUCCESS)
{
*error_code = gen_error_code;
}
}

Просмотреть файл

@ -0,0 +1,348 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* ad_panfs_open.c
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_panfs.h"
#include <string.h>
#include <pan_fs_client_cw_mode.h>
#define TEMP_BUFFER_SIZE 64
void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
{
char* value;
int perm, old_mask, amode, flag;
static char myname[] = "ADIOI_PANFS_OPEN";
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = ~old_mask & 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
{
pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT;
unsigned long int layout_stripe_unit = 0;
unsigned long int layout_parity_stripe_width = 0;
unsigned long int layout_parity_stripe_depth = 0;
unsigned long int layout_total_num_comps = 0;
pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN;
int myrank;
MPI_Comm_rank(fd->comm, &myrank);
*error_code = MPI_SUCCESS;
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_type = strtoul(value,NULL,10);
}
ADIOI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_stripe_unit = strtoul(value,NULL,10);
}
ADIOI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_total_num_comps = strtoul(value,NULL,10);
}
ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_parity_stripe_width = strtoul(value,NULL,10);
}
ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_parity_stripe_depth = strtoul(value,NULL,10);
}
ADIOI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_visit_policy = strtoul(value,NULL,10);
}
ADIOI_Free(value);
amode = amode | O_CREAT;
/* Check for valid set of hints */
if ((layout_type < PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT) ||
(layout_type > PAN_FS_CLIENT_LAYOUT_TYPE__RAID10))
{
FPRINTF(stderr, "%s: panfs_layout_type is not a valid value: %u.\n", myname, layout_type);
MPI_Abort(MPI_COMM_WORLD, 1);
}
if ((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) &&
((layout_stripe_unit == 0) || (layout_total_num_comps == 0)))
{
if(layout_stripe_unit == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID0 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
if(layout_total_num_comps == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID0 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)
{
if ((layout_stripe_unit == 0) ||
(layout_parity_stripe_width == 0) ||
(layout_parity_stripe_depth == 0) ||
(layout_total_num_comps == 0))
{
if(layout_stripe_unit == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
if(layout_total_num_comps == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
if(layout_parity_stripe_width == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_parity_stripe_width hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
if(layout_parity_stripe_depth == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_parity_stripe_depth hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) ||
(layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET))
{
FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname, layout_visit_policy);
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)
{
if ((layout_stripe_unit == 0) || (layout_total_num_comps == 0))
{
if(layout_stripe_unit == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
if(layout_total_num_comps == 0)
{
FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname);
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) ||
(layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET))
{
FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname, layout_visit_policy);
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
/* Create the file via ioctl() or open(). ADIOI_PANFS_Open's caller
* already optimizes performance by only calling this function with
* ADIO_CREATE on rank 0. Therefore, we don't need to worry about
* implementing that optimization here. */
if((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) || (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)
|| (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
pan_fs_client_layout_create_args_t file_create_args;
int fd_dir;
char* slash;
struct stat stat_buf;
int err;
char *path;
/* Check that the file does not exist before
* trying to create it. The ioctl itself should
* be able to handle this condition. Currently,
* the ioctl will return successfully if the file
* has been previously created. Filed bug 33862
* to track the problem.
*/
err = stat(fd->filename,&stat_buf);
if((err == -1) && (errno != ENOENT))
{
FPRINTF(stderr,"%s: Unexpected I/O Error calling stat() on PanFS file: %s.\n", myname, strerror(errno));
MPI_Abort(MPI_COMM_WORLD, 1);
}
else if (err == 0)
{
FPRINTF(stderr,"%s: Cannot create PanFS file with ioctl when file already exists.\n", myname);
MPI_Abort(MPI_COMM_WORLD, 1);
}
else
{
/* (err == -1) && (errno == ENOENT) */
/* File does not exist */
path = ADIOI_Strdup(fd->filename);
slash = strrchr(path, '/');
if (!slash)
ADIOI_Strncpy(path, ".", 2);
else {
if (slash == path)
*(path + 1) = '\0';
else *slash = '\0';
}
/* create PanFS object */
memset(&file_create_args,0,sizeof(pan_fs_client_layout_create_args_t));
/* open directory */
fd_dir = open(path, O_RDONLY);
if (fd_dir < 0) {
FPRINTF(stderr, "%s: I/O Error opening parent directory to create PanFS file using ioctl: %s.\n", myname, strerror(errno));
MPI_Abort(MPI_COMM_WORLD, 1);
}
else
{
char *file_name_ptr = fd->filename;
slash = strrchr(fd->filename, '/');
if (slash)
{
file_name_ptr = slash + 1;
}
/* create file in the directory */
file_create_args.mode = perm;
file_create_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
file_create_args.flags = PAN_FS_CLIENT_LAYOUT_CREATE_F__NONE;
ADIOI_Strncpy(file_create_args.filename, file_name_ptr, strlen(fd->filename)+1);
file_create_args.layout.agg_type = layout_type;
file_create_args.layout.layout_is_valid = 1;
if(layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)
{
file_create_args.layout.u.raid1_5_parity_stripe.total_num_comps = layout_total_num_comps;
file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_width = layout_parity_stripe_width;
file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth = layout_parity_stripe_depth;
file_create_args.layout.u.raid1_5_parity_stripe.stripe_unit = layout_stripe_unit;
file_create_args.layout.u.raid1_5_parity_stripe.layout_visit_policy = layout_visit_policy;
}
else if(layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0)
{
file_create_args.layout.u.raid0.total_num_comps = layout_total_num_comps;
file_create_args.layout.u.raid0.stripe_unit = layout_stripe_unit;
}
else if(layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)
{
file_create_args.layout.u.raid10.total_num_comps = layout_total_num_comps;
file_create_args.layout.u.raid10.stripe_unit = layout_stripe_unit;
file_create_args.layout.u.raid10.layout_visit_policy = layout_visit_policy;
}
err = ioctl(fd_dir, PAN_FS_CLIENT_LAYOUT_CREATE_FILE, &file_create_args);
if (err < 0) {
FPRINTF(stderr, "%s: I/O Error doing ioctl on parent directory to create PanFS file using ioctl: %s.\n", myname, strerror(errno));
MPI_Abort(MPI_COMM_WORLD, 1);
}
err = close(fd_dir);
}
ADIOI_Free(path);
}
}
else
{
int create_fd = open(fd->filename,amode,perm);
if(create_fd != -1)
{
close(create_fd);
}
else
{
FPRINTF(stderr, "%s: I/O Error creating PanFS file using open: %s.\n", myname, strerror(errno));
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
}
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
unsigned long int concurrent_write = strtoul(value,NULL,10);
if(concurrent_write == 1)
{
amode = amode | O_CONCURRENT_WRITE;
}
}
ADIOI_Free(value);
fd->fd_sys = open(fd->filename, amode, perm);
fd->fd_direct = -1;
if (fd->fd_sys != -1)
{
int rc;
char temp_buffer[TEMP_BUFFER_SIZE];
pan_fs_client_layout_query_args_t file_query_args;
memset(&file_query_args,0,sizeof(pan_fs_client_layout_query_args_t));
file_query_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
rc = ioctl(fd->fd_sys, PAN_FS_CLIENT_LAYOUT_QUERY_FILE, &file_query_args);
if (rc < 0)
{
/* Error - set layout type to unknown */
ADIOI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
}
else
{
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.agg_type);
ADIOI_Info_set(fd->info, "panfs_layout_type", temp_buffer);
if (file_query_args.layout.layout_is_valid == 1)
{
switch (file_query_args.layout.agg_type)
{
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID0:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.stripe_unit);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.total_num_comps);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
break;
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_width);
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth);
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.total_num_comps);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.layout_visit_policy);
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break;
case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10:
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.stripe_unit);
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.total_num_comps);
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer);
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy);
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break;
case PAN_FS_CLIENT_LAYOUT_TYPE__INVALID:
case PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT:
MPI_Info_set(fd->info, "panfs_layout_type",
"PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
default:
break;
}
}
}
}
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
if (fd->fd_sys == -1) {
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,68 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_panfs.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
void ADIOI_PANFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
MPI_Count err = -1, datatype_size, len;
static char myname[] = "ADIOI_PANFS_READCONTIG";
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind;
}
if (fd->fp_sys_posn != offset) {
err = lseek(fd->fd_sys, offset, SEEK_SET);
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
fd->fp_sys_posn = -1;
return;
}
/* --END ERROR HANDLING-- */
}
AD_PANFS_RETRY(read(fd->fd_sys, buf, len),err)
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
fd->fp_sys_posn = -1;
return;
}
/* --END ERROR HANDLING-- */
fd->fp_sys_posn = offset + err;
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind += err;
}
#ifdef HAVE_STATUS_SET_BYTES
if (err != -1) MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,49 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2004 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_panfs.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
void ADIOI_PANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
{
int err;
int myrank;
struct stat stat_buf;
static char myname[] = "ADIOI_PANFS_RESIZE";
MPI_Comm_rank(fd->comm, &myrank);
if (!myrank)
{
AD_PANFS_RETRY(ftruncate(fd->fd_sys,size),err);
MPI_Barrier(fd->comm);
}
else
{
MPI_Barrier(fd->comm);
AD_PANFS_RETRY(fstat(fd->fd_sys,&stat_buf),err);
if(((ADIO_Offset)stat_buf.st_size) != size)
{
/* This should never happen otherwise there is a coherency problem. */
FPRINTF(stderr, "%s: Rank %d: Resize failed: requested=%llu actual=%llu.\n",myname,myrank,size,(unsigned long long)stat_buf.st_size);
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,68 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2004 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_panfs.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
void ADIOI_PANFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code)
{
MPI_Count err = -1, datatype_size, len;
static char myname[] = "ADIOI_PANFS_WRITECONTIG";
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_INDIVIDUAL) {
offset = fd->fp_ind;
}
if (fd->fp_sys_posn != offset) {
err = lseek(fd->fd_sys, offset, SEEK_SET);
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
fd->fp_sys_posn = -1;
return;
}
/* --END ERROR HANDLING-- */
}
AD_PANFS_RETRY(write(fd->fd_sys, buf, len),err)
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
fd->fp_sys_posn = -1;
return;
}
/* --END ERROR HANDLING-- */
fd->fp_sys_posn = offset + err;
if (file_ptr_type == ADIO_INDIVIDUAL) {
fd->fp_ind += err;
}
#ifdef HAVE_STATUS_SET_BYTES
if (err != -1 && status) MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше